def test_synthetic_sample_and_remove(self): synthetic = SyntheticDatabase(10, 10, number_features=3) # 100 records, 10 entities, 10 records each synthetic2 = synthetic.sample_and_remove(50) self.assertEqual(len(synthetic.database.records), 50) self.assertEqual(len(synthetic2.database.records), 50) self.assertEqual(set(synthetic.database.records.keys()), set(synthetic.labels.keys())) self.assertEqual(set(synthetic2.database.records.keys()), set(synthetic2.labels.keys())) self.assertEqual(len(set(synthetic.database.records.keys()) & set(synthetic2.database.records.keys())), 0) self.assertEqual(len(set(synthetic.labels.keys()) & set(synthetic2.labels.keys())), 0)
def experiment_wrapper(dataset_name): """ Experiment wrapper, just takes the type of experiment, all parameters saved here :param dataset_name: Name of the database to run on, either synthetic, restaurant, abt-buy, trafficking """ if dataset_name == 'synthetic': number_entities = 100 records_per_entity = 10 train_database_size = 200 train_class_balance = 0.5 validation_database_size = 200 corruption = 0.001 #0.025 number_thresholds = 30 number_features = 10 synthetic_database = SyntheticDatabase(number_entities, records_per_entity, number_features=number_features) corruption_array = corruption*np.random.normal(loc=0.0, scale=1.0, size=[validation_database_size, synthetic_database.database.feature_descriptor.number]) synthetic_database.corrupt(corruption_array) synthetic_train = synthetic_database.sample_and_remove(train_database_size) synthetic_validation = synthetic_database.sample_and_remove(validation_database_size) synthetic_test = synthetic_database thresholds = np.linspace(0, 1, number_thresholds) experiment = Experiment(synthetic_train.database, synthetic_validation.database, synthetic_test.database, synthetic_train.labels, synthetic_validation.labels, synthetic_test.labels, train_class_balance, thresholds) experiment.plot() else: number_thresholds = 5 if dataset_name == 'restaurant': # 864 records, 112 matches features_path = '../data/restaurant/merged.csv' labels_path = '../data/restaurant/labels.csv' train_database_size = 300 train_class_balance = .4 validation_database_size = 200 database = Database(annotation_path=features_path) elif dataset_name == 'abt-buy': # ~4900 records, 1300 matches features_path = '../data/Abt-Buy/merged.csv' labels_path = '../data/Abt-Buy/labels.csv' train_database_size = 300 train_class_balance = 0.4 validation_database_size = 300 database = Database(annotation_path=features_path) elif dataset_name == 'trafficking': features_path = '../data/trafficking/features.csv' labels_path = '../data/trafficking/labels.csv' train_database_size = 300 train_class_balance = 0.5 validation_database_size = 300 #database = Database(annotation_path=features_path) else: raise Exception('Invalid dataset name') thresholds = np.linspace(0, 1, number_thresholds) # labels = np.loadtxt(open(labels_path, 'rb')) # database_train = database.sample_and_remove(train_database_size) # database_validation = database.sample_and_remove(validation_database_size) # database_test = database # labels_train = dict() # labels_validation = dict() # labels_test = dict() # for identifier, label in enumerate(labels): # if identifier in database_train.records: # labels_train[identifier] = label # elif identifier in database_validation.records: # labels_validation[identifier] = label # elif identifier in database_test.records: # labels_test[identifier] = label # else: # raise Exception('Record identifier ' + str(identifier) + ' not in either database') ### database_train = Database('../data/trafficking/cluster_subsample0_10000.csv', header_path='../data/trafficking/cluster_subsample_header_LM.csv', max_records=5000) database_validation = Database('../data/trafficking/cluster_subsample1_10000.csv', header_path='../data/trafficking/cluster_subsample_header_LM.csv', max_records=5000) database_test = Database('../data/trafficking/cluster_subsample2_10000.csv', header_path='../data/trafficking/cluster_subsample_header_LM.csv', max_records=1000) labels_train = fast_strong_cluster(database_train) labels_validation = fast_strong_cluster(database_validation) labels_test = fast_strong_cluster(database_test) ### experiment = Experiment(database_train, database_validation, database_test, labels_train, labels_validation, labels_test, train_class_balance, thresholds) #print 'Saving results' #pickle.dump(experiment, open('experiment.p', 'wb')) experiment.plot() print 'Finished'