Exemplo n.º 1
0
 def test_synthetic_sample_and_remove(self):
     synthetic = SyntheticDatabase(10, 10, number_features=3)  # 100 records, 10 entities, 10 records each
     synthetic2 = synthetic.sample_and_remove(50)
     self.assertEqual(len(synthetic.database.records), 50)
     self.assertEqual(len(synthetic2.database.records), 50)
     self.assertEqual(set(synthetic.database.records.keys()), set(synthetic.labels.keys()))
     self.assertEqual(set(synthetic2.database.records.keys()), set(synthetic2.labels.keys()))
     self.assertEqual(len(set(synthetic.database.records.keys()) & set(synthetic2.database.records.keys())), 0)
     self.assertEqual(len(set(synthetic.labels.keys()) & set(synthetic2.labels.keys())), 0)
Exemplo n.º 2
0
def experiment_wrapper(dataset_name):
    """
    Experiment wrapper, just takes the type of experiment, all parameters saved here
    :param dataset_name: Name of the database to run on, either synthetic, restaurant, abt-buy, trafficking
    """
    if dataset_name == 'synthetic':
        number_entities = 100
        records_per_entity = 10
        train_database_size = 200
        train_class_balance = 0.5
        validation_database_size = 200
        corruption = 0.001  #0.025
        number_thresholds = 30
        number_features = 10

        synthetic_database = SyntheticDatabase(number_entities, records_per_entity, number_features=number_features)
        corruption_array = corruption*np.random.normal(loc=0.0, scale=1.0, size=[validation_database_size,
                                                       synthetic_database.database.feature_descriptor.number])
        synthetic_database.corrupt(corruption_array)
        synthetic_train = synthetic_database.sample_and_remove(train_database_size)
        synthetic_validation = synthetic_database.sample_and_remove(validation_database_size)
        synthetic_test = synthetic_database
        thresholds = np.linspace(0, 1, number_thresholds)
        experiment = Experiment(synthetic_train.database, synthetic_validation.database, synthetic_test.database,
                                synthetic_train.labels, synthetic_validation.labels, synthetic_test.labels,
                                train_class_balance, thresholds)
        experiment.plot()
    else:
        number_thresholds = 5
        if dataset_name == 'restaurant':  # 864 records, 112 matches
            features_path = '../data/restaurant/merged.csv'
            labels_path = '../data/restaurant/labels.csv'
            train_database_size = 300
            train_class_balance = .4
            validation_database_size = 200
            database = Database(annotation_path=features_path)
        elif dataset_name == 'abt-buy':  # ~4900 records, 1300 matches
            features_path = '../data/Abt-Buy/merged.csv'
            labels_path = '../data/Abt-Buy/labels.csv'
            train_database_size = 300
            train_class_balance = 0.4
            validation_database_size = 300
            database = Database(annotation_path=features_path)
        elif dataset_name == 'trafficking':
            features_path = '../data/trafficking/features.csv'
            labels_path = '../data/trafficking/labels.csv'
            train_database_size = 300
            train_class_balance = 0.5
            validation_database_size = 300
            #database = Database(annotation_path=features_path)
        else:
            raise Exception('Invalid dataset name')
        thresholds = np.linspace(0, 1, number_thresholds)
        # labels = np.loadtxt(open(labels_path, 'rb'))
        # database_train = database.sample_and_remove(train_database_size)
        # database_validation = database.sample_and_remove(validation_database_size)
        # database_test = database
        # labels_train = dict()
        # labels_validation = dict()
        # labels_test = dict()
        # for identifier, label in enumerate(labels):
        #     if identifier in database_train.records:
        #         labels_train[identifier] = label
        #     elif identifier in database_validation.records:
        #         labels_validation[identifier] = label
        #     elif identifier in database_test.records:
        #         labels_test[identifier] = label
        #     else:
        #         raise Exception('Record identifier ' + str(identifier) + ' not in either database')
        ###
        database_train = Database('../data/trafficking/cluster_subsample0_10000.csv', header_path='../data/trafficking/cluster_subsample_header_LM.csv', max_records=5000)
        database_validation = Database('../data/trafficking/cluster_subsample1_10000.csv', header_path='../data/trafficking/cluster_subsample_header_LM.csv', max_records=5000)
        database_test = Database('../data/trafficking/cluster_subsample2_10000.csv', header_path='../data/trafficking/cluster_subsample_header_LM.csv', max_records=1000)

        labels_train = fast_strong_cluster(database_train)
        labels_validation = fast_strong_cluster(database_validation)
        labels_test = fast_strong_cluster(database_test)
        ###

        experiment = Experiment(database_train, database_validation, database_test,
                                labels_train, labels_validation, labels_test,
                                train_class_balance, thresholds)
        #print 'Saving results'
        #pickle.dump(experiment, open('experiment.p', 'wb'))
        experiment.plot()
    print 'Finished'