예제 #1
0
    def handle(self, *args, **options):

        self.init_options(options)
        self.log('Started at: {}'.format(datetime.now()))

        if self.options.get("verbose") > 0:
            with open(self.options["model_file"], 'rb') as sf:
                describe_dedupe(sys.stdout, dedupe.StaticDedupe(sf))

        with open(self.options["points_file"], 'w', encoding="utf8") as outf:
            for test_pool in options["test_pool"]:
                self.process_test(test_pool, outf)
예제 #2
0
    def deduper_setup(self,settings_file, training_file, field_list, selection, sample):
        """
        Trains (if training and settings files do not exist) otherwise set up deduper object
        :param settings_file: settings file name
        :param training_file: training file name
        :param field_list: list of lists (field(string), comparator(string), missing?(bool))
        :param selection: sql statement selecting all relevant columns to use in deduplication
        :param sample: sample size of data to be used for training
        :return: deduper object
        """
        if os.path.exists(settings_file):
            print('Reading from ', settings_file)
            with open(settings_file, 'rb') as sf:
                self.deduper = dedupe.StaticDedupe(sf, num_cores=4)
        else:
            # Define the fields dedupe will pay attention to
            fields = []
            for field in field_list:
                fields.append({'field': field[0], 'type': field[1], 'has missing': field[2]})

            # Create a new deduper object and pass our data model to it.
            self.deduper = dedupe.Dedupe(fields, num_cores=4)

            data = db.pandas_read(selection).to_dict('index')

            print('Collecting sample data for active learning... this may take a while.')
            self.deduper.sample(data, sample)

            if os.path.exists(training_file):
                print('Reading labeled examples from ', training_file)
                with open(training_file) as tf:
                    self.deduper.readTraining(tf)

            print('Starting active labeling...')
            dedupe.convenience.consoleLabel(self.deduper)

            # When finished, save our labeled, training pairs to disk
            with open(training_file, 'w') as tf:
                self.deduper.writeTraining(tf)

            # `recall` is the proportion of true dupes pairs that the learned
            # rules must cover. You may want to reduce this if your are making
            # too many blocks and too many comparisons.
            self.deduper.train(recall=0.90)

            with open(settings_file, 'wb') as sf:
                self.deduper.writeSettings(sf)

            self.deduper.cleanupTraining()
예제 #3
0
    def print_roc_points_quick_and_dirty(self, test_pool_file_name,
                                         output_points_file):
        clustered_dupes = []
        with open(self.options["model_file"], 'rb') as sf:
            dedupe_model = dedupe.StaticDedupe(sf)
            clustered_dupes = dedupe_model.match(self.dedupe_objects, 0)
        for t in ROC_POINTS:
            threshold = t / 100.0
            pairs = get_pairs_from_clusters(clustered_dupes,
                                            threshold=threshold)
            (metrics, test_results) = calcMetrics(pairs, self.test_data)
            self.print_test_output(test_results)

            metrics['Threshold'] = threshold
            metrics['TestName'] = os.path.basename(test_pool_file_name)
            output_points_file.write(json.dumps(metrics) + "\n")
예제 #4
0
def deduplicate(
    df,
    recall_weight=1,
    sample_size=0.3,
    settings_file="training-data/dedupe_learned_settings",
    training_file="training-data/dedupe_training.json",
):
    fields = df.columns
    df, data_d = data_prep(df)

    if os.path.exists(settings_file):
        logger.info("Existing settings found. Loading from: %s", settings_file)
        with open(settings_file, "rb") as f:
            deduper = dedupe.StaticDedupe(f)
    else:
        fields = select_fields(fields)
        deduper = dedupe.Dedupe(fields)
        sample_num = math.floor(len(data_d) * sample_size)

        logger.info("Extracting data sample of %s records", sample_num)
        deduper.sample(data_d, sample_num)

        if os.path.exists(training_file):
            logger.info("Reading training examples from: %s", training_file)
            with open(training_file, "rb") as f:
                deduper.readTraining(f)

        logger.info("Starting active labeling")

        dedupe.consoleLabel(deduper)

        deduper.train()

        with open(training_file, "w") as tf:
            deduper.writeTraining(tf)
        with open(settings_file, "wb") as sf:
            deduper.writeSettings(sf)

    threshold = deduper.threshold(data_d, recall_weight=recall_weight)

    clustered_df = data_cluster(deduper, data_d, threshold)
    results = df.join(clustered_df, how="left")
    results.drop(["dictionary"], axis=1, inplace=True)

    return results
예제 #5
0
def deduper_eval(dataset_type: str, dataset):
    # Create deduper model
    with open('trained_{}_settings.json'.format(dataset_type), 'rb') as fin:
        deduper = dedupe.StaticDedupe(fin, num_cores=8)

    # Prepare the data
    if dataset_type in ['x2', 'x3']:
        cols = [
            'instance_id',
            'brand',
            'model_name',
            # 'model_number',
            'cpu_brand',
            'cpu_model',
            'cpu_type',
            'ram_capacity',
            'hdd_capacity',
            'ssd_capacity',
            'title',
            'screen_size',
            'model'
        ]
    else:
        cols = [
            'name', 'name_2', 'model', 'line', 'brand', 'size', 'product_type'
        ]
    to_dedupe = dataset[cols]
    to_dedupe_dict = to_dedupe.to_dict(orient='index')

    # Cluster (prediction stage)
    clustered_dupes = deduper.partition(to_dedupe_dict,
                                        partition_threshold[dataset_type])
    print('# duplicate sets', len(clustered_dupes))

    # Save the result
    res = []
    for el in clustered_dupes:
        for i in range(len(el[0])):
            for j in range(i + 1, len(el[0])):
                res.append((el[0][i], el[0][j]))

    res_df = pd.DataFrame(res)
    res_df.columns = ['left_instance_id', 'right_instance_id']
    return res_df
예제 #6
0
    def match(self):
        '''
        generate the clusters
        '''
        print('Start Match Parent Method')
        print(datetime.now())
        with self.settings_file.get_file_bytes() as s:
            deduper = dedupe.StaticDedupe(s)

        src_data = self.data_source.get_data_dict()
        print('Start Threshold')
        print(datetime.now())
        threshold = deduper.threshold(src_data,
                                      recall_weight=self.recall_weight)
        print('threshold equals ' + str(threshold))
        print('Start Dedupe Match')
        print(datetime.now())
        clustered_dupes = deduper.match(src_data, threshold)
        print('Start Writing Clusters')
        print(datetime.now())
        self.write_clusters(clustered_dupes, src_data)
예제 #7
0
파일: duplicates.py 프로젝트: rajivraj/RVD
    def is_duplicate(self, flaw):
        """
        Checks whether the flaw passed as parameter is a duplicate or not
        Uses training information already available in training data folder.

        NOTE: should be called from the RVD respository directory.

        :param flaw, Flaw
        :return bool
        """
        data_d = self.read_data(None, invalid=False)  # data dict
        # pprint.pprint(data_d)

        # Append the flaw to the data dictonary with the ID 0
        data_d[0] = flaw.document_duplicates()
        # pprint.pprint(data_d)

        if os.path.exists(self.settings_file):
            print("reading from", self.settings_file)
            with open(self.settings_file, "rb") as f:
                deduper = dedupe.StaticDedupe(f)
        else:
            red("Error: settings file does not exist, stoping")
            sys.exit(1)

        cyan("Finding the threshold for data...")
        threshold = deduper.threshold(data_d, recall_weight=1)

        cyan("Clustering...")
        clustered_dupes = deduper.match(data_d, threshold)
        # pprint.pprint(clustered_dupes)  # debug purposes

        #  If ID 0 (corresponds with flaw passed as arg) is in there, is_duplicate
        for set in clustered_dupes:
            ids, values = set
            for id in ids:
                # print(id)
                if int(id) == 0:
                    return True
        return False
예제 #8
0
def blockDedupe(session_id, 
                table_name=None, 
                entity_table_name=None, 
                canonical=False):

    if not table_name:
        table_name = 'processed_{0}'.format(session_id)
    if not entity_table_name:
        entity_table_name = 'entity_{0}'.format(session_id)
    dd_session = worker_session.query(DedupeSession)\
        .get(session_id)
    deduper = dedupe.StaticDedupe(StringIO(dd_session.settings_file))
    engine = worker_session.bind
    metadata = MetaData()
    proc_table = Table(table_name, metadata,
        autoload=True, autoload_with=engine)
    entity_table = Table(entity_table_name, metadata,
        autoload=True, autoload_with=engine)
    for field in deduper.blocker.tfidf_fields:
        with engine.begin() as conn:
            fd = conn.execute('select record_id, {0} from "{1}"'.format(field, table_name))
            deduper.blocker.tfIdfBlock(fd, field)
    """ 
    SELECT p.* <-- need the fields that we trained on at least
        FROM processed as p
        LEFT OUTER JOIN entity_map as e
           ON p.record_id = e.record_id
        WHERE e.target_record_id IS NULL
    """
    proc_records = worker_session.query(proc_table)\
        .outerjoin(entity_table, proc_table.c.record_id == entity_table.c.record_id)\
        .filter(entity_table.c.target_record_id == None)
    fields = proc_table.columns.keys()
    full_data = ((getattr(row, 'record_id'), dict(zip(fields, row))) \
        for row in proc_records.yield_per(50000))
    return deduper.blocker(full_data)
예제 #9
0
    def main(self):

        data_d = {}
        # import the specified CSV file

        data_d = csvhelpers.readData(self.input, self.field_names)

        logging.info('imported %d rows', len(data_d))

        # sanity check for provided field names in CSV file
        for field in self.field_definition:
            if field['type'] != 'Interaction':
                if not field['field'] in data_d[0]:

                    raise self.parser.error("Could not find field '" +
                                            field['field'] + "' in input")

        logging.info('using fields: %s' % [field['field']
                                           for field in self.field_definition])

        # If --skip_training has been selected, and we have a settings cache still
        # persisting from the last run, use it in this next run.
        # __Note:__ if you want to add more training data, don't use skip training
        if self.skip_training and os.path.exists(self.settings_file):

            # Load our deduper from the last training session cache.
            logging.info('reading from previous training cache %s'
                                                          % self.settings_file)
            with open(self.settings_file, 'rb') as f:
                deduper = dedupe.StaticDedupe(f)

            fields = {variable.field for variable in deduper.data_model.primary_fields}
            unique_d, parents = exact_matches(data_d, fields)
                
        else:
            # # Create a new deduper object and pass our data model to it.
            deduper = dedupe.Dedupe(self.field_definition)

            fields = {variable.field for variable in deduper.data_model.primary_fields}
            unique_d, parents = exact_matches(data_d, fields)

            # Set up our data sample
            logging.info('taking a sample of %d possible pairs', self.sample_size)
            deduper.sample(unique_d, self.sample_size)

            # Perform standard training procedures
            self.dedupe_training(deduper)

        # ## Blocking

        logging.info('blocking...')

        # ## Clustering

        # Find the threshold that will maximize a weighted average of our precision and recall. 
        # When we set the recall weight to 2, we are saying we care twice as much
        # about recall as we do precision.
        #
        # If we had more data, we would not pass in all the blocked data into
        # this function but a representative sample.

        logging.info('finding a good threshold with a recall_weight of %s' %
                     self.recall_weight)
        threshold = deduper.threshold(unique_d, recall_weight=self.recall_weight)

        # `duplicateClusters` will return sets of record IDs that dedupe
        # believes are all referring to the same entity.

        logging.info('clustering...')
        clustered_dupes = deduper.match(unique_d, threshold)

        expanded_clustered_dupes = []
        for cluster, scores in clustered_dupes:
            new_cluster = list(cluster)
            new_scores = list(scores)
            for row_id, score in zip(cluster, scores):
                children = parents.get(row_id, [])
                new_cluster.extend(children)
                new_scores.extend([score] * len(children))
            expanded_clustered_dupes.append((new_cluster, new_scores))

        clustered_dupes = expanded_clustered_dupes

        logging.info('# duplicate sets %s' % len(clustered_dupes))

        write_function = csvhelpers.writeResults
        # write out our results
        if self.destructive:
            write_function = csvhelpers.writeUniqueResults

        if self.output_file:
            with open(self.output_file, 'w', encoding='utf-8') as output_file:
                write_function(clustered_dupes, self.input, output_file)
        else:
            if sys.version < '3' :
                out = codecs.getwriter(locale.getpreferredencoding())(sys.stdout)
                write_function(clustered_dupes, self.input, out)
            else :
                write_function(clustered_dupes, self.input, sys.stdout)
예제 #10
0
def train(clients):

    if os.path.exists(settings_file):
        with open(settings_file) as sf:
            print 'reading from', settings_file
            return dedupe.StaticDedupe(sf)

    else:
        fields = [
            {
                'field': 'city',
                'type': 'ShortString',
                'Has Missing': True
            },
            {
                'field': 'country',
                'type': 'ShortString',
                'Has Missing': True
            },
            {
                'field': 'zip',
                'type': 'ShortString',
                'Has Missing': True
            },
            {
                'field': 'state',
                'type': 'ShortString',
                'Has Missing': True
            },
            {
                'field': 'address',
                'type': 'String',
                'Has Missing': True
            },
            {
                'field': 'description',
                'type': 'Text',
                'Has Missing': True,
                'corpus':
                []  #map(lambda x: x['description'],clients.values())},
            },
            {
                'field': 'specific_issues',
                'type': 'Text',
                'Has Missing': True,
                'corpus':
                []  #map(lambda x: x['specific_issues'],clients.values())
            },
            {
                'field': 'exact_name',
                'type': 'String'
            },
            {
                'field': 'alis',
                'type': 'Set',
                'corpus': []  #map(lambda x: x['alis'],clients.values())
            },
            {
                'field': 'houseID',
                'type': 'Exact'
            },
            {
                'field': 'senate',
                'type': 'Exact'
            },
        ]
        #for definition in fields[:] :
        #    if definition.field != 'houseID':
        #        fields[k+"-houseID"] = {'type':'Interaction',
        #                                'Interaction Fields': ["houseID", k]}
        # Create a new deduper object and pass our data model to it.
        deduper = dedupe.Dedupe(fields)

        # To train dedupe, we feed it a random sample of records.
        deduper.sample(clients, 150000)

        # If we have training data saved from a previous run of dedupe,
        # look for it an load it in.
        # __Note:__ if you want to train from scratch, delete the training_file
        if os.path.exists(training_file):
            with open(training_file) as tf:
                print 'reading labeled examples from ', training_file
                deduper.readTraining(tf)

        # ## Active learning
        # Dedupe will find the next pair of records
        # it is least certain about and ask you to label them as duplicates
        # or not.
        # use 'y', 'n' and 'u' keys to flag duplicates
        # press 'f' when you are finished
        print 'starting active labeling...'

        dedupe.consoleLabel(deduper)

        deduper.train(ppc=0.1, uncovered_dupes=2)

        # When finished, save our training away to disk
        with open(training_file, 'w') as tf:
            deduper.writeTraining(tf)

        # Save our weights and predicates to disk.  If the settings file
        # exists, we will skip all the training and learning next time we run
        # this file.
        with open(settings_file, 'w') as sf:
            deduper.writeSettings(sf)

        deduper.cleanupTraining()

        return deduper
def run_dedupe(settings_file, training_file, type):
    start_time = time.time()
    print("dedupe file ", dedupe.__file__)
    print("dedupe path ", dedupe.__path__)
    print("dedupe name ", dedupe.__name__)
    print("dedupe spec ", dedupe.__spec__)
    print("dedupe doc ", dedupe.__doc__)
    print("dedupe loader ", dedupe.__loader__)
    # Set the database connection from environment variable using
    # [dj_database_url](https://github.com/kennethreitz/dj-database-url)
    # For example:
    #   export DATABASE_URL=postgres://user:password@host/mydatabase
    db_conf = dj_database_url.config()

    if not db_conf:
        raise Exception(
            'set DATABASE_URL environment variable with your connection, e.g. '
            'export DATABASE_URL=postgres://user:password@host/mydatabase')

    read_con = psycopg2.connect(database=db_conf['NAME'],
                                user=db_conf['USER'],
                                password=db_conf['PASSWORD'],
                                host=db_conf['HOST'],
                                port=db_conf['PORT'],
                                cursor_factory=psycopg2.extras.RealDictCursor)

    write_con = psycopg2.connect(database=db_conf['NAME'],
                                 user=db_conf['USER'],
                                 password=db_conf['PASSWORD'],
                                 host=db_conf['HOST'],
                                 port=db_conf['PORT'])

    # We'll be using variations on this following select statement to pull
    # in campaign donor info.
    if type == 'IND':
        DONOR_SELECT = "SELECT donor_id, city, name, zip, state, street " \
                   "from processed_donors where person = 1 and name not like '%unitem%' "
    else:
        DONOR_SELECT = "SELECT donor_id, city, name, zip, state, street " \
                   "from processed_donors where person != 1 and name not like '%unitem%' "

    # ## Training
    if os.path.exists(settings_file):
        print('reading from ', settings_file)
        with open(settings_file, 'rb') as sf:
            deduper = dedupe.StaticDedupe(sf, num_cores=1)
    else:
        # Define the fields dedupe will pay attention to
        #
        # The street, city, and zip fields are often missing, so we'll
        # tell dedupe that, and we'll learn a model that take that into
        # account
        fields = [
            {
                'field': 'name',
                'type': 'String'
            },
            {
                'field': 'street',
                'type': 'String',
                'has missing': True
            },
            {
                'field': 'city',
                'type': 'String',
                'has missing': True
            },
            {
                'field': 'state',
                'type': 'ShortString',
                'has missing': True
            },
            {
                'field': 'zip',
                'type': 'ShortString',
                'has missing': True
            },
        ]

        # Create a new deduper object and pass our data model to it.
        deduper = dedupe.Dedupe(fields, num_cores=1)

        # Named cursor runs server side with psycopg2
        with read_con.cursor('donor_select') as cur:
            cur.execute(DONOR_SELECT)
            temp_d = {i: row for i, row in enumerate(cur)}

        # If we have training data saved from a previous run of dedupe,
        # look for it an load it in.
        #
        # __Note:__ if you want to train from
        # scratch, delete the training_file
        if os.path.exists(training_file):
            print('reading labeled examples from ', training_file)
            with open(training_file) as tf:
                deduper.prepare_training(temp_d, tf)
        else:
            deduper.prepare_training(temp_d)

        del temp_d

        # ## Active learning

        print('starting active labeling...')
        # Starts the training loop. Dedupe will find the next pair of records
        # it is least certain about and ask you to label them as duplicates
        # or not.

        # use 'y', 'n' and 'u' keys to flag duplicates
        # press 'f' when you are finished
        dedupe.console_label(deduper)
        # When finished, save our labeled, training pairs to disk
        with open(training_file, 'w') as tf:
            deduper.write_training(tf)

        # Notice our argument here
        #
        # `recall` is the proportion of true dupes pairs that the learned
        # rules must cover. You may want to reduce this if your are making
        # too many blocks and too many comparisons.
        deduper.train(recall=0.90)

        with open(settings_file, 'wb') as sf:
            deduper.write_settings(sf)

        # We can now remove some of the memory hogging objects we used
        # for training
        deduper.cleanup_training()

    print(f'deduper predicates: {deduper.predicates}')
    # ## Blocking
    print('blocking...')

    # To run blocking on such a large set of data, we create a separate table
    # that contains blocking keys and record ids
    print('creating blocking_map database')
    blocking_table = 'blocking_map_' + type
    with write_con:
        with write_con.cursor() as cur:
            cur.execute("DROP TABLE IF EXISTS " + blocking_table)
            cur.execute("CREATE TABLE " + blocking_table + " "
                        "(block_key text, donor_id INTEGER)")

    # If dedupe learned a Index Predicate, we have to take a pass
    # through the data and create indices.
    print('creating inverted index')

    for field in deduper.fingerprinter.index_fields:
        with read_con.cursor('field_values') as cur:
            cur.execute("SELECT DISTINCT %s FROM processed_donors" % field)
            field_data = (row[field] for row in cur)
            deduper.fingerprinter.index(field_data, field)

    # Now we are ready to write our blocking map table by creating a
    # generator that yields unique `(block_key, donor_id)` tuples.
    print('writing blocking map')

    with read_con.cursor('donor_select') as read_cur:
        read_cur.execute(DONOR_SELECT)

        full_data = ((row['donor_id'], row) for row in read_cur)
        b_data = deduper.fingerprinter(full_data)

        with write_con:
            with write_con.cursor() as write_cur:
                write_cur.copy_expert('COPY ' + blocking_table +
                                      ' FROM STDIN WITH CSV',
                                      Readable(b_data),
                                      size=100000)

    # free up memory by removing indices
    deduper.fingerprinter.reset_indices()

    logging.info("indexing block_key")
    with write_con:
        with write_con.cursor() as cur:
            cur.execute("CREATE UNIQUE INDEX ON " + blocking_table + " "
                        "(block_key text_pattern_ops, donor_id)")

    # ## Clustering
    entity_map_table = "entity_map_" + type
    with write_con:
        with write_con.cursor() as cur:
            cur.execute("DROP TABLE IF EXISTS " + entity_map_table)

            print('creating entity_map database')

            cur.execute("CREATE TABLE " + entity_map_table + " "
                        "(donor_id INTEGER, canon_id INTEGER, "
                        " cluster_score FLOAT, PRIMARY KEY(donor_id))")

    read_con1 = psycopg2.connect(database=db_conf['NAME'],
                                 user=db_conf['USER'],
                                 password=db_conf['PASSWORD'],
                                 host=db_conf['HOST'],
                                 port=db_conf['PORT'],
                                 cursor_factory=psycopg2.extras.RealDictCursor)
    with read_con1.cursor(
            'pairs', cursor_factory=psycopg2.extensions.cursor) as read_cur:
        read_cur.execute(
            "SELECT a.donor_id, "
            "row_to_json((SELECT d FROM (SELECT a.city, "
            "a.name, "
            "a.zip, "
            "a.state, "
            "a.street) d)), "
            "b.donor_id, "
            "row_to_json((SELECT d FROM (SELECT b.city, "
            "b.name, "
            "b.zip, "
            "b.state, "
            "b.street) d)) "
            "FROM (SELECT DISTINCT l.donor_id AS east, r.donor_id AS west "
            "FROM " + blocking_table + " AS l "
            "INNER JOIN " + blocking_table + " AS r "
            "USING (block_key) "
            "WHERE l.donor_id < r.donor_id) ids "
            "INNER JOIN processed_donors a ON ids.east=a.donor_id "
            "INNER JOIN processed_donors b ON ids.west=b.donor_id")

        print('clustering...')
        clustered_dupes = deduper.cluster(deduper.score(
            record_pairs(read_cur)),
                                          threshold=0.5)

        # ## Writing out results

        # We now have a sequence of tuples of donor ids that dedupe believes
        # all refer to the same entity. We write this out onto an entity map
        # table

        print('writing results')
        write_con1 = psycopg2.connect(database=db_conf['NAME'],
                                      user=db_conf['USER'],
                                      password=db_conf['PASSWORD'],
                                      host=db_conf['HOST'],
                                      port=db_conf['PORT'])
        with write_con1:
            with write_con1.cursor() as write_cur:
                write_cur.copy_expert('COPY ' + entity_map_table +
                                      ' FROM STDIN WITH CSV',
                                      Readable(cluster_ids(clustered_dupes)),
                                      size=100000)

    with write_con1:
        with write_con1.cursor() as cur:
            cur.execute("CREATE INDEX head_index_" + type + " ON " +
                        entity_map_table + " (canon_id)")

    # Print out the number of duplicates found

    # ## Payoff

    # With all this done, we can now begin to ask interesting questions
    # of the data
    #
    # For example, let's see who the top 10 donors are.

    locale.setlocale(locale.LC_ALL,
                     'en_US.UTF-8')  # for pretty printing numbers
    read_con2 = psycopg2.connect(database=db_conf['NAME'],
                                 user=db_conf['USER'],
                                 password=db_conf['PASSWORD'],
                                 host=db_conf['HOST'],
                                 port=db_conf['PORT'],
                                 cursor_factory=psycopg2.extras.RealDictCursor)
    # save entity map
    entity_map_filename = entity_map_table + "_" + settings_file.split(
        '/')[-1] + '_' + time.strftime('%d_%m_%y_%H%M',
                                       time.localtime()) + '.csv'
    donors_filename = 'processed_donors_' + settings_file.split(
        '/')[-1] + '_' + time.strftime('%d_%m_%y_%H%M',
                                       time.localtime()) + '.csv'
    with read_con2.cursor() as cur:
        with open(entity_map_filename, 'w') as file_out:
            cur.copy_expert(
                'COPY ' + entity_map_table + ' TO STDOUT WITH CSV HEADER',
                file_out)
        with open(donors_filename, 'w') as file_out:
            cur.copy_expert('COPY processed_donors TO STDOUT WITH CSV HEADER',
                            file_out)

    if type == 'IND':
        top_donor_where = 'donors.corp is null'
    else:
        top_donor_where = 'donors.corp is not null'
    # Create a temporary table so each group and unmatched record has
    # a unique id
    with read_con2.cursor() as cur:
        cur.execute(
            "CREATE TEMPORARY TABLE e_map "
            "AS SELECT COALESCE(canon_id, donor_id) AS canon_id, donor_id "
            "FROM " + entity_map_table + " "
            "RIGHT JOIN donors USING(donor_id)")

        cur.execute(
            "SELECT CONCAT_WS(' ', donors.first_name, donors.last_name, donors.corp) AS name, "
            "donation_totals.totals AS totals "
            "FROM (SELECT * FROM donors WHERE " + top_donor_where +
            ") as donors INNER JOIN "
            "(SELECT canon_id, SUM(CAST(contributions.amount AS FLOAT)) AS totals "
            " FROM contributions INNER JOIN e_map "
            " USING (donor_id) "
            " GROUP BY (canon_id) "
            " ORDER BY totals "
            " DESC) "
            "AS donation_totals ON donors.donor_id=donation_totals.canon_id "
            "WHERE donors.donor_id = donation_totals.canon_id and donation_totals.totals IS NOT NULL LIMIT 10"
        )

        print("Top Donors (deduped)")
        for row in cur:
            row['totals'] = locale.currency(row['totals'], grouping=True)
            print('%(totals)20s: %(name)s' % row)

        # Compare this to what we would have gotten if we hadn't done any
        # deduplication

        cur.execute(
            "SELECT name, totals FROM (SELECT CONCAT_WS(' ', donors.first_name, donors.last_name, donors.corp) AS name, "
            "SUM(CAST(contributions.amount AS FLOAT)) AS totals "
            "FROM donors INNER JOIN contributions "
            "USING (donor_id) "
            "WHERE " + top_donor_where + " "
            "GROUP BY (donor_id) "
            "ORDER BY totals DESC) AS t where totals IS NOT NULL "
            "LIMIT 10")

        print("Top Donors (raw)")
        for row in cur:
            row['totals'] = locale.currency(row['totals'], grouping=True)
            print('%(totals)20s: %(name)s' % row)

        cur.execute(
            " SELECT CONCAT_WS(' ', donors.first_name, donors.last_name, donors.corp) AS name, "
            " cluster_size, cluster_id "
            " FROM (SELECT * FROM donors WHERE " + top_donor_where +
            ") as donors "
            " INNER JOIN (SELECT count(*) AS cluster_size, canon_id AS cluster_id "
            " FROM " + entity_map_table + " "
            " GROUP BY canon_id) "
            " AS cluster_totals "
            " ON donors.donor_id = cluster_totals.cluster_id "
            " ORDER BY cluster_size DESC LIMIT 10")
        # ##Print stats that are saved to match_runs table
        # Biggest cluster first
        print("Biggest Clusters")
        biggest_name = ''
        for row in cur:
            print('%(cluster_size)20s: %(name)s' % row)
            if biggest_name == '':
                biggest_name = row['name'] + ':' + str(row['cluster_id'])
        # Then total of donors
        if type == 'IND':
            processed_donors_where = '= 1'
        else:
            processed_donors_where = '!= 1'
        cur.execute(
            " SELECT count(*) AS num_donors FROM processed_donors WHERE person "
            + processed_donors_where)
        for row in cur:
            number_of_donors = row['num_donors']
        # Then max, average and number of clusters
        cur.execute(
            " SELECT MAX(cluster_size) AS Biggest, AVG(cluster_size) AS Average, COUNT(cluster_id) AS number_of_clusters "
            " FROM (SELECT CONCAT_WS(' ', donors.first_name, donors.last_name, donors.corp) AS name, "
            " cluster_size, cluster_id "
            " FROM (SELECT * FROM donors WHERE " + top_donor_where +
            ") as donors"
            " INNER JOIN (SELECT count(*) AS cluster_size, canon_id AS cluster_id "
            " FROM " + entity_map_table + " "
            " GROUP BY canon_id) "
            " AS cluster_totals "
            " ON donors.donor_id = cluster_totals.cluster_id "
            " ORDER BY cluster_size DESC) AS stats")
        # Print the stats
        print("stats...")
        print(f"total number of donors: {number_of_donors}")
        for row in cur:
            print(
                'max: %(biggest)s, average: %(average)s, number of clusters: %(number_of_clusters)s'
                % row)
            biggest_size = row['biggest']
            average_size = row['average']
            number_of_clusters = row['number_of_clusters']
    # write to the match_run table
    runtime = time.time() - start_time
    donor_cluster_ratio = number_of_donors / number_of_clusters
    with write_con1.cursor() as cur:
        cur.execute(
            """ 
            INSERT INTO match_runs 
            (completed, predicates, total_clusters, avg_cluster_size, biggest_cluster_size, biggest_cluster,
             total_donors, donor_type, total_run_time, donor_cluster_ratio, settings_file)
            VALUES (NOW(), %s, %s, %s, %s, %s,
            %s, %s, %s, %s, %s) """,
            (' '.join(str(pred)
                      for pred in deduper.predicates), number_of_clusters,
             average_size, biggest_size, biggest_name, number_of_donors, type,
             runtime, donor_cluster_ratio, settings_file))
    write_con1.commit()

    read_con.close()
    write_con.close()
    read_con1.close()
    write_con1.close()
    read_con2.close()

    print('ran in', runtime, 'seconds')
예제 #12
0
파일: cli.py 프로젝트: melvin15may/ssdedupe
def train(con, config):
    if config['seed'] is not None:
        if os.environ.get('PYTHONHASHSEED', 'random') == 'random':
            logging.warn(
                """dedupe is only deterministic with hash randomization disabled.
                            Set the PYTHONHASHSEED environment variable to a constant."""
            )
        random.seed(config['seed'])
        numpy.random.seed(config['seed'])
    if config['use_saved_model']:
        print('reading from ', config['settings_file'])
        with open(config['settings_file'], 'rb') as sf:
            return dedupe.StaticDedupe(sf, num_cores=config['num_cores'])
    # Create a new deduper object and pass our data model to it.
    deduper = dedupe.Dedupe(config['all_fields'],
                            num_cores=config['num_cores'])

    cur = con.cursor(as_dict=True)

    cur.execute("""SELECT {all_columns}
                   FROM {schema}.entries_unique
                   ORDER BY _unique_id""".format(**config))
    temp_d = dict((i, unicode_to_str(row)) for i, row in enumerate(cur))
    deduper.sample(temp_d, 75000)

    del temp_d
    # If we have training data saved from a previous run of dedupe,
    # look for it an load it in.
    #
    # __Note:__ if you want to train from
    # scratch, delete the training_file
    if os.path.exists(config['training_file']):
        print('reading labeled examples from ', config['training_file'])
        with open(config['training_file']) as tf:
            deduper.readTraining(tf)

    if config['prompt_for_labels']:
        # ## Active learning
        print('starting active labeling...')
        # Starts the training loop. Dedupe will find the next pair of records
        # it is least certain about and ask you to label them as duplicates
        # or not.

        # use 'y', 'n' and 'u' keys to flag duplicates
        # press 'f' when you are finished
        dedupe.convenience.consoleLabel(deduper)
        # When finished, save our labeled, training pairs to disk
        with open(config['training_file'], 'w') as tf:
            deduper.writeTraining(tf)

    # `recall` is the proportion of true dupes pairs that the learned
    # rules must cover. You may want to reduce this if your are making
    # too many blocks and too many comparisons.
    deduper.train(recall=config['recall'])

    with open(config['settings_file'], 'wb') as sf:
        deduper.writeSettings(sf)

    # We can now remove some of the memory hobbing objects we used
    # for training
    deduper.cleanupTraining()
    return deduper
예제 #13
0
    def find_duplicates(self, train, push, label):
        """
        Find duplicates and print them via stdout
        """
        # data_d = self.read_data()
        data_d = self.read_data(label, invalid=False)
        # pprint.pprint(data_d)

        if train:
            deduper = self.train(data_d)
        else:
            if os.path.exists(self.settings_file):
                print("reading from", self.settings_file)
                with open(self.settings_file, "rb") as f:
                    deduper = dedupe.StaticDedupe(f)
            else:
                red("Error: settings file does not exist, stoping")
                sys.exit(1)

        cyan("Finding the threshold for data...")
        threshold = deduper.threshold(data_d, recall_weight=1)

        cyan("Clustering...")
        clustered_dupes = deduper.match(data_d, threshold)

        cyan("Number of duplicate sets: " + str(len(clustered_dupes)))
        for aset in clustered_dupes:
            yellow("Found a duplicated pair...")
            ids, values = aset
            primary_issue = None  # reflects the primary ticket in a aset of
            # duplicates all the duplicates should point
            # to this one
            for id in ids:
                # print(id)
                issue = self.repo.get_issue(int(id))

                # if duplicate in issue, do nothing
                labels = [l.name for l in issue.labels]
                if "duplicate" in labels:
                    gray(str(id) + " already duplicate, skipping it")
                    continue

                # print(issue)
                if primary_issue is None:
                    primary_issue = issue
                else:
                    # Indicate that this issue is duplicated
                    yellow(
                        "Marking "
                        + str(id)
                        + " as duplicate, referencing to: "
                        + str(primary_issue)
                    )
                    if push:
                        duplicate_text = (
                            "- <ins>DUPLICATE</ins>: Tagging this ticket as duplicate. Referencing to  #"
                            + str(primary_issue.number)
                            + "\n"
                        )
                        issue.create_comment(duplicate_text)
                        # labeling
                        issue.add_to_labels("duplicate")
                        issue.add_to_labels("triage")
예제 #14
0
    def company_deduplicate(self):
        optp = optparse.OptionParser()
        optp.add_option(
            '-v',
            '--verbose',
            dest='verbose',
            action='count',
            help='Increase verbosity (specify multiple times for more)')
        (opts, args) = optp.parse_args()
        log_level = logging.WARNING
        if opts.verbose:
            if opts.verbose == 1:
                log_level = logging.INFO
            elif opts.verbose >= 2:
                log_level = logging.DEBUG
        logging.getLogger().setLevel(log_level)

        sql_data = 'SELECT CompanyID, CompanyName,Website,Email,Phone FROM Reporting.DimCompany'
        sql = 'SELECT C.CompanyID, C.CompanyName, C.Website,L.Address1,Y.[Name] AS [CityName],L.PostalCode ' \
              'FROM Reporting.DimCompany C LEFT JOIN [Reporting].[DimCompanyLocation] L ON L.CompanyID = C.CompanyID ' \
              'LEFT JOIN [Reporting].[DimCity] Y ON Y.CityID = L.CityID WHERE C.CompanyName IS NOT NULL'

        if os.path.exists(self.setting_file):
            print('reading from '.format(self.setting_file))
            with open(self.setting_file, 'rb') as sf:
                deduper = dedupe.StaticDedupe(sf, num_cores=4)

        fields = [{
            'field': 'CompanyID',
            'variable name': 'CompanyID',
            'type': 'String'
        }, {
            'field': 'CompanyName',
            'variable name': 'CompanyName',
            'type': 'Exists'
        }, {
            'field': 'Website',
            'variable name': 'Website',
            'type': 'String',
            'has missing': True
        }, {
            'field': 'Email',
            'variable name': 'Email',
            'type': 'String',
            'has missing': True
        }, {
            'field': 'Phone',
            'variable name': 'Phone',
            'type': 'String',
            'has missing': True
        }, {
            'type': 'Interaction',
            'interaction variables': ['CompanyName', 'Website']
        }, {
            'type': 'Interaction',
            'interaction variables': ['CompanyID', 'Email']
        }]

        deduper = dedupe.Dedupe(fields, num_cores=4)
        data = self.db.pandas_read(sql_data)
        temp_data = dict((index, row) for index, row in enumerate(data))
        deduper.sample(temp_data, sample_size=10000)
        del temp_data

        if os.path.exists(self.training_file):
            print('reading labeled examples from {}'.format(
                self.training_file))
            with open(self.training_file) as tf:
                deduper.readTraining(tf)

        print('start active labeling...')
        # ACTIVE LEARNING
        dedupe.convenience.consoleLabel(deduper)

        with open(self.training_file, 'w') as tf:
            deduper.writeTraining(tf)

        deduper.train(recall=0.90)

        with open(self.setting_file, 'wb') as sf:
            deduper.writeSettings(sf)

        deduper.cleanupTraining()

        # BLOCKING
        print('blocking...')

        print('creating blocking_map database')
        self.db.execute('DROP TABLE IF EXISTS blocking_map')
        self.db.execute('CREATE TABLE blocking_map '
                        '(block_key VARCHAR(200), company_id INTEGER) '
                        'CHARACTER SET utf8 COLLATE utf8_unicode_ci')
        print('creating inverted index')

        for field in deduper.blocker.index_fields:
            df = self.db.pandas_read(
                'SELECT DISTINCT {} FROM Staging.DimCompany WHERE {} IS NOT NULL'
                .format(field))
            field_data = (row[0] for row in df)
            deduper.blocker.index(field_data, field)

        print('writing blocking map')

        d_company = self.db.pandas_read(sql)
        full_data = ((row['CompanyID'], row) for row in d_company)
        b_data = deduper.blocker(full_data)

        val = b_data.toList()

        self.db.bulk_insert('INSERT INTO blocking_map VALUES (?,?)', val)

        deduper.blocker.resetIndices()

        # PREPARE BLOCKING TABLE

        print('prepare blocking table. ...this take a while')
        logging.info('indexing block_key')
        self.db.execute(
            'ALTER TABLE blocking_map ADD UNIQUE INDEX(block_key, company_id)')

        self.db.execute('DROP TABLE IF EXISTS plural_key')
        self.db.execute('DROP TABLE IF EXISTS plural_block')
        self.db.execute('DROP TABLE IF EXISTS covered_blocks')
        self.db.execute('DROP TABLE IF EXISTS smaller_coverage')

        logging.info('calculating plural_key')
        self.db.execute(
            'CREATE TABLE plural_key (block_key VARCHAR(200), block_id INTEGER UNSIGNED AUTO_INCREMENT, '
            'PRIMARY KEY (block_id)) (SELECT MIN(block_key) '
            'FROM  ('
            'SELECT block_key,GROUP_CONCAT(donor_id ORDER BY donor_id) AS block  '
            'FROM blocking_map  GROUP BY block_key HAVING COUNT(*) > 1'
            ') AS blocks '
            'GROUP BY block)')

        logging.info('creating block_key index')
        self.db.execute(
            'CREATE UNIQUE INDEX block_key_idx ON plural_key (block_key)')

        logging.info("calculating plural_block")
        self.db.execute(
            'CREATE TABLE plural_block ('
            'SELECT block_id, donor_id FROM blocking_map INNER JOIN plural_key  USING (block_key))'
        )

        logging.info("adding donor_id index and sorting index")
        self.db.execute(
            'ALTER TABLE plural_block ADD INDEX (donor_id), ADD UNIQUE INDEX (block_id, donor_id)'
        )

        self.db.execute('SET group_concat_max_len = 2048')

        logging.info("creating covered_blocks")
        self.db.execute(
            'CREATE TABLE covered_blocks ('
            'SELECT donor_id, GROUP_CONCAT(block_id ORDER BY block_id) AS sorted_ids  '
            'FROM plural_block  GROUP BY donor_id)')

        self.db.execute(
            "CREATE UNIQUE INDEX donor_idx ON covered_blocks (donor_id)")

        logging.info("creating smaller_coverage")
        self.db.execute(
            'CREATE TABLE smaller_coverage ('
            'SELECT donor_id, block_id,  TRIM(\',\' '
            'FROM SUBSTRING_INDEX(sorted_ids, block_id, 1)) AS smaller_ids  '
            'FROM plural_block INNER JOIN covered_blocks  USING (donor_id))')

        # CORRECT THIS SQL STATEMENT TO CORRECT THE ATTRIBUTES
        c_data = self.db.execute(
            'SELECT company_id, city, name, zip, state, address, occupation, '
            'employer, person, block_id, smaller_ids '
            'FROM smaller_coverage INNER JOIN processed_donors '
            'USING (company_id) ORDER BY (block_id)')

        print('clustering...')
        clustered_dupes = deduper.matchBlocks(self.candidates_gen(c_data),
                                              threshold=0.5)

        self.db.execute('DROP TABLE IF EXISTS entity_map')

        # WRITING OUT RESULTS

        print('creating entity_map database')
        self.db.execute(
            'CREATE TABLE entity_map ('
            'donor_id INTEGER, canon_id INTEGER,  cluster_score FLOAT, PRIMARY KEY(donor_id))'
        )

        for cluster, scores in clustered_dupes:
            cluster_id = cluster[0]
            for donor_id, score in zip(cluster, scores):
                self.db.execute('INSERT INTO entity_map VALUES (%s, %s, %s)',
                                (donor_id, cluster_id, score))

        self.db.execute("CREATE INDEX head_index ON entity_map (canon_id)")

        print('# of duplicate sets are: {}'.format(len(clustered_dupes)))
        self.file.df_to_excel(clustered_dupes, 'Clustered Companies')
예제 #15
0
def deduping_function(fields, training_file, settings_file, input_file,
                      output_file, input_dict):
    """
    Function to dedupe rows given a
    fields: list of dictionaries of fields relevant for the training
    training_file: json with training set if the function was already run
    settings_file: file with settings if the function was already run
    input_file: CSV with values to dedup
    output_file: CSV where results are going to be written
    input_dict: dictionary with keys as id and values as dictionary of features for each row
    
    Result is to write the output_file
    """
    if os.path.exists(settings_file):
        print('reading from {}'.format(settings_file))
        with open(settings_file, 'rb') as f:
            deduper = dedupe.StaticDedupe(f)
    else:
        deduper = dedupe.Dedupe(fields)
        deduper.sample(input_dict)
        if os.path.exists(training_file):
            print('reading labeled examples from ', format(training_file))
            with open(training_file) as tf:
                deduper.readTraining(tf)

        print('starting active labeling...')
        dedupe.consoleLabel(deduper)
        deduper.train()

        print('writing training file')
        with open(training_file, 'w') as tf:
            deduper.writeTraining(tf)
        print('writing settings file')
        with open(settings_file, 'wb') as sf:
            deduper.writeSettings(sf)

    print('blocking...')
    threshold = deduper.threshold(input_dict, recall_weight=2)
    print('clustering...')
    clustered_dupes = deduper.match(input_dict, threshold)
    print('# duplicate sets {}'.format(len(clustered_dupes)))

    cluster_membership = {}
    cluster_id = 0
    for (cluster_id, cluster) in enumerate(clustered_dupes):
        id_set, scores = cluster
        cluster_d = [input_dict[c] for c in id_set]
        canonical_rep = dedupe.canonicalize(cluster_d)
        for record_id, score in zip(id_set, scores):
            cluster_membership[record_id] = {
                "cluster id": cluster_id,
                "canonical representation": canonical_rep,
                "confidence": score
            }

    singleton_id = cluster_id + 1

    with open(output_file, 'w') as f_output:
        writer = csv.writer(f_output)

        with open(input_file) as f_input:
            reader = csv.reader(f_input)

            heading_row = next(reader)
            heading_row.insert(0, 'confidence_score')
            heading_row.insert(0, 'Cluster ID')
            canonical_keys = canonical_rep.keys()
            for key in canonical_keys:
                heading_row.append('canonical_' + key)

            writer.writerow(heading_row)

            for row in reader:
                row_id = int(row[0])
                if row_id in cluster_membership:
                    cluster_id = cluster_membership[row_id]["cluster id"]
                    canonical_rep = cluster_membership[row_id][
                        "canonical representation"]
                    row.insert(0, cluster_membership[row_id]['confidence'])
                    row.insert(0, cluster_id)
                    for key in canonical_keys:
                        row.append(canonical_rep[key].encode('utf8'))
                else:
                    row.insert(0, None)
                    row.insert(0, singleton_id)
                    singleton_id += 1
                    for key in canonical_keys:
                        row.append(None)
                writer.writerow(row)

    print('deduping sucessfully finished')
예제 #16
0
        {
            'field': 'enddate',
            'type': 'DateTime',
            'dayfirst': True
        }
    ]

    start_time = time.time()
    debugger_setup()

    # if settings file exists
    if os.path.exists(tender_settings_file):
        # load from settings file
        print('reading from', tender_settings_file)
        with open(tender_settings_file) as sf:
            deduper = dedupe.StaticDedupe(sf)

        # construct results table
        create_table(results_table, results_schema, output_fields)
        # start the cluster_id at 0
        current_index = 0

        # loop over different dates
        for date_range in select_date_ranges:

            # construct the query for getting data to be deduped
            select_query = construct_query(input_fields, data_source,
                                           date_range, select_country)

            # get data from the database using the query
            selected_data = fetch_data(select_query)
예제 #17
0
start_time = time.time()
with open(opts.CONFIG_PATH) as f:
    config = json.load(f)
con = psy.connect(cursor_factory=psycopg2.extras.RealDictCursor, **config)

c = con.cursor()

FIELD_NAMES = "entry_id, hash_ssn, hash_fname, hash_lname, dob, race, sex, block2010id"
INDIVIDUAL_SELECT = "SELECT %s FROM dedupe.entries" % FIELD_NAMES

# ## Training

if False:  #os.path.exists(settings_file):
    print('reading from ', settings_file)
    with open(settings_file, 'rb') as sf:
        deduper = dedupe.StaticDedupe(sf, num_cores=4)
else:

    # Define the fields dedupe will pay attention to
    #
    # The address, city, and zip fields are often missing, so we'll
    # tell dedupe that, and we'll learn a model that take that into
    # account
    fields = [
        {
            'field': 'hash_ssn',
            'variable name': 'hash_ssn',
            'type': 'Exact',
            'has missing': True
        },
        {
예제 #18
0
    cur.execute(DONOR_SELECT)
    for row in cur:
        temp_d[int(row[id_column])] = dedupe.core.frozendict(row)

    def random_pair_generator():
        for k1, k2 in random_pairs:
            yield (temp_d[k1], temp_d[k2])

    return tuple(pair for pair in random_pair_generator())


# ## Training

if os.path.exists(settings_file):
    print 'reading from ', settings_file
    deduper = dedupe.StaticDedupe(settings_file, num_processes=2)
else:

    # Select a large sample of duplicate pairs.  As the dataset grows,
    # duplicate pairs become relatively more rare so we have to take a
    # fairly large sample compared to `csv_example.py`
    print 'selecting random sample from donors table...'
    data_sample = getSample(c, 750000, 'donor_id', 'donors')

    # Define the fields dedupe will pay attention to
    #
    # The address, city, and zip fields are often missing, so we'll
    # tell dedupe that, and we'll learn a model that take that into
    # account
    fields = {
        'name': {
예제 #19
0
def deDuplication():

    input_file = 'csv_example_input.csv'
    output_file = 'csv_example_output.csv'
    settings_file = 'csv_example_learned_settings3'
    training_file = 'csv_example_training.json3'

    def preProcess(column):

        try:
            column = column.decode('utf-8')
        except AttributeError:
            pass
        column = unidecode(column)
        column = re.sub(' +', ' ', column)
        column = re.sub('\n', ' ', column)
        column = column.strip().strip('"').strip("'").lower().strip()

        if not column:
            column = None
        return column

    # Read in the data from CSV file:
    def readData(filename):

        data_d = {}
        with open(filename, encoding="utf-8") as f:
            reader = csv.DictReader(f)
            for row in reader:
                clean_row = [(k, preProcess(v)) for (k, v) in row.items()]
                row_id = row['id']
                data_d[row_id] = dict(clean_row)

        return data_d

    print('importing data ...')
    data_d = readData(input_file)

    if os.path.exists(settings_file):
        print('reading from', settings_file)
        with open(settings_file, 'rb') as f:
            deduper = dedupe.StaticDedupe(f)
    else:
        fields = [
            {
                'field': 'DisplayName_Processed',
                'type': 'String'
            },
            {
                'field': 'Email_Processed',
                'type': 'String'
            },
            {
                'field': 'Info',
                'type': 'String'
            },
        ]
        deduper = dedupe.Dedupe(fields)
        deduper.sample(data_d, 15000)

        if os.path.exists(training_file):
            print('reading labeled examples from ', training_file)
            with open(training_file, 'rb') as f:
                deduper.readTraining(f)

        print('starting active labeling...')

        dedupe.consoleLabel(deduper)

        deduper.train()

        with open(training_file, 'w') as tf:
            deduper.writeTraining(tf)

        with open(settings_file, 'wb') as sf:
            deduper.writeSettings(sf)

    threshold = deduper.threshold(data_d, recall_weight=1)

    print('clustering...')
    clustered_dupes = deduper.match(data_d, threshold)

    print('# duplicate sets', len(clustered_dupes))

    cluster_membership = {}
    cluster_id = 0
    for (cluster_id, cluster) in enumerate(clustered_dupes):
        id_set, scores = cluster
        cluster_d = [data_d[c] for c in id_set]
        canonical_rep = dedupe.canonicalize(cluster_d)
        for record_id, score in zip(id_set, scores):
            cluster_membership[record_id] = {
                "cluster id": cluster_id,
                "canonical representation": canonical_rep,
                "confidence": score
            }

    singleton_id = cluster_id + 1

    with open(output_file, 'w',
              encoding="utf-8") as f_output, open(input_file,
                                                  encoding="utf-8") as f_input:
        writer = csv.writer(f_output)
        reader = csv.reader(f_input)

        heading_row = next(reader)
        heading_row.insert(0, 'confidence_score')
        heading_row.insert(0, 'Cluster ID')
        canonical_keys = canonical_rep.keys()
        for key in canonical_keys:
            heading_row.append('canonical_' + key)

        writer.writerow(heading_row)

        for row in reader:
            row_id = row[0]
            if row_id in cluster_membership:
                cluster_id = cluster_membership[row_id]["cluster id"]
                canonical_rep = cluster_membership[row_id][
                    "canonical representation"]
                row.insert(0, cluster_membership[row_id]['confidence'])
                row.insert(0, cluster_id)
                for key in canonical_keys:
                    row.append(canonical_rep[key].encode('utf8'))
            else:
                row.insert(0, None)
                row.insert(0, singleton_id)
                singleton_id += 1
                for key in canonical_keys:
                    row.append(None)
            writer.writerow(row)
    return clustered_dupes
예제 #20
0
def _train(settings_file, training_file, data, field_properties, sample_size,
           update_model):
    """Internal method that trains the deduper model from scratch or update
        an existing dedupe model.
        Parameters
        ----------
        settings_file : str
            A path to a settings file that will be loaded if it exists.
        training_file : str
            A path to a training file that will be loaded to keep training
            from.
        data : dict
            The dictionary form of the dataframe that dedupe requires.
        field_properties : dict
            The mapping of fields to their respective data types. Please
            see the dedupe documentation for further details.
        sample_size : float, default 0.3
            Specify the sample size used for training as a float from 0 to 1.
            By default it is 30% (0.3) of our data.
        update_model : bool, default False
            If True, it allows user to update existing model by uploading
            training file.
        Returns
        -------
        dedupe.Dedupe
            A dedupe model instance.
    """
    # Define the fields dedupe will pay attention to
    fields = []
    select_fields(fields, field_properties)

    if update_model == False:

        # If a settings file already exists, we'll just load that and skip training
        if os.path.exists(settings_file):
            print('Reading from', settings_file)
            with open(settings_file, 'rb') as f:
                deduper = dedupe.StaticDedupe(f, num_cores=None)

        #Create a new deduper object and pass our data model to it.
        else:
            # Initialise dedupe
            deduper = dedupe.Dedupe(fields, num_cores=None)

            # Launch active learning
            deduper = _active_learning(data, sample_size, deduper,
                                       training_file, settings_file)

    else:
        # ## Training
        # Initialise dedupe
        deduper = dedupe.Dedupe(fields, num_cores=None)

        # Import existing model
        print('Reading labeled examples from ', training_file)
        with open(training_file, 'rb') as f:
            deduper.prepare_training(data, training_file=f)

        # Launch active learning
        deduper = _active_learning(data, sample_size, deduper, training_file,
                                   settings_file)

    return deduper
예제 #21
0
def start_dedupe(df):
    # If a settings file already exists, we'll just load that and skip training
    if os.path.exists(settings_file):
        print('reading from', settings_file)
        with open(settings_file, 'rb') as f:
            deduper = dedupe.StaticDedupe(f)

    else:
        # ## Training

        # Define the fields dedupe will pay attention to
        fields = [
            {
                'field': 'MESSZEIT',
                'type': 'Exact'
            },
            {
                'field': 'KENNUNG',
                'type': 'Exact'
            },
            {
                'field': 'GEOGR_BREITE',
                'type': 'Custom',
                'comparator': sim_num_abs
            },
            {
                'field': 'GEOGR_LAENGE',
                'type': 'Custom',
                'comparator': sim_num_abs
            },
            {
                'field': 'HORIZONTALE_SICHT',
                'type': 'ShortString',
                'has missing': True
            },
            {
                'field': 'WETTER',
                'type': 'Custom',
                'comparator': sim_ww,
                'has missing': True
            },
        ]

        # Create a new deduper object and pass our data model to it.
        deduper = dedupe.Dedupe(fields)

        # To train dedupe, we feed it a sample of records.
        deduper.sample(df, 15000, .5)

        # If we have training data saved from a previous run of dedupe,
        # look for it and load it in.
        # __Note:__ if you want to train from scratch, delete the training_file
        if os.path.exists(training_file):
            print('reading labeled examples from ', training_file)
            with open(training_file, 'rb') as f:
                deduper.readTraining(f)

        # ## Active learning
        # Dedupe will find the next pair of records
        # it is least certain about and ask you to label them as duplicates
        # or not.
        # use 'y', 'n' and 'u' keys to flag duplicates
        # press 'f' when you are finished
        print('starting active labeling...')

        dedupe.consoleLabel(deduper)

        # Using the examples we just labeled, train the deduper and learn
        # blocking predicates
        deduper.train()

        # When finished, save our training to disk
        with open(training_file, 'w') as tf:
            deduper.writeTraining(tf)

        # Save our weights and predicates to disk.  If the settings file
        # exists, we will skip all the training and learning next time we run
        # this file.
        with open(settings_file, 'wb') as sf:
            deduper.writeSettings(sf)

    threshold = deduper.threshold(df, recall_weight=1)
    print(f'threshold: {threshold}')

    print('clustering...')
    # clustered_dupes = deduper.match(df, .98)
    clustered_dupes = deduper.match(df, threshold)

    print(f'# duplicate sets {len(clustered_dupes)}')
    return clustered_dupes
예제 #22
0
        for row in reader:
            clean_row = [(k, preProcess(v)) for (k, v) in row.items()]
            row_id = int(row['Id'])
            data_d[row_id] = dict(clean_row)

    return data_d


print 'importing data ...'
data_d = readData(input_file)

# ## Training

if os.path.exists(settings_file):
    print 'reading from', settings_file
    deduper = dedupe.StaticDedupe(settings_file)

else:
    # Define the fields dedupe will pay attention to
    #
    # Notice how we are telling dedupe to use a custom field comparator
    # for the 'Zip' field.
    fields = {
        'Site name': {
            'type': 'String'
        },
        'Address': {
            'type': 'String'
        },
        'Zip': {
            'type': 'Custom',
예제 #23
0
def dedupe_dataframe(df, field_properties, canonicalize=False, config_name="dedupe_dataframe"):
    # Import Data
    print('runnning CUSTOM dedupe_dataframe LOCAL')
    config_name = config_name.replace(" ", "_")
    
    settings_file = config_name + '_learned_settings'
    training_file = config_name + '_training.json'

    print('importing data ...')

    df = clean_punctuation(df)
    
    specify_type(df, field_properties)                
    
    df['dictionary'] = df.apply(lambda x: dict(zip(df.columns,x.tolist())), axis=1)
    data_d = dict(zip(df.index,df.dictionary))
    for col in df.columns:
      print("df col - - - ", col)
    print(' df[dictionary].. shape', df.shape)
    # If a settings file already exists, we'll just load that and skip training
    if os.path.exists(settings_file):
        print('reading from', settings_file)
        with open(settings_file, 'rb') as f:
            deduper = dedupe.StaticDedupe(f)
    else:
        # ## Training
        # Define the fields dedupe will pay attention to
        
        fields = []
        select_fields(fields, field_properties)
        # print("fields", fields)
        # Create a new deduper object and pass our data model to it.
        deduper = dedupe.Dedupe(fields)

        # To train dedupe, we feed it a sample of records.
        deduper.sample(data_d, 15000)

        # If we have training data saved from a previous run of dedupe,
        # look for it and load it in.
        # __Note:__ if you want to train from scratch, delete the training_file
        if os.path.exists(training_file):
            print('reading labeled examples from ', training_file)
            with open(training_file, 'rb') as f:
                deduper.readTraining(f)
        
        # print('starting active labeling...')
        dedupe.consoleLabel(deduper)

        # Using the examples we just labeled, train the deduper and learn
        # blocking predicates
        print('calling train...')
        deduper.train()

        # When finished, save our training to disk
        with open(training_file, 'w') as tf:
            deduper.writeTraining(tf)

        # Save our weights and predicates to disk.  If the settings file
        # exists, we will skip all the training and learning next time we run
        # this file.
        with open(settings_file, 'wb') as sf:
            deduper.writeSettings(sf)
        print('saved our training to disk.')
    # ## Set threshold

    threshold = deduper.threshold(data_d, recall_weight=1)
    print('threshold=', threshold)
    # ## Clustering

    print('clustering...')
    clustered_dupes = deduper.match(data_d, threshold)

    print('# duplicate sets', len(clustered_dupes))

    # Convert data_d to string so that Price & LatLong won't get traceback during dedupe.canonicalize()
    
    for i in data_d.values():
        for key in i:
            if i[key] == None:
                pass
            else:
                i[key] = str(i[key])
            
    # ## Writing Results

    cluster_membership = {}
    cluster_id = 0
    for (cluster_id, cluster) in enumerate(clustered_dupes):
        id_set, scores = cluster
        cluster_d = [data_d[c] for c in id_set]
        canonical_rep = dedupe.canonicalize(cluster_d)
        for record_id, score in zip(id_set, scores):
            cluster_membership[record_id] = {
                "cluster id" : cluster_id,
                "canonical representation" : canonical_rep,
                "confidence": score
            }

    cluster_index=[]
    for i in cluster_membership.items():
        cluster_index.append(i)

    # turn results into dataframe
    dfa = pd.DataFrame(cluster_index)

    dfa.rename(columns={0: 'Id index'}, inplace=True)

    canonical_list=[]
    
    if canonicalize== True:
        for i in dfa[1][0]['canonical representation'].keys():
            canonical_list.append(i)
            dfa[i + ' - ' + 'canonical'] = None
            dfa[i + ' - ' + 'canonical'] = dfa[1].apply(lambda x: x['canonical representation'][i])
    elif canonicalize == False:
        pass            
    elif type(canonicalize) == list:
        for i in canonicalize:
            dfa[i + ' - ' + 'canonical'] = None
            dfa[i + ' - ' + 'canonical'] = dfa[1].apply(lambda x: x['canonical representation'][i])

    dfa.set_index('Id index', inplace=True)
  
    df = df.join(dfa)

    return df
예제 #24
0
def train(con, config):
    """Trains or retrieves a previously trained Dedupe object

    If config['use_saved_model'] is set, it will read the saved model from
    config['settings_file'] instead of training.

    If config['seed'] is set, both random.seed and numpy.random.seed will be set so
    the run will be deterministic given configuration.

    If config['prompt_for_labels'] is set, the console will be used to ask the user to
    help label examples

    Args:
        con (psycopg2.connection)
        config (dict) configuration options for a deduping run. Expected to have defaults applied

    Returns: (dedupe.Dedupe or dedupe.StaticDedupe)
    """
    if config['seed'] is not None:
        if os.environ.get('PYTHONHASHSEED', 'random') == 'random':
            raise EnvironmentError(
                """dedupe is only deterministic with hash randomization disabled.
                Set the PYTHONHASHSEED environment variable to a constant.""")
        random.seed(config['seed'])
        numpy.random.seed(config['seed'])
    if config['use_saved_model']:
        logging.info('reading saved model from %s', config['settings_file'])
        with open(config['settings_file'], 'rb') as sf:
            return dedupe.StaticDedupe(sf, num_cores=config['num_cores'])
    # Create a new deduper object and pass our data model to it.
    deduper = dedupe.Dedupe(config['all_fields'],
                            num_cores=config['num_cores'])

    module_name, class_name = config['classifier'].rsplit(".", 1)
    module = importlib.import_module(module_name)
    cls = getattr(module, class_name)
    deduper.classifier = cls(**config['hyperparameters'])

    # Named cursor runs server side with psycopg2
    cur = con.cursor('individual_select')

    cur.execute("""SELECT {all_columns}
                   FROM {schema}.entries_unique
                   ORDER BY _unique_id""".format(**config))
    temp_d = dict((i, row) for i, row in enumerate(cur))

    num_records = 75000
    logging.info('Creating sample of %s records', num_records)
    deduper.sample(temp_d, num_records)

    del temp_d
    # If we have training data saved from a previous run of dedupe,
    # look for it an load it in.
    #
    # __Note:__ if you want to train from
    # scratch, delete the training_file
    if os.path.exists(config['training_file']):
        logging.info('reading labeled examples from %s',
                     config['training_file'])
        with open(config['training_file']) as tf:
            deduper.readTraining(tf)

    if config['prompt_for_labels']:
        # ## Active learning
        logging.info('starting active labeling...')
        # Starts the training loop. Dedupe will find the next pair of records
        # it is least certain about and ask you to label them as duplicates
        # or not.

        # use 'y', 'n' and 'u' keys to flag duplicates
        # press 'f' when you are finished
        dedupe.convenience.consoleLabel(deduper)
        # When finished, save our labeled, training pairs to disk
        with open(config['training_file'], 'w') as tf:
            deduper.writeTraining(tf)

    # `recall` is the proportion of true dupes pairs that the learned
    # rules must cover. You may want to reduce this if your are making
    # too many blocks and too many comparisons.
    deduper.train(recall=config['recall'])

    with open(config['settings_file'], 'wb') as sf:
        deduper.writeSettings(sf)

    return deduper
예제 #25
0
def eval_lightgbm_dedupe(dataset_type: str, dataset_rf, dataset_dedupe):
    # Create deduper model
    with open('../../trained_models/combined_2/trained_{}_settings.json'.format(dataset_type), 'rb') as fin:
        deduper = dedupe.StaticDedupe(fin, num_cores=NUM_CORES)

    cols = ['name', 'brand', 'size', 'product_type']

    to_dedupe = dataset_dedupe[cols]
    to_dedupe_dict = to_dedupe.to_dict(orient='index')

    # Cluster (prediction stage)
    clustered_dupes = deduper.partition(to_dedupe_dict, partition_threshold[dataset_type])
    print('# duplicate sets', len(clustered_dupes))

    # Create the record linkage model
    # Indexer
    indexer = rl.Index()
    indexer.add(rl.index.Block('product_type'))
    indexer.add(rl.index.Block('brand'))
    indexer.add(rl.index.Block('size'))
    candidate_links = indexer.index(dataset_rf)

    # Comparing
    compare_cl = rl.Compare(n_jobs=-1)
    compare_cl.exact('brand', 'brand')
    compare_cl.exact('product_type', 'product_type')
    compare_cl.exact('size', 'size')
    compare_cl.string('name', 'name', method='qgram')
    compare_cl.string('name', 'name', method='damerau_levenshtein')
    compare_cl.string('name', 'name', method='levenshtein')
    compare_cl.string('name', 'name', method='jarowinkler')
    compare_cl.string('name', 'name', method='smith_waterman')
    compare_cl.string('name', 'name', method='lcs')
    # compare_cl.string('price', 'price')

    # Features
    features = compare_cl.compute(candidate_links, dataset_rf)

    # Add dedupe features
    features['xy_same_entity'] = pd.Series(np.zeros(len(features)))
    features.xy_same_entity = 0.0

    # Save the result
    for el in clustered_dupes:
        for i in range(len(el[0])):
            for j in range(i + 1, len(el[0])):
                k = (el[0][i], el[0][j])
                r_k = (el[0][j], el[0][i])
                p = el[1][i] * el[1][j]

                if k in features.index:
                    features.loc[k, 'xy_same_entity'] = p

                if r_k in features.index:
                    features.loc[r_k, 'xy_same_entity'] = p

    # Now load the lightgbm
    bst = lgb.Booster(model_file='../../trained_models/combined_2/x4_lgb_classifier.txt')

    # Predict
    confs = bst.predict(features)
    features['label'] = confs

    # Save the csv file
    # Now export the left and right instance ids
    # Save the result
    res = []
    for i in range(len(confs)):
        record = features.iloc[i]
        label = record.label
        if label > 0.5:
            res.append(record.name)

    res_df = pd.DataFrame(res)
    res_df.columns = ['left_instance_id', 'right_instance_id']

    return res_df
예제 #26
0
def main():
    reload(sys)
    sys.setdefaultencoding("utf-8")
    optp = optparse.OptionParser()
    optp.add_option('-v', '--verbose', dest='verbose', action='count',
                    help='Increase verbosity (specify multiple times for more)'
                    )
    (opts, args) = optp.parse_args()
    log_level = logging.WARNING
    if opts.verbose == 1:
        log_level = logging.INFO
    elif opts.verbose >= 2:
        log_level = logging.DEBUG
    logging.getLogger().setLevel(log_level)

  

    settings_file = 'crm_person_deduping_settings'
    training_file = 'crm_person_deduping_training.json'

    start_time = time.time()



    server = "name_of_server"
    user = "******"
    password = "******"
    con = pymssql.connect(server,user,password,"ew_DataMatching", as_dict=True)
    con.autocommit(True)
    c = con.cursor()

    con2 = pymssql.connect(server,user,password,"ew_DataMatching")

    c2 = con2.cursor()
    # c2.execute("SET net_write_timeout = 3600")






    COMPANY_SELECT = "SELECT * from [vr_unmatched_person_remainder]" \





    # ## Training
	
	        # Define the fields dedupe will pay attention to
  

    if os.path.exists(settings_file):
        print 'reading from ', settings_file
        with open(settings_file) as sf :
            deduper = dedupe.StaticDedupe(sf, num_cores=4)
    else:

        fields = [
            {'field' : 'company_id','variable name' : 'company', 'type': 'Exact'}
            ,
        {'field' : 'fullname', 'variable name' : 'fullname','type': 'Text','has missing':'True'},
     {'field' : 'lname', 'variable name' : 'Surname',
                   'type' : 'String'},
                  {'field' : 'initial',
                   'type' : 'Exact', 'Has Missing' : True},
                  {'field' : 'gender',
                   'type' : 'Exact', 'Has Missing' : True}
            ,
              {'type' : 'Interaction',
                 'interaction variables' : ['fullname', 'company']},




                 # {'type' : 'Interaction',
                  # 'interaction variables' : ['AbbrFullName', 'Gender']},
                  {'type' : 'Interaction',
                  'interaction variables' : ['Surname', 'company']}

        ]


        # Create a new deduper object and pass our data model to it.
        deduper = dedupe.Dedupe(fields, num_cores=4)

        # We will sample pairs from the entire donor table for training
        c.execute(COMPANY_SELECT)
        temp_d = dict((i, row) for i, row in enumerate(c))

        print temp_d

        # -*- coding: utf-8 -*-

        deduper.sample(temp_d,11000)
        del temp_d

        # If we have training data saved from a previous run of dedupe,
        # look for it an load it in.
        #
        # __Note:__ if you want to train from
        # scratch, delete the training_file
        if os.path.exists(training_file):
            print 'reading labeled examples from ', training_file
            with open(training_file) as tf :
                deduper.readTraining(tf)

        # ## Active learning

        print 'starting active labeling...'
        # Starts the training loop. Dedupe will find the next pair of records
        # it is least certain about and ask you to label them as duplicates
        # or not.

        # use 'y', 'n' and 'u' keys to flag duplicates
        # press 'f' when you are finished
        dedupe.convenience.consoleLabel(deduper)

        # Notice our two arguments here
        #
        # `ppc` limits the Proportion of Pairs Covered that we allow a
        # predicate to cover. If a predicate puts together a fraction of
        # possible pairs greater than the ppc, that predicate will be removed
        # from consideration. As the size of the data increases, the user
        # will generally want to reduce ppc.
        #
        # `uncovered_dupes` is the number of true dupes pairs in our training
        # data that we are willing to accept will never be put into any
        # block. If true duplicates are never in the same block, we will never
        # compare them, and may never declare them to be duplicates.
        #
        # However, requiring that we cover every single true dupe pair may
        # mean that we have to use blocks that put together many, many
        # distinct pairs that we'll have to expensively, compare as well.
        deduper.train(ppc=0.01, uncovered_dupes=5)

        # When finished, save our labeled, training pairs to disk
        with open(training_file, 'w') as tf:
            deduper.writeTraining(tf)
        with open(settings_file, 'w') as sf:
            deduper.writeSettings(sf)

        # We can now remove some of the memory hobbing objects we used
        # for training
        deduper.cleanupTraining()

    ## Blocking

    print 'blocking...'

    # To run blocking on such a large set of data, we create a separate table
    # that contains blocking keys and record ids
    print 'creating blocking_map database'
    c.execute("IF EXISTS (SELECT * FROM sys.objects WHERE object_id = OBJECT_ID(N'ew_DataMatching.dedupe.blocking_map'))"
   "DROP TABLE ew_DataMatching.dedupe.blocking_map")
    c.execute("CREATE TABLE ew_DataMatching.dedupe.blocking_map "
              "(block_key VARCHAR(200), matching_id INT)"
              )


    # If dedupe learned a TF-IDF blocking rule, we have to take a pass
    # through the data and create TF-IDF canopies. This can take up to an
    # hour
    print 'creating inverted index'


    for field in deduper.blocker.index_fields  :


        c2.execute("SELECT  %s FROM [vr_unmatched_person_remainder]" % field)
        field_data = (row[0] for row in c2)


        deduper.blocker.index(field_data, field)

    # Now we are ready to write our blocking map table by creating a
    # generator that yields unique `(block_key, donor_id)` tuples.
    print 'writing blocking map'

    c.execute(COMPANY_SELECT)


    full_data = ((row['matching_id'], row) for row in c)

   # for line in full_data:
   #       id_number,data = line
   #       if data['gender'] == '':
    #            print id_number

    b_data = deduper.blocker(full_data)



    def dbWriter(sql, rows) :
        conn  = pymssql.connect(server,user,password,"ew_DataMatching")


        cursor = conn.cursor()
        cursor.executemany(sql, rows)
        cursor.close()
        conn.commit()
        conn.close()
    results = dbWriter("INSERT INTO ew_DataMatching.dedupe.blocking_map VALUES (%s, %s)",b_data)

    print 'prepare blocking table. this will probably take a while ...'

    logging.info("indexing block_key")
    c.execute("CREATE INDEX blocking_map_key_idx ON dedupe.blocking_map (block_key)")

    c.execute("IF EXISTS (SELECT * FROM sys.objects WHERE object_id = OBJECT_ID(N'ew_DataMatching.dedupe.[plural_key]')) DROP TABLE ew_DataMatching.dedupe.[plural_key]")
    c.execute("IF EXISTS (SELECT * FROM sys.objects WHERE object_id = OBJECT_ID(N'ew_DataMatching.dedupe.[plural_block]')) DROP TABLE ew_DataMatching.dedupe.[plural_block]")
    c.execute("IF EXISTS (SELECT * FROM sys.objects WHERE object_id = OBJECT_ID(N'ew_DataMatching.dedupe.[covered_blocks]')) DROP TABLE ew_DataMatching.dedupe.[covered_blocks]")
    c.execute("IF EXISTS (SELECT * FROM sys.objects WHERE object_id = OBJECT_ID(N'ew_DataMatching.dedupe.[smaller_coverage]')) DROP TABLE ew_DataMatching.dedupe.[smaller_coverage]")

    # Many block_keys will only form blocks that contain a single
    # record. Since there are no comparisons possible withing such a
    # singleton block we can ignore them.
    logging.info("calculating plural_key")
    c.execute("CREATE TABLE ew_DataMatching.dedupe.plural_key "
              "(block_key VARCHAR(200), "
              " block_id INT identity(1,1), "
              " PRIMARY KEY (block_id)) ")

    c.execute("INSERT INTO dedupe.plural_key(block_key) SELECT block_key FROM ew_DataMatching.dedupe.blocking_map "
               "GROUP BY block_key HAVING COUNT(*) > 1")

    logging.info("creating block_key index")
    c.execute("CREATE UNIQUE INDEX block_key_idx ON ew_DataMatching.dedupe.plural_key (block_key)")

    logging.info("calculating plural_block")
    c.execute("SELECT distinct block_id, matching_id into dedupe.plural_block"
              " FROM ew_DataMatching.dedupe.blocking_map INNER JOIN ew_DataMatching.dedupe.plural_key "
              " on blocking_map.block_key = plural_key.block_key order by block_id")

    logging.info("adding donor_id index and sorting index")
    c.execute("CREATE INDEX matching_id ON dedupe.plural_block (matching_id)")
    c.execute("CREATE UNIQUE INDEX idx_matching_id_2 on dedupe.plural_block (block_id, matching_id)")

    # To use Kolb, et.al's Redundant Free Comparison scheme, we need to
    # keep track of all the block_ids that are associated with a
    # particular donor records. Original code had group_concat function from MySQL, below code replaces this for SQL Server


    logging.info("creating covered_blocks")
    c.execute("SELECT matching_id,"
               "dbo.trim_character(CAST(("
               "SELECT  cast(block_id as varchar)+',' "
               "FROM ew_DataMatching.dedupe.plural_block T WHERE plural_block.matching_id = T.matching_id order by block_id FOR XML PATH(''))as varchar(max)),',') AS sorted_ids into dedupe.covered_blocks FROM ew_DataMatching.dedupe.plural_block "
               "GROUP BY matching_id")

    c.execute("CREATE UNIQUE INDEX donor_idx ON ew_DataMatching.dedupe.covered_blocks (matching_id)")

    # In particular, for every block of records, we need to keep
    # track of a donor records's associated block_ids that are SMALLER than
    # the current block's id. Code converted for SQL SERVER
    logging.info("creating smaller_coverage")
    c.execute("SELECT plural_block.matching_id, block_id, "
              "case when charindex(cast(block_id as varchar),sorted_ids,1) = 1 then null else substring(sorted_ids,1,charindex(cast(block_id as varchar),sorted_ids,1)-2) end"
              " AS smaller_ids INTO dedupe.smaller_coverage"
              " FROM ew_DataMatching.dedupe.plural_block INNER JOIN ew_DataMatching.dedupe.covered_blocks "
              " on plural_block.matching_id = covered_blocks.matching_id order by matching_id,block_id")

    con.commit()


    ## Clustering

    def candidates_gen(result_set) :
        lset = set

        block_id = None
        records = []
        i = 0
        for row in result_set :
            if row['block_id'] != block_id :
                if records :
                    yield records

                block_id = row['block_id']
                records = []
                i += 1

                if i % 10000 == 0 :
                    print i, "blocks"
                    print time.time() - start_time, "seconds"

            smaller_ids = row['smaller_ids']

            if smaller_ids :
                smaller_ids = lset(smaller_ids.split(','))



            else :
                smaller_ids = lset([])

            records.append((row['matching_id'], row, smaller_ids))
            #print records


        if records :
            yield records

    c.execute("SELECT MasterMatching.matching_id, fullname,gender, "
              " initial,lname,company_id,"


              " block_id, smaller_ids "
              "FROM dedupe.smaller_coverage "
              "INNER JOIN [vr_unmatched_person_remainder]  MasterMatching"
              " on MasterMatching.matching_id = smaller_coverage.matching_id order by block_id,matching_id")
   # for row in c:
      #  if row["company"] == 'Cheshunt School & Technology College':
       #     print('row = %r' % (row,))

    print 'clustering...'

#0.5 Default

    clustered_dupes = deduper.matchBlocks(candidates_gen(c),
                                          threshold=0.5)

    ## Writing out results

    # We now have a sequence of tuples of donor ids that dedupe believes
    # all refer to the same entity. We write this out onto an entity map
    # table
    c.execute("IF EXISTS (SELECT * FROM sys.objects WHERE object_id = OBJECT_ID(N'ew_DataMatching.dedupe.[company_entity_map]')) DROP TABLE ew_DataMatching.dedupe.[company_entity_map]")

    print 'creating entity_map database'
    c.execute("CREATE TABLE ew_DataMatching.dedupe.company_entity_map "
              "(matching_id INTEGER, canon_id INTEGER, score FLOAT"
              "  PRIMARY KEY(matching_id))")

    for cluster, score in clustered_dupes :
        cluster_id = cluster[0]

        for matching_id, score in zip(cluster, score) :
            #print cluster_id, matching_id
            c.execute('INSERT INTO ew_DataMatching.dedupe.company_entity_map (matching_id, canon_id, score) VALUES (%s, %s , %s)',

                      (int(matching_id), int(cluster_id),round(score,2)

                      ))

    con.commit()

    c.execute("CREATE INDEX head_index ON ew_DataMatching.dedupe.company_entity_map (canon_id)")
    con.commit()

    # Print out the number of duplicates found
    print '# duplicate sets'
    print len(clustered_dupes)



    print 'ran in', time.time() - start_time, 'seconds'
예제 #27
0
def _train(settings_file, training_file, data, field_properties, sample_size):
    """Internal method that trains the deduper model if a training file does
    not exist.

        Parameters
        ----------
        settings_file : str
            A path to a settings file that will be loaded if it exists.
        training_file : str
            A path to a training file that will be loaded to keep training
            from.
        data : dict
            The dictionary form of the dataframe that dedupe requires.
        field_properties : dict
            The mapping of fields to their respective data types. Please
            see the dedupe documentation for further details.
        sample_size : float, default 0.3
            Specify the sample size used for training as a float from 0 to 1.
            By default it is 30% (0.3) of our data.

        Returns
        -------
        dedupe.Dedupe
            A dedupe model instance.
    """
    # If a settings file already exists, we'll just load that and skip training
    if os.path.exists(settings_file):
        print('reading from', settings_file)
        with open(settings_file, 'rb') as f:
            deduper = dedupe.StaticDedupe(f)
    else:
        # ## Training
        # Define the fields dedupe will pay attention to
        fields = []
        select_fields(fields, field_properties)

        # Create a new deduper object and pass our data model to it.
        deduper = dedupe.Dedupe(fields)

        # To train dedupe, we feed it a sample of records.
        sample_num = math.floor(len(data) * sample_size)
        deduper.sample(data, sample_num)

        # If we have training data saved from a previous run of dedupe,
        # look for it and load it in.
        # __Note:__ if you want to train from scratch, delete the training_file
        if os.path.exists(training_file):
            print('reading labeled examples from ', training_file)
            with open(training_file, 'rb') as f:
                deduper.readTraining(f)

        print('starting active labeling...')

        dedupe.consoleLabel(deduper)

        # Using the examples we just labeled, train the deduper and learn
        # blocking predicates
        deduper.train()

        # When finished, save our training to disk
        with open(training_file, 'w') as tf:
            deduper.writeTraining(tf)

        # Save our weights and predicates to disk.  If the settings file
        # exists, we will skip all the training and learning next time we run
        # this file.
        with open(settings_file, 'wb') as sf:
            deduper.writeSettings(sf)

    return deduper
def run_dedupe(settings_file, training_file, type):
    start_time = time.time()
    print("dedupe file ", dedupe.__file__)
    # Set the database connection from environment variable using
    # [dj_database_url](https://github.com/kennethreitz/dj-database-url)
    # For example:
    #   export DATABASE_URL=postgres://user:password@host/mydatabase
    db_conf = dj_database_url.config()

    if not db_conf:
        raise Exception(
            'set DATABASE_URL environment variable with your connection, e.g. '
            'export DATABASE_URL=postgres://user:password@host/mydatabase')

    read_con = psycopg2.connect(database=db_conf['NAME'],
                                user=db_conf['USER'],
                                password=db_conf['PASSWORD'],
                                host=db_conf['HOST'],
                                port=db_conf['PORT'],
                                cursor_factory=psycopg2.extras.RealDictCursor)

    write_con = psycopg2.connect(database=db_conf['NAME'],
                                 user=db_conf['USER'],
                                 password=db_conf['PASSWORD'],
                                 host=db_conf['HOST'],
                                 port=db_conf['PORT'])

    # We'll be using variations on this following select statement to pull
    # in campaign donor info.
    if type == 'IND':
        DONOR_SELECT = "SELECT donor_id, city, name, zip, state, street " \
                   "from processed_donors where person = 1 and name not like '%unitem%' "
    else:
        DONOR_SELECT = "SELECT donor_id, city, name, zip, state, street " \
                   "from processed_donors where person != 1 and name not like '%unitem%' "

    # Open Settings

    print('reading from ', settings_file)
    with open(settings_file, 'rb') as sf:
        deduper = dedupe.StaticDedupe(sf, num_cores=1)
    print(f'deduper predicates: {deduper.predicates}')

    blocking_table = 'blocking_map_' + type
    # ## Clustering
    entity_map_table = "entity_map_" + type
    with write_con:
        with write_con.cursor() as cur:
            cur.execute("DROP TABLE IF EXISTS " + entity_map_table)

            print('creating entity_map database')

            cur.execute("CREATE TABLE " + entity_map_table + " "
                        "(donor_id INTEGER, canon_id INTEGER, "
                        " cluster_score FLOAT, PRIMARY KEY(donor_id))")

    read_con1 = psycopg2.connect(database=db_conf['NAME'],
                                 user=db_conf['USER'],
                                 password=db_conf['PASSWORD'],
                                 host=db_conf['HOST'],
                                 port=db_conf['PORT'],
                                 cursor_factory=psycopg2.extras.RealDictCursor)
    with read_con1.cursor(
            'pairs', cursor_factory=psycopg2.extensions.cursor) as read_cur:
        read_cur.execute(
            "SELECT a.donor_id, "
            "row_to_json((SELECT d FROM (SELECT a.city, "
            "a.name, "
            "a.zip, "
            "a.state, "
            "a.street) d)), "
            "b.donor_id, "
            "row_to_json((SELECT d FROM (SELECT b.city, "
            "b.name, "
            "b.zip, "
            "b.state, "
            "b.street) d)) "
            "FROM (SELECT DISTINCT l.donor_id AS east, r.donor_id AS west "
            "FROM " + blocking_table + " AS l "
            "INNER JOIN " + blocking_table + " AS r "
            "USING (block_key) "
            "WHERE l.donor_id < r.donor_id) ids "
            "INNER JOIN processed_donors a ON ids.east=a.donor_id "
            "INNER JOIN processed_donors b ON ids.west=b.donor_id")

        print('clustering...')
        clustered_dupes = deduper.cluster(deduper.score(
            record_pairs(read_cur)),
                                          threshold=0.5)

        # ## Writing out results

        # We now have a sequence of tuples of donor ids that dedupe believes
        # all refer to the same entity. We write this out onto an entity map
        # table

        print('writing results')
        write_con1 = psycopg2.connect(database=db_conf['NAME'],
                                      user=db_conf['USER'],
                                      password=db_conf['PASSWORD'],
                                      host=db_conf['HOST'],
                                      port=db_conf['PORT'])
        with write_con1:
            with write_con1.cursor() as write_cur:
                write_cur.copy_expert('COPY ' + entity_map_table +
                                      ' FROM STDIN WITH CSV',
                                      Readable(cluster_ids(clustered_dupes)),
                                      size=100000)

    with write_con1:
        with write_con1.cursor() as cur:
            cur.execute("CREATE INDEX head_index_" + type + " ON " +
                        entity_map_table + " (canon_id)")

    # Print out the number of duplicates found

    # ## Payoff

    # With all this done, we can now begin to ask interesting questions
    # of the data
    #
    # For example, let's see who the top 10 donors are.

    locale.setlocale(locale.LC_ALL,
                     'en_US.UTF-8')  # for pretty printing numbers
    read_con2 = psycopg2.connect(database=db_conf['NAME'],
                                 user=db_conf['USER'],
                                 password=db_conf['PASSWORD'],
                                 host=db_conf['HOST'],
                                 port=db_conf['PORT'],
                                 cursor_factory=psycopg2.extras.RealDictCursor)
    # save entity map
    entity_map_filename = 'entity_map_' + settings_file.split(
        '/')[-1] + '_' + time.strftime('%d_%m_%y_%H%M',
                                       time.localtime()) + '.csv'
    donors_filename = 'processed_donors_' + settings_file.split(
        '/')[-1] + '_' + time.strftime('%d_%m_%y_%H%M',
                                       time.localtime()) + '.csv'
    with read_con2.cursor() as cur:
        with open(entity_map_filename, 'w') as file_out:
            cur.copy_expert(
                'COPY ' + entity_map_table + ' TO STDOUT WITH CSV HEADER',
                file_out)
        with open(donors_filename, 'w') as file_out:
            cur.copy_expert('COPY processed_donors TO STDOUT WITH CSV HEADER',
                            file_out)

    if type == 'IND':
        top_donor_where = 'donors.corp is null'
    else:
        top_donor_where = 'donors.corp is not null'
    # Create a temporary table so each group and unmatched record has
    # a unique id
    with read_con2.cursor() as cur:
        cur.execute(
            "CREATE TEMPORARY TABLE e_map "
            "AS SELECT COALESCE(canon_id, donor_id) AS canon_id, donor_id "
            "FROM " + entity_map_table + " "
            "RIGHT JOIN donors USING(donor_id)")

        cur.execute(
            "SELECT CONCAT_WS(' ', donors.first_name, donors.last_name, donors.corp) AS name, "
            "donation_totals.totals AS totals "
            "FROM (SELECT * FROM donors WHERE " + top_donor_where +
            ") as donors INNER JOIN "
            "(SELECT canon_id, SUM(CAST(contributions.amount AS FLOAT)) AS totals "
            " FROM contributions INNER JOIN e_map "
            " USING (donor_id) "
            " GROUP BY (canon_id) "
            " ORDER BY totals "
            " DESC) "
            "AS donation_totals ON donors.donor_id=donation_totals.canon_id "
            "WHERE donors.donor_id = donation_totals.canon_id and donation_totals.totals IS NOT NULL LIMIT 10"
        )

        print("Top Donors (deduped)")
        for row in cur:
            row['totals'] = locale.currency(row['totals'], grouping=True)
            print('%(totals)20s: %(name)s' % row)

        # Compare this to what we would have gotten if we hadn't done any
        # deduplication

        cur.execute(
            "SELECT name, totals FROM (SELECT CONCAT_WS(' ', donors.first_name, donors.last_name, donors.corp) AS name, "
            "SUM(CAST(contributions.amount AS FLOAT)) AS totals "
            "FROM donors INNER JOIN contributions "
            "USING (donor_id) "
            "WHERE " + top_donor_where + " "
            "GROUP BY (donor_id) "
            "ORDER BY totals DESC) AS t where totals IS NOT NULL "
            "LIMIT 10")

        print("Top Donors (raw)")
        for row in cur:
            row['totals'] = locale.currency(row['totals'], grouping=True)
            print('%(totals)20s: %(name)s' % row)

        cur.execute(
            " SELECT CONCAT_WS(' ', donors.first_name, donors.last_name, donors.corp) AS name, "
            " cluster_size, cluster_id "
            " FROM (SELECT * FROM donors WHERE " + top_donor_where +
            ") as donors "
            " INNER JOIN (SELECT count(*) AS cluster_size, canon_id AS cluster_id "
            " FROM " + entity_map_table + " "
            " GROUP BY canon_id) "
            " AS cluster_totals "
            " ON donors.donor_id = cluster_totals.cluster_id "
            " ORDER BY cluster_size DESC LIMIT 10")
        # ##Print stats that are saved to match_runs table
        # Biggest cluster first
        print("Biggest Clusters")
        biggest_name = ''
        for row in cur:
            print('%(cluster_size)20s: %(name)s' % row)
            if biggest_name == '':
                biggest_name = row['name'] + ':' + str(row['cluster_id'])
        # Then total of donors
        if type == 'IND':
            processed_donors_where = '= 1'
        else:
            processed_donors_where = '!= 1'
        cur.execute(
            " SELECT count(*) AS num_donors FROM processed_donors WHERE person "
            + processed_donors_where)
        for row in cur:
            number_of_donors = row['num_donors']
        # Then max, average and number of clusters
        cur.execute(
            " SELECT MAX(cluster_size) AS Biggest, AVG(cluster_size) AS Average, COUNT(cluster_id) AS number_of_clusters "
            " FROM (SELECT CONCAT_WS(' ', donors.first_name, donors.last_name, donors.corp) AS name, "
            " cluster_size, cluster_id "
            " FROM (SELECT * FROM donors WHERE " + top_donor_where +
            ") as donors"
            " INNER JOIN (SELECT count(*) AS cluster_size, canon_id AS cluster_id "
            " FROM " + entity_map_table + " "
            " GROUP BY canon_id) "
            " AS cluster_totals "
            " ON donors.donor_id = cluster_totals.cluster_id "
            " ORDER BY cluster_size DESC) AS stats")
        # Print the stats
        print("stats...")
        print(f"total number of donors: {number_of_donors}")
        for row in cur:
            print(
                'max: %(biggest)s, average: %(average)s, number of clusters: %(number_of_clusters)s'
                % row)
            biggest_size = row['biggest']
            average_size = row['average']
            number_of_clusters = row['number_of_clusters']
    # write to the match_run table
    runtime = time.time() - start_time
    donor_cluster_ratio = number_of_donors / number_of_clusters
    with write_con1.cursor() as cur:
        cur.execute(
            """ 
            INSERT INTO match_runs 
            (completed, predicates, total_clusters, avg_cluster_size, biggest_cluster_size, biggest_cluster,
             total_donors, donor_type, total_run_time, donor_cluster_ratio, settings_file)
            VALUES (NOW(), %s, %s, %s, %s, %s,
            %s, %s, %s, %s, %s) """,
            (' '.join(str(pred)
                      for pred in deduper.predicates), number_of_clusters,
             average_size, biggest_size, biggest_name, number_of_donors, type,
             runtime, donor_cluster_ratio, settings_file))
    write_con1.commit()

    read_con.close()
    write_con.close()
    read_con1.close()
    write_con1.close()
    read_con2.close()
    print('ran in', runtime, 'seconds')
예제 #29
0
            row_id = int(row['Id'])
            data_d[row_id] = dict(clean_row)
            row_id += 1

    return data_d


print('importing data ...')
data_d = readData(input_file)

# ## Training

if os.path.exists(settings_file):
    print('reading from', settings_file)
    with open(settings_file, 'rb') as f:
        deduper = dedupe.StaticDedupe(f)

else:
    # Define the fields dedupe will pay attention to

    fields = [
        {
            'field': 'name',
            'type': 'String',
            'has missing': True
        },
        {
            'field': 'address',
            'type': 'String',
            'has missing': True
        },
예제 #30
0
def process():
    """entry point"""
    api = sesamclient.Connection(sesamapi_base_url=INSTANCE,
                                 jwt_auth_token=JWT,
                                 timeout=60 * 10)
    result = api.session.get(
        api.sesamapi_base_url +
        "publishers/{}/entities?deleted=false&history=false".format(SOURCE))

    if result.status_code != 200:
        logging.warning('Sesam API returned non 200 code {}'.format(
            result.status_code))
        iterator = iter([])
    else:
        iterator = iter(result.json())
    raw_data = []

    for dataset_entity in iterator:
        raw_data.append(dataset_entity)
    # this is in memory service, for large datasets we probably may want to have service with DB storage
    if len(raw_data) > 100_000:
        logging.warning(
            'This service is not suitable for datasets with more than 100 000 elements'
            ' and may lead to OutOfMemory errors')

    cleaned_data = read_data(raw_data)

    logging.info('reading from %s', SETTINGS_FILE)

    with open(SETTINGS_FILE, 'rb') as f:
        deduper = dedupe.StaticDedupe(f)

    threshold = deduper.threshold(cleaned_data, recall_weight=1)

    logging.info('clustering...')
    clustered_dupes = deduper.match(cleaned_data, threshold)
    logging.info('%d duplicate sets found', len(clustered_dupes))

    cluster_membership = {}
    cluster_id = 0

    for (cluster_id, cluster) in enumerate(clustered_dupes):
        id_set, scores = cluster
        cluster_d = [cleaned_data[c] for c in id_set]
        canonical_rep = dedupe.canonicalize(cluster_d)
        for record_id, score in zip(id_set, scores):
            cluster_membership[record_id] = {
                "cluster id": cluster_id,
                "canonical representation": canonical_rep,
                "confidence": score
            }

    result_dataset = []
    for row in raw_data:
        row_id = row['_id']
        if row_id in cluster_membership:
            result_dict = {
                '_id': row_id,
                'cluster_id': cluster_membership[row_id]["cluster id"],
                'confidence_score': cluster_membership[row_id]['confidence']
            }

            if ADD_ORIGINALS:
                result_dict['originals'] = {key: row[key] for key in KEYS}

            if ADD_CANONICALS:
                result_dict['canonical_rep'] = cluster_membership[row_id][
                    "canonical representation"]

            result_dataset.append(result_dict)
    if TARGET is not None:
        target_url = api.sesamapi_base_url + "receivers/{}/entities".format(
            TARGET)
        requests.post(target_url,
                      json.dumps(sorted(result_dataset,
                                        key=lambda k: k['cluster_id']),
                                 cls=NumpyEncoder),
                      headers={
                          'Authorization': 'Bearer {}'.format(JWT),
                          'Content-Type': 'application/json'
                      })
        return Response()

    return Response(json.dumps(sorted(result_dataset,
                                      key=lambda k: k['cluster_id']),
                               cls=NumpyEncoder),
                    mimetype='application/json')