Пример #1
0
def train_linker(df1, df2):
    if READ_FROM_SETTINGS_FILE:
        with open(SETTINGS_FILE, 'rb') as sf:
            linker = dedupe.StaticRecordLink(sf)
    else:
        linker = dedupe.RecordLink(FIELDS)
        # It's terrible that you have to do this next line!!!!
        linker.classifier = rlr.RegularizedLogisticRegression()

        linker.sample(df1, df2, BLOCKING_TRAINING_SAMPLE_SIZE)

        if READ_FROM_TRAINING_FILE:
            print('reading labeled examples from ', TRAINING_FILE)
            with open(TRAINING_FILE, 'rb') as tf:
                linker.readTraining(tf)
        else:
            dedupe.consoleLabel(linker)

        linker.train()

    if WRITE_SETTINGS_FILE:
        with open(SETTINGS_FILE, 'wb') as sf:
            linker.writeSettings(sf)
    if WRITE_TRAINING_FILE:
        with open(TRAINING_FILE, 'w') as tf:
            linker.writeTraining(tf)

    return linker
Пример #2
0
 def load_model(self, settings_file):
     if os.path.exists(settings_file):
         logging.info('reading settings from', settings_file)
         with open(settings_file, 'rb') as f:
             self.deduper = dedupe.StaticRecordLink(f)
     else:
         raise Exception("file not found")
    return data_d

    
print('importing data ...')
data_1 = readData('AbtBuy_Abt.csv')
data_2 = readData('AbtBuy_Buy.csv')
#
def descriptions() :
    for dataset in (data_1, data_2) :
        for record in dataset.values() :
            yield record['description']
            
if os.path.exists(settings_file):
    print('reading from', settings_file)
    with open(settings_file, 'rb') as sf :
        linker = dedupe.StaticRecordLink(sf)

else:
    
fields = [
        {'field' : 'title', 'type': 'String'},
        {'field' : 'title', 'type': 'Text', 'corpus' : descriptions()},
        {'field' : 'description', 'type': 'Text',
         'has missing' : True, 'corpus' : descriptions()},
        {'field' : 'price', 'type' : 'Price', 'has missing' : True}]
        
linker = dedupe.RecordLink(fields)

    linker.sample(data_1, data_2, 15000)

    if os.path.exists(training_file):
def get_resolved_df(data_1, data_2, fields):

    # Training

    if os.path.exists(settings_file):
        print('reading from', settings_file)
        with open(settings_file, 'rb') as sf:
            linker = dedupe.StaticRecordLink(sf)

    else:
        # Create a new linker object and pass our data model to it.
        linker = dedupe.RecordLink(fields)
        # To train the linker, we feed it a sample of records.
        linker.sample(data_1, data_2, 15000)

        # If we have training data saved from a previous run of linker,
        # look for it an load it in.
        if os.path.exists(training_file):
            print('reading labeled examples from ', training_file)
            with open(training_file) as tf:
                linker.readTraining(tf)

        # ## Active learning
        print('starting active labeling...')
        dedupe.consoleLabel(linker)

        linker.train()

        # When finished, save our training away to disk
        with open(training_file, 'w') as tf:
            linker.writeTraining(tf)

# Save our weights and predicates to disk.  If the settings file
# exists, we will skip all the training and learning next time we run
# this file.
        with open(settings_file, 'wb') as sf:
            linker.writeSettings(sf)

    # ## Clustering

    print('clustering...')
    linked_records = linker.match(data_1, data_2, 0)

    print('# duplicate sets', len(linked_records))
    print(linked_records)

    # ## Writing Results

    df_list = []

    for cluster, score in linked_records:

        #obtain filename + record no clusters
        cluster1 = cluster[0]
        cluster2 = cluster[1]
        match = re.match(r"(\w+)\.csv(\d+)", cluster1)
        filename, idx = match.groups()
        filename += '.csv'
        filename = pr.resource_filename('src.input', filename)
        idx = int(idx)
        print filename
        print idx

        #dataframe for cluster - 1
        df1 = pd.DataFrame(columns=pd.read_csv(filename).columns)
        df1.loc[0] = pd.read_csv(filename).ix[idx]

        #for cluster 2
        match = re.match(r"(\w+)\.csv(\d+)", cluster2)
        filename, idx = match.groups()
        filename += '.csv'
        filename = pr.resource_filename('src.input', filename)
        idx = int(idx)
        print filename
        print idx

        #dataframe for cluster 2
        df2 = pd.DataFrame(columns=pd.read_csv(filename).columns)
        df2.loc[0] = pd.read_csv(filename).ix[idx]

        #common attribute
        df2['score'] = [score]
        df1['score'] = [score]

        df = pd.merge(df1, df2, on='score')
        df_list.append(df)

    results_df = pd.concat(df_list, ignore_index=True)
    return results_df
Пример #5
0
settings_file = 'canonical_data_matching_learned_settings'

data_1, header = canonicalImport('tests/datasets/restaurant-1.csv')
data_2, _ = canonicalImport('tests/datasets/restaurant-2.csv')

training_pairs = dedupe.trainingDataLink(data_1, data_2, 'unique_id', 5000)

duplicates_s = set(frozenset(pair) for pair in training_pairs['match'])

t0 = time.time()

print('number of known duplicate pairs', len(duplicates_s))

if os.path.exists(settings_file):
    with open(settings_file, 'rb') as f:
        deduper = dedupe.StaticRecordLink(f)
else:
    fields = [{
        'field': 'name',
        'type': 'String'
    }, {
        'field': 'address',
        'type': 'String'
    }, {
        'field': 'cuisine',
        'type': 'String'
    }, {
        'field': 'city',
        'type': 'String'
    }]
Пример #6
0
def main_dedupe_method(data_d,spot_d,spot_data,chosen_cols3,country_code,settings_file,training_file,fields):
        
     
    print '''############# Welcome to our Matching Algorithm  ################
    =============== Initializing Part 1 - Data Retrieving =================\n'''
    start_time = time.time()
    
 
    print "Getting data from BP companies..."                
    
    # Select all companies from selected country
    con = psy.connect(database='msi_bp_%s' % (country_code), user = '******', host='basil.colours.local', password='******')
    con.set_client_encoding('UTF8')
    c = con.cursor(cursor_factory=psy.extras.RealDictCursor)
    
    
    q_companies = '''SELECT --DISTINCT on (auth.company_int_id_number)
	CONCAT(ROW_NUMBER() OVER(ORDER BY auth.company_int_id_number),'c') AS line_nr,
	auth.current_authority_pk as auth_nr,
	auth.s_acct_name as "name",
	auth.s_tax_number as vat,
	ads.address_line1 as address,
	ads.address_line3 as town,
	ads.postcode as postcode
	FROM gdm.authorities auth
	LEFT JOIN gdm.addresses ads
		ON ads.address_id_pk = auth.s_address_id
	WHERE auth.s_acct_name is not null
	ORDER BY auth.company_int_id_number'''    
    c.execute(q_companies)
    companies = c.fetchall()

    comp_d = {}
    for row in companies:
        clean_row = {ky: p.proc(row[ky]) for ky in chosen_cols3[1:]}
        row_id = row['line_nr']
        comp_d[row_id] = dict(clean_row)


    print "Saving all data into database..."

    all_data_d=md.merge_dicts(spot_d,comp_d)

    all_data=[]

    for k,v in all_data_d.iteritems():   
        new_row = [v[ky] for ky in chosen_cols3[1:]]
        new_row.insert(0,k)
        all_data.append(new_row)


    c2 = con.cursor()


    c2.execute('DROP TABLE IF EXISTS htc.all_data')

    field_string = ','.join('%s varchar(200)' % name for name in chosen_cols3)
    c2.execute('''CREATE TABLE htc.all_data
               (%s)''' %(field_string))

    num_cols = len(chosen_cols3)
    mog = "(" + ("%s,"*(num_cols -1)) + "%s)"
    args_str = ','.join(c2.mogrify(mog,x) for x in all_data)
    values = "("+ ','.join(x for x in chosen_cols3) +")"
    c2.execute("INSERT INTO htc.all_data %s VALUES %s" % (values, args_str))
    con.commit()


    print 'Data retrieval took ', time.time() - start_time, 'seconds (', (time.time() - start_time)/60, 'minutes)'
    start_time1 = time.time()


    print "Arranging data to create blocks..."

    u_keys = set()

    for n,m in all_data_d.iteritems():
        for k,v in m.iteritems():
            u_keys.add(v)

    u_keys = list(u_keys)


    block_keys = {}
    count = 0

    for k in u_keys:
        block_keys[k] = count
        count += 1

    
    print 'Checking blocks...'

    ids = []
    for k, v in all_data_d.items():
        ids.append([block_keys[v['name']],v['name'], k])
        if 'postcode' in v.keys():
            if v['postcode'] != None:
                ids.append([block_keys[v['postcode']],v['postcode'], k])
        if 'town' in v.keys():
            if v['town'] != None:
                ids.append([block_keys[v['town']],v['town'], k])


    print 'Writing tables...'

    column_names = ['block_key','block_id','record_id']

    c3 = con.cursor()

    print 'Writing htc.blocks_recordlink_test'

    ## Table with all blocks and companies ##
    c3.execute('DROP TABLE IF EXISTS htc.blocks_recordlink_test')
    field_string = ','.join('%s varchar(200)' % name for name in column_names)
    c3.execute('''CREATE TABLE htc.blocks_recordlink_test
               (%s)''' %(field_string))

    num_cols = len(column_names)
    mog = "(" + ("%s,"*(num_cols -1)) + "%s)"
    args_str = ','.join(c3.mogrify(mog,x) for x in ids)
    values = "("+ ','.join(x for x in column_names) +")"
    c3.execute("INSERT INTO htc.blocks_recordlink_test %s VALUES %s" % (values, args_str))
    con.commit()

    print 'Writing htc.plural_blocks'

    ## Table with all blocks that have more than one company ##

    c3.execute("DROP TABLE IF EXISTS htc.plural_blocks")
    c3.execute("DROP TABLE IF EXISTS htc.covered_blocks")

    c3.execute("""CREATE TABLE htc.plural_blocks as (
                SELECT b.block_id 
                FROM htc.blocks_recordlink_test b
                INNER JOIN (SELECT DISTINCT block_id 
                        FROM htc.blocks_recordlink_test 
                        WHERE record_id like '%c') bp
                ON bp.block_id = b.block_id
                INNER JOIN (SELECT DISTINCT block_id 
                        FROM htc.blocks_recordlink_test 
                        WHERE record_id not like '%c') comp
                ON comp.block_id = b.block_id
                GROUP BY b.block_id 
                HAVING COUNT(*) > 1)""")
    con.commit()

    print 'Writing htc.covered_blocks'

    ## Table with list of blocks for each company ##

    c3.execute("""CREATE TABLE htc.covered_blocks as (
                SELECT record_id, array_agg(r.block_key ORDER BY r.block_key) AS sorted_ids
                FROM htc.blocks_recordlink_test r
                INNER JOIN htc.plural_blocks bl
                ON r.block_id = bl.block_id
                GROUP BY record_id)""")
    con.commit()

    c3 = con.cursor(cursor_factory=psy.extras.RealDictCursor)

    print "Blocking..."

    ## Get all companies and blocks ##

    c3.execute("""SELECT br.record_id, m.*, br.block_key, cb.sorted_ids
               FROM htc.blocks_recordlink_test br
               LEFT JOIN 
		        (SELECT * FROM htc.all_data) m
                    ON m.line_nr = br.record_id
                INNER JOIN htc.covered_blocks cb
                    ON br.record_id = cb.record_id
                INNER JOIN htc.plural_blocks pb
                    ON br.block_id = pb.block_id
                ORDER BY br.block_key""")


    rec_ids = c3.fetchall()


    print 'Writing tables took ', time.time() - start_time1, 'seconds (', (time.time() - start_time1)/60, 'minutes)'
    start_time2 = time.time()

    print "Blocking..."
    ## Arrange data into the needed syntax to use in dedupe ##

    blocks = []
    last_key = 0
    count = 0
    bp_block = []
    comp_block = []


    for rec in rec_ids:
        if rec['block_key'] == last_key:
            if rec['record_id'][-1:] != 'c':
                bp_block.append(tuple([rec['record_id'],
                                       {re.sub('_bp','',your_key): p.proc(rec[your_key]) for your_key in chosen_cols3[1:]},
                                       set(rec['sorted_ids'][:rec['sorted_ids'].index(rec['block_key'])])
                                       ]))  
            else:
                comp_block.append(tuple([rec['record_id'],
                                         comp_d[rec['record_id']],
                                         set(rec['sorted_ids'][:rec['sorted_ids'].index(rec['block_key'])])
                                         ])) 

            
        else:
            if bp_block != [] and comp_block != []:
                blocks.append((bp_block, comp_block))
                bp_block = []
                comp_block = []
                if rec['record_id'][-1:] == 'c':
                    comp_block =[tuple([rec['record_id'],
                                        comp_d[rec['record_id']], 
                                        set(rec['sorted_ids'][:rec['sorted_ids'].index(rec['block_key'])])
                                        ])]
            else:
                            bp_block = [tuple([rec['record_id'],
                                               {re.sub('_bp','',your_key): p.proc(rec[your_key]) for your_key in chosen_cols3[1:]},
                                               set(rec['sorted_ids'][:rec['sorted_ids'].index(rec['block_key'])])
                                               ])]
    
            last_key = rec['block_key']

        count +=1

    blocks.append((bp_block, comp_block))

    blocks = tuple(blocks)


    del rec_ids
    del all_data

    con.close() 



    # Dinamically create fields accordingly to chosen columns
    # We need to see how can we say to our model if we only have name to work with



    print 'Blocking took ', time.time() - start_time2, 'seconds (', (time.time() - start_time2)/60, 'minutes)'
    start_time3 = time.time()


    print "=============== Initializing Part 2 - Dedupe ===============\n"

    print 'Entering Dedupe ...'

    #================================ DEDUPING WITH BLOCKS ==============================================

    if os.path.exists(settings_file):
        print 'Reading from: ', settings_file
        with open(settings_file) as sf :
        
            linker = dedupe.StaticRecordLink(sf)

    else:
        
        linker = dedupe.RecordLink(fields)
        
        linker.sample(data_d,comp_d)

        if os.path.exists(training_file):
            print 'Reading labeled examples from: ', training_file
            with open(training_file) as tf :
                linker.readTraining(tf)

        print 'Starting Active Labeling...'
        dedupe.consoleLabel(linker)
        linker.train()
        with open(training_file, 'w') as tf :
            linker.writeTraining(tf)

        with open(settings_file, 'w') as sf :
            linker.writeSettings(sf)

    print 'Creating blocks...'

    threshold = linker.thresholdBlocks(blocks, recall_weight=2)

    print 'Generating clusters...'

    clusters = linker.matchBlocks(blocks, threshold)
    clust=list(clusters)

    print 'Deduping took ', time.time() - start_time3, 'seconds (', (time.time() - start_time3)/60, 'minutes)'
    start_time4 = time.time()  

    print 'Writing results to database...'

    con = psy.connect(database='msi_bp_%s' % (country_code), user = '******', host='basil.colours.local', password='******')
    c = con.cursor()

    c.execute('DROP TABLE if exists htc.recordlink_blocks_result')
    con.commit()
    c.execute('CREATE TABLE htc.recordlink_blocks_result (spot_id varchar, comp_id varchar, score double precision)')


    #register_adapter(numpy.int64, addapt_numpy_int64)
    num_cols = 3
    mog = "(" + ("%s,"*(num_cols -1)) + "%s)"
    args_str = ','.join(c.mogrify(mog,x[0]+(float(x[1]),)) for x in clust)
    values = "("+ ','.join(x for x in ['spot_id','comp_id', 'score']) +")"
    c.execute("INSERT INTO htc.recordlink_blocks_result %s VALUES %s" % (values, args_str))


    c.execute('DROP TABLE if exists htc.recordlink_blocks_result_all')

    c.execute("""CREATE TABLE htc.recordlink_blocks_result_all as (
               SELECT adt.line_nr as spot_id,br.comp_id as comp_id,br.score as score,
               CASE WHEN br.comp_id IS NULL THEN 'Not a Client'
               ELSE 'Check Match Score'
               END AS "Result",
               adt.name as name_spot,b.name as name_comp, b.auth_nr as auth_nr_comp
               FROM htc.all_data adt
               LEFT JOIN htc.recordlink_blocks_result br
                   ON br.spot_id = adt.line_nr
                  LEFT JOIN 
                    (SELECT *
                    FROM htc.recordlink_blocks_result br
                    INNER JOIN htc.all_data adt
                    ON adt.line_nr= br.comp_id) b
                ON b.spot_id = adt.line_nr
                WHERE adt.line_nr not like '%c%')""")
    con.commit()
    con.close()



    print 'Writing results took ', time.time() - start_time4, 'seconds (', (time.time() - start_time4)/60, 'minutes)'
    print 'Ended in ' , time.time() - start_time, 'seconds (', (time.time() - start_time)/60, 'minutes)'
Пример #7
0
    def main(self):

        data_1 = {}
        data_2 = {}
        # import the specified CSV file

        data_1 = csvhelpers.readData(self.input_1,
                                     self.field_names_1,
                                     delimiter=self.delimiter,
                                     prefix='input_1')
        data_2 = csvhelpers.readData(self.input_2,
                                     self.field_names_2,
                                     delimiter=self.delimiter,
                                     prefix='input_2')

        # fix price
        for field in self.field_definition:
            if field['type'] == 'Price':
                for key, record in data_1.items():
                    value = record[field['field']]
                    if value:
                        record[field['field']] = int(float(value))

        # fix latlon
        for field in self.field_definition:
            if field['type'] == 'LatLong':
                for key, record in data_1.items():
                    latitude = record[field['latitude']]
                    longitude = record[field['longitude']]
                    record[field['field']] = (
                        float(latitude),
                        float(longitude)) if latitude and longitude else None

        # fix price
        for field in self.field_definition:
            if field['type'] == 'Price':
                for key, record in data_2.items():
                    value = record[field['field']]
                    if value:
                        record[field['field']] = int(float(value))

        # fix latlon
        for field in self.field_definition:
            if field['type'] == 'LatLong':
                for key, record in data_2.items():
                    latitude = record[field['latitude']]
                    longitude = record[field['longitude']]
                    record[field['field']] = (
                        float(latitude),
                        float(longitude)) if latitude and longitude else None

        # sanity check for provided field names in CSV file
        for field in self.field_names_1:
            if field not in list(data_1.values())[0]:
                raise self.parser.error("Could not find field '" + field +
                                        "' in input")

        for field in self.field_names_2:
            if field not in list(data_2.values())[0]:
                raise self.parser.error("Could not find field '" + field +
                                        "' in input")

        if self.field_names_1 != self.field_names_2:
            for record_id, record in data_2.items():
                remapped_record = {}
                for new_field, old_field in zip(self.field_names_1,
                                                self.field_names_2):
                    remapped_record[new_field] = record[old_field]
                data_2[record_id] = remapped_record

        logging.info('imported %d rows from file 1', len(data_1))
        logging.info('imported %d rows from file 2', len(data_2))

        logging.info('using fields: %s' %
                     [field['field'] for field in self.field_definition])

        # If --skip_training has been selected, and we have a settings cache still
        # persisting from the last run, use it in this next run.
        # __Note:__ if you want to add more training data, don't use skip training
        if self.skip_training and os.path.exists(self.settings_file):

            # Load our deduper from the last training session cache.
            logging.info('reading from previous training cache %s' %
                         self.settings_file)
            with open(self.settings_file, 'rb') as f:
                deduper = dedupe.StaticRecordLink(f)

            fields = {
                variable.field
                for variable in deduper.data_model.primary_fields
            }
            (nonexact_1, nonexact_2,
             exact_pairs) = exact_matches(data_1, data_2, fields)

        else:
            # # Create a new deduper object and pass our data model to it.
            deduper = dedupe.RecordLink(self.field_definition)

            fields = {
                variable.field
                for variable in deduper.data_model.primary_fields
            }
            (nonexact_1, nonexact_2,
             exact_pairs) = exact_matches(data_1, data_2, fields)

            # Set up our data sample
            logging.info('taking a sample of %d possible pairs',
                         self.sample_size)
            deduper.sample(nonexact_1, nonexact_2, self.sample_size)

            # Perform standard training procedures
            self.dedupe_training(deduper)

        # ## Blocking

        logging.info('blocking...')

        # ## Clustering

        # Find the threshold that will maximize a weighted average of our precision and recall.
        # When we set the recall weight to 2, we are saying we care twice as much
        # about recall as we do precision.
        #
        # If we had more data, we would not pass in all the blocked data into
        # this function but a representative sample.

        logging.info('finding a good threshold with a recall_weight of %s' %
                     self.recall_weight)
        threshold = deduper.threshold(data_1,
                                      data_2,
                                      recall_weight=self.recall_weight)

        # `duplicateClusters` will return sets of record IDs that dedupe
        # believes are all referring to the same entity.

        logging.info('clustering...')
        clustered_dupes = deduper.match(data_1, data_2, threshold)

        clustered_dupes.extend(exact_pairs)

        logging.info('# duplicate sets %s' % len(clustered_dupes))

        write_function = csvhelpers.writeLinkedResults
        # write out our results

        if self.output_file:
            if sys.version < '3':
                with open(self.output_file, 'wb',
                          encoding='utf-8') as output_file:
                    write_function(clustered_dupes, self.input_1, self.input_2,
                                   output_file, self.inner_join)
            else:
                with open(self.output_file, 'w',
                          encoding='utf-8') as output_file:
                    write_function(clustered_dupes, self.input_1, self.input_2,
                                   output_file, self.inner_join)
        else:
            write_function(clustered_dupes, self.input_1, self.input_2,
                           sys.stdout, self.inner_join)
Пример #8
0
def link_dataframes(dfa,
                    dfb,
                    field_properties,
                    config_name="link_dataframes",
                    n_cores=None):

    config_name = config_name.replace(" ", "_")

    settings_file = config_name + '_learned_settings'
    training_file = config_name + '_training.json'

    print('Importing data ...')

    dfa = clean_punctuation(dfa)
    specify_type(dfa, field_properties)

    dfa['index_field'] = dfa.index
    dfa['index_field'] = dfa['index_field'].apply(lambda x: "dfa" + str(x))
    dfa.set_index(['index_field'], inplace=True)

    data_1 = dfa.to_dict(orient='index')

    dfb = clean_punctuation(dfb)
    specify_type(dfb, field_properties)

    dfb['index_field'] = dfb.index
    dfb['index_field'] = dfb['index_field'].apply(lambda x: "dfb" + str(x))
    dfb.set_index(['index_field'], inplace=True)

    data_2 = dfb.to_dict(orient='index')
    # ---------------------------------------------------------------------------------

    # ## Training

    if os.path.exists(settings_file):
        print('Reading from', settings_file)
        with open(settings_file, 'rb') as sf:
            linker = dedupe.StaticRecordLink(sf, num_cores=n_cores)

    else:
        # Define the fields the linker will pay attention to
        #
        # Notice how we are telling the linker to use a custom field comparator
        # for the 'price' field.

        fields = []
        select_fields(fields, field_properties)

        # Create a new linker object and pass our data model to it.
        linker = dedupe.RecordLink(fields, num_cores=n_cores)
        # To train the linker, we feed it a sample of records.
        linker.prepare_training(data_1, data_2, sample_size=15000)

        # If we have training data saved from a previous run of linker,
        # look for it an load it in.
        # __Note:__ if you want to train from scratch, delete the training_file
        if os.path.exists(training_file):
            print('Reading labeled examples from ', training_file)
            with open(training_file) as tf:
                linker.prepare_training(data, training_file=tf)

        # ## Active learning
        # Dedupe will find the next pair of records
        # it is least certain about and ask you to label them as matches
        # or not.
        # use 'y', 'n' and 'u' keys to flag duplicates
        # press 'f' when you are finished
        print('Starting active labeling...')

        dedupe.console_label(linker)
        linker.train()

        # When finished, save our training away to disk
        with open(training_file, 'w') as tf:
            linker.write_training(tf)

        # Save our weights and predicates to disk.  If the settings file
        # exists, we will skip all the training and learning next time we run
        # this file.
        with open(settings_file, 'wb') as sf:
            linker.write_settings(sf)

    # ## Blocking

    # ## Clustering

    # Find the threshold that will maximize a weighted average of our
    # precision and recall.  When we set the recall weight to 2, we are
    # saying we care twice as much about recall as we do precision.
    #
    # If we had more data, we would not pass in all the blocked data into
    # this function but a representative sample.

    print('Clustering...')
    linked_records = linker.join(data_1, data_2, 0)

    print('# duplicate sets', len(linked_records))

    #Convert linked records into dataframe
    df_linked_records = pd.DataFrame(linked_records)

    df_linked_records['dfa_link'] = df_linked_records[0].apply(lambda x: x[0])
    df_linked_records['dfb_link'] = df_linked_records[0].apply(lambda x: x[1])
    df_linked_records.rename(columns={1: 'confidence'}, inplace=True)
    df_linked_records.drop(columns=[0], inplace=True)
    df_linked_records['cluster id'] = df_linked_records.index

    #For both dfa & dfb, add cluster id & confidence score from liked_records
    dfa.index.rename('dfa_link', inplace=True)
    dfa = dfa.merge(df_linked_records, on='dfa_link', how='left')

    dfb.index.rename('dfb_link', inplace=True)
    dfb = dfb.merge(df_linked_records, on='dfb_link', how='left')

    #Concatenate results from dfa + dfb
    df_final = dfa.append(dfb, ignore_index=True, sort=True)
    df_final = df_final.sort_values(by=['cluster id'])
    df_final = df_final.drop(columns=['dfa_link', 'dfb_link'])

    return df_final
Пример #9
0
settings_file = 'canonical_data_matching_learned_settings'

data_1, header = canonicalImport('tests/datasets/restaurant-1.csv')
data_2, _ = canonicalImport('tests/datasets/restaurant-2.csv')

training_pairs = dedupe.trainingDataLink(data_1, data_2, 'unique_id', 5000)

duplicates_s = set(frozenset(pair) for pair in training_pairs['match'])

t0 = time.time()

print 'number of known duplicate pairs', len(duplicates_s)

if os.path.exists(settings_file):
    deduper = dedupe.StaticRecordLink(settings_file)
else:
    fields = {
        'name': {
            'type': 'String'
        },
        'address': {
            'type': 'String'
        },
        'cuisine': {
            'type': 'String'
        },
        'city': {
            'type': 'String'
        }
    }