def train_linker(df1, df2): if READ_FROM_SETTINGS_FILE: with open(SETTINGS_FILE, 'rb') as sf: linker = dedupe.StaticRecordLink(sf) else: linker = dedupe.RecordLink(FIELDS) # It's terrible that you have to do this next line!!!! linker.classifier = rlr.RegularizedLogisticRegression() linker.sample(df1, df2, BLOCKING_TRAINING_SAMPLE_SIZE) if READ_FROM_TRAINING_FILE: print('reading labeled examples from ', TRAINING_FILE) with open(TRAINING_FILE, 'rb') as tf: linker.readTraining(tf) else: dedupe.consoleLabel(linker) linker.train() if WRITE_SETTINGS_FILE: with open(SETTINGS_FILE, 'wb') as sf: linker.writeSettings(sf) if WRITE_TRAINING_FILE: with open(TRAINING_FILE, 'w') as tf: linker.writeTraining(tf) return linker
def load_model(self, settings_file): if os.path.exists(settings_file): logging.info('reading settings from', settings_file) with open(settings_file, 'rb') as f: self.deduper = dedupe.StaticRecordLink(f) else: raise Exception("file not found")
return data_d print('importing data ...') data_1 = readData('AbtBuy_Abt.csv') data_2 = readData('AbtBuy_Buy.csv') # def descriptions() : for dataset in (data_1, data_2) : for record in dataset.values() : yield record['description'] if os.path.exists(settings_file): print('reading from', settings_file) with open(settings_file, 'rb') as sf : linker = dedupe.StaticRecordLink(sf) else: fields = [ {'field' : 'title', 'type': 'String'}, {'field' : 'title', 'type': 'Text', 'corpus' : descriptions()}, {'field' : 'description', 'type': 'Text', 'has missing' : True, 'corpus' : descriptions()}, {'field' : 'price', 'type' : 'Price', 'has missing' : True}] linker = dedupe.RecordLink(fields) linker.sample(data_1, data_2, 15000) if os.path.exists(training_file):
def get_resolved_df(data_1, data_2, fields): # Training if os.path.exists(settings_file): print('reading from', settings_file) with open(settings_file, 'rb') as sf: linker = dedupe.StaticRecordLink(sf) else: # Create a new linker object and pass our data model to it. linker = dedupe.RecordLink(fields) # To train the linker, we feed it a sample of records. linker.sample(data_1, data_2, 15000) # If we have training data saved from a previous run of linker, # look for it an load it in. if os.path.exists(training_file): print('reading labeled examples from ', training_file) with open(training_file) as tf: linker.readTraining(tf) # ## Active learning print('starting active labeling...') dedupe.consoleLabel(linker) linker.train() # When finished, save our training away to disk with open(training_file, 'w') as tf: linker.writeTraining(tf) # Save our weights and predicates to disk. If the settings file # exists, we will skip all the training and learning next time we run # this file. with open(settings_file, 'wb') as sf: linker.writeSettings(sf) # ## Clustering print('clustering...') linked_records = linker.match(data_1, data_2, 0) print('# duplicate sets', len(linked_records)) print(linked_records) # ## Writing Results df_list = [] for cluster, score in linked_records: #obtain filename + record no clusters cluster1 = cluster[0] cluster2 = cluster[1] match = re.match(r"(\w+)\.csv(\d+)", cluster1) filename, idx = match.groups() filename += '.csv' filename = pr.resource_filename('src.input', filename) idx = int(idx) print filename print idx #dataframe for cluster - 1 df1 = pd.DataFrame(columns=pd.read_csv(filename).columns) df1.loc[0] = pd.read_csv(filename).ix[idx] #for cluster 2 match = re.match(r"(\w+)\.csv(\d+)", cluster2) filename, idx = match.groups() filename += '.csv' filename = pr.resource_filename('src.input', filename) idx = int(idx) print filename print idx #dataframe for cluster 2 df2 = pd.DataFrame(columns=pd.read_csv(filename).columns) df2.loc[0] = pd.read_csv(filename).ix[idx] #common attribute df2['score'] = [score] df1['score'] = [score] df = pd.merge(df1, df2, on='score') df_list.append(df) results_df = pd.concat(df_list, ignore_index=True) return results_df
settings_file = 'canonical_data_matching_learned_settings' data_1, header = canonicalImport('tests/datasets/restaurant-1.csv') data_2, _ = canonicalImport('tests/datasets/restaurant-2.csv') training_pairs = dedupe.trainingDataLink(data_1, data_2, 'unique_id', 5000) duplicates_s = set(frozenset(pair) for pair in training_pairs['match']) t0 = time.time() print('number of known duplicate pairs', len(duplicates_s)) if os.path.exists(settings_file): with open(settings_file, 'rb') as f: deduper = dedupe.StaticRecordLink(f) else: fields = [{ 'field': 'name', 'type': 'String' }, { 'field': 'address', 'type': 'String' }, { 'field': 'cuisine', 'type': 'String' }, { 'field': 'city', 'type': 'String' }]
def main_dedupe_method(data_d,spot_d,spot_data,chosen_cols3,country_code,settings_file,training_file,fields): print '''############# Welcome to our Matching Algorithm ################ =============== Initializing Part 1 - Data Retrieving =================\n''' start_time = time.time() print "Getting data from BP companies..." # Select all companies from selected country con = psy.connect(database='msi_bp_%s' % (country_code), user = '******', host='basil.colours.local', password='******') con.set_client_encoding('UTF8') c = con.cursor(cursor_factory=psy.extras.RealDictCursor) q_companies = '''SELECT --DISTINCT on (auth.company_int_id_number) CONCAT(ROW_NUMBER() OVER(ORDER BY auth.company_int_id_number),'c') AS line_nr, auth.current_authority_pk as auth_nr, auth.s_acct_name as "name", auth.s_tax_number as vat, ads.address_line1 as address, ads.address_line3 as town, ads.postcode as postcode FROM gdm.authorities auth LEFT JOIN gdm.addresses ads ON ads.address_id_pk = auth.s_address_id WHERE auth.s_acct_name is not null ORDER BY auth.company_int_id_number''' c.execute(q_companies) companies = c.fetchall() comp_d = {} for row in companies: clean_row = {ky: p.proc(row[ky]) for ky in chosen_cols3[1:]} row_id = row['line_nr'] comp_d[row_id] = dict(clean_row) print "Saving all data into database..." all_data_d=md.merge_dicts(spot_d,comp_d) all_data=[] for k,v in all_data_d.iteritems(): new_row = [v[ky] for ky in chosen_cols3[1:]] new_row.insert(0,k) all_data.append(new_row) c2 = con.cursor() c2.execute('DROP TABLE IF EXISTS htc.all_data') field_string = ','.join('%s varchar(200)' % name for name in chosen_cols3) c2.execute('''CREATE TABLE htc.all_data (%s)''' %(field_string)) num_cols = len(chosen_cols3) mog = "(" + ("%s,"*(num_cols -1)) + "%s)" args_str = ','.join(c2.mogrify(mog,x) for x in all_data) values = "("+ ','.join(x for x in chosen_cols3) +")" c2.execute("INSERT INTO htc.all_data %s VALUES %s" % (values, args_str)) con.commit() print 'Data retrieval took ', time.time() - start_time, 'seconds (', (time.time() - start_time)/60, 'minutes)' start_time1 = time.time() print "Arranging data to create blocks..." u_keys = set() for n,m in all_data_d.iteritems(): for k,v in m.iteritems(): u_keys.add(v) u_keys = list(u_keys) block_keys = {} count = 0 for k in u_keys: block_keys[k] = count count += 1 print 'Checking blocks...' ids = [] for k, v in all_data_d.items(): ids.append([block_keys[v['name']],v['name'], k]) if 'postcode' in v.keys(): if v['postcode'] != None: ids.append([block_keys[v['postcode']],v['postcode'], k]) if 'town' in v.keys(): if v['town'] != None: ids.append([block_keys[v['town']],v['town'], k]) print 'Writing tables...' column_names = ['block_key','block_id','record_id'] c3 = con.cursor() print 'Writing htc.blocks_recordlink_test' ## Table with all blocks and companies ## c3.execute('DROP TABLE IF EXISTS htc.blocks_recordlink_test') field_string = ','.join('%s varchar(200)' % name for name in column_names) c3.execute('''CREATE TABLE htc.blocks_recordlink_test (%s)''' %(field_string)) num_cols = len(column_names) mog = "(" + ("%s,"*(num_cols -1)) + "%s)" args_str = ','.join(c3.mogrify(mog,x) for x in ids) values = "("+ ','.join(x for x in column_names) +")" c3.execute("INSERT INTO htc.blocks_recordlink_test %s VALUES %s" % (values, args_str)) con.commit() print 'Writing htc.plural_blocks' ## Table with all blocks that have more than one company ## c3.execute("DROP TABLE IF EXISTS htc.plural_blocks") c3.execute("DROP TABLE IF EXISTS htc.covered_blocks") c3.execute("""CREATE TABLE htc.plural_blocks as ( SELECT b.block_id FROM htc.blocks_recordlink_test b INNER JOIN (SELECT DISTINCT block_id FROM htc.blocks_recordlink_test WHERE record_id like '%c') bp ON bp.block_id = b.block_id INNER JOIN (SELECT DISTINCT block_id FROM htc.blocks_recordlink_test WHERE record_id not like '%c') comp ON comp.block_id = b.block_id GROUP BY b.block_id HAVING COUNT(*) > 1)""") con.commit() print 'Writing htc.covered_blocks' ## Table with list of blocks for each company ## c3.execute("""CREATE TABLE htc.covered_blocks as ( SELECT record_id, array_agg(r.block_key ORDER BY r.block_key) AS sorted_ids FROM htc.blocks_recordlink_test r INNER JOIN htc.plural_blocks bl ON r.block_id = bl.block_id GROUP BY record_id)""") con.commit() c3 = con.cursor(cursor_factory=psy.extras.RealDictCursor) print "Blocking..." ## Get all companies and blocks ## c3.execute("""SELECT br.record_id, m.*, br.block_key, cb.sorted_ids FROM htc.blocks_recordlink_test br LEFT JOIN (SELECT * FROM htc.all_data) m ON m.line_nr = br.record_id INNER JOIN htc.covered_blocks cb ON br.record_id = cb.record_id INNER JOIN htc.plural_blocks pb ON br.block_id = pb.block_id ORDER BY br.block_key""") rec_ids = c3.fetchall() print 'Writing tables took ', time.time() - start_time1, 'seconds (', (time.time() - start_time1)/60, 'minutes)' start_time2 = time.time() print "Blocking..." ## Arrange data into the needed syntax to use in dedupe ## blocks = [] last_key = 0 count = 0 bp_block = [] comp_block = [] for rec in rec_ids: if rec['block_key'] == last_key: if rec['record_id'][-1:] != 'c': bp_block.append(tuple([rec['record_id'], {re.sub('_bp','',your_key): p.proc(rec[your_key]) for your_key in chosen_cols3[1:]}, set(rec['sorted_ids'][:rec['sorted_ids'].index(rec['block_key'])]) ])) else: comp_block.append(tuple([rec['record_id'], comp_d[rec['record_id']], set(rec['sorted_ids'][:rec['sorted_ids'].index(rec['block_key'])]) ])) else: if bp_block != [] and comp_block != []: blocks.append((bp_block, comp_block)) bp_block = [] comp_block = [] if rec['record_id'][-1:] == 'c': comp_block =[tuple([rec['record_id'], comp_d[rec['record_id']], set(rec['sorted_ids'][:rec['sorted_ids'].index(rec['block_key'])]) ])] else: bp_block = [tuple([rec['record_id'], {re.sub('_bp','',your_key): p.proc(rec[your_key]) for your_key in chosen_cols3[1:]}, set(rec['sorted_ids'][:rec['sorted_ids'].index(rec['block_key'])]) ])] last_key = rec['block_key'] count +=1 blocks.append((bp_block, comp_block)) blocks = tuple(blocks) del rec_ids del all_data con.close() # Dinamically create fields accordingly to chosen columns # We need to see how can we say to our model if we only have name to work with print 'Blocking took ', time.time() - start_time2, 'seconds (', (time.time() - start_time2)/60, 'minutes)' start_time3 = time.time() print "=============== Initializing Part 2 - Dedupe ===============\n" print 'Entering Dedupe ...' #================================ DEDUPING WITH BLOCKS ============================================== if os.path.exists(settings_file): print 'Reading from: ', settings_file with open(settings_file) as sf : linker = dedupe.StaticRecordLink(sf) else: linker = dedupe.RecordLink(fields) linker.sample(data_d,comp_d) if os.path.exists(training_file): print 'Reading labeled examples from: ', training_file with open(training_file) as tf : linker.readTraining(tf) print 'Starting Active Labeling...' dedupe.consoleLabel(linker) linker.train() with open(training_file, 'w') as tf : linker.writeTraining(tf) with open(settings_file, 'w') as sf : linker.writeSettings(sf) print 'Creating blocks...' threshold = linker.thresholdBlocks(blocks, recall_weight=2) print 'Generating clusters...' clusters = linker.matchBlocks(blocks, threshold) clust=list(clusters) print 'Deduping took ', time.time() - start_time3, 'seconds (', (time.time() - start_time3)/60, 'minutes)' start_time4 = time.time() print 'Writing results to database...' con = psy.connect(database='msi_bp_%s' % (country_code), user = '******', host='basil.colours.local', password='******') c = con.cursor() c.execute('DROP TABLE if exists htc.recordlink_blocks_result') con.commit() c.execute('CREATE TABLE htc.recordlink_blocks_result (spot_id varchar, comp_id varchar, score double precision)') #register_adapter(numpy.int64, addapt_numpy_int64) num_cols = 3 mog = "(" + ("%s,"*(num_cols -1)) + "%s)" args_str = ','.join(c.mogrify(mog,x[0]+(float(x[1]),)) for x in clust) values = "("+ ','.join(x for x in ['spot_id','comp_id', 'score']) +")" c.execute("INSERT INTO htc.recordlink_blocks_result %s VALUES %s" % (values, args_str)) c.execute('DROP TABLE if exists htc.recordlink_blocks_result_all') c.execute("""CREATE TABLE htc.recordlink_blocks_result_all as ( SELECT adt.line_nr as spot_id,br.comp_id as comp_id,br.score as score, CASE WHEN br.comp_id IS NULL THEN 'Not a Client' ELSE 'Check Match Score' END AS "Result", adt.name as name_spot,b.name as name_comp, b.auth_nr as auth_nr_comp FROM htc.all_data adt LEFT JOIN htc.recordlink_blocks_result br ON br.spot_id = adt.line_nr LEFT JOIN (SELECT * FROM htc.recordlink_blocks_result br INNER JOIN htc.all_data adt ON adt.line_nr= br.comp_id) b ON b.spot_id = adt.line_nr WHERE adt.line_nr not like '%c%')""") con.commit() con.close() print 'Writing results took ', time.time() - start_time4, 'seconds (', (time.time() - start_time4)/60, 'minutes)' print 'Ended in ' , time.time() - start_time, 'seconds (', (time.time() - start_time)/60, 'minutes)'
def main(self): data_1 = {} data_2 = {} # import the specified CSV file data_1 = csvhelpers.readData(self.input_1, self.field_names_1, delimiter=self.delimiter, prefix='input_1') data_2 = csvhelpers.readData(self.input_2, self.field_names_2, delimiter=self.delimiter, prefix='input_2') # fix price for field in self.field_definition: if field['type'] == 'Price': for key, record in data_1.items(): value = record[field['field']] if value: record[field['field']] = int(float(value)) # fix latlon for field in self.field_definition: if field['type'] == 'LatLong': for key, record in data_1.items(): latitude = record[field['latitude']] longitude = record[field['longitude']] record[field['field']] = ( float(latitude), float(longitude)) if latitude and longitude else None # fix price for field in self.field_definition: if field['type'] == 'Price': for key, record in data_2.items(): value = record[field['field']] if value: record[field['field']] = int(float(value)) # fix latlon for field in self.field_definition: if field['type'] == 'LatLong': for key, record in data_2.items(): latitude = record[field['latitude']] longitude = record[field['longitude']] record[field['field']] = ( float(latitude), float(longitude)) if latitude and longitude else None # sanity check for provided field names in CSV file for field in self.field_names_1: if field not in list(data_1.values())[0]: raise self.parser.error("Could not find field '" + field + "' in input") for field in self.field_names_2: if field not in list(data_2.values())[0]: raise self.parser.error("Could not find field '" + field + "' in input") if self.field_names_1 != self.field_names_2: for record_id, record in data_2.items(): remapped_record = {} for new_field, old_field in zip(self.field_names_1, self.field_names_2): remapped_record[new_field] = record[old_field] data_2[record_id] = remapped_record logging.info('imported %d rows from file 1', len(data_1)) logging.info('imported %d rows from file 2', len(data_2)) logging.info('using fields: %s' % [field['field'] for field in self.field_definition]) # If --skip_training has been selected, and we have a settings cache still # persisting from the last run, use it in this next run. # __Note:__ if you want to add more training data, don't use skip training if self.skip_training and os.path.exists(self.settings_file): # Load our deduper from the last training session cache. logging.info('reading from previous training cache %s' % self.settings_file) with open(self.settings_file, 'rb') as f: deduper = dedupe.StaticRecordLink(f) fields = { variable.field for variable in deduper.data_model.primary_fields } (nonexact_1, nonexact_2, exact_pairs) = exact_matches(data_1, data_2, fields) else: # # Create a new deduper object and pass our data model to it. deduper = dedupe.RecordLink(self.field_definition) fields = { variable.field for variable in deduper.data_model.primary_fields } (nonexact_1, nonexact_2, exact_pairs) = exact_matches(data_1, data_2, fields) # Set up our data sample logging.info('taking a sample of %d possible pairs', self.sample_size) deduper.sample(nonexact_1, nonexact_2, self.sample_size) # Perform standard training procedures self.dedupe_training(deduper) # ## Blocking logging.info('blocking...') # ## Clustering # Find the threshold that will maximize a weighted average of our precision and recall. # When we set the recall weight to 2, we are saying we care twice as much # about recall as we do precision. # # If we had more data, we would not pass in all the blocked data into # this function but a representative sample. logging.info('finding a good threshold with a recall_weight of %s' % self.recall_weight) threshold = deduper.threshold(data_1, data_2, recall_weight=self.recall_weight) # `duplicateClusters` will return sets of record IDs that dedupe # believes are all referring to the same entity. logging.info('clustering...') clustered_dupes = deduper.match(data_1, data_2, threshold) clustered_dupes.extend(exact_pairs) logging.info('# duplicate sets %s' % len(clustered_dupes)) write_function = csvhelpers.writeLinkedResults # write out our results if self.output_file: if sys.version < '3': with open(self.output_file, 'wb', encoding='utf-8') as output_file: write_function(clustered_dupes, self.input_1, self.input_2, output_file, self.inner_join) else: with open(self.output_file, 'w', encoding='utf-8') as output_file: write_function(clustered_dupes, self.input_1, self.input_2, output_file, self.inner_join) else: write_function(clustered_dupes, self.input_1, self.input_2, sys.stdout, self.inner_join)
def link_dataframes(dfa, dfb, field_properties, config_name="link_dataframes", n_cores=None): config_name = config_name.replace(" ", "_") settings_file = config_name + '_learned_settings' training_file = config_name + '_training.json' print('Importing data ...') dfa = clean_punctuation(dfa) specify_type(dfa, field_properties) dfa['index_field'] = dfa.index dfa['index_field'] = dfa['index_field'].apply(lambda x: "dfa" + str(x)) dfa.set_index(['index_field'], inplace=True) data_1 = dfa.to_dict(orient='index') dfb = clean_punctuation(dfb) specify_type(dfb, field_properties) dfb['index_field'] = dfb.index dfb['index_field'] = dfb['index_field'].apply(lambda x: "dfb" + str(x)) dfb.set_index(['index_field'], inplace=True) data_2 = dfb.to_dict(orient='index') # --------------------------------------------------------------------------------- # ## Training if os.path.exists(settings_file): print('Reading from', settings_file) with open(settings_file, 'rb') as sf: linker = dedupe.StaticRecordLink(sf, num_cores=n_cores) else: # Define the fields the linker will pay attention to # # Notice how we are telling the linker to use a custom field comparator # for the 'price' field. fields = [] select_fields(fields, field_properties) # Create a new linker object and pass our data model to it. linker = dedupe.RecordLink(fields, num_cores=n_cores) # To train the linker, we feed it a sample of records. linker.prepare_training(data_1, data_2, sample_size=15000) # If we have training data saved from a previous run of linker, # look for it an load it in. # __Note:__ if you want to train from scratch, delete the training_file if os.path.exists(training_file): print('Reading labeled examples from ', training_file) with open(training_file) as tf: linker.prepare_training(data, training_file=tf) # ## Active learning # Dedupe will find the next pair of records # it is least certain about and ask you to label them as matches # or not. # use 'y', 'n' and 'u' keys to flag duplicates # press 'f' when you are finished print('Starting active labeling...') dedupe.console_label(linker) linker.train() # When finished, save our training away to disk with open(training_file, 'w') as tf: linker.write_training(tf) # Save our weights and predicates to disk. If the settings file # exists, we will skip all the training and learning next time we run # this file. with open(settings_file, 'wb') as sf: linker.write_settings(sf) # ## Blocking # ## Clustering # Find the threshold that will maximize a weighted average of our # precision and recall. When we set the recall weight to 2, we are # saying we care twice as much about recall as we do precision. # # If we had more data, we would not pass in all the blocked data into # this function but a representative sample. print('Clustering...') linked_records = linker.join(data_1, data_2, 0) print('# duplicate sets', len(linked_records)) #Convert linked records into dataframe df_linked_records = pd.DataFrame(linked_records) df_linked_records['dfa_link'] = df_linked_records[0].apply(lambda x: x[0]) df_linked_records['dfb_link'] = df_linked_records[0].apply(lambda x: x[1]) df_linked_records.rename(columns={1: 'confidence'}, inplace=True) df_linked_records.drop(columns=[0], inplace=True) df_linked_records['cluster id'] = df_linked_records.index #For both dfa & dfb, add cluster id & confidence score from liked_records dfa.index.rename('dfa_link', inplace=True) dfa = dfa.merge(df_linked_records, on='dfa_link', how='left') dfb.index.rename('dfb_link', inplace=True) dfb = dfb.merge(df_linked_records, on='dfb_link', how='left') #Concatenate results from dfa + dfb df_final = dfa.append(dfb, ignore_index=True, sort=True) df_final = df_final.sort_values(by=['cluster id']) df_final = df_final.drop(columns=['dfa_link', 'dfb_link']) return df_final
settings_file = 'canonical_data_matching_learned_settings' data_1, header = canonicalImport('tests/datasets/restaurant-1.csv') data_2, _ = canonicalImport('tests/datasets/restaurant-2.csv') training_pairs = dedupe.trainingDataLink(data_1, data_2, 'unique_id', 5000) duplicates_s = set(frozenset(pair) for pair in training_pairs['match']) t0 = time.time() print 'number of known duplicate pairs', len(duplicates_s) if os.path.exists(settings_file): deduper = dedupe.StaticRecordLink(settings_file) else: fields = { 'name': { 'type': 'String' }, 'address': { 'type': 'String' }, 'cuisine': { 'type': 'String' }, 'city': { 'type': 'String' } }