def train_linker(df1, df2): if READ_FROM_SETTINGS_FILE: with open(SETTINGS_FILE, 'rb') as sf: linker = dedupe.StaticRecordLink(sf) else: linker = dedupe.RecordLink(FIELDS) # It's terrible that you have to do this next line!!!! linker.classifier = rlr.RegularizedLogisticRegression() linker.sample(df1, df2, BLOCKING_TRAINING_SAMPLE_SIZE) if READ_FROM_TRAINING_FILE: print('reading labeled examples from ', TRAINING_FILE) with open(TRAINING_FILE, 'rb') as tf: linker.readTraining(tf) else: dedupe.consoleLabel(linker) linker.train() if WRITE_SETTINGS_FILE: with open(SETTINGS_FILE, 'wb') as sf: linker.writeSettings(sf) if WRITE_TRAINING_FILE: with open(TRAINING_FILE, 'w') as tf: linker.writeTraining(tf) return linker
def executor(data1, data2): input1 = { i: {fields1[j]: value for j, value in enumerate(row)} for i, row in enumerate(data1) } input2 = { i: {fields1[j]: value for j, value in enumerate(row)} for i, row in enumerate(data2) } fields = [{'field': field, 'type': 'String'} for field in fields1] linker = dedupe.RecordLink(fields) linker.prepare_training(input1, input2, sample_size=1500) while True: labelling(linker) try: linker.train() break except: sys.stderr.write('\nYou need to do more training.\n') pairs = linker.join(input1, input2, threshold, 'many-to-many') matches = [] for pair in pairs: matches.append((pair[0][0], pair[0][1], pair[1])) return matches
def train(self): # data_d = self._read_data(self.input_file) # self.data_d = data_d data_1 = self._read_data(self.input_file_base) data_2 = self._read_data(self.input_file) self.data_1 = data_1 self.data_2 = data_2 fields = self.fields deduper = dedupe.RecordLink(fields) training_file = self.training_file # ## training data if os.path.exists(training_file): logging.info('reading labeled examples from ', training_file) with open(training_file, 'rb') as f: deduper.prepare_training(data_1, data_2, f) else: deduper.prepare_training(data_1, data_2) print('starting active labeling...') dedupe.consoleLabel(deduper) deduper.train(recall=1) with open(training_file, 'w') as tf: deduper.writeTraining(tf) self.deduper = deduper
def match(data1, data2, fields1, fields2, threshold): # threshold not used, as is automatically calculated input1 = { i: {fields1[j]: value for j, value in enumerate(row)} for i, row in enumerate(data1) } input2 = { i: {fields1[j]: value for j, value in enumerate(row)} for i, row in enumerate(data2) } fields = [{'field': field, 'type': 'String'} for field in fields1] linker = dedupe.RecordLink(fields) linker.sample(input1, input2, sample_size=1500) while True: labelling(linker) try: linker.train() break except ValueError: sys.stderr.write('\nYou need to do more training.\n') threshold = linker.threshold(input1, input2, recall_weight=1) pairs = linker.match(input1, input2, threshold) matches = [] for pair in pairs: matches.append((pair[0][0], pair[0][1], pair[1])) return matches
def setUp(self): random.seed(123) field_definition = [{ 'field': 'name', 'type': 'String' }, { 'field': 'age', 'type': 'String' }] self.linker = dedupe.RecordLink(field_definition)
def setUp(self): random.seed(123) fields = { 'name': { 'type': 'String' }, 'age': { 'type': 'String' }, } self.linker = dedupe.RecordLink(fields)
def train(self, data_1, data_2, training_file): fields = self.fields deduper = dedupe.RecordLink(fields) # ## training data if os.path.exists(training_file): logging.info('reading labeled examples from ', training_file) with open(training_file, 'rb') as f: deduper.prepare_training(data_1, data_2, f) else: # deduper.prepare_training(data_1, data_2) raise Exception("no traing data") # print('starting active labeling...') # dedupe.consoleLabel(deduper) deduper.train(recall=1) self.deduper = deduper return self
def match(data1, data2, fields1, fields2): data2_remapped = remap(data2, fields1, fields2) if fields1 != fields2 else data2 fields = [{'field': field, 'type': 'String'} for field in fields1] linker = dedupe.RecordLink(fields) linker.sample(data1, data2_remapped, sample_size=1500) while True: labelling(linker) try: linker.train() break except ValueError: sys.stderr.write('\nYou need to do more training.\n') threshold = linker.threshold(data1, data2_remapped, recall_weight=1) pairs = linker.match(data1, data2_remapped, threshold) matches = [] for pair in pairs: matches.append((pair[0][0], pair[0][1], pair[1])) return matches
def link_dataframes(dfa, dfb, field_properties, config_name="link_dataframes", n_cores=None): config_name = config_name.replace(" ", "_") settings_file = config_name + '_learned_settings' training_file = config_name + '_training.json' print('Importing data ...') dfa = clean_punctuation(dfa) specify_type(dfa, field_properties) dfa['index_field'] = dfa.index dfa['index_field'] = dfa['index_field'].apply(lambda x: "dfa" + str(x)) dfa.set_index(['index_field'], inplace=True) data_1 = dfa.to_dict(orient='index') dfb = clean_punctuation(dfb) specify_type(dfb, field_properties) dfb['index_field'] = dfb.index dfb['index_field'] = dfb['index_field'].apply(lambda x: "dfb" + str(x)) dfb.set_index(['index_field'], inplace=True) data_2 = dfb.to_dict(orient='index') # --------------------------------------------------------------------------------- # ## Training if os.path.exists(settings_file): print('Reading from', settings_file) with open(settings_file, 'rb') as sf: linker = dedupe.StaticRecordLink(sf, num_cores=n_cores) else: # Define the fields the linker will pay attention to # # Notice how we are telling the linker to use a custom field comparator # for the 'price' field. fields = [] select_fields(fields, field_properties) # Create a new linker object and pass our data model to it. linker = dedupe.RecordLink(fields, num_cores=n_cores) # To train the linker, we feed it a sample of records. linker.prepare_training(data_1, data_2, sample_size=15000) # If we have training data saved from a previous run of linker, # look for it an load it in. # __Note:__ if you want to train from scratch, delete the training_file if os.path.exists(training_file): print('Reading labeled examples from ', training_file) with open(training_file) as tf: linker.prepare_training(data, training_file=tf) # ## Active learning # Dedupe will find the next pair of records # it is least certain about and ask you to label them as matches # or not. # use 'y', 'n' and 'u' keys to flag duplicates # press 'f' when you are finished print('Starting active labeling...') dedupe.console_label(linker) linker.train() # When finished, save our training away to disk with open(training_file, 'w') as tf: linker.write_training(tf) # Save our weights and predicates to disk. If the settings file # exists, we will skip all the training and learning next time we run # this file. with open(settings_file, 'wb') as sf: linker.write_settings(sf) # ## Blocking # ## Clustering # Find the threshold that will maximize a weighted average of our # precision and recall. When we set the recall weight to 2, we are # saying we care twice as much about recall as we do precision. # # If we had more data, we would not pass in all the blocked data into # this function but a representative sample. print('Clustering...') linked_records = linker.join(data_1, data_2, 0) print('# duplicate sets', len(linked_records)) #Convert linked records into dataframe df_linked_records = pd.DataFrame(linked_records) df_linked_records['dfa_link'] = df_linked_records[0].apply(lambda x: x[0]) df_linked_records['dfb_link'] = df_linked_records[0].apply(lambda x: x[1]) df_linked_records.rename(columns={1: 'confidence'}, inplace=True) df_linked_records.drop(columns=[0], inplace=True) df_linked_records['cluster id'] = df_linked_records.index #For both dfa & dfb, add cluster id & confidence score from liked_records dfa.index.rename('dfa_link', inplace=True) dfa = dfa.merge(df_linked_records, on='dfa_link', how='left') dfb.index.rename('dfb_link', inplace=True) dfb = dfb.merge(df_linked_records, on='dfb_link', how='left') #Concatenate results from dfa + dfb df_final = dfa.append(dfb, ignore_index=True, sort=True) df_final = df_final.sort_values(by=['cluster id']) df_final = df_final.drop(columns=['dfa_link', 'dfb_link']) return df_final
if os.path.exists(settings_file): print('reading from', settings_file) with open(settings_file, 'rb') as sf : linker = dedupe.StaticRecordLink(sf) else: fields = [ {'field' : 'title', 'type': 'String'}, {'field' : 'title', 'type': 'Text', 'corpus' : descriptions()}, {'field' : 'description', 'type': 'Text', 'has missing' : True, 'corpus' : descriptions()}, {'field' : 'price', 'type' : 'Price', 'has missing' : True}] linker = dedupe.RecordLink(fields) linker.sample(data_1, data_2, 15000) if os.path.exists(training_file): print('reading labeled examples from ', training_file) with open(training_file) as tf : linker.readTraining(tf) Dedupe will find the next pair of records it is least certain about and ask you to label them as matches or not. use 'y', 'n' and 'u' keys to flag duplicates press 'f' when you are finished print('starting active labeling...') dedupe.consoleLabel(linker)
def main_dedupe_method(data_d,spot_d,spot_data,chosen_cols3,country_code,settings_file,training_file,fields): print '''############# Welcome to our Matching Algorithm ################ =============== Initializing Part 1 - Data Retrieving =================\n''' start_time = time.time() print "Getting data from BP companies..." # Select all companies from selected country con = psy.connect(database='msi_bp_%s' % (country_code), user = '******', host='basil.colours.local', password='******') con.set_client_encoding('UTF8') c = con.cursor(cursor_factory=psy.extras.RealDictCursor) q_companies = '''SELECT --DISTINCT on (auth.company_int_id_number) CONCAT(ROW_NUMBER() OVER(ORDER BY auth.company_int_id_number),'c') AS line_nr, auth.current_authority_pk as auth_nr, auth.s_acct_name as "name", auth.s_tax_number as vat, ads.address_line1 as address, ads.address_line3 as town, ads.postcode as postcode FROM gdm.authorities auth LEFT JOIN gdm.addresses ads ON ads.address_id_pk = auth.s_address_id WHERE auth.s_acct_name is not null ORDER BY auth.company_int_id_number''' c.execute(q_companies) companies = c.fetchall() comp_d = {} for row in companies: clean_row = {ky: p.proc(row[ky]) for ky in chosen_cols3[1:]} row_id = row['line_nr'] comp_d[row_id] = dict(clean_row) print "Saving all data into database..." all_data_d=md.merge_dicts(spot_d,comp_d) all_data=[] for k,v in all_data_d.iteritems(): new_row = [v[ky] for ky in chosen_cols3[1:]] new_row.insert(0,k) all_data.append(new_row) c2 = con.cursor() c2.execute('DROP TABLE IF EXISTS htc.all_data') field_string = ','.join('%s varchar(200)' % name for name in chosen_cols3) c2.execute('''CREATE TABLE htc.all_data (%s)''' %(field_string)) num_cols = len(chosen_cols3) mog = "(" + ("%s,"*(num_cols -1)) + "%s)" args_str = ','.join(c2.mogrify(mog,x) for x in all_data) values = "("+ ','.join(x for x in chosen_cols3) +")" c2.execute("INSERT INTO htc.all_data %s VALUES %s" % (values, args_str)) con.commit() print 'Data retrieval took ', time.time() - start_time, 'seconds (', (time.time() - start_time)/60, 'minutes)' start_time1 = time.time() print "Arranging data to create blocks..." u_keys = set() for n,m in all_data_d.iteritems(): for k,v in m.iteritems(): u_keys.add(v) u_keys = list(u_keys) block_keys = {} count = 0 for k in u_keys: block_keys[k] = count count += 1 print 'Checking blocks...' ids = [] for k, v in all_data_d.items(): ids.append([block_keys[v['name']],v['name'], k]) if 'postcode' in v.keys(): if v['postcode'] != None: ids.append([block_keys[v['postcode']],v['postcode'], k]) if 'town' in v.keys(): if v['town'] != None: ids.append([block_keys[v['town']],v['town'], k]) print 'Writing tables...' column_names = ['block_key','block_id','record_id'] c3 = con.cursor() print 'Writing htc.blocks_recordlink_test' ## Table with all blocks and companies ## c3.execute('DROP TABLE IF EXISTS htc.blocks_recordlink_test') field_string = ','.join('%s varchar(200)' % name for name in column_names) c3.execute('''CREATE TABLE htc.blocks_recordlink_test (%s)''' %(field_string)) num_cols = len(column_names) mog = "(" + ("%s,"*(num_cols -1)) + "%s)" args_str = ','.join(c3.mogrify(mog,x) for x in ids) values = "("+ ','.join(x for x in column_names) +")" c3.execute("INSERT INTO htc.blocks_recordlink_test %s VALUES %s" % (values, args_str)) con.commit() print 'Writing htc.plural_blocks' ## Table with all blocks that have more than one company ## c3.execute("DROP TABLE IF EXISTS htc.plural_blocks") c3.execute("DROP TABLE IF EXISTS htc.covered_blocks") c3.execute("""CREATE TABLE htc.plural_blocks as ( SELECT b.block_id FROM htc.blocks_recordlink_test b INNER JOIN (SELECT DISTINCT block_id FROM htc.blocks_recordlink_test WHERE record_id like '%c') bp ON bp.block_id = b.block_id INNER JOIN (SELECT DISTINCT block_id FROM htc.blocks_recordlink_test WHERE record_id not like '%c') comp ON comp.block_id = b.block_id GROUP BY b.block_id HAVING COUNT(*) > 1)""") con.commit() print 'Writing htc.covered_blocks' ## Table with list of blocks for each company ## c3.execute("""CREATE TABLE htc.covered_blocks as ( SELECT record_id, array_agg(r.block_key ORDER BY r.block_key) AS sorted_ids FROM htc.blocks_recordlink_test r INNER JOIN htc.plural_blocks bl ON r.block_id = bl.block_id GROUP BY record_id)""") con.commit() c3 = con.cursor(cursor_factory=psy.extras.RealDictCursor) print "Blocking..." ## Get all companies and blocks ## c3.execute("""SELECT br.record_id, m.*, br.block_key, cb.sorted_ids FROM htc.blocks_recordlink_test br LEFT JOIN (SELECT * FROM htc.all_data) m ON m.line_nr = br.record_id INNER JOIN htc.covered_blocks cb ON br.record_id = cb.record_id INNER JOIN htc.plural_blocks pb ON br.block_id = pb.block_id ORDER BY br.block_key""") rec_ids = c3.fetchall() print 'Writing tables took ', time.time() - start_time1, 'seconds (', (time.time() - start_time1)/60, 'minutes)' start_time2 = time.time() print "Blocking..." ## Arrange data into the needed syntax to use in dedupe ## blocks = [] last_key = 0 count = 0 bp_block = [] comp_block = [] for rec in rec_ids: if rec['block_key'] == last_key: if rec['record_id'][-1:] != 'c': bp_block.append(tuple([rec['record_id'], {re.sub('_bp','',your_key): p.proc(rec[your_key]) for your_key in chosen_cols3[1:]}, set(rec['sorted_ids'][:rec['sorted_ids'].index(rec['block_key'])]) ])) else: comp_block.append(tuple([rec['record_id'], comp_d[rec['record_id']], set(rec['sorted_ids'][:rec['sorted_ids'].index(rec['block_key'])]) ])) else: if bp_block != [] and comp_block != []: blocks.append((bp_block, comp_block)) bp_block = [] comp_block = [] if rec['record_id'][-1:] == 'c': comp_block =[tuple([rec['record_id'], comp_d[rec['record_id']], set(rec['sorted_ids'][:rec['sorted_ids'].index(rec['block_key'])]) ])] else: bp_block = [tuple([rec['record_id'], {re.sub('_bp','',your_key): p.proc(rec[your_key]) for your_key in chosen_cols3[1:]}, set(rec['sorted_ids'][:rec['sorted_ids'].index(rec['block_key'])]) ])] last_key = rec['block_key'] count +=1 blocks.append((bp_block, comp_block)) blocks = tuple(blocks) del rec_ids del all_data con.close() # Dinamically create fields accordingly to chosen columns # We need to see how can we say to our model if we only have name to work with print 'Blocking took ', time.time() - start_time2, 'seconds (', (time.time() - start_time2)/60, 'minutes)' start_time3 = time.time() print "=============== Initializing Part 2 - Dedupe ===============\n" print 'Entering Dedupe ...' #================================ DEDUPING WITH BLOCKS ============================================== if os.path.exists(settings_file): print 'Reading from: ', settings_file with open(settings_file) as sf : linker = dedupe.StaticRecordLink(sf) else: linker = dedupe.RecordLink(fields) linker.sample(data_d,comp_d) if os.path.exists(training_file): print 'Reading labeled examples from: ', training_file with open(training_file) as tf : linker.readTraining(tf) print 'Starting Active Labeling...' dedupe.consoleLabel(linker) linker.train() with open(training_file, 'w') as tf : linker.writeTraining(tf) with open(settings_file, 'w') as sf : linker.writeSettings(sf) print 'Creating blocks...' threshold = linker.thresholdBlocks(blocks, recall_weight=2) print 'Generating clusters...' clusters = linker.matchBlocks(blocks, threshold) clust=list(clusters) print 'Deduping took ', time.time() - start_time3, 'seconds (', (time.time() - start_time3)/60, 'minutes)' start_time4 = time.time() print 'Writing results to database...' con = psy.connect(database='msi_bp_%s' % (country_code), user = '******', host='basil.colours.local', password='******') c = con.cursor() c.execute('DROP TABLE if exists htc.recordlink_blocks_result') con.commit() c.execute('CREATE TABLE htc.recordlink_blocks_result (spot_id varchar, comp_id varchar, score double precision)') #register_adapter(numpy.int64, addapt_numpy_int64) num_cols = 3 mog = "(" + ("%s,"*(num_cols -1)) + "%s)" args_str = ','.join(c.mogrify(mog,x[0]+(float(x[1]),)) for x in clust) values = "("+ ','.join(x for x in ['spot_id','comp_id', 'score']) +")" c.execute("INSERT INTO htc.recordlink_blocks_result %s VALUES %s" % (values, args_str)) c.execute('DROP TABLE if exists htc.recordlink_blocks_result_all') c.execute("""CREATE TABLE htc.recordlink_blocks_result_all as ( SELECT adt.line_nr as spot_id,br.comp_id as comp_id,br.score as score, CASE WHEN br.comp_id IS NULL THEN 'Not a Client' ELSE 'Check Match Score' END AS "Result", adt.name as name_spot,b.name as name_comp, b.auth_nr as auth_nr_comp FROM htc.all_data adt LEFT JOIN htc.recordlink_blocks_result br ON br.spot_id = adt.line_nr LEFT JOIN (SELECT * FROM htc.recordlink_blocks_result br INNER JOIN htc.all_data adt ON adt.line_nr= br.comp_id) b ON b.spot_id = adt.line_nr WHERE adt.line_nr not like '%c%')""") con.commit() con.close() print 'Writing results took ', time.time() - start_time4, 'seconds (', (time.time() - start_time4)/60, 'minutes)' print 'Ended in ' , time.time() - start_time, 'seconds (', (time.time() - start_time)/60, 'minutes)'
def get_resolved_df(data_1, data_2, fields): # Training if os.path.exists(settings_file): print('reading from', settings_file) with open(settings_file, 'rb') as sf: linker = dedupe.StaticRecordLink(sf) else: # Create a new linker object and pass our data model to it. linker = dedupe.RecordLink(fields) # To train the linker, we feed it a sample of records. linker.sample(data_1, data_2, 15000) # If we have training data saved from a previous run of linker, # look for it an load it in. if os.path.exists(training_file): print('reading labeled examples from ', training_file) with open(training_file) as tf: linker.readTraining(tf) # ## Active learning print('starting active labeling...') dedupe.consoleLabel(linker) linker.train() # When finished, save our training away to disk with open(training_file, 'w') as tf: linker.writeTraining(tf) # Save our weights and predicates to disk. If the settings file # exists, we will skip all the training and learning next time we run # this file. with open(settings_file, 'wb') as sf: linker.writeSettings(sf) # ## Clustering print('clustering...') linked_records = linker.match(data_1, data_2, 0) print('# duplicate sets', len(linked_records)) print(linked_records) # ## Writing Results df_list = [] for cluster, score in linked_records: #obtain filename + record no clusters cluster1 = cluster[0] cluster2 = cluster[1] match = re.match(r"(\w+)\.csv(\d+)", cluster1) filename, idx = match.groups() filename += '.csv' filename = pr.resource_filename('src.input', filename) idx = int(idx) print filename print idx #dataframe for cluster - 1 df1 = pd.DataFrame(columns=pd.read_csv(filename).columns) df1.loc[0] = pd.read_csv(filename).ix[idx] #for cluster 2 match = re.match(r"(\w+)\.csv(\d+)", cluster2) filename, idx = match.groups() filename += '.csv' filename = pr.resource_filename('src.input', filename) idx = int(idx) print filename print idx #dataframe for cluster 2 df2 = pd.DataFrame(columns=pd.read_csv(filename).columns) df2.loc[0] = pd.read_csv(filename).ix[idx] #common attribute df2['score'] = [score] df1['score'] = [score] df = pd.merge(df1, df2, on='score') df_list.append(df) results_df = pd.concat(df_list, ignore_index=True) return results_df
'has missing': True }, ] # Pick appropriate schema if asset_class == "bonds": fields = common_fields + bond_only_fields else: fields = common_fields # Report the schema logger.debug("Linkage Schema") logger.debug(fields) # Create a new linker object and pass our data model to it linker = dedupe.RecordLink(fields, num_cores=num_cores) # Set up the regularized logistic regression model linker.classifier = RegularizedLogisticRegression() # To train the linker, we feed it a sample of records random.seed(1) np.random.seed(1) linker.sample(bad_data_dict, good_data_dict, 10000) # Look for previous training data to bootstrap the linker if os.path.exists(training_file): logger.info('Reading labeled examples from {}'.format(training_file)) with open(training_file) as tf: training_pairs_prev = read_training(tf) else:
def _merge_features(fields, g1_features, g2_features): while True: if len(g1_features) == len(g2_features) == 1: linked_records = ( (list(g1_features.keys())[0], list(g2_features.keys())[0]), 1.0, ) g1_features = [] g2_features = [] break linker = dedupe.RecordLink(fields) linker.prepare_training(g1_features, g2_features) confirmed_matches = console_label(linker) linker.train() linked_records = linker.join( g1_features, g2_features, 0.0, constraint="one-to-one" ) # remove records from linked_records that are in confirmed_matches for (e1, e2) in confirmed_matches: idx = 0 while idx < len(linked_records): pair = linked_records[idx] for (pair, _) in linked_records: if e1 in pair or e2 in pair: linked_records.pop(idx) break idx += 1 # replace linked record scores with 1.0 if the user explicitly # marked them as equivalent. Then fill in other user-marked pairs # of linked records at the end idx = 0 while idx < len(confirmed_matches): pair = confirmed_matches[idx] for lidx, (lpair, _) in enumerate(linked_records): if pair == lpair: linked_records[lidx] = (pair, 1.0) confirmed_matches.pop(idx) idx -= 1 # cancel out the increment break idx += 1 linked_records.extend([(pair, 1.0) for pair in confirmed_matches]) print( Style.BRIGHT + Fore.YELLOW + "Is this matching correct?" + Style.RESET_ALL ) for (e1, e2), similarity in linked_records: for field in unique(field["field"] for field in fields): g1_val = g1_features[e1][field] g2_val = g2_features[e2][field] print(f"{g1_val:<50} | {g2_val:<50}") print(f"Similarity: {similarity}") print("-" * 20) ans = input(Fore.YELLOW + "[y/n]? " + Style.RESET_ALL) if ans.lower() == "y": print( Fore.GREEN + "All correct! Moving on to any stragglers" + Style.RESET_ALL ) break else: print(Fore.RED + "Re-labeling..." + Style.RESET_ALL) linked_entities = _unpack_linked_records(linked_records) if len(linked_entities) != len(g1_features) or len(linked_entities) != len( g2_features ): leftover_g1 = set(g1_features.keys()).difference(linked_entities) leftover_g2 = set(g2_features.keys()).difference(linked_entities) leftover_g1 = {k: v for (k, v) in g1_features.items() if k in leftover_g1} leftover_g2 = {k: v for (k, v) in g2_features.items() if k in leftover_g2} return linked_records, leftover_g1, leftover_g2
def main(self): data_1 = {} data_2 = {} # import the specified CSV file data_1 = csvhelpers.readData(self.input_1, self.field_names_1, delimiter=self.delimiter, prefix='input_1') data_2 = csvhelpers.readData(self.input_2, self.field_names_2, delimiter=self.delimiter, prefix='input_2') # fix price for field in self.field_definition: if field['type'] == 'Price': for key, record in data_1.items(): value = record[field['field']] if value: record[field['field']] = int(float(value)) # fix latlon for field in self.field_definition: if field['type'] == 'LatLong': for key, record in data_1.items(): latitude = record[field['latitude']] longitude = record[field['longitude']] record[field['field']] = ( float(latitude), float(longitude)) if latitude and longitude else None # fix price for field in self.field_definition: if field['type'] == 'Price': for key, record in data_2.items(): value = record[field['field']] if value: record[field['field']] = int(float(value)) # fix latlon for field in self.field_definition: if field['type'] == 'LatLong': for key, record in data_2.items(): latitude = record[field['latitude']] longitude = record[field['longitude']] record[field['field']] = ( float(latitude), float(longitude)) if latitude and longitude else None # sanity check for provided field names in CSV file for field in self.field_names_1: if field not in list(data_1.values())[0]: raise self.parser.error("Could not find field '" + field + "' in input") for field in self.field_names_2: if field not in list(data_2.values())[0]: raise self.parser.error("Could not find field '" + field + "' in input") if self.field_names_1 != self.field_names_2: for record_id, record in data_2.items(): remapped_record = {} for new_field, old_field in zip(self.field_names_1, self.field_names_2): remapped_record[new_field] = record[old_field] data_2[record_id] = remapped_record logging.info('imported %d rows from file 1', len(data_1)) logging.info('imported %d rows from file 2', len(data_2)) logging.info('using fields: %s' % [field['field'] for field in self.field_definition]) # If --skip_training has been selected, and we have a settings cache still # persisting from the last run, use it in this next run. # __Note:__ if you want to add more training data, don't use skip training if self.skip_training and os.path.exists(self.settings_file): # Load our deduper from the last training session cache. logging.info('reading from previous training cache %s' % self.settings_file) with open(self.settings_file, 'rb') as f: deduper = dedupe.StaticRecordLink(f) fields = { variable.field for variable in deduper.data_model.primary_fields } (nonexact_1, nonexact_2, exact_pairs) = exact_matches(data_1, data_2, fields) else: # # Create a new deduper object and pass our data model to it. deduper = dedupe.RecordLink(self.field_definition) fields = { variable.field for variable in deduper.data_model.primary_fields } (nonexact_1, nonexact_2, exact_pairs) = exact_matches(data_1, data_2, fields) # Set up our data sample logging.info('taking a sample of %d possible pairs', self.sample_size) deduper.sample(nonexact_1, nonexact_2, self.sample_size) # Perform standard training procedures self.dedupe_training(deduper) # ## Blocking logging.info('blocking...') # ## Clustering # Find the threshold that will maximize a weighted average of our precision and recall. # When we set the recall weight to 2, we are saying we care twice as much # about recall as we do precision. # # If we had more data, we would not pass in all the blocked data into # this function but a representative sample. logging.info('finding a good threshold with a recall_weight of %s' % self.recall_weight) threshold = deduper.threshold(data_1, data_2, recall_weight=self.recall_weight) # `duplicateClusters` will return sets of record IDs that dedupe # believes are all referring to the same entity. logging.info('clustering...') clustered_dupes = deduper.match(data_1, data_2, threshold) clustered_dupes.extend(exact_pairs) logging.info('# duplicate sets %s' % len(clustered_dupes)) write_function = csvhelpers.writeLinkedResults # write out our results if self.output_file: if sys.version < '3': with open(self.output_file, 'wb', encoding='utf-8') as output_file: write_function(clustered_dupes, self.input_1, self.input_2, output_file, self.inner_join) else: with open(self.output_file, 'w', encoding='utf-8') as output_file: write_function(clustered_dupes, self.input_1, self.input_2, output_file, self.inner_join) else: write_function(clustered_dupes, self.input_1, self.input_2, sys.stdout, self.inner_join)
def link_records(dataset, left_records_abbr, right_records_abbr, fields_json_file_path, known_matches_filepath='src/prodigy/known_matches.jsonl'): """ Collect the best possible training data for linking records across multiple datasets. This recipe is an example of linking records across 2 CSV files using the dedupe.io library. """ load_dotenv(find_dotenv()) account_name = os.environ.get('COSMOS_ACCOUNT_NAME') db_name = os.environ.get('COSMOS_DB_NAME') graph_name = os.environ.get('COSMOS_GRAPH_NAME') master_key = os.environ.get('COSMOS_MASTER_KEY') doc_qm = DocumentQueryManager(account_name, master_key, db_name) gremlin_qm = GremlinQueryManager(account_name, master_key, db_name, graph_name) # left_records = get_services(doc_qm, left_record_label) # right_records = get_services(doc_qm, right_record_label) left_record_file_path = f'data/processed/{left_records_abbr}_dedupe_data.csv' right_record_file_path = f'data/processed/{right_records_abbr}_dedupe_data.csv' left_records = readData(left_record_file_path) right_records = readData(right_record_file_path) db = connect() # uses the settings in your prodigy.json output_path = f'data/processed/{left_records_abbr}_{right_records_abbr}_dedupe' os.makedirs(output_path, exist_ok=True) output_file = f'{output_path}/data_matching_output.csv' settings_file = f'{output_path}/data_matching_learned_settings' training_file = f'{output_path}/data_matching_training.json' def descriptions(): for dataset in (left_records, right_records): for k, record in dataset.items(): yield record['short_description'] yield record['long_description'] with open(fields_json_file_path) as fields_json_file: fields = json.load(fields_json_file) for field in fields: validate_field(field) if field['type'] == 'Text' and 'corpus' in field: func_name = field['corpus'][1:-1] field['corpus'] = locals()[func_name].__call__() # print(json.dumps(fields, indent=4)) linker = dedupe.RecordLink(fields) # To train the linker, we feed it a sample of records. linker.sample(left_records, right_records, 500 # round(min(len(left_records) / 2, len(right_records) / 2)) ) # If we have training data saved from a previous run of linker, # look for it an load it in. examples = db.get_dataset(dataset) if examples: linker = update_linker(linker, examples) # Add known matches known_matches = [] for match in JSONL(known_matches_filepath): left_match = None right_match = None for lr in left_records.values(): if left_records_abbr in match and lr['name'] == match[ left_records_abbr].lower(): left_match = lr for rr in right_records.values(): if right_records_abbr in match and rr['name'] == match[ right_records_abbr].lower(): right_match = rr if left_match and right_match: known_matches.append((left_match, right_match)) linker.markPairs({'distinct': [], 'match': known_matches}) def update(examples, linker=linker): linker = update_linker(linker, examples) def get_progress(session=0, total=0, loss=0, linker=linker): n_match = len(linker.training_pairs['match']) n_distinct = len(linker.training_pairs['distinct']) n_match_progress = min(1, (n_match / 10)) / 2 n_distinct_progress = min(1, (n_distinct / 10)) / 2 print("Examples Annotated: {0}/10 positive, {1}/10 negative".format( n_match, n_distinct)) print(n_match_progress, n_distinct_progress) progress = min(1, n_match_progress + n_distinct_progress) return progress def on_exit(controller, linker=linker): print('TRAINING PAIRS') print(linker.training_pairs) print('=' * 50) print('=' * 50) linker.train() # Save our weights and predicates to disk. If the settings file # exists, we will skip all the training and learning next time we run # this file. with open(settings_file, 'wb') as sf: linker.writeSettings(sf) print('clustering...') linked_records = linker.match(left_records, right_records, 0) print('# duplicate sets', len(linked_records)) # ## Writing Results # Write our original data back out to a CSV with a new column called # 'Cluster ID' which indicates which records refer to each other. cluster_membership = {} cluster_id = None for cluster_id, (cluster, score) in enumerate(linked_records): for record_id in cluster: cluster_membership[record_id] = (cluster_id, score) if cluster_id: unique_id = cluster_id + 1 else: unique_id = 0 with open(output_file, 'w') as f: writer = csv.writer(f) header_unwritten = True for fileno, filename in enumerate( (left_record_file_path, right_record_file_path)): with open(filename) as f_input: reader = csv.reader(f_input) if header_unwritten: heading_row = next(reader) heading_row.insert(0, 'source file') heading_row.insert(0, 'Link Score') heading_row.insert(0, 'Cluster ID') writer.writerow(heading_row) header_unwritten = False else: next(reader) for row_id, row in enumerate(reader): cluster_details = cluster_membership.get(filename + str(row_id)) if cluster_details is None: cluster_id = unique_id unique_id += 1 score = None else: cluster_id, score = cluster_details row.insert(0, fileno) row.insert(0, score) row.insert(0, cluster_id) writer.writerow(row) print(cluster_membership) stream = record_pairs_stream(linker) with open('src/prodigy/record_pairs.html') as template_file: html_template = template_file.read() return { 'view_id': 'html', 'dataset': dataset, 'stream': stream, 'update': update, 'progress': get_progress, 'on_exit': on_exit, 'config': { 'html_template': html_template } }
def do_training_iteration(iteration_number, linkage_fields, bad_data_dict, good_data_dict, asset_class, geo_setting, clip_betas=True, clip_levels=None, prior_weights=None): # Validate inputs if clip_betas and clip_levels is None: raise Exception( "Please provide clip levels for option clip_betas=True") # Create a new linker object and pass our data model to it linker = dedupe.RecordLink(linkage_fields, num_cores=num_cores) # Set up the regularized logistic regression model linker.classifier = RegularizedLogisticRegression() # For the initial iteration, use the prior weights if iteration_number == 0: assert prior_weights is not None, "Must provide prior weights for initial training iteration" prior_intercept, prior_beta = prior_weights linker.classifier.weights = prior_beta linker.classifier.bias = prior_intercept # Else we use the latest iteration of the training set else: # To train the linker, we feed it a sample of records random.seed(1) np.random.seed(1) linker.sample(bad_data_dict, good_data_dict, 10000) # Look for the previous training file iteration trainset_file_prev_iter = '{}/training_set_build/fuzzy_{}_training_set_v{}.json'.format( scratch_dir, asset_class, "_it_{}".format(iteration_number - 1)) assert os.path.exists(trainset_file_prev_iter ), "Cannot find previous iteration training set" # Read the previous training set logger.info( 'Reading labeled examples from {}'.format(trainset_file_prev_iter)) with open(trainset_file_prev_iter) as tf: training_pairs_prev = read_training(tf) linker.markPairs(training_pairs_prev) # Do the training logger.info( "Training the linker, iteration {}".format(iteration_number)) linker.train() linker.cleanupTraining() # Apply beta clipping if asked to if clip_betas: linker.classifier.weights = linker.classifier.weights.clip( min=clip_levels[:, 0], max=clip_levels[:, 1]) # Finally, serialize the linker logger.info( "Serializing the linker, iteration {}".format(iteration_number)) store_linker(linker, scratch_dir + "/training_set_build", asset_class, geo_setting, "_it_{}".format(iteration_number)) return linker
def link_records(dataset, left_record_file_path, right_record_file_path, fields_json_file_path): """ Collect the best possible training data for linking records across multiple datasets. This recipe is an example of linking records across 2 CSV files using the dedupe.io library. """ db = connect() # uses the settings in your prodigy.json output_file = 'data_matching_output.csv' settings_file = 'data_matching_learned_settings' training_file = 'data_matching_training.json' left_records = readData(left_record_file_path) right_records = readData(right_record_file_path) def descriptions(): for dataset in (left_records, right_records): for record in dataset.values(): yield record['description'] with open(fields_json_file_path) as fields_json_file: fields = json.load(fields_json_file) for field in fields: validate_field(field) if field['type'] == 'Text' and 'corpus' in field: func_name = field['corpus'][1:-1] field['corpus'] = locals()[func_name].__call__() print('LEN RECORDS: ', len(left_records) / 2, len(right_records) / 2) print('MIN SAMPLE', min(len(left_records) / 2, len(right_records) / 2)) print(fields) linker = dedupe.RecordLink(fields) # To train the linker, we feed it a sample of records. linker.sample( left_records, right_records, round(min(len(left_records) / 2, len(right_records) / 2)) ) print('getting examples') # If we have training data saved from a previous run of linker, # look for it an load it in. examples = db.get_dataset(dataset) if len(examples) > 0: linker = update_linker(linker, examples) def update(examples, linker=linker): print(len(examples)) linker = update_linker(linker, examples) def get_progress(session=0, total=0, loss=0, linker=linker): n_match = len(linker.training_pairs['match']) n_distinct = len(linker.training_pairs['distinct']) n_match_progress = min(1, (n_match / 10)) / 2 n_distinct_progress = min(1, (n_distinct / 10)) / 2 print("Examples Annotated: {0}/10 positive, {1}/10 negative".format(n_match, n_distinct)) print(n_match_progress, n_distinct_progress) progress = min(1, n_match_progress + n_distinct_progress) return progress def on_exit(controller, linker=linker): linker.train() # Save our weights and predicates to disk. If the settings file # exists, we will skip all the training and learning next time we run # this file. with open(settings_file, 'wb') as sf: linker.writeSettings(sf) print('clustering...') linked_records = linker.match(left_records, right_records, 0) print('# duplicate sets', len(linked_records)) # ## Writing Results # Write our original data back out to a CSV with a new column called # 'Cluster ID' which indicates which records refer to each other. cluster_membership = {} cluster_id = None for cluster_id, (cluster, score) in enumerate(linked_records): for record_id in cluster: cluster_membership[record_id] = (cluster_id, score) if cluster_id: unique_id = cluster_id + 1 else: unique_id = 0 with open(output_file, 'w') as f: writer = csv.writer(f) header_unwritten = True for fileno, filename in enumerate((left_record_file_path, right_record_file_path)): with open(filename) as f_input: reader = csv.reader(f_input) if header_unwritten: heading_row = next(reader) heading_row.insert(0, 'source file') heading_row.insert(0, 'Link Score') heading_row.insert(0, 'Cluster ID') writer.writerow(heading_row) header_unwritten = False else: next(reader) for row_id, row in enumerate(reader): cluster_details = cluster_membership.get(filename + str(row_id)) if cluster_details is None: cluster_id = unique_id unique_id += 1 score = None else: cluster_id, score = cluster_details row.insert(0, fileno) row.insert(0, score) row.insert(0, cluster_id) writer.writerow(row) print(cluster_membership) stream = record_pairs_stream(linker) with open('./record_pairs.html') as template_file: html_template = template_file.read() return { 'view_id': 'html', 'dataset': dataset, 'stream': stream, 'update': update, 'progress': get_progress, 'on_exit': on_exit, 'config': { 'html_template': html_template } }
def main(self) : data_1 = {} data_2 = {} # import the specified CSV file data_1 = csvhelpers.readData(self.input_1, self.field_names_1, prefix='input_1') data_2 = csvhelpers.readData(self.input_2, self.field_names_2, prefix='input_2') # sanity check for provided field names in CSV file for field in self.field_names_1 : if field not in data_1.values()[0]: raise parser.error("Could not find field '" + field + "' in input") for field in self.field_names_2 : if field not in data_2.values()[0]: raise parser.error("Could not find field '" + field + "' in input") if self.field_names_1 != self.field_names_2 : for record_id, record in data_2.items() : remapped_record = {} for new_field, old_field in zip(self.field_names_1, self.field_names_2) : remapped_record[new_field] = record[old_field] data_2[record_id] = remapped_record logging.info('imported %d rows from file 1', len(data_1)) logging.info('imported %d rows from file 2', len(data_2)) logging.info('using fields: %s' % self.field_definition.keys()) # # Create a new deduper object and pass our data model to it. deduper = dedupe.RecordLink(self.field_definition) # Set up our data sample logging.info('taking a sample of %d possible pairs', self.sample_size) deduper.sample(data_1, data_2, self.sample_size) # If we have training data saved from a previous run of dedupe, # look for it an load it in. # __Note:__ if you want to train from scratch, delete the training_file if os.path.exists(self.training_file): logging.info('reading labeled examples from %s' % self.training_file) deduper.readTraining(self.training_file) elif self.skip_training: raise parser.error("You need to provide an existing training_file or run this script without --skip_training") if not self.skip_training: logging.info('starting active labeling...') dedupe.consoleLabel(deduper) deduper.train() # When finished, save our training away to disk logging.info('saving training data to %s' % self.training_file) deduper.writeTraining(self.training_file) else: logging.info('skipping the training step') deduper.train() # ## Blocking logging.info('blocking...') # ## Clustering # Find the threshold that will maximize a weighted average of our precision and recall. # When we set the recall weight to 2, we are saying we care twice as much # about recall as we do precision. # # If we had more data, we would not pass in all the blocked data into # this function but a representative sample. logging.info('finding a good threshold with a recall_weight of %s' % self.recall_weight) threshold = deduper.threshold(data_1, data_2, recall_weight=self.recall_weight) # `duplicateClusters` will return sets of record IDs that dedupe # believes are all referring to the same entity. logging.info('clustering...') clustered_dupes = deduper.match(data_1, data_2, threshold) logging.info('# duplicate sets %s' % len(clustered_dupes)) write_function = csvhelpers.writeLinkedResults # write out our results if self.output_file : with open(self.output_file, 'w') as output_file : write_function(clustered_dupes, self.input_1, self.input_2, output_file, self.inner_join) else : write_function(clustered_dupes, self.input_1, self.input_2, sys.stdout, self.inner_join)