def active_training(): print("MODE: Active training") # carga fichero de datos a deduplicar try: data_d = read_messy_data(CONFIG.PATHS.INPUT_FILE) except IOError: print("No se pudo abrir el fichero de records de entrada - " + CONFIG.PATHS.INPUT_FILE) raise IOError # Entrenamiento Activo deduper = dedupe.Dedupe(CONFIG.DEDUPE.FIELDS) deduper.sample(data_d, CONFIG.DEDUPE.SAMPLE_SIZE) # Carga de registros clasificados de entrenamientos anteriores if CONFIG.GENERAL.LOAD_TRAINING: try: with open(CONFIG.PATHS.TRAINING_FILE) as f: deduper.readTraining(f) except IOError: print("No se pudo abrir el fichero de entrenamiento activo -" + CONFIG.PATHS.TRAINING_FILE) dedupe.consoleLabel(deduper) with open(CONFIG.PATHS.TRAINING_FILE, 'w') as tf: deduper.writeTraining(tf)
def dedupe_training(self, deduper) : # __Note:__ if you want to train from scratch, delete the training_file if os.path.exists(self.training_file): logging.info('reading labeled examples from %s' % self.training_file) with open(self.training_file) as tf: deduper.readTraining(tf) elif self.skip_training: raise self.parser.error( "You need to provide an existing training_file or run this script without --skip_training") if not self.skip_training: logging.info('starting active labeling...') dedupe.consoleLabel(deduper) # When finished, save our training away to disk logging.info('saving training data to %s' % self.training_file) if sys.version < '3' : with open(self.training_file, 'wb') as tf: deduper.writeTraining(tf) else : with open(self.training_file, 'w') as tf: deduper.writeTraining(tf) else: logging.info('skipping the training step') deduper.train() # After training settings have been established make a cache file for reuse logging.info('caching training result set to file %s' % self.settings_file) with open(self.settings_file, 'wb') as sf: deduper.writeSettings(sf)
def train(self): # data_d = self._read_data(self.input_file) # self.data_d = data_d data_1 = self._read_data(self.input_file_base) data_2 = self._read_data(self.input_file) self.data_1 = data_1 self.data_2 = data_2 fields = self.fields deduper = dedupe.RecordLink(fields) training_file = self.training_file # ## training data if os.path.exists(training_file): logging.info('reading labeled examples from ', training_file) with open(training_file, 'rb') as f: deduper.prepare_training(data_1, data_2, f) else: deduper.prepare_training(data_1, data_2) print('starting active labeling...') dedupe.consoleLabel(deduper) deduper.train(recall=1) with open(training_file, 'w') as tf: deduper.writeTraining(tf) self.deduper = deduper
def train_linker(df1, df2): if READ_FROM_SETTINGS_FILE: with open(SETTINGS_FILE, 'rb') as sf: linker = dedupe.StaticRecordLink(sf) else: linker = dedupe.RecordLink(FIELDS) # It's terrible that you have to do this next line!!!! linker.classifier = rlr.RegularizedLogisticRegression() linker.sample(df1, df2, BLOCKING_TRAINING_SAMPLE_SIZE) if READ_FROM_TRAINING_FILE: print('reading labeled examples from ', TRAINING_FILE) with open(TRAINING_FILE, 'rb') as tf: linker.readTraining(tf) else: dedupe.consoleLabel(linker) linker.train() if WRITE_SETTINGS_FILE: with open(SETTINGS_FILE, 'wb') as sf: linker.writeSettings(sf) if WRITE_TRAINING_FILE: with open(TRAINING_FILE, 'w') as tf: linker.writeTraining(tf) return linker
def deduplicate(): print("MODE: Deduplicate") # Comprueba que va a haber registros clasificados con los que entrenar un modelo: if not (CONFIG.GENERAL.LOAD_TRAINING or CONFIG.GENERAL.PERFORM_ACTIVE_TRAINING): print("ERROR: El entrenamiento activo y la carga desde fichero están desactivados") return # Carga fichero de datos a deduplicar try: data_d = read_messy_data(CONFIG.PATHS.INPUT_FILE) except IOError: print("No se pudo abrir el fichero de records de entrada - " + CONFIG.PATHS.INPUT_FILE) raise IOError if CONFIG.GENERAL.LOAD_SETTINGS: try: with open(CONFIG.PATHS.SETTINGS_FILE, 'rb') as f: deduper = dedupe.StaticDedupe(f) except IOError: print("No se pudo abrir el fichero de settings de dedupe - " + CONFIG.PATHS.SETTINGS_FILE) raise IOError else: # Inicializa objeto dedupe deduper = dedupe.Dedupe(CONFIG.DEDUPE.FIELDS) # Carga de registros clasificados de entrenamientos anteriores if CONFIG.GENERAL.LOAD_TRAINING: try: with open(CONFIG.PATHS.TRAINING_FILE) as f: deduper.readTraining(f) except IOError: print("No se pudo abrir el fichero de entrenamiento activo - " + CONFIG.PATHS.TRAINING_FILE) raise IOError if CONFIG.GENERAL.PERFORM_ACTIVE_TRAINING: # Muestreo y entrenamiento activo deduper.sample(data_d, CONFIG.DEDUPE.SAMPLE_SIZE) dedupe.consoleLabel(deduper) # Entrenamiento de modelo predictivo (por defecto regresión logística deduper.train(CONFIG.DEDUPE.USE_INDEX_PREDICATES) # Guarda entrenamiento activo, y modelo predictivo + predicados with open(CONFIG.PATHS.TRAINING_FILE, 'w') as tf: deduper.writeTraining(tf) with open(CONFIG.PATHS.SETTINGS_FILE, 'wb') as sf: deduper.writeSettings(sf) try: # Calcula umbral para la regresión logistica threshold = deduper.threshold(data_d, recall_weight=CONFIG.DEDUPE.RECALL_WEIGHT) # Agrupación de matches en clusters clustered_dupes = deduper.match(data_d, threshold) # Escrirura en fichero write_clusters(clustered_dupes) except NameError: print("Error - No se pudo inicializar el objeto dedupe") raise NameError
def run_dedupe(project): deduper, data = create_deduper(project) deduper.train() dedupe.consoleLabel(deduper) deduper.train() # TODO: lower the recall weight? threshold = deduper.threshold(data, recall_weight=1) clustered_dupes = deduper.match(data, threshold) print('# duplicate sets', len(clustered_dupes))
def trainIncoming(name): from geocoder.deduper import DatabaseGazetteer import simplejson as json import dedupe engine = create_engine(DB_CONN) deduper = DatabaseGazetteer([{ 'field': 'complete_address', 'type': 'Address' }], engine=engine) sql_table = checkForTable(engine, name) if sql_table == None: sys.exit() primary_key = sql_table.primary_key.columns.keys()[0] messy_table = ''' SELECT {0}, complete_address FROM {1} WHERE address_id IS NULL AND complete_address IS NOT NULL '''.format(primary_key, name) curs = engine.execute(messy_table) messy_data = ({'complete_address': r.complete_address} for r in curs) deduper.drawSample(messy_data, sample_size=30000) if os.path.exists('geocoder/data/training.json'): print('reading labeled examples from geocoder/data/training.json') with open('geocoder/data/training.json') as tf: deduper.readTraining(tf) dedupe.consoleLabel(deduper) deduper.train(ppc=0.1, index_predicates=False) # When finished, save our training away to disk with open('geocoder/data/training.json', 'w') as tf: deduper.writeTraining(tf) # Save our weights and predicates to disk. If the settings file # exists, we will skip all the training and learning next time we run # this file. with open('geocoder/dedupe.settings', 'wb') as sf: deduper.writeSettings(sf) deduper.cleanupTraining()
def trainIncoming(name): from geocoder.deduper import DatabaseGazetteer import simplejson as json import dedupe engine = create_engine(DB_CONN) deduper = DatabaseGazetteer([{'field': 'complete_address', 'type': 'Address'}], engine=engine) sql_table = checkForTable(engine, name) if sql_table == None: sys.exit() primary_key = sql_table.primary_key.columns.keys()[0] messy_table = ''' SELECT {0}, complete_address FROM {1} WHERE address_id IS NULL AND complete_address IS NOT NULL '''.format(primary_key, name) curs = engine.execute(messy_table) messy_data = ({'complete_address': r.complete_address} for r in curs) deduper.drawSample(messy_data, sample_size=30000) if os.path.exists('geocoder/data/training.json'): print('reading labeled examples from geocoder/data/training.json') with open('geocoder/data/training.json') as tf : deduper.readTraining(tf) dedupe.consoleLabel(deduper) deduper.train(ppc=0.1, index_predicates=False) # When finished, save our training away to disk with open('geocoder/data/training.json', 'w') as tf : deduper.writeTraining(tf) # Save our weights and predicates to disk. If the settings file # exists, we will skip all the training and learning next time we run # this file. with open('geocoder/dedupe.settings', 'wb') as sf : deduper.writeSettings(sf) deduper.cleanupTraining()
def train(self, data_d): """ Train classifiers using logistic regression over the the RVD flaws """ cyan("Duplicates, training the classifiers using dedupe...") # Create a new deduper object and pass our data model to it. deduper = dedupe.Dedupe(self.fields) # If we have training data saved from a previous run of dedupe, # look for it and load it in. # __Note:__ if you want to train from scratch, delete the training_file if os.path.exists(self.training_file): gray("Reading labeled examples from ", self.training_file) with open(self.training_file, "rb") as f: deduper.prepare_training(data_d, f) else: deduper.prepare_training(data_d) # ## Active learning # Dedupe will find the next pair of records # it is least certain about and ask you to label them as duplicates # or not. # use 'y', 'n' and 'u' keys to flag duplicates # press 'f' when you are finished gray("Starting active labeling...") dedupe.consoleLabel(deduper) # Using the examples we just labeled, train the deduper and learn # blocking predicates gray("Training...") deduper.train() # When finished, save our training to disk gray("Saving results to training file...") with open(self.training_file, "w+") as tf: deduper.writeTraining(tf) # Save our weights and predicates to disk. If the settings file # exists, we will skip all the training and learning next time we run # this file. gray("Saving weights and predicates to settings file...") with open(self.settings_file, "wb+") as sf: deduper.writeSettings(sf) return deduper
def deduplicate( df, recall_weight=1, sample_size=0.3, settings_file="training-data/dedupe_learned_settings", training_file="training-data/dedupe_training.json", ): fields = df.columns df, data_d = data_prep(df) if os.path.exists(settings_file): logger.info("Existing settings found. Loading from: %s", settings_file) with open(settings_file, "rb") as f: deduper = dedupe.StaticDedupe(f) else: fields = select_fields(fields) deduper = dedupe.Dedupe(fields) sample_num = math.floor(len(data_d) * sample_size) logger.info("Extracting data sample of %s records", sample_num) deduper.sample(data_d, sample_num) if os.path.exists(training_file): logger.info("Reading training examples from: %s", training_file) with open(training_file, "rb") as f: deduper.readTraining(f) logger.info("Starting active labeling") dedupe.consoleLabel(deduper) deduper.train() with open(training_file, "w") as tf: deduper.writeTraining(tf) with open(settings_file, "wb") as sf: deduper.writeSettings(sf) threshold = deduper.threshold(data_d, recall_weight=recall_weight) clustered_df = data_cluster(deduper, data_d, threshold) results = df.join(clustered_df, how="left") results.drop(["dictionary"], axis=1, inplace=True) return results
def manual_training(self, *args, **options): logger.info('Starting dedupe training.') data = {} logger.info('Loading training data...') fields = [ {'field': 'First Name', 'type': 'String'}, {'field': 'Last Name', 'type': 'String'}, {'field': 'Mobile', 'type': 'String'}, ] with open('temp/report1445218513088.csv', 'rb') as training_file: reader = csv.DictReader(training_file) identifier = 1 for row in reader: clean = [(k, self.preprocess_column(v)) for (k, v) in row.items()] clean = dict(clean) # remove empty records delete = False for field in fields: if not clean[field['field']]: delete = True break if not delete: data[identifier] = clean identifier += 1 logger.info('Loaded training data.') deduper = dedupe.Dedupe(fields) logger.info('Taking samples...') deduper.sample(data, 10000) dedupe.consoleLabel(deduper) deduper.train() with open('training.json', 'wb') as training_file: deduper.writeTraining(training_file) with open('settings.json', 'wb') as settings_file: deduper.writeSettings(settings_file) logger.info('Beggining blocking.') logger.info('Computing thresholds...') threshold = deduper.threshold(data, recall_weight=2) logger.info('Clustering...') clustered_dupes = deduper.match(data, threshold) logger.info('%s duplicate sets.' % len(clustered_dupes)) with open('results', 'wb') as output_file: output_file.write(repr(clustered_dupes)) logger.info('Finished blocking.') logger.info('Finished dedupe training.')
def active_train(self, input_data, training_file): data_d = input_data fields = self.fields deduper = dedupe.Dedupe(fields) # ## training data if os.path.exists(training_file): logging.info('reading labeled examples from ', training_file) with open(training_file, 'rb') as f: deduper.prepare_training(data_d, f) else: deduper.prepare_training(data_d) print('starting active labeling...') dedupe.consoleLabel(deduper) deduper.train() with open(training_file, 'w') as tf: deduper.writeTraining(tf) self.deduper = deduper return self
def active_train(self, data_1, data_2, training_file): fields = self.fields deduper = dedupe.RecordLink(fields) # ## training data if os.path.exists(training_file): logging.info('reading labeled examples from ', training_file) with open(training_file, 'rb') as f: deduper.prepare_training(data_1, data_2, f) else: deduper.prepare_training(data_1, data_2) print('starting active labeling...') dedupe.consoleLabel(deduper) deduper.train(recall=1) with open(training_file, 'w') as tf: deduper.writeTraining(tf) self.deduper = deduper return self
def label_and_train(d_dict): deduper = dedupe.Dedupe(FIELDS) print("Sampling data...") deduper.sample(d_dict, 15_000) if TRAINING_PATH.exists(): with TRAINING_PATH.open('rb') as f: print(f"Loading trained examples from {TRAINING_PATH.name}.") deduper.readTraining(f) print("Starting active labeling...") dedupe.consoleLabel(deduper) deduper.train() with TRAINING_PATH.open('w') as tf: deduper.writeTraining(tf) with SETTINGS_PATH.open('wb') as sf: deduper.writeSettings(sf)
def train(): fields = [{ 'field': 'album_name', 'variable name': 'album', 'type': 'String' }, { 'field': 'track_name', 'variable name': 'track', 'type': 'String' }, { 'field': 'artist_name', 'variable name': 'artist', 'type': 'String' }, { 'field': 'track_duration', 'type': 'Price', 'has missing': True }, { 'type': 'Interaction', 'interaction variables': ['artist', 'album'] }] deduper = dedupe.Dedupe(fields) deduper.sample(data_d, 150000) if os.path.exists(training_file): print 'reading labeled examples from ', training_file with open(training_file) as tf: deduper.readTraining(tf) print 'starting active labeling...' dedupe.consoleLabel(deduper) deduper.train() with open(training_file, 'w') as tf: deduper.writeTraining(tf) with open(settings_file, 'w') as sf: deduper.writeSettings(sf)
def start_active_labeling(self, training_file, settings): if(not(self.settings_used)): print 'Active Labeling...' dedupe.consoleLabel(self.duper) self.duper.train() self.save(training_file, settings)
def main(self): data_1 = {} data_2 = {} # import the specified CSV file data_1 = csvhelpers.readData(self.input_1, self.field_names_1, prefix='input_1') data_2 = csvhelpers.readData(self.input_2, self.field_names_2, prefix='input_2') # sanity check for provided field names in CSV file for field in self.field_names_1: if field not in list(data_1.values())[0]: raise self.parser.error( "Could not find field '" + field + "' in input") for field in self.field_names_2: if field not in list(data_2.values())[0]: raise self.parser.error( "Could not find field '" + field + "' in input") if self.field_names_1 != self.field_names_2: for record_id, record in data_2.items(): remapped_record = {} for new_field, old_field in zip(self.field_names_1, self.field_names_2): remapped_record[new_field] = record[old_field] data_2[record_id] = remapped_record logging.info('imported %d rows from file 1', len(data_1)) logging.info('imported %d rows from file 2', len(data_2)) logging.info('using fields: %s' % [field['field'] for field in self.field_definition]) # # Create a new deduper object and pass our data model to it. deduper = dedupe.RecordLink(self.field_definition) # Set up our data sample logging.info('taking a sample of %d possible pairs', self.sample_size) deduper.sample(data_1, data_2, self.sample_size) # If we have training data saved from a previous run of dedupe, # look for it an load it in. # __Note:__ if you want to train from scratch, delete the training_file if os.path.exists(self.training_file): logging.info('reading labeled examples from %s' % self.training_file) with open(self.training_file) as tf: deduper.readTraining(tf) elif self.skip_training: raise self.parser.error( "You need to provide an existing training_file or run this script without --skip_training") if not self.skip_training: logging.info('starting active labeling...') dedupe.consoleLabel(deduper) # When finished, save our training away to disk logging.info('saving training data to %s' % self.training_file) if sys.version < '3' : with open(self.training_file, 'wb') as tf: deduper.writeTraining(tf) else : with open(self.training_file, 'w') as tf: deduper.writeTraining(tf) else: logging.info('skipping the training step') deduper.train() # ## Blocking logging.info('blocking...') # ## Clustering # Find the threshold that will maximize a weighted average of our precision and recall. # When we set the recall weight to 2, we are saying we care twice as much # about recall as we do precision. # # If we had more data, we would not pass in all the blocked data into # this function but a representative sample. logging.info('finding a good threshold with a recall_weight of %s' % self.recall_weight) threshold = deduper.threshold(data_1, data_2, recall_weight=self.recall_weight) # `duplicateClusters` will return sets of record IDs that dedupe # believes are all referring to the same entity. logging.info('clustering...') clustered_dupes = deduper.match(data_1, data_2, threshold) logging.info('# duplicate sets %s' % len(clustered_dupes)) write_function = csvhelpers.writeLinkedResults # write out our results if self.output_file: if sys.version < '3' : with open(self.output_file, 'wb') as output_file: write_function(clustered_dupes, self.input_1, self.input_2, output_file, self.inner_join) else : with open(self.output_file, 'w') as output_file: write_function(clustered_dupes, self.input_1, self.input_2, output_file, self.inner_join) else: write_function(clustered_dupes, self.input_1, self.input_2, sys.stdout, self.inner_join)
# look for it an load it in. # __Note:__ if you want to train from scratch, delete the training_file if os.path.exists(training_file): print('reading labeled examples from ', training_file) with open(training_file) as tf: gazetteer.readTraining(tf) # ## Active learning # Dedupe will find the next pair of records # it is least certain about and ask you to label them as matches # or not. # use 'y', 'n' and 'u' keys to flag duplicates # press 'f' when you are finished print('starting active labeling...') dedupe.consoleLabel(gazetteer) gazetteer.train() # When finished, save our training away to disk with open(training_file, 'w') as tf: gazetteer.writeTraining(tf) # Make the canonical set gazetteer.index(canonical) # Save our weights and predicates to disk. If the settings file # exists, we will skip all the training and learning next time we run # this file. with open(settings_file, 'wb') as sf: gazetteer.writeSettings(sf, index=True)
def main(self) : data_d = {} # import the specified CSV file data_d = csvhelpers.readData(self.input, self.field_names) logging.info('imported %d rows', len(data_d)) # sanity check for provided field names in CSV file for field in self.field_definition : if self.field_definition[field]['type'] != 'Interaction' : if not field in data_d[0]: raise parser.error("Could not find field '" + field + "' in input") logging.info('using fields: %s' % self.field_definition.keys()) # # Create a new deduper object and pass our data model to it. deduper = dedupe.Dedupe(self.field_definition) # Set up our data sample logging.info('taking a sample of %d possible pairs', self.sample_size) deduper.sample(data_d, self.sample_size) # If we have training data saved from a previous run of dedupe, # look for it an load it in. # __Note:__ if you want to train from scratch, delete the training_file if os.path.exists(self.training_file): logging.info('reading labeled examples from %s' % self.training_file) deduper.readTraining(self.training_file) elif self.skip_training: raise parser.error("You need to provide an existing training_file or run this script without --skip_training") if not self.skip_training: logging.info('starting active labeling...') dedupe.consoleLabel(deduper) deduper.train() # When finished, save our training away to disk logging.info('saving training data to %s' % self.training_file) deduper.writeTraining(self.training_file) else: logging.info('skipping the training step') deduper.train() # ## Blocking logging.info('blocking...') # ## Clustering # Find the threshold that will maximize a weighted average of our precision and recall. # When we set the recall weight to 2, we are saying we care twice as much # about recall as we do precision. # # If we had more data, we would not pass in all the blocked data into # this function but a representative sample. logging.info('finding a good threshold with a recall_weight of %s' % self.recall_weight) threshold = deduper.threshold(data_d, recall_weight=self.recall_weight) # `duplicateClusters` will return sets of record IDs that dedupe # believes are all referring to the same entity. logging.info('clustering...') clustered_dupes = deduper.match(data_d, threshold) logging.info('# duplicate sets %s' % len(clustered_dupes)) write_function = csvhelpers.writeResults # write out our results if self.destructive: write_function = csvhelpers.writeUniqueResults if self.output_file : with open(self.output_file, 'w') as output_file : write_function(clustered_dupes, self.input, output_file) else : write_function(clustered_dupes, self.input, sys.stdout)
def gazetteer_dataframes(messy_df, canonical_df, field_properties, recall_weight, n_matches, config_name="gazetteer_dataframes"): config_name = config_name.replace(" ", "_") settings_file = config_name + '_learned_settings' training_file = config_name + '_training.json' print('importing data ...') messy_df = clean_punctuation(messy_df) specify_type(messy_df, field_properties) messy_df['index_field'] = messy_df.index messy_df['index_field'] = messy_df['index_field'].apply( lambda x: "messy_df" + str(x)) messy_df.set_index(['index_field'], inplace=True) data_1 = messy_df.to_dict(orient='index') canonical_df = clean_punctuation(canonical_df) specify_type(canonical_df, field_properties) canonical_df['index_field'] = canonical_df.index canonical_df['index_field'] = canonical_df['index_field'].apply( lambda x: "canonical_df" + str(x)) canonical_df.set_index(['index_field'], inplace=True) data_2 = canonical_df.to_dict(orient='index') # --------------------------------------------------------------------------------- # ## Training if os.path.exists(settings_file): print('reading from', settings_file) with open(settings_file, 'rb') as sf: gazetteer = dedupe.StaticGazetteer(sf) else: # Define the fields the linker will pay attention to # # Notice how we are telling the linker to use a custom field comparator # for the 'price' field. fields = [] select_fields(fields, field_properties) # Create a new gazetteer object and pass our data model to it. gazetteer = dedupe.Gazetteer(fields) # To train the gazetteer, we feed it a sample of records. gazetteer.sample(data_1, data_2, 15000) # If we have training data saved from a previous run of linker, # look for it an load it in. # __Note:__ if you want to train from scratch, delete the training_file if os.path.exists(training_file): print('reading labeled examples from ', training_file) with open(training_file) as tf: linker.readTraining(tf) # ## Active learning # Dedupe will find the next pair of records # it is least certain about and ask you to label them as matches # or not. # use 'y', 'n' and 'u' keys to flag duplicates # press 'f' when you are finished print('starting active labeling...') dedupe.consoleLabel(gazetteer) gazetteer.train() # When finished, save our training awak to disk with open(training_file, 'w') as tf: gazetteer.writeTraining(tf) # Make the canonical set gazetteer.index(data_2) # Save our weights and predicates to disk. If the settings file exists, # we will skip all training and learning next time we run this file. with open(settings_file, 'wb') as sf: gazetteer.writeSettings(sf, index=True) gazetteer.cleanupTraining() gazetteer.index(data_2) #Calc threshold print('start calculating threshold') threshold = gazetteer.threshold(data_1, recall_weight) print('Threshold: {}'.format(threshold)) results = gazetteer.match(data_1, threshold=threshold, n_matches=n_matches) results_df = pd.DataFrame(results) results_df['messy_df_link'] = results_df[0].apply(lambda x: x[0][0]) results_df['messy_df_link'] = results_df['messy_df_link'].str.strip( 'messy_df') results_df['messy_df_link'] = results_df['messy_df_link'].astype(int) results_df['canonical_df_link'] = results_df[0].apply(lambda x: x[0][1]) results_df['canonical_df_link'] = results_df[ 'canonical_df_link'].str.strip('canonical_df') results_df['canonical_df_link'] = results_df['canonical_df_link'].astype( int) results_df['confidence'] = results_df[0].apply(lambda x: x[1]) results_df['cluster id'] = results_df.index results_df = results_df.rename(columns={0: 'results'}) results_df['results'] = results_df['results'].astype(str) #For both messy_df & canonical_df, add cluster id & confidence score from results_df messy_df.index.rename('messy_df_link', inplace=True) messy_df = messy_df.rename(columns={'unique_id': 'messy_unique_id'}) messy_df = messy_df.merge(results_df_copy, on='messy_df_link', how='left') canonical_df.index.rename('canonical_df_link', inplace=True) canonical_df = canonical_df.rename( columns={'unique_id': 'canonical_unique_id'}) canonical_df = canonical_df.merge(results_df_copy, on='canonical_df_link', how='left') #Merge messy_df & canonical_df together final_df = messy_df.merge(canonical_df, on='results') return final_df
def train(clients): if os.path.exists(settings_file): with open(settings_file) as sf: print 'reading from', settings_file return dedupe.StaticDedupe(sf) else: fields = [ { 'field': 'city', 'type': 'ShortString', 'Has Missing': True }, { 'field': 'country', 'type': 'ShortString', 'Has Missing': True }, { 'field': 'zip', 'type': 'ShortString', 'Has Missing': True }, { 'field': 'state', 'type': 'ShortString', 'Has Missing': True }, { 'field': 'address', 'type': 'String', 'Has Missing': True }, { 'field': 'description', 'type': 'Text', 'Has Missing': True, 'corpus': [] #map(lambda x: x['description'],clients.values())}, }, { 'field': 'specific_issues', 'type': 'Text', 'Has Missing': True, 'corpus': [] #map(lambda x: x['specific_issues'],clients.values()) }, { 'field': 'exact_name', 'type': 'String' }, { 'field': 'alis', 'type': 'Set', 'corpus': [] #map(lambda x: x['alis'],clients.values()) }, { 'field': 'houseID', 'type': 'Exact' }, { 'field': 'senate', 'type': 'Exact' }, ] #for definition in fields[:] : # if definition.field != 'houseID': # fields[k+"-houseID"] = {'type':'Interaction', # 'Interaction Fields': ["houseID", k]} # Create a new deduper object and pass our data model to it. deduper = dedupe.Dedupe(fields) # To train dedupe, we feed it a random sample of records. deduper.sample(clients, 150000) # If we have training data saved from a previous run of dedupe, # look for it an load it in. # __Note:__ if you want to train from scratch, delete the training_file if os.path.exists(training_file): with open(training_file) as tf: print 'reading labeled examples from ', training_file deduper.readTraining(tf) # ## Active learning # Dedupe will find the next pair of records # it is least certain about and ask you to label them as duplicates # or not. # use 'y', 'n' and 'u' keys to flag duplicates # press 'f' when you are finished print 'starting active labeling...' dedupe.consoleLabel(deduper) deduper.train(ppc=0.1, uncovered_dupes=2) # When finished, save our training away to disk with open(training_file, 'w') as tf: deduper.writeTraining(tf) # Save our weights and predicates to disk. If the settings file # exists, we will skip all the training and learning next time we run # this file. with open(settings_file, 'w') as sf: deduper.writeSettings(sf) deduper.cleanupTraining() return deduper
def get_resolved_df(data_1,data_2,fields): # Training if os.path.exists(settings_file): print ('reading from', settings_file) with open(settings_file, 'rb') as sf : linker = dedupe.StaticRecordLink(sf) else: # Create a new linker object and pass our data model to it. linker = dedupe.RecordLink(fields) # To train the linker, we feed it a sample of records. linker.sample(data_1, data_2, 15000) # If we have training data saved from a previous run of linker, # look for it an load it in. if os.path.exists(training_file): print('reading labeled examples from ', training_file) with open(training_file) as tf : linker.readTraining(tf) # ## Active learning print('starting active labeling...') dedupe.consoleLabel(linker) linker.train() # When finished, save our training away to disk with open(training_file, 'w') as tf : linker.writeTraining(tf) # Save our weights and predicates to disk. If the settings file # exists, we will skip all the training and learning next time we run # this file. with open(settings_file, 'wb') as sf : linker.writeSettings(sf) # ## Clustering print('clustering...') linked_records = linker.match(data_1, data_2, 0) print('# duplicate sets', len(linked_records)) print (linked_records) # ## Writing Results df_list = [] for cluster, score in linked_records: #obtain filename + record no clusters cluster1 = cluster[0] cluster2 = cluster[1] match = re.match(r"(\w+)\.csv(\d+)",cluster1) filename,idx = match.groups() filename +='.csv' filename = pr.resource_filename('src.input',filename) idx = int(idx) print filename print idx #dataframe for cluster - 1 df1 = pd.DataFrame(columns=pd.read_csv(filename).columns) df1.loc[0] = pd.read_csv(filename).ix[idx] #for cluster 2 match = re.match(r"(\w+)\.csv(\d+)",cluster2) filename,idx = match.groups() filename += '.csv' filename = pr.resource_filename('src.input',filename) idx = int(idx) print filename print idx #dataframe for cluster 2 df2 = pd.DataFrame(columns=pd.read_csv(filename).columns) df2.loc[0] = pd.read_csv(filename).ix[idx] #common attribute df2['score'] = [score] df1['score'] = [score] df = pd.merge(df1,df2,on = 'score') df_list.append(df) results_df = pd.concat(df_list,ignore_index = True) return results_df
def deDuplication(): input_file = 'csv_example_input.csv' output_file = 'csv_example_output.csv' settings_file = 'csv_example_learned_settings3' training_file = 'csv_example_training.json3' def preProcess(column): try: column = column.decode('utf-8') except AttributeError: pass column = unidecode(column) column = re.sub(' +', ' ', column) column = re.sub('\n', ' ', column) column = column.strip().strip('"').strip("'").lower().strip() if not column: column = None return column # Read in the data from CSV file: def readData(filename): data_d = {} with open(filename, encoding="utf-8") as f: reader = csv.DictReader(f) for row in reader: clean_row = [(k, preProcess(v)) for (k, v) in row.items()] row_id = row['id'] data_d[row_id] = dict(clean_row) return data_d print('importing data ...') data_d = readData(input_file) if os.path.exists(settings_file): print('reading from', settings_file) with open(settings_file, 'rb') as f: deduper = dedupe.StaticDedupe(f) else: fields = [ { 'field': 'DisplayName_Processed', 'type': 'String' }, { 'field': 'Email_Processed', 'type': 'String' }, { 'field': 'Info', 'type': 'String' }, ] deduper = dedupe.Dedupe(fields) deduper.sample(data_d, 15000) if os.path.exists(training_file): print('reading labeled examples from ', training_file) with open(training_file, 'rb') as f: deduper.readTraining(f) print('starting active labeling...') dedupe.consoleLabel(deduper) deduper.train() with open(training_file, 'w') as tf: deduper.writeTraining(tf) with open(settings_file, 'wb') as sf: deduper.writeSettings(sf) threshold = deduper.threshold(data_d, recall_weight=1) print('clustering...') clustered_dupes = deduper.match(data_d, threshold) print('# duplicate sets', len(clustered_dupes)) cluster_membership = {} cluster_id = 0 for (cluster_id, cluster) in enumerate(clustered_dupes): id_set, scores = cluster cluster_d = [data_d[c] for c in id_set] canonical_rep = dedupe.canonicalize(cluster_d) for record_id, score in zip(id_set, scores): cluster_membership[record_id] = { "cluster id": cluster_id, "canonical representation": canonical_rep, "confidence": score } singleton_id = cluster_id + 1 with open(output_file, 'w', encoding="utf-8") as f_output, open(input_file, encoding="utf-8") as f_input: writer = csv.writer(f_output) reader = csv.reader(f_input) heading_row = next(reader) heading_row.insert(0, 'confidence_score') heading_row.insert(0, 'Cluster ID') canonical_keys = canonical_rep.keys() for key in canonical_keys: heading_row.append('canonical_' + key) writer.writerow(heading_row) for row in reader: row_id = row[0] if row_id in cluster_membership: cluster_id = cluster_membership[row_id]["cluster id"] canonical_rep = cluster_membership[row_id][ "canonical representation"] row.insert(0, cluster_membership[row_id]['confidence']) row.insert(0, cluster_id) for key in canonical_keys: row.append(canonical_rep[key].encode('utf8')) else: row.insert(0, None) row.insert(0, singleton_id) singleton_id += 1 for key in canonical_keys: row.append(None) writer.writerow(row) return clustered_dupes
def dedupe_training(category, recall_weight=1): assert (category in ['mangas', 'animes']),"Only mangas or animes needs training" data = {} for work in Work.objects.filter(category__slug=category[:-1]).values_list('title', 'vo_title', 'id'): data[work[2]] = {'title': work[0], 'vo_title': work[1]} for field in ['title', 'vo_title']: if not data[work[2]][field]: data[work[2]][field] = None fields = [ {'field': 'title', 'type': 'String'}, {'field': 'vo_title', 'type': 'String'}, ] output_file = os.path.join(DEDUPE_DIR, category + '_output.csv') settings_file = os.path.join(DEDUPE_DIR, category + '_learned_settings') training_file = os.path.join(DEDUPE_DIR, category+'_training.json') deduper = Dedupe(fields) deduper.sample(data) consoleLabel(deduper) if os.path.exists(training_file): print('reading labeled examples from ', training_file) with open(training_file, 'rb') as f: deduper.readTraining(f) deduper.train() with open(training_file, 'w') as tf: deduper.writeTraining(tf) with open(settings_file, 'wb') as sf: deduper.writeSettings(sf) #recall_weight defines the value you give to recall opposed to the value of precision threshold = deduper.threshold(data, recall_weight) print('clustering...') clustered_dupes = deduper.match(data, threshold) print('# duplicate sets', len(clustered_dupes)) input_file = os.path.join(DEDUPE_DIR, category + '.csv') with open(input_file, 'w', newline='', encoding='utf-8') as f: writer = csv.writer(f, delimiter='\t') writer.writerow(['id', 'title', 'vo_title']) for work_id in data: title = data[work_id]['title'] vo_title = data[work_id]['vo_title'] writer.writerow([work_id, title, vo_title]) cluster_membership = {} cluster_id = 0 for (cluster_id, cluster) in enumerate(clustered_dupes): id_set, scores = cluster cluster_d = [data[c] for c in id_set] canonical_rep = canonicalize(cluster_d) for record_id, score in zip(id_set, scores): cluster_membership[record_id] = { "cluster id" : cluster_id, "canonical representation" : canonical_rep, "confidence": score } singleton_id = cluster_id + 1 with open(output_file, 'w', newline='', encoding='utf-8') as f_output, open(input_file, newline='', encoding='utf-8') as f_input: writer = csv.writer(f_output, delimiter='\t') reader = csv.reader(f_input, delimiter='\t') heading_row = next(reader) heading_row.insert(0, 'confidence_score') heading_row.insert(0, 'Cluster ID') canonical_keys = canonical_rep.keys() for key in canonical_keys: heading_row.append('canonical_' + key) writer.writerow(heading_row) for row in reader: row_id = int(row[0]) if row_id in cluster_membership: cluster_id = cluster_membership[row_id]["cluster id"] canonical_rep = cluster_membership[row_id]["canonical representation"] row.insert(0, cluster_membership[row_id]['confidence']) row.insert(0, cluster_id) for key in canonical_keys: row.append(canonical_rep[key]) else: row.insert(0, None) row.insert(0, singleton_id) singleton_id += 1 for key in canonical_keys: row.append(None) writer.writerow(row) print('output file written')
def train(clients): if os.path.exists(settings_file): with open(settings_file) as sf : print 'reading from', settings_file return dedupe.StaticDedupe(sf) else: fields = [{'field' : 'city', 'type': 'ShortString', 'Has Missing': True}, {'field' : 'country', 'type': 'ShortString', 'Has Missing': True}, {'field' : 'zip', 'type': 'ShortString', 'Has Missing': True}, {'field' : 'state', 'type': 'ShortString', 'Has Missing': True}, {'field' : 'address', 'type': 'String', 'Has Missing': True}, {'field' : 'description', 'type': 'Text', 'Has Missing': True, 'corpus': [] #map(lambda x: x['description'],clients.values())}, }, {'field' : 'specific_issues', 'type': 'Text', 'Has Missing': True, 'corpus': [] #map(lambda x: x['specific_issues'],clients.values()) }, {'field' : 'exact_name', 'type': 'String'}, {'field' : 'alis', 'type': 'Set', 'corpus': [] #map(lambda x: x['alis'],clients.values()) }, {'field' : 'houseID', 'type': 'Exact'}, {'field' : 'senate', 'type': 'Exact'}, ] #for definition in fields[:] : # if definition.field != 'houseID': # fields[k+"-houseID"] = {'type':'Interaction', # 'Interaction Fields': ["houseID", k]} # Create a new deduper object and pass our data model to it. deduper = dedupe.Dedupe(fields) # To train dedupe, we feed it a random sample of records. deduper.sample(clients, 150000) # If we have training data saved from a previous run of dedupe, # look for it an load it in. # __Note:__ if you want to train from scratch, delete the training_file if os.path.exists(training_file): with open(training_file) as tf : print 'reading labeled examples from ', training_file deduper.readTraining(tf) # ## Active learning # Dedupe will find the next pair of records # it is least certain about and ask you to label them as duplicates # or not. # use 'y', 'n' and 'u' keys to flag duplicates # press 'f' when you are finished print 'starting active labeling...' dedupe.consoleLabel(deduper) deduper.train(ppc=0.1, uncovered_dupes=2) # When finished, save our training away to disk with open(training_file, 'w') as tf : deduper.writeTraining(tf) # Save our weights and predicates to disk. If the settings file # exists, we will skip all the training and learning next time we run # this file. with open(settings_file, 'w') as sf : deduper.writeSettings(sf) deduper.cleanupTraining() return deduper
def start_dedupe(df): # If a settings file already exists, we'll just load that and skip training if os.path.exists(settings_file): print('reading from', settings_file) with open(settings_file, 'rb') as f: deduper = dedupe.StaticDedupe(f) else: # ## Training # Define the fields dedupe will pay attention to fields = [ { 'field': 'MESSZEIT', 'type': 'Exact' }, { 'field': 'KENNUNG', 'type': 'Exact' }, { 'field': 'GEOGR_BREITE', 'type': 'Custom', 'comparator': sim_num_abs }, { 'field': 'GEOGR_LAENGE', 'type': 'Custom', 'comparator': sim_num_abs }, { 'field': 'HORIZONTALE_SICHT', 'type': 'ShortString', 'has missing': True }, { 'field': 'WETTER', 'type': 'Custom', 'comparator': sim_ww, 'has missing': True }, ] # Create a new deduper object and pass our data model to it. deduper = dedupe.Dedupe(fields) # To train dedupe, we feed it a sample of records. deduper.sample(df, 15000, .5) # If we have training data saved from a previous run of dedupe, # look for it and load it in. # __Note:__ if you want to train from scratch, delete the training_file if os.path.exists(training_file): print('reading labeled examples from ', training_file) with open(training_file, 'rb') as f: deduper.readTraining(f) # ## Active learning # Dedupe will find the next pair of records # it is least certain about and ask you to label them as duplicates # or not. # use 'y', 'n' and 'u' keys to flag duplicates # press 'f' when you are finished print('starting active labeling...') dedupe.consoleLabel(deduper) # Using the examples we just labeled, train the deduper and learn # blocking predicates deduper.train() # When finished, save our training to disk with open(training_file, 'w') as tf: deduper.writeTraining(tf) # Save our weights and predicates to disk. If the settings file # exists, we will skip all the training and learning next time we run # this file. with open(settings_file, 'wb') as sf: deduper.writeSettings(sf) threshold = deduper.threshold(df, recall_weight=1) print(f'threshold: {threshold}') print('clustering...') # clustered_dupes = deduper.match(df, .98) clustered_dupes = deduper.match(df, threshold) print(f'# duplicate sets {len(clustered_dupes)}') return clustered_dupes
def main(self) : data_1 = {} data_2 = {} # import the specified CSV file data_1 = csvhelpers.readData(self.input_1, self.field_names_1, prefix='input_1') data_2 = csvhelpers.readData(self.input_2, self.field_names_2, prefix='input_2') # sanity check for provided field names in CSV file for field in self.field_names_1 : if field not in data_1.values()[0]: raise parser.error("Could not find field '" + field + "' in input") for field in self.field_names_2 : if field not in data_2.values()[0]: raise parser.error("Could not find field '" + field + "' in input") if self.field_names_1 != self.field_names_2 : for record_id, record in data_2.items() : remapped_record = {} for new_field, old_field in zip(self.field_names_1, self.field_names_2) : remapped_record[new_field] = record[old_field] data_2[record_id] = remapped_record logging.info('imported %d rows from file 1', len(data_1)) logging.info('imported %d rows from file 2', len(data_2)) logging.info('using fields: %s' % self.field_definition.keys()) # # Create a new deduper object and pass our data model to it. deduper = dedupe.RecordLink(self.field_definition) # Set up our data sample logging.info('taking a sample of %d possible pairs', self.sample_size) deduper.sample(data_1, data_2, self.sample_size) # If we have training data saved from a previous run of dedupe, # look for it an load it in. # __Note:__ if you want to train from scratch, delete the training_file if os.path.exists(self.training_file): logging.info('reading labeled examples from %s' % self.training_file) deduper.readTraining(self.training_file) elif self.skip_training: raise parser.error("You need to provide an existing training_file or run this script without --skip_training") if not self.skip_training: logging.info('starting active labeling...') dedupe.consoleLabel(deduper) deduper.train() # When finished, save our training away to disk logging.info('saving training data to %s' % self.training_file) deduper.writeTraining(self.training_file) else: logging.info('skipping the training step') deduper.train() # ## Blocking logging.info('blocking...') # ## Clustering # Find the threshold that will maximize a weighted average of our precision and recall. # When we set the recall weight to 2, we are saying we care twice as much # about recall as we do precision. # # If we had more data, we would not pass in all the blocked data into # this function but a representative sample. logging.info('finding a good threshold with a recall_weight of %s' % self.recall_weight) threshold = deduper.threshold(data_1, data_2, recall_weight=self.recall_weight) # `duplicateClusters` will return sets of record IDs that dedupe # believes are all referring to the same entity. logging.info('clustering...') clustered_dupes = deduper.match(data_1, data_2, threshold) logging.info('# duplicate sets %s' % len(clustered_dupes)) write_function = csvhelpers.writeLinkedResults # write out our results if self.output_file : with open(self.output_file, 'w') as output_file : write_function(clustered_dupes, self.input_1, self.input_2, output_file, self.inner_join) else : write_function(clustered_dupes, self.input_1, self.input_2, sys.stdout, self.inner_join)
'field': 'primary_author_name', 'type': 'String' }, { 'field': 'full_book_title', 'type': 'String' }] # Initialise deduplication object deduper = dedupe.Dedupe(variables) # Train the model deduper.sample( data, 150000, .5) # This randomly generates 150000 pairs of records to compare dedupe.consoleLabel( deduper ) # Call up command line tool for manually labelling training pairs. deduper.train() # Trains the model to work out which pairs are the same # Write training to disk with open(training_file, 'w') as tf: deduper.writeTraining(tf) with open(settings_file, 'wb') as sf: deduper.writeSettings(sf) # Now make our predictions threshold = deduper.threshold( data, recall_weight=1) # We care equally about precision and recall print("clustering ...")
# look for it an load it in. # __Note:__ if you want to train from scratch, delete the training_file if os.path.exists(training_file): print('reading labeled examples from ', training_file) with open(training_file) as tf : linker.readTraining(tf) # ## Active learning # Dedupe will find the next pair of records # it is least certain about and ask you to label them as matches # or not. # use 'y', 'n' and 'u' keys to flag duplicates # press 'f' when you are finished print('starting active labeling...') dedupe.consoleLabel(linker) linker.train() # When finished, save our training away to disk with open(training_file, 'w') as tf : linker.writeTraining(tf) # Save our weights and predicates to disk. If the settings file # exists, we will skip all the training and learning next time we run # this file. with open(settings_file, 'wb') as sf : linker.writeSettings(sf) # ## Blocking
# look for it an load it in. # __Note:__ if you want to train from scratch, delete the training_file if os.path.exists(training_file): print 'reading labeled examples from ', training_file with open(training_file, 'rb') as f: deduper.readTraining(f) # ## Active learning # Dedupe will find the next pair of records # it is least certain about and ask you to label them as duplicates # or not. # use 'y', 'n' and 'u' keys to flag duplicates # press 'f' when you are finished print 'starting active labeling...' dedupe.consoleLabel(deduper) deduper.train() # When finished, save our training away to disk with open(training_file, 'w') as tf : deduper.writeTraining(tf) # Save our weights and predicates to disk. If the settings file # exists, we will skip all the training and learning next time we run # this file. with open(settings_file, 'w') as sf : deduper.writeSettings(sf) # ## Blocking
def main_dedupe_method(data_d,spot_d,spot_data,chosen_cols3,country_code,settings_file,training_file,fields): print '''############# Welcome to our Matching Algorithm ################ =============== Initializing Part 1 - Data Retrieving =================\n''' start_time = time.time() print "Getting data from BP companies..." # Select all companies from selected country con = psy.connect(database='msi_bp_%s' % (country_code), user = '******', host='basil.colours.local', password='******') con.set_client_encoding('UTF8') c = con.cursor(cursor_factory=psy.extras.RealDictCursor) q_companies = '''SELECT --DISTINCT on (auth.company_int_id_number) CONCAT(ROW_NUMBER() OVER(ORDER BY auth.company_int_id_number),'c') AS line_nr, auth.current_authority_pk as auth_nr, auth.s_acct_name as "name", auth.s_tax_number as vat, ads.address_line1 as address, ads.address_line3 as town, ads.postcode as postcode FROM gdm.authorities auth LEFT JOIN gdm.addresses ads ON ads.address_id_pk = auth.s_address_id WHERE auth.s_acct_name is not null ORDER BY auth.company_int_id_number''' c.execute(q_companies) companies = c.fetchall() comp_d = {} for row in companies: clean_row = {ky: p.proc(row[ky]) for ky in chosen_cols3[1:]} row_id = row['line_nr'] comp_d[row_id] = dict(clean_row) print "Saving all data into database..." all_data_d=md.merge_dicts(spot_d,comp_d) all_data=[] for k,v in all_data_d.iteritems(): new_row = [v[ky] for ky in chosen_cols3[1:]] new_row.insert(0,k) all_data.append(new_row) c2 = con.cursor() c2.execute('DROP TABLE IF EXISTS htc.all_data') field_string = ','.join('%s varchar(200)' % name for name in chosen_cols3) c2.execute('''CREATE TABLE htc.all_data (%s)''' %(field_string)) num_cols = len(chosen_cols3) mog = "(" + ("%s,"*(num_cols -1)) + "%s)" args_str = ','.join(c2.mogrify(mog,x) for x in all_data) values = "("+ ','.join(x for x in chosen_cols3) +")" c2.execute("INSERT INTO htc.all_data %s VALUES %s" % (values, args_str)) con.commit() print 'Data retrieval took ', time.time() - start_time, 'seconds (', (time.time() - start_time)/60, 'minutes)' start_time1 = time.time() print "Arranging data to create blocks..." u_keys = set() for n,m in all_data_d.iteritems(): for k,v in m.iteritems(): u_keys.add(v) u_keys = list(u_keys) block_keys = {} count = 0 for k in u_keys: block_keys[k] = count count += 1 print 'Checking blocks...' ids = [] for k, v in all_data_d.items(): ids.append([block_keys[v['name']],v['name'], k]) if 'postcode' in v.keys(): if v['postcode'] != None: ids.append([block_keys[v['postcode']],v['postcode'], k]) if 'town' in v.keys(): if v['town'] != None: ids.append([block_keys[v['town']],v['town'], k]) print 'Writing tables...' column_names = ['block_key','block_id','record_id'] c3 = con.cursor() print 'Writing htc.blocks_recordlink_test' ## Table with all blocks and companies ## c3.execute('DROP TABLE IF EXISTS htc.blocks_recordlink_test') field_string = ','.join('%s varchar(200)' % name for name in column_names) c3.execute('''CREATE TABLE htc.blocks_recordlink_test (%s)''' %(field_string)) num_cols = len(column_names) mog = "(" + ("%s,"*(num_cols -1)) + "%s)" args_str = ','.join(c3.mogrify(mog,x) for x in ids) values = "("+ ','.join(x for x in column_names) +")" c3.execute("INSERT INTO htc.blocks_recordlink_test %s VALUES %s" % (values, args_str)) con.commit() print 'Writing htc.plural_blocks' ## Table with all blocks that have more than one company ## c3.execute("DROP TABLE IF EXISTS htc.plural_blocks") c3.execute("DROP TABLE IF EXISTS htc.covered_blocks") c3.execute("""CREATE TABLE htc.plural_blocks as ( SELECT b.block_id FROM htc.blocks_recordlink_test b INNER JOIN (SELECT DISTINCT block_id FROM htc.blocks_recordlink_test WHERE record_id like '%c') bp ON bp.block_id = b.block_id INNER JOIN (SELECT DISTINCT block_id FROM htc.blocks_recordlink_test WHERE record_id not like '%c') comp ON comp.block_id = b.block_id GROUP BY b.block_id HAVING COUNT(*) > 1)""") con.commit() print 'Writing htc.covered_blocks' ## Table with list of blocks for each company ## c3.execute("""CREATE TABLE htc.covered_blocks as ( SELECT record_id, array_agg(r.block_key ORDER BY r.block_key) AS sorted_ids FROM htc.blocks_recordlink_test r INNER JOIN htc.plural_blocks bl ON r.block_id = bl.block_id GROUP BY record_id)""") con.commit() c3 = con.cursor(cursor_factory=psy.extras.RealDictCursor) print "Blocking..." ## Get all companies and blocks ## c3.execute("""SELECT br.record_id, m.*, br.block_key, cb.sorted_ids FROM htc.blocks_recordlink_test br LEFT JOIN (SELECT * FROM htc.all_data) m ON m.line_nr = br.record_id INNER JOIN htc.covered_blocks cb ON br.record_id = cb.record_id INNER JOIN htc.plural_blocks pb ON br.block_id = pb.block_id ORDER BY br.block_key""") rec_ids = c3.fetchall() print 'Writing tables took ', time.time() - start_time1, 'seconds (', (time.time() - start_time1)/60, 'minutes)' start_time2 = time.time() print "Blocking..." ## Arrange data into the needed syntax to use in dedupe ## blocks = [] last_key = 0 count = 0 bp_block = [] comp_block = [] for rec in rec_ids: if rec['block_key'] == last_key: if rec['record_id'][-1:] != 'c': bp_block.append(tuple([rec['record_id'], {re.sub('_bp','',your_key): p.proc(rec[your_key]) for your_key in chosen_cols3[1:]}, set(rec['sorted_ids'][:rec['sorted_ids'].index(rec['block_key'])]) ])) else: comp_block.append(tuple([rec['record_id'], comp_d[rec['record_id']], set(rec['sorted_ids'][:rec['sorted_ids'].index(rec['block_key'])]) ])) else: if bp_block != [] and comp_block != []: blocks.append((bp_block, comp_block)) bp_block = [] comp_block = [] if rec['record_id'][-1:] == 'c': comp_block =[tuple([rec['record_id'], comp_d[rec['record_id']], set(rec['sorted_ids'][:rec['sorted_ids'].index(rec['block_key'])]) ])] else: bp_block = [tuple([rec['record_id'], {re.sub('_bp','',your_key): p.proc(rec[your_key]) for your_key in chosen_cols3[1:]}, set(rec['sorted_ids'][:rec['sorted_ids'].index(rec['block_key'])]) ])] last_key = rec['block_key'] count +=1 blocks.append((bp_block, comp_block)) blocks = tuple(blocks) del rec_ids del all_data con.close() # Dinamically create fields accordingly to chosen columns # We need to see how can we say to our model if we only have name to work with print 'Blocking took ', time.time() - start_time2, 'seconds (', (time.time() - start_time2)/60, 'minutes)' start_time3 = time.time() print "=============== Initializing Part 2 - Dedupe ===============\n" print 'Entering Dedupe ...' #================================ DEDUPING WITH BLOCKS ============================================== if os.path.exists(settings_file): print 'Reading from: ', settings_file with open(settings_file) as sf : linker = dedupe.StaticRecordLink(sf) else: linker = dedupe.RecordLink(fields) linker.sample(data_d,comp_d) if os.path.exists(training_file): print 'Reading labeled examples from: ', training_file with open(training_file) as tf : linker.readTraining(tf) print 'Starting Active Labeling...' dedupe.consoleLabel(linker) linker.train() with open(training_file, 'w') as tf : linker.writeTraining(tf) with open(settings_file, 'w') as sf : linker.writeSettings(sf) print 'Creating blocks...' threshold = linker.thresholdBlocks(blocks, recall_weight=2) print 'Generating clusters...' clusters = linker.matchBlocks(blocks, threshold) clust=list(clusters) print 'Deduping took ', time.time() - start_time3, 'seconds (', (time.time() - start_time3)/60, 'minutes)' start_time4 = time.time() print 'Writing results to database...' con = psy.connect(database='msi_bp_%s' % (country_code), user = '******', host='basil.colours.local', password='******') c = con.cursor() c.execute('DROP TABLE if exists htc.recordlink_blocks_result') con.commit() c.execute('CREATE TABLE htc.recordlink_blocks_result (spot_id varchar, comp_id varchar, score double precision)') #register_adapter(numpy.int64, addapt_numpy_int64) num_cols = 3 mog = "(" + ("%s,"*(num_cols -1)) + "%s)" args_str = ','.join(c.mogrify(mog,x[0]+(float(x[1]),)) for x in clust) values = "("+ ','.join(x for x in ['spot_id','comp_id', 'score']) +")" c.execute("INSERT INTO htc.recordlink_blocks_result %s VALUES %s" % (values, args_str)) c.execute('DROP TABLE if exists htc.recordlink_blocks_result_all') c.execute("""CREATE TABLE htc.recordlink_blocks_result_all as ( SELECT adt.line_nr as spot_id,br.comp_id as comp_id,br.score as score, CASE WHEN br.comp_id IS NULL THEN 'Not a Client' ELSE 'Check Match Score' END AS "Result", adt.name as name_spot,b.name as name_comp, b.auth_nr as auth_nr_comp FROM htc.all_data adt LEFT JOIN htc.recordlink_blocks_result br ON br.spot_id = adt.line_nr LEFT JOIN (SELECT * FROM htc.recordlink_blocks_result br INNER JOIN htc.all_data adt ON adt.line_nr= br.comp_id) b ON b.spot_id = adt.line_nr WHERE adt.line_nr not like '%c%')""") con.commit() con.close() print 'Writing results took ', time.time() - start_time4, 'seconds (', (time.time() - start_time4)/60, 'minutes)' print 'Ended in ' , time.time() - start_time, 'seconds (', (time.time() - start_time)/60, 'minutes)'