def write_clusters(self, clustered_dupes, src_data): ''' write data with id, cluster_id, conf_score, <canoncial columns> ''' with StringIO() as f: is_header = True writer = csv.writer(f) for (cluster_id, cluster) in enumerate(clustered_dupes): id_set, scores = cluster score_dict = {id: score for id, score in zip(id_set, scores)} #zip id_set, scores together for [(id,score),(id,score)] all_cluster_rows = [src_data[c] for c in id_set] #list of dicts canonical_rep = dedupe.canonicalize(all_cluster_rows) #make a score dict with zip. for rec in all_cluster_rows: rec['cluster_id'] = cluster_id for can_col in canonical_rep: rec['canoncial_' + can_col] = canonical_rep[can_col] rec['confidence'] = score_dict[int( rec[self.data_source.id_col])] if is_header: writer.writerow(rec.keys()) is_header = False writer.writerow(rec.values()) self.outp_file.write_file(f.getvalue())
def main(): d_dict = load_dict() if SETTINGS_PATH.exists(): print(f"Reading settings from {SETTINGS_PATH.name}.") with SETTINGS_PATH.open('rb') as f: deduper = dedupe.StaticDedupe(f) else: deduper = label_and_train(d_dict) if not DUPES_PATH.exists(): threshold = deduper.threshold(d_dict, recall_weight=1) print("Clustering...") clustered_dupes = deduper.match(d_dict, threshold) print(f"Writing {len(clustered_dupes)} clusters to {DUPES_PATH.name}.") DUPES_PATH.write_bytes(pickle.dumps(clustered_dupes)) clustered_dupes = pickle.loads(DUPES_PATH.read_bytes()) for cluster in clustered_dupes: id_set, scores = cluster cluster_dicts = [d_dict[c] for c in id_set] min_index, min_score = get_the_freaking_minimum_index_and_score(scores) lowest_score_dict = cluster_dicts[min_index] canonical_rep = dedupe.canonicalize(cluster_dicts) print(f"\nFound {len(cluster_dicts)} registrations for:") print_dict(canonical_rep) print(f"Least confident dupe ({min_score}) is:") print_dict(lowest_score_dict) input("Press enter for next cluster.")
def cluster(self): deduper = self.deduper if not self.data_d: self.data_d = self._read_data(self.input_file) data_d = self.data_d threshold = deduper.threshold(data_d, recall_weight=1) # ## clustering print('clustering...') clustered_dupes = deduper.match(data_d, threshold) print('# duplicate sets', len(clustered_dupes)) cluster_membership = {} cluster_id = 0 for (cluster_id, cluster) in enumerate(clustered_dupes): id_set, scores = cluster cluster_d = [data_d[c] for c in id_set] canonical_rep = dedupe.canonicalize(cluster_d) for record_id, score in zip(id_set, scores): cluster_membership[record_id] = { 'cluster id': cluster_id, 'canonical representation' : canonical_rep, 'confidence': score } singleton_id = cluster_id + 1 output_file = self.output_file input_file = self.input_file with open(output_file, 'w') as f_output, open(input_file) as f_input: writer = csv.writer(f_output) reader = csv.reader(f_input) heading_row = next(reader) heading_row.insert(0, 'confidence_score') heading_row.insert(0, 'Cluster ID') canonical_keys = canonical_rep.keys() for key in canonical_keys: heading_row.append('canonical_' + key) writer.writerow(heading_row) for row in reader: # row_id = int(row[0]) row_id = row[0] if row_id in cluster_membership: cluster_id = cluster_membership[row_id]["cluster id"] canonical_rep = cluster_membership[row_id]["canonical representation"] row.insert(0, cluster_membership[row_id]['confidence']) row.insert(0, cluster_id) for key in canonical_keys: row.append(canonical_rep[key].encode('utf8')) else: row.insert(0, None) row.insert(0, singleton_id) singleton_id += 1 for key in canonical_keys: row.append(None) writer.writerow(row)
def writeCanonRep(session_id, name_pattern='cr_{0}'): engine = worker_session.bind metadata = MetaData() entity = Table('entity_{0}'.format(session_id), metadata, autoload=True, autoload_with=engine, keep_existing=True) proc_table = Table('processed_{0}'.format(session_id), metadata, autoload=True, autoload_with=engine, keep_existing=True) cr_cols = [Column('record_id', String, primary_key=True)] for col in proc_table.columns: if col.name != 'record_id': cr_cols.append(Column(col.name, col.type)) cr = Table(name_pattern.format(session_id), metadata, *cr_cols) cr.drop(bind=engine, checkfirst=True) cr.create(bind=engine) cols = [entity.c.entity_id] col_names = [c for c in proc_table.columns.keys() if c != 'record_id'] for name in col_names: cols.append(label(name, func.array_agg(getattr(proc_table.c, name)))) rows = worker_session.query(*cols)\ .filter(entity.c.record_id == proc_table.c.record_id)\ .group_by(entity.c.entity_id) names = cr.columns.keys() with open('/tmp/{0}.csv'.format(name_pattern.format(session_id)), 'wb') as f: writer = UnicodeCSVWriter(f) writer.writerow(names) for row in rows: r = [row.entity_id] dicts = [dict(**{n:None for n in col_names}) for i in range(len(row[1]))] for idx, dct in enumerate(dicts): for name in col_names: dicts[idx][name] = unicode(getattr(row, name)[idx]) canon_form = dedupe.canonicalize(dicts) r.extend([canon_form[k] for k in names if canon_form.get(k) is not None]) writer.writerow(r) canon_table_name = name_pattern.format(session_id) copy_st = 'COPY "{0}" ('.format(canon_table_name) for idx, name in enumerate(names): if idx < len(names) - 1: copy_st += '"{0}", '.format(name) else: copy_st += '"{0}")'.format(name) else: copy_st += "FROM STDIN WITH (FORMAT CSV, HEADER TRUE, DELIMITER ',', NULL ' ')" conn = engine.raw_connection() cur = conn.cursor() with open('/tmp/{0}.csv'.format(name_pattern.format(session_id)), 'rb') as f: cur.copy_expert(copy_st, f) conn.commit()
def cluster(self): # Set threshold print('threshold...') threshold = self.deduper.threshold(self.dictionary, recall_weight=1) # ## Clustering print('clustering...') clustered_dupes = self.deduper.match(self.dictionary, threshold) cluster_membership = {} cluster_id = 0 for (cluster_id, cluster) in enumerate(clustered_dupes): id_set, scores = cluster cluster_d = [self.dictionary[c] for c in id_set] canonical_rep = dedupe.canonicalize(cluster_d) for record_id, score in zip(id_set, scores): cluster_membership[record_id] = { "cluster id": cluster_id, "canonical representation": canonical_rep, "confidence": score } cluster_index = [] for i in cluster_membership.items(): cluster_index.append(i) dfa = pd.DataFrame(cluster_index) dfa.rename(columns={0: 'Id'}, inplace=True) dfa['cluster id'] = dfa[1].apply(lambda x: x["cluster id"]) dfa['confidence'] = dfa[1].apply(lambda x: x["confidence"]) canonical_list = [] for i in dfa[1][0]['canonical representation'].keys(): canonical_list.append(i) dfa[i + ' - ' + 'canonical'] = None dfa[i + ' - ' + 'canonical'] = dfa[1].apply( lambda x: x['canonical representation'][i]) dfa.set_index('Id', inplace=True) self.dataframe = self.dataframe.join(dfa) self.dataframe[self.entity["output_fields"] + ['cluster id', 'confidence']].to_csv( self.entity["name"].lower() + "_output.csv") #self.dataframe[self.entity["output_fields"] + ['cluster id', 'confidence']].to_sql(self.entity["name"].lower()+"_output", con=db.engine, if_exists='replace') return self.dataframe
def rundedupe2(): threshold = deduper.threshold(data_d, recall_weight=1) print('clustering...') clustered_dupes = deduper.match(data_d, threshold) print('# duplicate sets', len(clustered_dupes)) cluster_membership = {} cluster_id = 0 for (cluster_id, cluster) in enumerate(clustered_dupes): id_set, scores = cluster cluster_d = [data_d[c] for c in id_set] canonical_rep = dedupe.canonicalize(cluster_d) for record_id, score in zip(id_set, scores): cluster_membership[record_id] = { "cluster id": cluster_id, "canonical representation": canonical_rep, "confidence": score } singleton_id = cluster_id + 1 with open(os.getcwd() + "/media/output_files/" + output_file, 'w') as f_output, open(input_file) as f_input: writer = csv.writer(f_output) reader = csv.reader(f_input) heading_row = next(reader) heading_row.insert(0, 'confidence_score') heading_row.insert(0, 'Cluster ID') canonical_keys = canonical_rep.keys() for key in canonical_keys: heading_row.append('canonical_' + key) writer.writerow(heading_row) for row in reader: row_id = int(row[0]) if row_id in cluster_membership: cluster_id = cluster_membership[row_id]["cluster id"] canonical_rep = cluster_membership[row_id][ "canonical representation"] row.insert(0, cluster_membership[row_id]['confidence']) row.insert(0, cluster_id) for key in canonical_keys: row.append(canonical_rep[key].encode('utf8')) else: row.insert(0, None) row.insert(0, singleton_id) singleton_id += 1 for key in canonical_keys: row.append(None) writer.writerow(row)
def gen_cluster_members(self, clustered_dupes, src_data): ''' format and canonicalize the matched data ''' #clustered dupes like [((<ids>), (<conf scores>)),... ] cluster_membership = {} for (cluster_id, cluster) in enumerate(clustered_dupes): id_set, scores = cluster cluster_d = [src_data[c] for c in id_set] canonical_rep = dedupe.canonicalize(cluster_d) for record_id, score in zip(id_set, scores): cluster_membership[record_id] = { "cluster id": cluster_id, "canonical representation": canonical_rep, "confidence": score } return cluster_membership
print('clustering...') clustered_dupes = deduper.match(data_d, threshold) print('# duplicate sets', len(clustered_dupes)) # ## Writing Results # Write our original data back out to a CSV with a new column called # 'Cluster ID' which indicates which records refer to each other. cluster_membership = {} cluster_id = 0 for (cluster_id, cluster) in enumerate(clustered_dupes): id_set, scores = cluster cluster_d = [data_d[c] for c in id_set] canonical_rep = dedupe.canonicalize(cluster_d) for record_id, score in zip(id_set, scores): cluster_membership[record_id] = { "cluster id": cluster_id, "canonical representation": canonical_rep, "confidence": score } singleton_id = cluster_id + 1 with open(output_file, 'w') as f_output: writer = csv.writer(f_output) with open(input_file) as f_input: reader = csv.reader(f_input)
print 'clustering...' clustered_dupes = deduper.match(data_d, threshold) print '# duplicate sets', len(clustered_dupes) # ## Writing Results # Write our original data back out to a CSV with a new column called # 'Cluster ID' which indicates which records refer to each other. cluster_membership = {} cluster_id = 0 for (cluster_id, cluster) in enumerate(clustered_dupes): id_set, conf_score = cluster cluster_d = [data_d[c] for c in id_set] canonical_rep = dedupe.canonicalize(cluster_d) for record_id in id_set: cluster_membership[record_id] = { "cluster id" : cluster_id, "canonical representation" : canonical_rep, "confidence": conf_score } singleton_id = cluster_id + 1 with open(output_file, 'w') as f_output: writer = csv.writer(f_output) with open(input_file) as f_input : reader = csv.reader(f_input)
def _cluster(deduper, data, threshold, canonicalize): """Internal method that clusters the data. Parameters ---------- deduper : dedupe.Deduper A trained instance of dedupe. data : dict The dedupe formatted data dictionary. threshold : dedupe.Threshold The threshold used for clustering. canonicalize : bool or list, default False Option that provides the canonical records as additional columns. Specifying a list of column names only canonicalizes those columns. Returns ------- pd.DataFrame A dataframe storing the clustering results. """ # ## Clustering print('clustering...') clustered_dupes = deduper.match(data, threshold) print('# duplicate sets', len(clustered_dupes)) # Convert data_d to string so that Price & LatLong won't get traceback # during dedupe.canonicalize() for i in data.values(): for key in i: if i[key] is None: pass else: i[key] = str(i[key]) df_data = [] # ## Writing Results cluster_id = 0 for (cluster_id, cluster) in enumerate(clustered_dupes): id_set, scores = cluster cluster_d = [data[c] for c in id_set] canonical_rep = None if canonicalize: canonical_rep = dedupe.canonicalize(cluster_d) for record_id, score in zip(id_set, scores): tmp = { 'Id': record_id, 'cluster id': cluster_id, 'confidence': score, } if canonicalize: fields_to_canon = canonical_rep.keys() if isinstance(canonicalize, list): fields_to_canon = canonicalize for key in fields_to_canon: canon_key = 'canonical_' + key tmp[canon_key] = canonical_rep[key] df_data.append(tmp) clustered_df = pd.DataFrame(df_data) clustered_df = clustered_df.set_index('Id') return clustered_df
def deduping_function(fields, training_file, settings_file, input_file, output_file, input_dict): """ Function to dedupe rows given a fields: list of dictionaries of fields relevant for the training training_file: json with training set if the function was already run settings_file: file with settings if the function was already run input_file: CSV with values to dedup output_file: CSV where results are going to be written input_dict: dictionary with keys as id and values as dictionary of features for each row Result is to write the output_file """ if os.path.exists(settings_file): print('reading from {}'.format(settings_file)) with open(settings_file, 'rb') as f: deduper = dedupe.StaticDedupe(f) else: deduper = dedupe.Dedupe(fields) deduper.sample(input_dict) if os.path.exists(training_file): print('reading labeled examples from ', format(training_file)) with open(training_file) as tf: deduper.readTraining(tf) print('starting active labeling...') dedupe.consoleLabel(deduper) deduper.train() print('writing training file') with open(training_file, 'w') as tf: deduper.writeTraining(tf) print('writing settings file') with open(settings_file, 'wb') as sf: deduper.writeSettings(sf) print('blocking...') threshold = deduper.threshold(input_dict, recall_weight=2) print('clustering...') clustered_dupes = deduper.match(input_dict, threshold) print('# duplicate sets {}'.format(len(clustered_dupes))) cluster_membership = {} cluster_id = 0 for (cluster_id, cluster) in enumerate(clustered_dupes): id_set, scores = cluster cluster_d = [input_dict[c] for c in id_set] canonical_rep = dedupe.canonicalize(cluster_d) for record_id, score in zip(id_set, scores): cluster_membership[record_id] = { "cluster id": cluster_id, "canonical representation": canonical_rep, "confidence": score } singleton_id = cluster_id + 1 with open(output_file, 'w') as f_output: writer = csv.writer(f_output) with open(input_file) as f_input: reader = csv.reader(f_input) heading_row = next(reader) heading_row.insert(0, 'confidence_score') heading_row.insert(0, 'Cluster ID') canonical_keys = canonical_rep.keys() for key in canonical_keys: heading_row.append('canonical_' + key) writer.writerow(heading_row) for row in reader: row_id = int(row[0]) if row_id in cluster_membership: cluster_id = cluster_membership[row_id]["cluster id"] canonical_rep = cluster_membership[row_id][ "canonical representation"] row.insert(0, cluster_membership[row_id]['confidence']) row.insert(0, cluster_id) for key in canonical_keys: row.append(canonical_rep[key].encode('utf8')) else: row.insert(0, None) row.insert(0, singleton_id) singleton_id += 1 for key in canonical_keys: row.append(None) writer.writerow(row) print('deduping sucessfully finished')
def dedupe_dataframe(df, field_properties, canonicalize=False, config_name="dedupe_dataframe"): # Import Data print('runnning CUSTOM dedupe_dataframe LOCAL') config_name = config_name.replace(" ", "_") settings_file = config_name + '_learned_settings' training_file = config_name + '_training.json' print('importing data ...') df = clean_punctuation(df) specify_type(df, field_properties) df['dictionary'] = df.apply(lambda x: dict(zip(df.columns,x.tolist())), axis=1) data_d = dict(zip(df.index,df.dictionary)) for col in df.columns: print("df col - - - ", col) print(' df[dictionary].. shape', df.shape) # If a settings file already exists, we'll just load that and skip training if os.path.exists(settings_file): print('reading from', settings_file) with open(settings_file, 'rb') as f: deduper = dedupe.StaticDedupe(f) else: # ## Training # Define the fields dedupe will pay attention to fields = [] select_fields(fields, field_properties) # print("fields", fields) # Create a new deduper object and pass our data model to it. deduper = dedupe.Dedupe(fields) # To train dedupe, we feed it a sample of records. deduper.sample(data_d, 15000) # If we have training data saved from a previous run of dedupe, # look for it and load it in. # __Note:__ if you want to train from scratch, delete the training_file if os.path.exists(training_file): print('reading labeled examples from ', training_file) with open(training_file, 'rb') as f: deduper.readTraining(f) # print('starting active labeling...') dedupe.consoleLabel(deduper) # Using the examples we just labeled, train the deduper and learn # blocking predicates print('calling train...') deduper.train() # When finished, save our training to disk with open(training_file, 'w') as tf: deduper.writeTraining(tf) # Save our weights and predicates to disk. If the settings file # exists, we will skip all the training and learning next time we run # this file. with open(settings_file, 'wb') as sf: deduper.writeSettings(sf) print('saved our training to disk.') # ## Set threshold threshold = deduper.threshold(data_d, recall_weight=1) print('threshold=', threshold) # ## Clustering print('clustering...') clustered_dupes = deduper.match(data_d, threshold) print('# duplicate sets', len(clustered_dupes)) # Convert data_d to string so that Price & LatLong won't get traceback during dedupe.canonicalize() for i in data_d.values(): for key in i: if i[key] == None: pass else: i[key] = str(i[key]) # ## Writing Results cluster_membership = {} cluster_id = 0 for (cluster_id, cluster) in enumerate(clustered_dupes): id_set, scores = cluster cluster_d = [data_d[c] for c in id_set] canonical_rep = dedupe.canonicalize(cluster_d) for record_id, score in zip(id_set, scores): cluster_membership[record_id] = { "cluster id" : cluster_id, "canonical representation" : canonical_rep, "confidence": score } cluster_index=[] for i in cluster_membership.items(): cluster_index.append(i) # turn results into dataframe dfa = pd.DataFrame(cluster_index) dfa.rename(columns={0: 'Id index'}, inplace=True) canonical_list=[] if canonicalize== True: for i in dfa[1][0]['canonical representation'].keys(): canonical_list.append(i) dfa[i + ' - ' + 'canonical'] = None dfa[i + ' - ' + 'canonical'] = dfa[1].apply(lambda x: x['canonical representation'][i]) elif canonicalize == False: pass elif type(canonicalize) == list: for i in canonicalize: dfa[i + ' - ' + 'canonical'] = None dfa[i + ' - ' + 'canonical'] = dfa[1].apply(lambda x: x['canonical representation'][i]) dfa.set_index('Id index', inplace=True) df = df.join(dfa) return df
def process(): """entry point""" api = sesamclient.Connection(sesamapi_base_url=INSTANCE, jwt_auth_token=JWT, timeout=60 * 10) result = api.session.get( api.sesamapi_base_url + "publishers/{}/entities?deleted=false&history=false".format(SOURCE)) if result.status_code != 200: logging.warning('Sesam API returned non 200 code {}'.format( result.status_code)) iterator = iter([]) else: iterator = iter(result.json()) raw_data = [] for dataset_entity in iterator: raw_data.append(dataset_entity) # this is in memory service, for large datasets we probably may want to have service with DB storage if len(raw_data) > 100_000: logging.warning( 'This service is not suitable for datasets with more than 100 000 elements' ' and may lead to OutOfMemory errors') cleaned_data = read_data(raw_data) logging.info('reading from %s', SETTINGS_FILE) with open(SETTINGS_FILE, 'rb') as f: deduper = dedupe.StaticDedupe(f) threshold = deduper.threshold(cleaned_data, recall_weight=1) logging.info('clustering...') clustered_dupes = deduper.match(cleaned_data, threshold) logging.info('%d duplicate sets found', len(clustered_dupes)) cluster_membership = {} cluster_id = 0 for (cluster_id, cluster) in enumerate(clustered_dupes): id_set, scores = cluster cluster_d = [cleaned_data[c] for c in id_set] canonical_rep = dedupe.canonicalize(cluster_d) for record_id, score in zip(id_set, scores): cluster_membership[record_id] = { "cluster id": cluster_id, "canonical representation": canonical_rep, "confidence": score } result_dataset = [] for row in raw_data: row_id = row['_id'] if row_id in cluster_membership: result_dict = { '_id': row_id, 'cluster_id': cluster_membership[row_id]["cluster id"], 'confidence_score': cluster_membership[row_id]['confidence'] } if ADD_ORIGINALS: result_dict['originals'] = {key: row[key] for key in KEYS} if ADD_CANONICALS: result_dict['canonical_rep'] = cluster_membership[row_id][ "canonical representation"] result_dataset.append(result_dict) if TARGET is not None: target_url = api.sesamapi_base_url + "receivers/{}/entities".format( TARGET) requests.post(target_url, json.dumps(sorted(result_dataset, key=lambda k: k['cluster_id']), cls=NumpyEncoder), headers={ 'Authorization': 'Bearer {}'.format(JWT), 'Content-Type': 'application/json' }) return Response() return Response(json.dumps(sorted(result_dataset, key=lambda k: k['cluster_id']), cls=NumpyEncoder), mimetype='application/json')
def dedupe_training(category, recall_weight=1): assert (category in ['mangas', 'animes']),"Only mangas or animes needs training" data = {} for work in Work.objects.filter(category__slug=category[:-1]).values_list('title', 'vo_title', 'id'): data[work[2]] = {'title': work[0], 'vo_title': work[1]} for field in ['title', 'vo_title']: if not data[work[2]][field]: data[work[2]][field] = None fields = [ {'field': 'title', 'type': 'String'}, {'field': 'vo_title', 'type': 'String'}, ] output_file = os.path.join(DEDUPE_DIR, category + '_output.csv') settings_file = os.path.join(DEDUPE_DIR, category + '_learned_settings') training_file = os.path.join(DEDUPE_DIR, category+'_training.json') deduper = Dedupe(fields) deduper.sample(data) consoleLabel(deduper) if os.path.exists(training_file): print('reading labeled examples from ', training_file) with open(training_file, 'rb') as f: deduper.readTraining(f) deduper.train() with open(training_file, 'w') as tf: deduper.writeTraining(tf) with open(settings_file, 'wb') as sf: deduper.writeSettings(sf) #recall_weight defines the value you give to recall opposed to the value of precision threshold = deduper.threshold(data, recall_weight) print('clustering...') clustered_dupes = deduper.match(data, threshold) print('# duplicate sets', len(clustered_dupes)) input_file = os.path.join(DEDUPE_DIR, category + '.csv') with open(input_file, 'w', newline='', encoding='utf-8') as f: writer = csv.writer(f, delimiter='\t') writer.writerow(['id', 'title', 'vo_title']) for work_id in data: title = data[work_id]['title'] vo_title = data[work_id]['vo_title'] writer.writerow([work_id, title, vo_title]) cluster_membership = {} cluster_id = 0 for (cluster_id, cluster) in enumerate(clustered_dupes): id_set, scores = cluster cluster_d = [data[c] for c in id_set] canonical_rep = canonicalize(cluster_d) for record_id, score in zip(id_set, scores): cluster_membership[record_id] = { "cluster id" : cluster_id, "canonical representation" : canonical_rep, "confidence": score } singleton_id = cluster_id + 1 with open(output_file, 'w', newline='', encoding='utf-8') as f_output, open(input_file, newline='', encoding='utf-8') as f_input: writer = csv.writer(f_output, delimiter='\t') reader = csv.reader(f_input, delimiter='\t') heading_row = next(reader) heading_row.insert(0, 'confidence_score') heading_row.insert(0, 'Cluster ID') canonical_keys = canonical_rep.keys() for key in canonical_keys: heading_row.append('canonical_' + key) writer.writerow(heading_row) for row in reader: row_id = int(row[0]) if row_id in cluster_membership: cluster_id = cluster_membership[row_id]["cluster id"] canonical_rep = cluster_membership[row_id]["canonical representation"] row.insert(0, cluster_membership[row_id]['confidence']) row.insert(0, cluster_id) for key in canonical_keys: row.append(canonical_rep[key]) else: row.insert(0, None) row.insert(0, singleton_id) singleton_id += 1 for key in canonical_keys: row.append(None) writer.writerow(row) print('output file written')
def dupe_aff(aff_data): import dedupe import logging log_level = logging.DEBUG logging.getLogger().setLevel(log_level) fields = [ { 'field': 'name', 'type': 'String' }, ] data = {} cnt = 0 for aff in aff_data: data[cnt] = {"id": cnt, "name": aff} cnt += 1 deduper = dedupe.Dedupe(fields) deduper.sample(data, 15000) print('starting active labeling...') dedupe.consoleLabel(deduper) deduper.train() print('blocking...') threshold = deduper.threshold(data, recall_weight=2) # `match` will return sets of record IDs that dedupe # believes are all referring to the same entity. print('clustering...') clustered_dupes = deduper.match(data, threshold) print('# duplicate sets', len(clustered_dupes)) # ## Writing Results # Write our original data back out to a CSV with a new column called # 'Cluster ID' which indicates which records refer to each other. cluster_membership = {} cluster_id = 0 for (cluster_id, cluster) in enumerate(clustered_dupes): id_set, scores = cluster cluster_d = [data[c] for c in id_set] for x in cluster_d: x["id"] = str(x["id"]) canonical_rep = dedupe.canonicalize(cluster_d) for record_id, score in zip(id_set, scores): cluster_membership[record_id] = { "cluster id": cluster_id, "canonical representation": canonical_rep, "confidence": score } singleton_id = cluster_id + 1 clusters = [] for c in clustered_dupes: clusters.append([data[k] for k in c[0]]) clusters = sorted(clusters, key=lambda x: len(x), reverse=True)
def train(input_file, train_sample, fields): try: data_d = Dedupetrainer.readData(input_file) # Define the fields dedupe will pay attention to # Create a new deduper object and pass our data model to it. deduper = dedupe.Dedupe(fields) print("***********************") deduper.sample(data_d, train_sample) dedupe.consoleLabel(deduper) deduper.train() threshold = deduper.threshold(data_d, recall_weight=1) # ## Clustering # `match` will return sets of record IDs that dedupe # believes are all referring to the same entity. print('clustering...') clustered_dupes = deduper.match(data_d, threshold) print('# duplicate sets', len(clustered_dupes)) cluster_membership = {} cluster_id = 0 for (cluster_id, cluster) in enumerate(clustered_dupes): id_set, scores = cluster cluster_d = [data_d[c] for c in id_set] canonical_rep = dedupe.canonicalize(cluster_d) for record_id, score in zip(id_set, scores): cluster_membership[record_id] = { "cluster id": cluster_id, "canonical representation": canonical_rep, "confidence": score } singleton_id = cluster_id + 1 output_file = "./media/output.csv" with open(output_file, 'w') as f_output, open(input_file) as f_input: writer = csv.writer(f_output) reader = csv.reader(f_input) heading_row = next(reader) heading_row.insert(0, 'confidence_score') heading_row.insert(0, 'Cluster ID') canonical_keys = canonical_rep.keys() for key in canonical_keys: heading_row.append('canonical_' + key) writer.writerow(heading_row) for row in reader: row_id = int(row[0]) if row_id in cluster_membership: cluster_id = cluster_membership[row_id]["cluster id"] canonical_rep = cluster_membership[row_id][ "canonical representation"] row.insert(0, cluster_membership[row_id]['confidence']) row.insert(0, cluster_id) for key in canonical_keys: row.append(canonical_rep[key].encode('utf8')) else: row.insert(0, None) row.insert(0, singleton_id) singleton_id += 1 for key in canonical_keys: row.append(None) writer.writerow(row) print("process has been completed") return "success" except Exception as e: print("Error in catch ", str(e)) return "failure"
def clusterreview(request): username = request.session['username'] if request.method == 'POST': print("cluster review post") selects = request.POST.getlist("_selected_action") print(selects) clusterid = request.POST.get("clusterid") # clusterCora.objects.filter(clusterid=clusterid).update(is_checked=1) aaa = clusterCora.objects.filter(clusterid=clusterid) sett = set(aitem.cora_id for aitem in aaa) print("sett", sett) noset = set(int(noitem) for noitem in selects) print('noset:', noset) iyes = list(sett.difference(noset)) print("iyes", iyes) clusterCora.objects.filter(cora_id__in=noset).update( is_checked=0) # no clusterCora.objects.filter(cora_id__in=set(iyes)).update( is_checked=1) # yes uids = WorkerInfo.objects.filter(user=username).values_list('id', flat=True) msg = {"clusterid": clusterid, "include": iyes, 'exclude': list(noset)} WorkLog.objects.create( task=dxaconstants.ERTASK.Cora, operate_message=json.dumps(msg), operate_flag=dxaconstants.WorkerOperation.clusterReview, operate_user=uids[0], operate_system=dxaconstants.TestSystems.Dedupe) num = clusterCora.objects.filter(user=username, is_checked=-1).count() print("num:", num) if num == 0: clusteridlist = clusterCora.objects.values('clusterid').distinct() print('clusterlist:', clusteridlist) examples = {'distinct': [], 'match': []} for clusteridd in clusteridlist: print(clusteridd) print(clusteridd['clusterid']) aaa = clusterCora.objects.filter( clusterid=clusteridd['clusterid']) # sett = set(aitem.cora_id for aitem in aaa) nosets = clusterCora.objects.filter( clusterid=clusteridd['clusterid'], is_checked=0) # no yessets = clusterCora.objects.filter( clusterid=clusteridd['clusterid'], is_checked=1) # yes nocoraids = set(aitem.cora_id for aitem in nosets) yescoraids = set(aitem.cora_id for aitem in yessets) inosescora = Cora.objects.filter(id__in=nocoraids) iyesescora = Cora.objects.filter(id__in=yescoraids) for nocora in inosescora: d1 = dict([('text', nocora.text), ('id', str(nocora.id))]) for yescora in iyesescora: d2 = dict([('text', yescora.text), ('id', str(yescora.id))]) record_pair = (d1, d2) # record_pair = dict([('text', ditem.text1), ('id', ditem.id1)]) + dict( # [('text', ditem.text2), ('id', ditem.id2)]) print(record_pair) examples['distinct'].append(record_pair) for yescora1 in iyesescora: d1 = dict([('text', yescora1.text), ('id', str(yescora1.id))]) for yescora2 in iyesescora: d2 = dict([('text', yescora2.text), ('id', str(yescora2.id))]) if not yescora1.id == yescora2.id: record_pair = (d1, d2) examples['match'].append(record_pair) if os.path.exists(training_file): print('reading labeled examples from ', training_file) with open(training_file, 'rb') as af: deduper.readTraining(af) deduper.markPairs(examples) deduper.train() # When finished, save our training away to disk with open(training_file, 'w') as tf: deduper.writeTraining(tf) # Save our weights and predicates to disk. If the settings file # exists, we will skip all the training and learning next time we run # this file. with open(settings_file, 'wb') as sf: deduper.writeSettings(sf) threshold = deduper.threshold(temp_d, recall_weight=1) print('clustering...') clustered_dupes = deduper.match(temp_d, threshold) print('# duplicate sets', len(clustered_dupes)) cluster_id = 0 print(clustered_dupes) aaa = models.clusterTemp.objects.filter(user=username) if aaa: aaa.delete() models.clusterCanonicalRepresentation.objects.filter( user=username).delete() for (cluster_id, cluster) in enumerate(clustered_dupes): id_set, scores = cluster print(id_set) cluster_d = [temp_d[c] for c in id_set] print(cluster_d) canonical_rep = dedupe.canonicalize(cluster_d) models.clusterCanonicalRepresentation.objects.create( clusterid=cluster_id, canonrep=canonical_rep, user=username, task=dxaconstants.ERTASK.Cora) for record_id, score in zip(id_set, scores): # cluster_membershipRound2[record_id] = { # "cluster id": cluster_id, # "canonical representation": canonical_rep, # "confidence": score # # } models.clusterTemp.objects.create( task_record_id=record_id, task=dxaconstants.ERTASK.Cora, clusterid=cluster_id, confidence=score, user=username) # print(cluster_membershipRound2) aaaaa = models.clusterTemp.objects.all() sett = set(bitem.task_record_id for bitem in aaaaa) corass = Cora.objects.all() alls = set(bitem.id for bitem in corass) isingle = list(alls.difference(sett)) for single in isingle: models.clusterTemp.objects.create( task_record_id=single, task=dxaconstants.ERTASK.Cora, clusterid=-1, confidence=0.0, user=username) return redirect('/dedupe/addtoclusters') else: a = clusterCora.objects.filter(user=username, is_checked=-1).aggregate( Min('clusterid')) datas = clusterCora.objects.filter(clusterid=a['clusterid__min']) sett = set(citem.cora_id for citem in datas) coras = Cora.objects.filter(id__in=sett) print(coras) return render(request, 'dedupe/clusterreview.html', { 'data': coras, 'clusterid': a['clusterid__min'] }) a = clusterCora.objects.filter(user=username, is_checked=-1).aggregate(Min('clusterid')) print(a) print(a['clusterid__min']) datas = clusterCora.objects.filter(clusterid=a['clusterid__min']) print(datas) sett = set(bitem.cora_id for bitem in datas) coras = Cora.objects.filter(id__in=sett) print(coras) return render(request, 'dedupe/clusterreview.html', { 'data': coras, 'clusterid': a['clusterid__min'] })
def deDuplication(): input_file = 'csv_example_input.csv' output_file = 'csv_example_output.csv' settings_file = 'csv_example_learned_settings3' training_file = 'csv_example_training.json3' def preProcess(column): try: column = column.decode('utf-8') except AttributeError: pass column = unidecode(column) column = re.sub(' +', ' ', column) column = re.sub('\n', ' ', column) column = column.strip().strip('"').strip("'").lower().strip() if not column: column = None return column # Read in the data from CSV file: def readData(filename): data_d = {} with open(filename, encoding="utf-8") as f: reader = csv.DictReader(f) for row in reader: clean_row = [(k, preProcess(v)) for (k, v) in row.items()] row_id = row['id'] data_d[row_id] = dict(clean_row) return data_d print('importing data ...') data_d = readData(input_file) if os.path.exists(settings_file): print('reading from', settings_file) with open(settings_file, 'rb') as f: deduper = dedupe.StaticDedupe(f) else: fields = [ { 'field': 'DisplayName_Processed', 'type': 'String' }, { 'field': 'Email_Processed', 'type': 'String' }, { 'field': 'Info', 'type': 'String' }, ] deduper = dedupe.Dedupe(fields) deduper.sample(data_d, 15000) if os.path.exists(training_file): print('reading labeled examples from ', training_file) with open(training_file, 'rb') as f: deduper.readTraining(f) print('starting active labeling...') dedupe.consoleLabel(deduper) deduper.train() with open(training_file, 'w') as tf: deduper.writeTraining(tf) with open(settings_file, 'wb') as sf: deduper.writeSettings(sf) threshold = deduper.threshold(data_d, recall_weight=1) print('clustering...') clustered_dupes = deduper.match(data_d, threshold) print('# duplicate sets', len(clustered_dupes)) cluster_membership = {} cluster_id = 0 for (cluster_id, cluster) in enumerate(clustered_dupes): id_set, scores = cluster cluster_d = [data_d[c] for c in id_set] canonical_rep = dedupe.canonicalize(cluster_d) for record_id, score in zip(id_set, scores): cluster_membership[record_id] = { "cluster id": cluster_id, "canonical representation": canonical_rep, "confidence": score } singleton_id = cluster_id + 1 with open(output_file, 'w', encoding="utf-8") as f_output, open(input_file, encoding="utf-8") as f_input: writer = csv.writer(f_output) reader = csv.reader(f_input) heading_row = next(reader) heading_row.insert(0, 'confidence_score') heading_row.insert(0, 'Cluster ID') canonical_keys = canonical_rep.keys() for key in canonical_keys: heading_row.append('canonical_' + key) writer.writerow(heading_row) for row in reader: row_id = row[0] if row_id in cluster_membership: cluster_id = cluster_membership[row_id]["cluster id"] canonical_rep = cluster_membership[row_id][ "canonical representation"] row.insert(0, cluster_membership[row_id]['confidence']) row.insert(0, cluster_id) for key in canonical_keys: row.append(canonical_rep[key].encode('utf8')) else: row.insert(0, None) row.insert(0, singleton_id) singleton_id += 1 for key in canonical_keys: row.append(None) writer.writerow(row) return clustered_dupes
def dedupe_dataframe(df, field_properties, canonicalize=False, config_name="dedupe_dataframe", recall_weight=1, sample_size=0.3): """Deduplicates a dataframe given fields of interest. Parameters ---------- df : pd.DataFrame The dataframe to deduplicate. field_properties : list A list specifying what fields to use for deduplicating records. canonicalize : bool, default False Option that provides the canonical record as additional columns. config_name : str, default dedupe_dataframe The configuration file name. Note that this will be used as a prefixto save the settings and training files. recall_weight : int, default 1 Find the threshold that will maximize a weighted average of our precision and recall. When we set the recall weight to 2, we are saying we care twice as much about recall as we do precision. sample_size : float, default 0.3 Specify the sample size used for training as a float from 0 to 1. By default it is 30% (0.3) of our data. Returns ------- pd.DataFrame A pandas dataframe that contains the cluster id and confidence score. Optionally, it will contain canonicalized columns for all attributes of the record. """ # Import Data config_name = config_name.replace(" ", "_") settings_file = config_name + '_learned_settings' training_file = config_name + '_training.json' print('importing data ...') df = clean_punctuation(df) specify_type(df, field_properties) df['dictionary'] = df.apply( lambda x: dict(zip(df.columns, x.tolist())), axis=1) data_d = dict(zip(df.index, df.dictionary)) # If a settings file already exists, we'll just load that and skip training if os.path.exists(settings_file): print('reading from', settings_file) with open(settings_file, 'rb') as f: deduper = dedupe.StaticDedupe(f) else: # ## Training # Define the fields dedupe will pay attention to fields = [] select_fields(fields, field_properties) # Create a new deduper object and pass our data model to it. deduper = dedupe.Dedupe(fields) # To train dedupe, we feed it a sample of records. sample_num = math.floor(len(df) * sample_size) deduper.sample(data_d, sample_num) # If we have training data saved from a previous run of dedupe, # look for it and load it in. # __Note:__ if you want to train from scratch, delete the training_file if os.path.exists(training_file): print('reading labeled examples from ', training_file) with open(training_file, 'rb') as f: deduper.readTraining(f) print('starting active labeling...') dedupe.consoleLabel(deduper) # Using the examples we just labeled, train the deduper and learn # blocking predicates deduper.train() # When finished, save our training to disk with open(training_file, 'w') as tf: deduper.writeTraining(tf) # Save our weights and predicates to disk. If the settings file # exists, we will skip all the training and learning next time we run # this file. with open(settings_file, 'wb') as sf: deduper.writeSettings(sf) # ## Set threshold threshold = deduper.threshold(data_d, recall_weight=recall_weight) # ## Clustering print('clustering...') clustered_dupes = deduper.match(data_d, threshold) print('# duplicate sets', len(clustered_dupes)) # Convert data_d to string so that Price & LatLong won't get traceback # during dedupe.canonicalize() for i in data_d.values(): for key in i: if i[key] is None: pass else: i[key] = str(i[key]) # ## Writing Results cluster_membership = {} cluster_id = 0 for (cluster_id, cluster) in enumerate(clustered_dupes): id_set, scores = cluster cluster_d = [data_d[c] for c in id_set] canonical_rep = dedupe.canonicalize(cluster_d) for record_id, score in zip(id_set, scores): cluster_membership[record_id] = { "cluster id": cluster_id, "canonical representation": canonical_rep, "confidence": score } cluster_index = [] for i in cluster_membership.items(): cluster_index.append(i) # turn results into dataframe dfa = pd.DataFrame(cluster_index) dfa.rename(columns={0: 'Id'}, inplace=True) dfa['cluster id'] = dfa[1].apply(lambda x: x["cluster id"]) dfa['confidence'] = dfa[1].apply(lambda x: x["confidence"]) canonical_list = [] if canonicalize: for i in dfa[1][0]['canonical representation'].keys(): canonical_list.append(i) dfa[i + ' - ' + 'canonical'] = None dfa[i + ' - ' + 'canonical'] = dfa[1].apply( lambda x: x['canonical representation'][i]) elif type(canonicalize) == list: for i in canonicalize: dfa[i + ' - ' + 'canonical'] = None dfa[i + ' - ' + 'canonical'] = dfa[1].apply( lambda x: x['canonical representation'][i]) dfa.set_index('Id', inplace=True) df = df.join(dfa) df.drop(columns=[1, 'dictionary'], inplace=True) return df