def train_gazetteer(messy, canonical, model_settings=None, should_index=False): """ Train and return a dedupe.Gazetteer using the specified messy and canonical dictionaries. The messy and canonical objects should have the same structure: - The key is a unique ID - The value is another dictionary of field:value pairs. This dictionary must contain at least 'country', 'name', and 'address' keys. Reads a training.json file containing positive and negative matches. """ if model_settings: gazetteer = dedupe.StaticGazetteer(model_settings) else: fields = [ { 'field': 'country', 'type': 'Exact' }, { 'field': 'name', 'type': 'String' }, { 'field': 'address', 'type': 'String' }, ] gazetteer = dedupe.Gazetteer(fields) gazetteer.sample(messy, canonical, 15000) training_file = os.path.join(settings.BASE_DIR, 'api', 'data', 'training.json') with open(training_file) as tf: gazetteer.readTraining(tf) gazetteer.train() gazetteer.cleanupTraining() if should_index: index_start = datetime.now() logger.info('Indexing started') gazetteer.index(canonical) index_duration = datetime.now() - index_start logger.info('Indexing finished ({})'.format(index_duration)) logger.info('Cleanup training') return gazetteer
def gazetteer_dataframes(messy_df, canonical_df, field_properties, recall_weight, n_matches, config_name="gazetteer_dataframes"): config_name = config_name.replace(" ", "_") settings_file = config_name + '_learned_settings' training_file = config_name + '_training.json' print('importing data ...') messy_df = clean_punctuation(messy_df) specify_type(messy_df, field_properties) messy_df['index_field'] = messy_df.index messy_df['index_field'] = messy_df['index_field'].apply( lambda x: "messy_df" + str(x)) messy_df.set_index(['index_field'], inplace=True) data_1 = messy_df.to_dict(orient='index') canonical_df = clean_punctuation(canonical_df) specify_type(canonical_df, field_properties) canonical_df['index_field'] = canonical_df.index canonical_df['index_field'] = canonical_df['index_field'].apply( lambda x: "canonical_df" + str(x)) canonical_df.set_index(['index_field'], inplace=True) data_2 = canonical_df.to_dict(orient='index') # --------------------------------------------------------------------------------- # ## Training if os.path.exists(settings_file): print('reading from', settings_file) with open(settings_file, 'rb') as sf: gazetteer = dedupe.StaticGazetteer(sf) else: # Define the fields the linker will pay attention to # # Notice how we are telling the linker to use a custom field comparator # for the 'price' field. fields = [] select_fields(fields, field_properties) # Create a new gazetteer object and pass our data model to it. gazetteer = dedupe.Gazetteer(fields) # To train the gazetteer, we feed it a sample of records. gazetteer.sample(data_1, data_2, 15000) # If we have training data saved from a previous run of linker, # look for it an load it in. # __Note:__ if you want to train from scratch, delete the training_file if os.path.exists(training_file): print('reading labeled examples from ', training_file) with open(training_file) as tf: linker.readTraining(tf) # ## Active learning # Dedupe will find the next pair of records # it is least certain about and ask you to label them as matches # or not. # use 'y', 'n' and 'u' keys to flag duplicates # press 'f' when you are finished print('starting active labeling...') dedupe.consoleLabel(gazetteer) gazetteer.train() # When finished, save our training awak to disk with open(training_file, 'w') as tf: gazetteer.writeTraining(tf) # Make the canonical set gazetteer.index(data_2) # Save our weights and predicates to disk. If the settings file exists, # we will skip all training and learning next time we run this file. with open(settings_file, 'wb') as sf: gazetteer.writeSettings(sf, index=True) gazetteer.cleanupTraining() gazetteer.index(data_2) #Calc threshold print('start calculating threshold') threshold = gazetteer.threshold(data_1, recall_weight) print('Threshold: {}'.format(threshold)) results = gazetteer.match(data_1, threshold=threshold, n_matches=n_matches) results_df = pd.DataFrame(results) results_df['messy_df_link'] = results_df[0].apply(lambda x: x[0][0]) results_df['messy_df_link'] = results_df['messy_df_link'].str.strip( 'messy_df') results_df['messy_df_link'] = results_df['messy_df_link'].astype(int) results_df['canonical_df_link'] = results_df[0].apply(lambda x: x[0][1]) results_df['canonical_df_link'] = results_df[ 'canonical_df_link'].str.strip('canonical_df') results_df['canonical_df_link'] = results_df['canonical_df_link'].astype( int) results_df['confidence'] = results_df[0].apply(lambda x: x[1]) results_df['cluster id'] = results_df.index results_df = results_df.rename(columns={0: 'results'}) results_df['results'] = results_df['results'].astype(str) #For both messy_df & canonical_df, add cluster id & confidence score from results_df messy_df.index.rename('messy_df_link', inplace=True) messy_df = messy_df.rename(columns={'unique_id': 'messy_unique_id'}) messy_df = messy_df.merge(results_df_copy, on='messy_df_link', how='left') canonical_df.index.rename('canonical_df_link', inplace=True) canonical_df = canonical_df.rename( columns={'unique_id': 'canonical_unique_id'}) canonical_df = canonical_df.merge(results_df_copy, on='canonical_df_link', how='left') #Merge messy_df & canonical_df together final_df = messy_df.merge(canonical_df, on='results') return final_df
print('importing data ...') messy = readData(messy_file) print('N data 1 records: {}'.format(len(messy))) canonical = readData(canon_file) print('N data 2 records: {}'.format(len(canonical))) def descriptions(): for dataset in (messy, canonical): for record in dataset.values(): yield record['description'] if os.path.exists(settings_file): print('reading from', settings_file) with open(settings_file, 'rb') as sf: gazetteer = dedupe.StaticGazetteer(sf) else: # Define the fields the gazetteer will pay attention to # # Notice how we are telling the gazetteer to use a custom # field comparator for the 'price' field. fields = [{ 'field': 'title', 'type': 'String' }, { 'field': 'title', 'type': 'Text', 'corpus': descriptions() }, { 'field': 'description',
def match(): try: post = json.loads(request.data) except ValueError: r = { 'status': 'error', 'message': ''' The content of your request should be a string encoded JSON object. ''', 'object': request.data, } resp = make_response(json.dumps(r), 400) resp.headers['Content-Type'] = 'application/json' return resp r, status_code, sess = validate_post(post) if r['status'] != 'error': api_key = post['api_key'] session_id = post['session_id'] n_matches = post.get('num_matches', 5) obj = post['object'] field_defs = json.loads(sess.field_defs) model_fields = sorted(list(set([f['field'] for f in field_defs]))) fields = ', '.join(['r.{0}'.format(f) for f in model_fields]) engine = db_session.bind entity_table = Table('entity_{0}'.format(session_id), Base.metadata, autoload=True, autoload_with=engine, keep_existing=True) try: hash_me = [] for field in model_fields: if obj[field]: hash_me.append(unicode(obj[field])) else: hash_me.append('') hash_me = ';'.join(hash_me) except KeyError, e: r['status'] = 'error' r['message'] = 'Sent fields "{0}" do no match model fields "{1}"'\ .format(','.join(obj.keys()), ','.join(model_fields)) resp = make_response(json.dumps(r), 400) resp.headers['Content-Type'] = 'application/json' return resp if set(obj.keys()).isdisjoint(set(model_fields)): r['status'] = 'error' r['message'] = 'Sent fields "{0}" do no match model fields "{1}"'\ .format(','.join(obj.keys()), ','.join(model_fields)) resp = make_response(json.dumps(r), 400) resp.headers['Content-Type'] = 'application/json' return resp md5_hash = md5(unidecode(hash_me)).hexdigest() exact_match = db_session.query(entity_table)\ .filter(entity_table.c.source_hash == md5_hash).first() match_list = [] if exact_match: sel = text(''' SELECT {0} FROM "raw_{1}" AS r JOIN "entity_{1}" AS e ON r.record_id = e.record_id WHERE e.entity_id = :entity_id LIMIT :limit '''.format(fields, session_id)) rows = [] with engine.begin() as conn: rows = list(conn.execute(sel, entity_id=exact_match.entity_id, limit=n_matches)) for row in rows: d = {f: getattr(row, f) for f in model_fields} d['entity_id'] = exact_match.entity_id d['match_confidence'] = '1.0' match_list.append(d) else: deduper = dedupe.StaticGazetteer(StringIO(sess.gaz_settings_file)) for k,v in obj.items(): obj[k] = preProcess(unicode(v)) block_keys = tuple([b[0] for b in list(deduper.blocker([('blob', obj)]))]) # Sometimes the blocker does not find blocks. In this case we can't match if block_keys: sel = text(''' SELECT r.record_id, {1} FROM "processed_{0}" as r JOIN ( SELECT record_id FROM "match_blocks_{0}" WHERE block_key IN :block_keys ) AS s ON r.record_id = s.record_id '''.format(session_id, fields)) with engine.begin() as conn: data_d = {int(i[0]): dict(zip(model_fields, i[1:])) \ for i in list(conn.execute(sel, block_keys=block_keys))} if data_d: deduper.index(data_d) linked = deduper.match({'blob': obj}, threshold=0, n_matches=n_matches) if linked: ids = [] confs = {} for l in linked[0]: id_set, confidence = l ids.extend([i for i in id_set if i != 'blob']) confs[id_set[1]] = confidence ids = tuple(set(ids)) sel = text(''' SELECT {0}, r.record_id, e.entity_id FROM "raw_{1}" as r JOIN "entity_{1}" as e ON r.record_id = e.record_id WHERE r.record_id IN :ids '''.format(fields, session_id)) matches = [] with engine.begin() as conn: matches = list(conn.execute(sel, ids=ids)) for match in matches: m = {f: getattr(match, f) for f in model_fields} m['record_id'] = getattr(match, 'record_id') m['entity_id'] = getattr(match, 'entity_id') # m['match_confidence'] = float(confs[str(m['entity_id'])]) match_list.append(m) else: if sentry: sentry.captureMessage('Unable to block record', extra=post) r['matches'] = match_list
def add_entity(): ''' Add an entry to the entity map. POST data should be a string encoded JSON object which looks like: { "object": { "city":"Macon", "cont_name":"Kinght & Fisher, LLP", "zip":"31201", "firstname":null, "employer":null, "address":"350 Second St", "record_id":3, "type":"Monetary", "occupation":null }, "api_key":"6bf73c41-404e-47ae-bc2d-051e935c298e", "match_id": 100, } The object key should contain a mapping of fields that are in the data model. If the record_id field is present, an attempt will be made to look up the record in the raw / processed table before making the entry. If match_id is present, the record will be added as a member of the entity referenced by the id. ''' r = { 'status': 'ok', 'message': "" } status_code = 200 session_id = flask_session['session_id'] try: post = json.loads(request.data) except ValueError: r = { 'status': 'error', 'message': ''' The content of your request should be a string encoded JSON object. ''', 'object': request.data, } resp = make_response(json.dumps(r), 400) resp.headers['Content-Type'] = 'application/json' return resp obj = post['object'] record_id = obj.get('record_id') if record_id: del obj['record_id'] match_id = json.loads(request.data).get('match_id') sess = db_session.query(DedupeSession).get(session_id) field_defs = json.loads(sess.field_defs) fds = {} for fd in field_defs: try: fds[fd['field']].append(fd['type']) except KeyError: fds[fd['field']] = [fd['type']] if not set(fds.keys()) == set(obj.keys()): r['status'] = 'error' r['message'] = "The fields in the object do not match the fields in the model" status_code = 400 else: engine = db_session.bind proc_table = Table('processed_{0}'.format(session_id), Base.metadata, autoload=True, autoload_with=engine, keep_existing=True) row = db_session.query(proc_table)\ .filter(proc_table.c.record_id == record_id)\ .first() if not row: # pragma: no cover raw_table = Table('raw_{0}'.format(session_id), Base.metadata, autoload=True, autoload_with=engine, keep_existing=True) proc_ins = 'INSERT INTO "processed_{0}" (SELECT record_id, '\ .format(proc_table_name) for idx, field in enumerate(fds.keys()): try: field_types = fds[field] except KeyError: field_types = ['String'] # TODO: Need to figure out how to parse a LatLong field type if 'Price' in field_types: col_def = 'COALESCE(CAST("{0}" AS DOUBLE PRECISION), 0.0) AS {0}'.format(field) else: col_def = 'CAST(TRIM(COALESCE(LOWER("{0}"), \'\')) AS VARCHAR) AS {0}'.format(field) if idx < len(fds.keys()) - 1: proc_ins += '{0}, '.format(col_def) else: proc_ins += '{0} '.format(col_def) else: proc_ins += 'FROM "raw_{0}" WHERE record_id = :record_id)'\ .format(session_id) with engine.begin() as conn: record_id = conn.execute(raw_table.insert()\ .returning(raw_table.c.record_id) , **obj) conn.execute(text(proc_ins), record_id=record_id) hash_me = ';'.join([preProcess(unicode(obj[i])) for i in fds.keys()]) md5_hash = md5(unidecode(hash_me)).hexdigest() entity = { 'entity_id': unicode(uuid4()), 'record_id': record_id, 'source_hash': md5_hash, 'clustered': True, 'checked_out': False, } entity_table = Table('entity_{0}'.format(session_id), Base.metadata, autoload=True, autoload_with=engine, keep_existing=True) if match_id: entity['target_record_id'] = match_id entity_id = db_session.query(entity_table.c.entity_id)\ .filter(entity_table.c.record_id == match_id)\ .first() entity['entity_id'] = entity_id.entity_id with engine.begin() as conn: conn.execute(entity_table.insert(), **entity) deduper = dedupe.StaticGazetteer(StringIO(sess.gaz_settings_file)) for k,v in obj.items(): obj[k] = preProcess(unicode(v)) block_keys = [{'record_id': b[1], 'block_key': b[0]} \ for b in list(deduper.blocker([(record_id, obj)]))] with engine.begin() as conn: conn.execute(text(''' INSERT INTO "match_blocks_{0}" ( block_key, record_id ) VALUES (:block_key, :record_id) '''.format(sess.id)), *block_keys) if sess.review_count: sess.review_count = sess.review_count - 1 db_session.add(sess) db_session.commit() resp = make_response(json.dumps(r), status_code) resp.headers['Content-Type'] = 'application/json' return resp
def _train(settings_file, training_file, clean_data, messy_data, field_properties, sample_size, update_model, n_cores): """Internal method that trains the deduper model from scratch or update an existing dedupe model. Parameters ---------- settings_file : str A path to a settings file that will be loaded if it exists. training_file : str A path to a training file that will be loaded to keep training from. clean_data : dict The dictionary form of the gazette that gazetteer_dedupe requires. messy_data : dict The dictionary form of the messy data that needs to be deduplicated (and canonicalized) field_properties : dict The mapping of fields to their respective data types. Please see the dedupe documentation for further details. sample_size : float, default 0.3 Specify the sample size used for training as a float from 0 to 1. By default it is 30% (0.3) of our data. update_model : bool, default False If True, it allows user to update existing model by uploading training file. n_cores : int, default None Specify the number of cores to use during clustering. By default n_cores is equal to None (i.e. use multipressing equal to CPU count). Returns ------- dedupe.Gazetteer A gazetteer model instance. """ # Define the fields dedupe will pay attention to fields = [] select_fields(fields, [field_properties]) if update_model == False: # If a settings file already exists, we'll just load that and skip training if os.path.exists(settings_file): print('Reading from', settings_file) with open(settings_file, 'rb') as f: deduper = dedupe.StaticGazetteer(f, num_cores=n_cores) #Create a new deduper object and pass our data model to it. else: # Initialise dedupe deduper = dedupe.Gazetteer(fields, num_cores=n_cores) # Launch active learning deduper = _active_learning(clean_data, messy_data, sample_size, deduper, training_file, settings_file) else: # ## Training # Initialise dedupe deduper = dedupe.Gazetteer(fields, num_cores=n_cores) # Import existing model print('Reading labeled examples from ', training_file) with open(training_file, 'rb') as f: deduper.prepare_training(clean_data, messy_data, training_file=f) # Launch active learning deduper = _active_learning(clean_data, messy_data, sample_size, deduper, training_file, settings_file) return deduper
def link_games( gazetteer, paths, id_prefixes=None, id_fields=None, training_file=None, manual_labelling=False, threshold=None, recall_weight=1, output=None, pretty_print=True, ): """ find links for games """ paths = tuple(arg_to_iter(paths)) if len(paths) < 2: raise ValueError( f"need at least 2 files to link games, but received {paths}") id_prefixes = tuple(arg_to_iter(id_prefixes)) id_prefixes = id_prefixes + tuple(map(_filename, paths[len(id_prefixes):])) id_fields = tuple(arg_to_iter(id_fields)) id_fields = id_fields + tuple(f"{prefix}_id" for prefix in id_prefixes[len(id_fields):]) games_canonical = _load_games(paths[0]) data_canonical = _make_data(games_canonical, id_fields[0], id_prefixes[0]) del games_canonical LOGGER.info("loaded %d games in the canonical dataset <%s>", len(data_canonical), paths[0]) data_link = {} for path, id_field, id_prefix in zip(paths[1:], id_fields[1:], id_prefixes[1:]): games = _load_games(path) data_link.update(_make_data(games, id_field, id_prefix)) del games LOGGER.info("loaded %d games to link in the datasets %s", len(data_link), paths[1:]) if training_file: gazetteer_trained = _train_gazetteer( data_canonical, data_link, training_file=training_file, manual_labelling=manual_labelling, pretty_print=pretty_print, ) if isinstance(gazetteer, str): LOGGER.info("saving gazetteer model to <%s>", gazetteer) with smart_open(gazetteer, "wb") as file_obj: gazetteer_trained.writeSettings(file_obj) gazetteer = gazetteer_trained del gazetteer_trained elif isinstance(gazetteer, str): LOGGER.info("reading gazetteer model from <%s>", gazetteer) with smart_open(gazetteer, "rb") as file_obj: gazetteer = dedupe.StaticGazetteer(file_obj) gazetteer.index(data_canonical) LOGGER.info("using gazetteer model %r", gazetteer) threshold = threshold or gazetteer.threshold(data_link, recall_weight=recall_weight) LOGGER.info("using threshold %.3f", threshold) clusters = gazetteer.match(messy_data=data_link, threshold=threshold, n_matches=None, generator=True) links = defaultdict(set) del gazetteer for cluster in clusters: for (id_link, id_canonical), _ in cluster: links[id_canonical].add(id_link) del clusters LOGGER.info("found links for %d items", len(links)) if output == "-": for id_canonical, linked in links.items(): LOGGER.info("%s <-> %s", id_canonical, linked) elif output: LOGGER.info("saving clusters as JSON to <%s>", output) links_sorted = { key: sorted(value) for key, value in links.items() if key and value } json_formats = {"sort_keys": True, "indent": 4} if pretty_print else {} with smart_open(output, "w") as file_obj: serialize_json(obj=links_sorted, file=file_obj, **json_formats) del links_sorted return links
output_file = 'address_matching_output.csv' settings_file = 'address_matching_learned_settings' training_file = 'address_matching_training.json' canonical_file = 'data/chicago_addresses.csv' messy_file = 'data/messy_addresses.csv' print 'importing data ...' messy_addresses = readData(messy_file) canonical_addresses = readData(canonical_file) # ## Training if os.path.exists(settings_file): print 'reading from', settings_file with open(settings_file) as sf: linker = dedupe.StaticGazetteer(sf, num_cores=2) else: # Define the fields dedupe will pay attention to fields = [{'field': 'Address', 'type': 'Address'}] # Create a new linker object and pass our data model to it. linker = dedupe.Gazetteer(fields, num_cores=2) # To train dedupe, we feed it a random sample of records. linker.sample(messy_addresses, canonical_addresses, 30000) if os.path.exists(training_file): print 'reading labeled examples from ', training_file with open(training_file) as tf: linker.readTraining(tf)