def _train_gazetteer(
    data_1,
    data_2,
    fields=DEDUPE_FIELDS,
    training_file=None,
    manual_labelling=False,
    pretty_print=False,
):
    LOGGER.info("training gazetteer with fields: %r", fields)

    gazetteer = dedupe.Gazetteer(fields)

    if training_file and smart_exists(training_file):
        LOGGER.info("reading existing training from <%s>", training_file)
        with open(training_file) as file_obj:
            gazetteer.prepare_training(data_1=data_1,
                                       data_2=data_2,
                                       training_file=file_obj,
                                       sample_size=50_000)
    else:
        gazetteer.prepare_training(data_1=data_1,
                                   data_2=data_2,
                                   sample_size=50_000)

    if manual_labelling:
        LOGGER.info("start interactive labelling")
        dedupe.convenience.console_label(gazetteer)

    if training_file:
        LOGGER.info("write training data back to <%s>", training_file)
        with open(training_file, "w") as file_obj:
            # bug in dedupe preventing training from being serialized correctly
            # gazetteer.write_training(file_obj)
            if pretty_print:
                _write_training(gazetteer, file_obj, sort_keys=True, indent=4)
            else:
                _write_training(gazetteer, file_obj)
        # if pretty_print:
        #     with open(training_file) as file_obj:
        #         training = parse_json(file_obj)
        #     with open(training_file, "w") as file_obj:
        #         serialize_json(obj=training, file=file_obj, sort_keys=True, indent=4)

    LOGGER.info("done labelling, begin training")
    gazetteer.train(recall=0.9, index_predicates=True)

    gazetteer.cleanup_training()

    return gazetteer
def train_gazetteer(messy, canonical, model_settings=None, should_index=False):
    """
    Train and return a dedupe.Gazetteer using the specified messy and canonical
    dictionaries. The messy and canonical objects should have the same
    structure:
      - The key is a unique ID
      - The value is another dictionary of field:value pairs. This dictionary
        must contain at least 'country', 'name', and 'address' keys.

    Reads a training.json file containing positive and negative matches.
    """
    if model_settings:
        gazetteer = dedupe.StaticGazetteer(model_settings)
    else:
        fields = [
            {
                'field': 'country',
                'type': 'Exact'
            },
            {
                'field': 'name',
                'type': 'String'
            },
            {
                'field': 'address',
                'type': 'String'
            },
        ]

        gazetteer = dedupe.Gazetteer(fields)
        gazetteer.sample(messy, canonical, 15000)
        training_file = os.path.join(settings.BASE_DIR, 'api', 'data',
                                     'training.json')
        with open(training_file) as tf:
            gazetteer.readTraining(tf)
        gazetteer.train()
        gazetteer.cleanupTraining()

    if should_index:
        index_start = datetime.now()
        logger.info('Indexing started')
        gazetteer.index(canonical)
        index_duration = datetime.now() - index_start
        logger.info('Indexing finished ({})'.format(index_duration))
        logger.info('Cleanup training')

    return gazetteer
示例#3
0
def _train_gazetteer(
    data_1,
    data_2,
    fields=DEDUPE_FIELDS,
    training_file=None,
    manual_labelling=False,
    pretty_print=False,
):
    LOGGER.info("training gazetteer with fields: %r", fields)

    gazetteer = dedupe.Gazetteer(fields)
    gazetteer.sample(data_1, data_2, 50_000)

    if training_file and smart_exists(training_file):
        LOGGER.info("reading existing training from <%s>", training_file)
        with smart_open(training_file, "r") as file_obj:
            gazetteer.readTraining(file_obj)

    if manual_labelling:
        LOGGER.info("start interactive labelling")
        dedupe.convenience.consoleLabel(gazetteer)

    if training_file:
        LOGGER.info("write training data back to <%s>", training_file)
        with smart_open(training_file, "w") as file_obj:
            gazetteer.writeTraining(file_obj)
        if pretty_print:
            with smart_open(training_file, "r") as file_obj:
                training = parse_json(file_obj)
            with smart_open(training_file, "w") as file_obj:
                serialize_json(obj=training,
                               file=file_obj,
                               sort_keys=True,
                               indent=4)

    LOGGER.info("done labelling, begin training")
    gazetteer.train(recall=0.9, index_predicates=True)

    gazetteer.cleanupTraining()

    return gazetteer
示例#4
0
            'field': 'title',
            'type': 'Text',
            'corpus': descriptions()
        }, {
            'field': 'description',
            'type': 'Text',
            'has missing': True,
            'corpus': descriptions()
        }, {
            'field': 'price',
            'type': 'Price',
            'has missing': True
        }]

        # Create a new gazetteer object and pass our data model to it.
        gazetteer = dedupe.Gazetteer(fields)

        # If we have training data saved from a previous run of gazetteer,
        # look for it an load it in.
        # __Note:__ if you want to train from scratch, delete the training_file
        if os.path.exists(training_file):
            print('reading labeled examples from ', training_file)
            with open(training_file) as tf:
                gazetteer.prepare_training(messy, canonical, training_file=tf)
        else:
            gazetteer.prepare_training(messy, canonical)

        # ## Active learning
        # Dedupe will find the next pair of records
        # it is least certain about and ask you to label them as matches
        # or not.
示例#5
0
if os.path.exists(settings_file):
    print('reading from', settings_file)
    with open(settings_file, 'rb') as sf:
        gazetteer = dedupe.StaticGazetteer(sf)
else:
    fields = [{'field' : 'name', 'type': 'String'},
              {'field' : 'address', 'type': 'String', 'has missing' : True},
              {'field' : 'city', 'type': 'String', 'has missing' : True},
              {'field' : 'state', 'type': 'Exact', 'has missing': True},
              {'field' : 'zip', 'type': 'ShortString', 'has missing' : True},
              {'field' : 'phone', 'type': 'ShortString', 'has missing' : True},
              {'field' : 'country', 'type': 'Exact', 'has missing' : True}
              ]
 
    # Create a new gazetteer object and pass our data model to it.
    gazetteer = dedupe.Gazetteer(fields, num_cores=2)
 
    gazetteer.markPairs(labeled_examples)
    print("labelled pairs are loaded from database")
      
    #print("Prepair for training using sample data")
    gazetteer.prepare_training(messy_data,canonical_data, blocked_proportion=0.6)
    # gazetteer.sample(messy_erp, canonical_sfdc, 15000)   
    # deduper.prepare_training(temp_d, training_file=tf)
    #del messy_erp
          
    # ## Active learning
    dedupe.consoleLabel(gazetteer)    
    gazetteer.train(index_predicates=True)

    # When finished, save our training away to disk
def gazetteer_dataframes(messy_df,
                         canonical_df,
                         field_properties,
                         recall_weight,
                         n_matches,
                         config_name="gazetteer_dataframes"):

    config_name = config_name.replace(" ", "_")

    settings_file = config_name + '_learned_settings'
    training_file = config_name + '_training.json'

    print('importing data ...')

    messy_df = clean_punctuation(messy_df)
    specify_type(messy_df, field_properties)

    messy_df['index_field'] = messy_df.index
    messy_df['index_field'] = messy_df['index_field'].apply(
        lambda x: "messy_df" + str(x))
    messy_df.set_index(['index_field'], inplace=True)

    data_1 = messy_df.to_dict(orient='index')

    canonical_df = clean_punctuation(canonical_df)
    specify_type(canonical_df, field_properties)

    canonical_df['index_field'] = canonical_df.index
    canonical_df['index_field'] = canonical_df['index_field'].apply(
        lambda x: "canonical_df" + str(x))
    canonical_df.set_index(['index_field'], inplace=True)

    data_2 = canonical_df.to_dict(orient='index')

    # ---------------------------------------------------------------------------------

    # ## Training

    if os.path.exists(settings_file):
        print('reading from', settings_file)
        with open(settings_file, 'rb') as sf:
            gazetteer = dedupe.StaticGazetteer(sf)

    else:
        # Define the fields the linker will pay attention to
        #
        # Notice how we are telling the linker to use a custom field comparator
        # for the 'price' field.

        fields = []
        select_fields(fields, field_properties)

        # Create a new gazetteer object and pass our data model to it.
        gazetteer = dedupe.Gazetteer(fields)

        # To train the gazetteer, we feed it a sample of records.
        gazetteer.sample(data_1, data_2, 15000)

        # If we have training data saved from a previous run of linker,
        # look for it an load it in.
        # __Note:__ if you want to train from scratch, delete the training_file
        if os.path.exists(training_file):
            print('reading labeled examples from ', training_file)
            with open(training_file) as tf:
                linker.readTraining(tf)

        # ## Active learning
        # Dedupe will find the next pair of records
        # it is least certain about and ask you to label them as matches
        # or not.
        # use 'y', 'n' and 'u' keys to flag duplicates
        # press 'f' when you are finished
        print('starting active labeling...')

        dedupe.consoleLabel(gazetteer)

        gazetteer.train()

        # When finished, save our training awak to disk

        with open(training_file, 'w') as tf:
            gazetteer.writeTraining(tf)

        # Make the canonical set
        gazetteer.index(data_2)

        # Save our weights and predicates to disk. If the settings file exists,
        # we will skip all training and learning next time we run this file.
        with open(settings_file, 'wb') as sf:
            gazetteer.writeSettings(sf, index=True)

        gazetteer.cleanupTraining()

    gazetteer.index(data_2)
    #Calc threshold
    print('start calculating threshold')
    threshold = gazetteer.threshold(data_1, recall_weight)
    print('Threshold: {}'.format(threshold))

    results = gazetteer.match(data_1, threshold=threshold, n_matches=n_matches)

    results_df = pd.DataFrame(results)

    results_df['messy_df_link'] = results_df[0].apply(lambda x: x[0][0])
    results_df['messy_df_link'] = results_df['messy_df_link'].str.strip(
        'messy_df')
    results_df['messy_df_link'] = results_df['messy_df_link'].astype(int)

    results_df['canonical_df_link'] = results_df[0].apply(lambda x: x[0][1])
    results_df['canonical_df_link'] = results_df[
        'canonical_df_link'].str.strip('canonical_df')
    results_df['canonical_df_link'] = results_df['canonical_df_link'].astype(
        int)

    results_df['confidence'] = results_df[0].apply(lambda x: x[1])
    results_df['cluster id'] = results_df.index

    results_df = results_df.rename(columns={0: 'results'})
    results_df['results'] = results_df['results'].astype(str)

    #For both messy_df & canonical_df, add cluster id & confidence score from results_df
    messy_df.index.rename('messy_df_link', inplace=True)
    messy_df = messy_df.rename(columns={'unique_id': 'messy_unique_id'})
    messy_df = messy_df.merge(results_df_copy, on='messy_df_link', how='left')

    canonical_df.index.rename('canonical_df_link', inplace=True)
    canonical_df = canonical_df.rename(
        columns={'unique_id': 'canonical_unique_id'})
    canonical_df = canonical_df.merge(results_df_copy,
                                      on='canonical_df_link',
                                      how='left')

    #Merge messy_df & canonical_df together
    final_df = messy_df.merge(canonical_df, on='results')

    return final_df
示例#7
0
def getMatchingReady(session_id):
    addRowHash(session_id)
    cleanupTables(session_id)
    engine = worker_session.bind
    with engine.begin() as conn:
        conn.execute('DROP TABLE IF EXISTS "match_blocks_{0}"'\
            .format(session_id))
        conn.execute(''' 
            CREATE TABLE "match_blocks_{0}" (
                block_key VARCHAR, 
                record_id BIGINT
            )
            '''.format(session_id))
    sess = worker_session.query(DedupeSession).get(session_id)
    field_defs = json.loads(sess.field_defs)

    # Save Gazetteer settings
    d = dedupe.Gazetteer(field_defs)

    # Disabling canopy based predicates for now
    for definition in d.data_model.primary_fields:
        for idx, predicate in enumerate(definition.predicates):
            if predicate.type == 'TfidfPredicate':
                definition.predicates.pop(idx)

    d.readTraining(StringIO(sess.training_data))
    d.train()
    g_settings = StringIO()
    d.writeSettings(g_settings)
    g_settings.seek(0)
    sess.gaz_settings_file = g_settings.getvalue()
    worker_session.add(sess)
    worker_session.commit()

    # Write match_block table
    model_fields = list(set([f['field'] for f in field_defs]))
    fields = ', '.join(['p.{0}'.format(f) for f in model_fields])
    sel = ''' 
        SELECT 
          p.record_id, 
          {0}
        FROM "processed_{1}" AS p 
        LEFT JOIN "exact_match_{1}" AS e 
          ON p.record_id = e.match 
        WHERE e.record_id IS NULL;
        '''.format(fields, session_id)
    conn = engine.connect()
    rows = conn.execute(sel)
    data = ((getattr(row, 'record_id'), dict(zip(model_fields, row[1:]))) \
        for row in rows)
    block_gen = d.blocker(data)
    s = StringIO()
    writer = UnicodeCSVWriter(s)
    writer.writerows(block_gen)
    conn.close()
    s.seek(0)
    conn = engine.raw_connection()
    curs = conn.cursor()
    try:
        curs.copy_expert('COPY "match_blocks_{0}" FROM STDIN CSV'\
            .format(session_id), s)
        conn.commit()
    except Exception, e: # pragma: no cover
        conn.rollback()
        raise e
示例#8
0
def _train(settings_file, training_file, clean_data, messy_data,
           field_properties, sample_size, update_model, n_cores):
    """Internal method that trains the deduper model from scratch or update
        an existing dedupe model.
        Parameters
        ----------
        settings_file : str
            A path to a settings file that will be loaded if it exists.
        training_file : str
            A path to a training file that will be loaded to keep training
            from.
        clean_data : dict
            The dictionary form of the gazette that gazetteer_dedupe requires.
        messy_data : dict
            The dictionary form of the messy data that needs to be deduplicated 
            (and canonicalized)
        field_properties : dict
            The mapping of fields to their respective data types. Please
            see the dedupe documentation for further details.
        sample_size : float, default 0.3
            Specify the sample size used for training as a float from 0 to 1.
            By default it is 30% (0.3) of our data.
        update_model : bool, default False
            If True, it allows user to update existing model by uploading
            training file.
        n_cores : int, default None
            Specify the number of cores to use during clustering.
            By default n_cores is equal to None (i.e. use multipressing equal to CPU count).
        Returns
        -------
        dedupe.Gazetteer
            A gazetteer model instance.
    """
    # Define the fields dedupe will pay attention to
    fields = []
    select_fields(fields, [field_properties])

    if update_model == False:

        # If a settings file already exists, we'll just load that and skip training
        if os.path.exists(settings_file):
            print('Reading from', settings_file)
            with open(settings_file, 'rb') as f:
                deduper = dedupe.StaticGazetteer(f, num_cores=n_cores)

        #Create a new deduper object and pass our data model to it.
        else:
            # Initialise dedupe
            deduper = dedupe.Gazetteer(fields, num_cores=n_cores)

            # Launch active learning
            deduper = _active_learning(clean_data, messy_data, sample_size,
                                       deduper, training_file, settings_file)

    else:
        # ## Training
        # Initialise dedupe
        deduper = dedupe.Gazetteer(fields, num_cores=n_cores)

        # Import existing model
        print('Reading labeled examples from ', training_file)
        with open(training_file, 'rb') as f:
            deduper.prepare_training(clean_data, messy_data, training_file=f)

        # Launch active learning
        deduper = _active_learning(clean_data, messy_data, sample_size,
                                   deduper, training_file, settings_file)

    return deduper
示例#9
0
 def setUp(self):
     self.deduper = dedupe.Gazetteer([{'field': 'name', 'type': 'String'}])
     self.sSND = dedupe.training.semiSupervisedNonDuplicates