예제 #1
0
def train_linker(df1, df2):
    if READ_FROM_SETTINGS_FILE:
        with open(SETTINGS_FILE, 'rb') as sf:
            linker = dedupe.StaticRecordLink(sf)
    else:
        linker = dedupe.RecordLink(FIELDS)
        # It's terrible that you have to do this next line!!!!
        linker.classifier = rlr.RegularizedLogisticRegression()

        linker.sample(df1, df2, BLOCKING_TRAINING_SAMPLE_SIZE)

        if READ_FROM_TRAINING_FILE:
            print('reading labeled examples from ', TRAINING_FILE)
            with open(TRAINING_FILE, 'rb') as tf:
                linker.readTraining(tf)
        else:
            dedupe.consoleLabel(linker)

        linker.train()

    if WRITE_SETTINGS_FILE:
        with open(SETTINGS_FILE, 'wb') as sf:
            linker.writeSettings(sf)
    if WRITE_TRAINING_FILE:
        with open(TRAINING_FILE, 'w') as tf:
            linker.writeTraining(tf)

    return linker
예제 #2
0
 def executor(data1, data2):
     input1 = {
         i: {fields1[j]: value
             for j, value in enumerate(row)}
         for i, row in enumerate(data1)
     }
     input2 = {
         i: {fields1[j]: value
             for j, value in enumerate(row)}
         for i, row in enumerate(data2)
     }
     fields = [{'field': field, 'type': 'String'} for field in fields1]
     linker = dedupe.RecordLink(fields)
     linker.prepare_training(input1, input2, sample_size=1500)
     while True:
         labelling(linker)
         try:
             linker.train()
             break
         except:
             sys.stderr.write('\nYou need to do more training.\n')
     pairs = linker.join(input1, input2, threshold, 'many-to-many')
     matches = []
     for pair in pairs:
         matches.append((pair[0][0], pair[0][1], pair[1]))
     return matches
예제 #3
0
    def train(self):
        # data_d = self._read_data(self.input_file)
        # self.data_d = data_d
        data_1 = self._read_data(self.input_file_base)
        data_2 = self._read_data(self.input_file)
        self.data_1 = data_1
        self.data_2 = data_2

        fields = self.fields
        deduper = dedupe.RecordLink(fields)
        training_file = self.training_file

        # ## training data
        if os.path.exists(training_file):
            logging.info('reading labeled examples from ', training_file)
            with open(training_file, 'rb') as f:
                deduper.prepare_training(data_1, data_2, f)
        else:
            deduper.prepare_training(data_1, data_2)

        print('starting active labeling...')
        dedupe.consoleLabel(deduper)
        deduper.train(recall=1)

        with open(training_file, 'w') as tf:
            deduper.writeTraining(tf)

        self.deduper = deduper
예제 #4
0
def match(data1, data2, fields1, fields2,
          threshold):  # threshold not used, as is automatically calculated
    input1 = {
        i: {fields1[j]: value
            for j, value in enumerate(row)}
        for i, row in enumerate(data1)
    }
    input2 = {
        i: {fields1[j]: value
            for j, value in enumerate(row)}
        for i, row in enumerate(data2)
    }
    fields = [{'field': field, 'type': 'String'} for field in fields1]
    linker = dedupe.RecordLink(fields)
    linker.sample(input1, input2, sample_size=1500)
    while True:
        labelling(linker)
        try:
            linker.train()
            break
        except ValueError:
            sys.stderr.write('\nYou need to do more training.\n')
    threshold = linker.threshold(input1, input2, recall_weight=1)
    pairs = linker.match(input1, input2, threshold)
    matches = []
    for pair in pairs:
        matches.append((pair[0][0], pair[0][1], pair[1]))
    return matches
예제 #5
0
파일: test_api.py 프로젝트: jgoleary/dedupe
    def setUp(self):
        random.seed(123)

        field_definition = [{
            'field': 'name',
            'type': 'String'
        }, {
            'field': 'age',
            'type': 'String'
        }]
        self.linker = dedupe.RecordLink(field_definition)
예제 #6
0
 def setUp(self):
     random.seed(123)
     fields = {
         'name': {
             'type': 'String'
         },
         'age': {
             'type': 'String'
         },
     }
     self.linker = dedupe.RecordLink(fields)
예제 #7
0
    def train(self, data_1, data_2, training_file):
        fields = self.fields
        deduper = dedupe.RecordLink(fields)
        # ## training data
        if os.path.exists(training_file):
            logging.info('reading labeled examples from ', training_file)
            with open(training_file, 'rb') as f:
                deduper.prepare_training(data_1, data_2, f)
        else:
            # deduper.prepare_training(data_1, data_2)
            raise Exception("no traing data")

        # print('starting active labeling...')
        # dedupe.consoleLabel(deduper)
        deduper.train(recall=1)
        self.deduper = deduper
        return self
예제 #8
0
def match(data1, data2, fields1, fields2):
    data2_remapped = remap(data2, fields1,
                           fields2) if fields1 != fields2 else data2
    fields = [{'field': field, 'type': 'String'} for field in fields1]
    linker = dedupe.RecordLink(fields)
    linker.sample(data1, data2_remapped, sample_size=1500)
    while True:
        labelling(linker)
        try:
            linker.train()
            break
        except ValueError:
            sys.stderr.write('\nYou need to do more training.\n')
    threshold = linker.threshold(data1, data2_remapped, recall_weight=1)
    pairs = linker.match(data1, data2_remapped, threshold)
    matches = []
    for pair in pairs:
        matches.append((pair[0][0], pair[0][1], pair[1]))
    return matches
예제 #9
0
def link_dataframes(dfa,
                    dfb,
                    field_properties,
                    config_name="link_dataframes",
                    n_cores=None):

    config_name = config_name.replace(" ", "_")

    settings_file = config_name + '_learned_settings'
    training_file = config_name + '_training.json'

    print('Importing data ...')

    dfa = clean_punctuation(dfa)
    specify_type(dfa, field_properties)

    dfa['index_field'] = dfa.index
    dfa['index_field'] = dfa['index_field'].apply(lambda x: "dfa" + str(x))
    dfa.set_index(['index_field'], inplace=True)

    data_1 = dfa.to_dict(orient='index')

    dfb = clean_punctuation(dfb)
    specify_type(dfb, field_properties)

    dfb['index_field'] = dfb.index
    dfb['index_field'] = dfb['index_field'].apply(lambda x: "dfb" + str(x))
    dfb.set_index(['index_field'], inplace=True)

    data_2 = dfb.to_dict(orient='index')
    # ---------------------------------------------------------------------------------

    # ## Training

    if os.path.exists(settings_file):
        print('Reading from', settings_file)
        with open(settings_file, 'rb') as sf:
            linker = dedupe.StaticRecordLink(sf, num_cores=n_cores)

    else:
        # Define the fields the linker will pay attention to
        #
        # Notice how we are telling the linker to use a custom field comparator
        # for the 'price' field.

        fields = []
        select_fields(fields, field_properties)

        # Create a new linker object and pass our data model to it.
        linker = dedupe.RecordLink(fields, num_cores=n_cores)
        # To train the linker, we feed it a sample of records.
        linker.prepare_training(data_1, data_2, sample_size=15000)

        # If we have training data saved from a previous run of linker,
        # look for it an load it in.
        # __Note:__ if you want to train from scratch, delete the training_file
        if os.path.exists(training_file):
            print('Reading labeled examples from ', training_file)
            with open(training_file) as tf:
                linker.prepare_training(data, training_file=tf)

        # ## Active learning
        # Dedupe will find the next pair of records
        # it is least certain about and ask you to label them as matches
        # or not.
        # use 'y', 'n' and 'u' keys to flag duplicates
        # press 'f' when you are finished
        print('Starting active labeling...')

        dedupe.console_label(linker)
        linker.train()

        # When finished, save our training away to disk
        with open(training_file, 'w') as tf:
            linker.write_training(tf)

        # Save our weights and predicates to disk.  If the settings file
        # exists, we will skip all the training and learning next time we run
        # this file.
        with open(settings_file, 'wb') as sf:
            linker.write_settings(sf)

    # ## Blocking

    # ## Clustering

    # Find the threshold that will maximize a weighted average of our
    # precision and recall.  When we set the recall weight to 2, we are
    # saying we care twice as much about recall as we do precision.
    #
    # If we had more data, we would not pass in all the blocked data into
    # this function but a representative sample.

    print('Clustering...')
    linked_records = linker.join(data_1, data_2, 0)

    print('# duplicate sets', len(linked_records))

    #Convert linked records into dataframe
    df_linked_records = pd.DataFrame(linked_records)

    df_linked_records['dfa_link'] = df_linked_records[0].apply(lambda x: x[0])
    df_linked_records['dfb_link'] = df_linked_records[0].apply(lambda x: x[1])
    df_linked_records.rename(columns={1: 'confidence'}, inplace=True)
    df_linked_records.drop(columns=[0], inplace=True)
    df_linked_records['cluster id'] = df_linked_records.index

    #For both dfa & dfb, add cluster id & confidence score from liked_records
    dfa.index.rename('dfa_link', inplace=True)
    dfa = dfa.merge(df_linked_records, on='dfa_link', how='left')

    dfb.index.rename('dfb_link', inplace=True)
    dfb = dfb.merge(df_linked_records, on='dfb_link', how='left')

    #Concatenate results from dfa + dfb
    df_final = dfa.append(dfb, ignore_index=True, sort=True)
    df_final = df_final.sort_values(by=['cluster id'])
    df_final = df_final.drop(columns=['dfa_link', 'dfb_link'])

    return df_final
            
if os.path.exists(settings_file):
    print('reading from', settings_file)
    with open(settings_file, 'rb') as sf :
        linker = dedupe.StaticRecordLink(sf)

else:
    
fields = [
        {'field' : 'title', 'type': 'String'},
        {'field' : 'title', 'type': 'Text', 'corpus' : descriptions()},
        {'field' : 'description', 'type': 'Text',
         'has missing' : True, 'corpus' : descriptions()},
        {'field' : 'price', 'type' : 'Price', 'has missing' : True}]
        
linker = dedupe.RecordLink(fields)

    linker.sample(data_1, data_2, 15000)

    if os.path.exists(training_file):
        print('reading labeled examples from ', training_file)
        with open(training_file) as tf :
            linker.readTraining(tf)


Dedupe will find the next pair of records it is least certain about and ask you to label them as matches or not. use 'y', 'n' and 'u' keys to flag duplicates press 'f' when you are finished

    print('starting active labeling...')

    dedupe.consoleLabel(linker)
예제 #11
0
def main_dedupe_method(data_d,spot_d,spot_data,chosen_cols3,country_code,settings_file,training_file,fields):
        
     
    print '''############# Welcome to our Matching Algorithm  ################
    =============== Initializing Part 1 - Data Retrieving =================\n'''
    start_time = time.time()
    
 
    print "Getting data from BP companies..."                
    
    # Select all companies from selected country
    con = psy.connect(database='msi_bp_%s' % (country_code), user = '******', host='basil.colours.local', password='******')
    con.set_client_encoding('UTF8')
    c = con.cursor(cursor_factory=psy.extras.RealDictCursor)
    
    
    q_companies = '''SELECT --DISTINCT on (auth.company_int_id_number)
	CONCAT(ROW_NUMBER() OVER(ORDER BY auth.company_int_id_number),'c') AS line_nr,
	auth.current_authority_pk as auth_nr,
	auth.s_acct_name as "name",
	auth.s_tax_number as vat,
	ads.address_line1 as address,
	ads.address_line3 as town,
	ads.postcode as postcode
	FROM gdm.authorities auth
	LEFT JOIN gdm.addresses ads
		ON ads.address_id_pk = auth.s_address_id
	WHERE auth.s_acct_name is not null
	ORDER BY auth.company_int_id_number'''    
    c.execute(q_companies)
    companies = c.fetchall()

    comp_d = {}
    for row in companies:
        clean_row = {ky: p.proc(row[ky]) for ky in chosen_cols3[1:]}
        row_id = row['line_nr']
        comp_d[row_id] = dict(clean_row)


    print "Saving all data into database..."

    all_data_d=md.merge_dicts(spot_d,comp_d)

    all_data=[]

    for k,v in all_data_d.iteritems():   
        new_row = [v[ky] for ky in chosen_cols3[1:]]
        new_row.insert(0,k)
        all_data.append(new_row)


    c2 = con.cursor()


    c2.execute('DROP TABLE IF EXISTS htc.all_data')

    field_string = ','.join('%s varchar(200)' % name for name in chosen_cols3)
    c2.execute('''CREATE TABLE htc.all_data
               (%s)''' %(field_string))

    num_cols = len(chosen_cols3)
    mog = "(" + ("%s,"*(num_cols -1)) + "%s)"
    args_str = ','.join(c2.mogrify(mog,x) for x in all_data)
    values = "("+ ','.join(x for x in chosen_cols3) +")"
    c2.execute("INSERT INTO htc.all_data %s VALUES %s" % (values, args_str))
    con.commit()


    print 'Data retrieval took ', time.time() - start_time, 'seconds (', (time.time() - start_time)/60, 'minutes)'
    start_time1 = time.time()


    print "Arranging data to create blocks..."

    u_keys = set()

    for n,m in all_data_d.iteritems():
        for k,v in m.iteritems():
            u_keys.add(v)

    u_keys = list(u_keys)


    block_keys = {}
    count = 0

    for k in u_keys:
        block_keys[k] = count
        count += 1

    
    print 'Checking blocks...'

    ids = []
    for k, v in all_data_d.items():
        ids.append([block_keys[v['name']],v['name'], k])
        if 'postcode' in v.keys():
            if v['postcode'] != None:
                ids.append([block_keys[v['postcode']],v['postcode'], k])
        if 'town' in v.keys():
            if v['town'] != None:
                ids.append([block_keys[v['town']],v['town'], k])


    print 'Writing tables...'

    column_names = ['block_key','block_id','record_id']

    c3 = con.cursor()

    print 'Writing htc.blocks_recordlink_test'

    ## Table with all blocks and companies ##
    c3.execute('DROP TABLE IF EXISTS htc.blocks_recordlink_test')
    field_string = ','.join('%s varchar(200)' % name for name in column_names)
    c3.execute('''CREATE TABLE htc.blocks_recordlink_test
               (%s)''' %(field_string))

    num_cols = len(column_names)
    mog = "(" + ("%s,"*(num_cols -1)) + "%s)"
    args_str = ','.join(c3.mogrify(mog,x) for x in ids)
    values = "("+ ','.join(x for x in column_names) +")"
    c3.execute("INSERT INTO htc.blocks_recordlink_test %s VALUES %s" % (values, args_str))
    con.commit()

    print 'Writing htc.plural_blocks'

    ## Table with all blocks that have more than one company ##

    c3.execute("DROP TABLE IF EXISTS htc.plural_blocks")
    c3.execute("DROP TABLE IF EXISTS htc.covered_blocks")

    c3.execute("""CREATE TABLE htc.plural_blocks as (
                SELECT b.block_id 
                FROM htc.blocks_recordlink_test b
                INNER JOIN (SELECT DISTINCT block_id 
                        FROM htc.blocks_recordlink_test 
                        WHERE record_id like '%c') bp
                ON bp.block_id = b.block_id
                INNER JOIN (SELECT DISTINCT block_id 
                        FROM htc.blocks_recordlink_test 
                        WHERE record_id not like '%c') comp
                ON comp.block_id = b.block_id
                GROUP BY b.block_id 
                HAVING COUNT(*) > 1)""")
    con.commit()

    print 'Writing htc.covered_blocks'

    ## Table with list of blocks for each company ##

    c3.execute("""CREATE TABLE htc.covered_blocks as (
                SELECT record_id, array_agg(r.block_key ORDER BY r.block_key) AS sorted_ids
                FROM htc.blocks_recordlink_test r
                INNER JOIN htc.plural_blocks bl
                ON r.block_id = bl.block_id
                GROUP BY record_id)""")
    con.commit()

    c3 = con.cursor(cursor_factory=psy.extras.RealDictCursor)

    print "Blocking..."

    ## Get all companies and blocks ##

    c3.execute("""SELECT br.record_id, m.*, br.block_key, cb.sorted_ids
               FROM htc.blocks_recordlink_test br
               LEFT JOIN 
		        (SELECT * FROM htc.all_data) m
                    ON m.line_nr = br.record_id
                INNER JOIN htc.covered_blocks cb
                    ON br.record_id = cb.record_id
                INNER JOIN htc.plural_blocks pb
                    ON br.block_id = pb.block_id
                ORDER BY br.block_key""")


    rec_ids = c3.fetchall()


    print 'Writing tables took ', time.time() - start_time1, 'seconds (', (time.time() - start_time1)/60, 'minutes)'
    start_time2 = time.time()

    print "Blocking..."
    ## Arrange data into the needed syntax to use in dedupe ##

    blocks = []
    last_key = 0
    count = 0
    bp_block = []
    comp_block = []


    for rec in rec_ids:
        if rec['block_key'] == last_key:
            if rec['record_id'][-1:] != 'c':
                bp_block.append(tuple([rec['record_id'],
                                       {re.sub('_bp','',your_key): p.proc(rec[your_key]) for your_key in chosen_cols3[1:]},
                                       set(rec['sorted_ids'][:rec['sorted_ids'].index(rec['block_key'])])
                                       ]))  
            else:
                comp_block.append(tuple([rec['record_id'],
                                         comp_d[rec['record_id']],
                                         set(rec['sorted_ids'][:rec['sorted_ids'].index(rec['block_key'])])
                                         ])) 

            
        else:
            if bp_block != [] and comp_block != []:
                blocks.append((bp_block, comp_block))
                bp_block = []
                comp_block = []
                if rec['record_id'][-1:] == 'c':
                    comp_block =[tuple([rec['record_id'],
                                        comp_d[rec['record_id']], 
                                        set(rec['sorted_ids'][:rec['sorted_ids'].index(rec['block_key'])])
                                        ])]
            else:
                            bp_block = [tuple([rec['record_id'],
                                               {re.sub('_bp','',your_key): p.proc(rec[your_key]) for your_key in chosen_cols3[1:]},
                                               set(rec['sorted_ids'][:rec['sorted_ids'].index(rec['block_key'])])
                                               ])]
    
            last_key = rec['block_key']

        count +=1

    blocks.append((bp_block, comp_block))

    blocks = tuple(blocks)


    del rec_ids
    del all_data

    con.close() 



    # Dinamically create fields accordingly to chosen columns
    # We need to see how can we say to our model if we only have name to work with



    print 'Blocking took ', time.time() - start_time2, 'seconds (', (time.time() - start_time2)/60, 'minutes)'
    start_time3 = time.time()


    print "=============== Initializing Part 2 - Dedupe ===============\n"

    print 'Entering Dedupe ...'

    #================================ DEDUPING WITH BLOCKS ==============================================

    if os.path.exists(settings_file):
        print 'Reading from: ', settings_file
        with open(settings_file) as sf :
        
            linker = dedupe.StaticRecordLink(sf)

    else:
        
        linker = dedupe.RecordLink(fields)
        
        linker.sample(data_d,comp_d)

        if os.path.exists(training_file):
            print 'Reading labeled examples from: ', training_file
            with open(training_file) as tf :
                linker.readTraining(tf)

        print 'Starting Active Labeling...'
        dedupe.consoleLabel(linker)
        linker.train()
        with open(training_file, 'w') as tf :
            linker.writeTraining(tf)

        with open(settings_file, 'w') as sf :
            linker.writeSettings(sf)

    print 'Creating blocks...'

    threshold = linker.thresholdBlocks(blocks, recall_weight=2)

    print 'Generating clusters...'

    clusters = linker.matchBlocks(blocks, threshold)
    clust=list(clusters)

    print 'Deduping took ', time.time() - start_time3, 'seconds (', (time.time() - start_time3)/60, 'minutes)'
    start_time4 = time.time()  

    print 'Writing results to database...'

    con = psy.connect(database='msi_bp_%s' % (country_code), user = '******', host='basil.colours.local', password='******')
    c = con.cursor()

    c.execute('DROP TABLE if exists htc.recordlink_blocks_result')
    con.commit()
    c.execute('CREATE TABLE htc.recordlink_blocks_result (spot_id varchar, comp_id varchar, score double precision)')


    #register_adapter(numpy.int64, addapt_numpy_int64)
    num_cols = 3
    mog = "(" + ("%s,"*(num_cols -1)) + "%s)"
    args_str = ','.join(c.mogrify(mog,x[0]+(float(x[1]),)) for x in clust)
    values = "("+ ','.join(x for x in ['spot_id','comp_id', 'score']) +")"
    c.execute("INSERT INTO htc.recordlink_blocks_result %s VALUES %s" % (values, args_str))


    c.execute('DROP TABLE if exists htc.recordlink_blocks_result_all')

    c.execute("""CREATE TABLE htc.recordlink_blocks_result_all as (
               SELECT adt.line_nr as spot_id,br.comp_id as comp_id,br.score as score,
               CASE WHEN br.comp_id IS NULL THEN 'Not a Client'
               ELSE 'Check Match Score'
               END AS "Result",
               adt.name as name_spot,b.name as name_comp, b.auth_nr as auth_nr_comp
               FROM htc.all_data adt
               LEFT JOIN htc.recordlink_blocks_result br
                   ON br.spot_id = adt.line_nr
                  LEFT JOIN 
                    (SELECT *
                    FROM htc.recordlink_blocks_result br
                    INNER JOIN htc.all_data adt
                    ON adt.line_nr= br.comp_id) b
                ON b.spot_id = adt.line_nr
                WHERE adt.line_nr not like '%c%')""")
    con.commit()
    con.close()



    print 'Writing results took ', time.time() - start_time4, 'seconds (', (time.time() - start_time4)/60, 'minutes)'
    print 'Ended in ' , time.time() - start_time, 'seconds (', (time.time() - start_time)/60, 'minutes)'
def get_resolved_df(data_1, data_2, fields):

    # Training

    if os.path.exists(settings_file):
        print('reading from', settings_file)
        with open(settings_file, 'rb') as sf:
            linker = dedupe.StaticRecordLink(sf)

    else:
        # Create a new linker object and pass our data model to it.
        linker = dedupe.RecordLink(fields)
        # To train the linker, we feed it a sample of records.
        linker.sample(data_1, data_2, 15000)

        # If we have training data saved from a previous run of linker,
        # look for it an load it in.
        if os.path.exists(training_file):
            print('reading labeled examples from ', training_file)
            with open(training_file) as tf:
                linker.readTraining(tf)

        # ## Active learning
        print('starting active labeling...')
        dedupe.consoleLabel(linker)

        linker.train()

        # When finished, save our training away to disk
        with open(training_file, 'w') as tf:
            linker.writeTraining(tf)

# Save our weights and predicates to disk.  If the settings file
# exists, we will skip all the training and learning next time we run
# this file.
        with open(settings_file, 'wb') as sf:
            linker.writeSettings(sf)

    # ## Clustering

    print('clustering...')
    linked_records = linker.match(data_1, data_2, 0)

    print('# duplicate sets', len(linked_records))
    print(linked_records)

    # ## Writing Results

    df_list = []

    for cluster, score in linked_records:

        #obtain filename + record no clusters
        cluster1 = cluster[0]
        cluster2 = cluster[1]
        match = re.match(r"(\w+)\.csv(\d+)", cluster1)
        filename, idx = match.groups()
        filename += '.csv'
        filename = pr.resource_filename('src.input', filename)
        idx = int(idx)
        print filename
        print idx

        #dataframe for cluster - 1
        df1 = pd.DataFrame(columns=pd.read_csv(filename).columns)
        df1.loc[0] = pd.read_csv(filename).ix[idx]

        #for cluster 2
        match = re.match(r"(\w+)\.csv(\d+)", cluster2)
        filename, idx = match.groups()
        filename += '.csv'
        filename = pr.resource_filename('src.input', filename)
        idx = int(idx)
        print filename
        print idx

        #dataframe for cluster 2
        df2 = pd.DataFrame(columns=pd.read_csv(filename).columns)
        df2.loc[0] = pd.read_csv(filename).ix[idx]

        #common attribute
        df2['score'] = [score]
        df1['score'] = [score]

        df = pd.merge(df1, df2, on='score')
        df_list.append(df)

    results_df = pd.concat(df_list, ignore_index=True)
    return results_df
            'has missing': True
        },
    ]

    # Pick appropriate schema
    if asset_class == "bonds":
        fields = common_fields + bond_only_fields
    else:
        fields = common_fields

    # Report the schema
    logger.debug("Linkage Schema")
    logger.debug(fields)

    # Create a new linker object and pass our data model to it
    linker = dedupe.RecordLink(fields, num_cores=num_cores)

    # Set up the regularized logistic regression model
    linker.classifier = RegularizedLogisticRegression()

    # To train the linker, we feed it a sample of records
    random.seed(1)
    np.random.seed(1)
    linker.sample(bad_data_dict, good_data_dict, 10000)

    # Look for previous training data to bootstrap the linker
    if os.path.exists(training_file):
        logger.info('Reading labeled examples from {}'.format(training_file))
        with open(training_file) as tf:
            training_pairs_prev = read_training(tf)
    else:
예제 #14
0
def _merge_features(fields, g1_features, g2_features):
    while True:
        if len(g1_features) == len(g2_features) == 1:
            linked_records = (
                (list(g1_features.keys())[0], list(g2_features.keys())[0]),
                1.0,
            )
            g1_features = []
            g2_features = []
            break

        linker = dedupe.RecordLink(fields)
        linker.prepare_training(g1_features, g2_features)
        confirmed_matches = console_label(linker)
        linker.train()
        linked_records = linker.join(
            g1_features, g2_features, 0.0, constraint="one-to-one"
        )

        # remove records from linked_records that are in confirmed_matches
        for (e1, e2) in confirmed_matches:
            idx = 0
            while idx < len(linked_records):
                pair = linked_records[idx]
                for (pair, _) in linked_records:
                    if e1 in pair or e2 in pair:
                        linked_records.pop(idx)
                        break
                idx += 1

        # replace linked record scores with 1.0 if the user explicitly
        # marked them as equivalent. Then fill in other user-marked pairs
        # of linked records at the end
        idx = 0
        while idx < len(confirmed_matches):
            pair = confirmed_matches[idx]
            for lidx, (lpair, _) in enumerate(linked_records):
                if pair == lpair:
                    linked_records[lidx] = (pair, 1.0)
                    confirmed_matches.pop(idx)
                    idx -= 1  # cancel out the increment
                    break
            idx += 1
        linked_records.extend([(pair, 1.0) for pair in confirmed_matches])

        print(
            Style.BRIGHT + Fore.YELLOW + "Is this matching correct?" + Style.RESET_ALL
        )
        for (e1, e2), similarity in linked_records:
            for field in unique(field["field"] for field in fields):
                g1_val = g1_features[e1][field]
                g2_val = g2_features[e2][field]
                print(f"{g1_val:<50} | {g2_val:<50}")
            print(f"Similarity: {similarity}")
            print("-" * 20)
        ans = input(Fore.YELLOW + "[y/n]? " + Style.RESET_ALL)
        if ans.lower() == "y":
            print(
                Fore.GREEN
                + "All correct! Moving on to any stragglers"
                + Style.RESET_ALL
            )
            break
        else:
            print(Fore.RED + "Re-labeling..." + Style.RESET_ALL)
    linked_entities = _unpack_linked_records(linked_records)
    if len(linked_entities) != len(g1_features) or len(linked_entities) != len(
        g2_features
    ):
        leftover_g1 = set(g1_features.keys()).difference(linked_entities)
        leftover_g2 = set(g2_features.keys()).difference(linked_entities)
        leftover_g1 = {k: v for (k, v) in g1_features.items() if k in leftover_g1}
        leftover_g2 = {k: v for (k, v) in g2_features.items() if k in leftover_g2}
    return linked_records, leftover_g1, leftover_g2
예제 #15
0
    def main(self):

        data_1 = {}
        data_2 = {}
        # import the specified CSV file

        data_1 = csvhelpers.readData(self.input_1,
                                     self.field_names_1,
                                     delimiter=self.delimiter,
                                     prefix='input_1')
        data_2 = csvhelpers.readData(self.input_2,
                                     self.field_names_2,
                                     delimiter=self.delimiter,
                                     prefix='input_2')

        # fix price
        for field in self.field_definition:
            if field['type'] == 'Price':
                for key, record in data_1.items():
                    value = record[field['field']]
                    if value:
                        record[field['field']] = int(float(value))

        # fix latlon
        for field in self.field_definition:
            if field['type'] == 'LatLong':
                for key, record in data_1.items():
                    latitude = record[field['latitude']]
                    longitude = record[field['longitude']]
                    record[field['field']] = (
                        float(latitude),
                        float(longitude)) if latitude and longitude else None

        # fix price
        for field in self.field_definition:
            if field['type'] == 'Price':
                for key, record in data_2.items():
                    value = record[field['field']]
                    if value:
                        record[field['field']] = int(float(value))

        # fix latlon
        for field in self.field_definition:
            if field['type'] == 'LatLong':
                for key, record in data_2.items():
                    latitude = record[field['latitude']]
                    longitude = record[field['longitude']]
                    record[field['field']] = (
                        float(latitude),
                        float(longitude)) if latitude and longitude else None

        # sanity check for provided field names in CSV file
        for field in self.field_names_1:
            if field not in list(data_1.values())[0]:
                raise self.parser.error("Could not find field '" + field +
                                        "' in input")

        for field in self.field_names_2:
            if field not in list(data_2.values())[0]:
                raise self.parser.error("Could not find field '" + field +
                                        "' in input")

        if self.field_names_1 != self.field_names_2:
            for record_id, record in data_2.items():
                remapped_record = {}
                for new_field, old_field in zip(self.field_names_1,
                                                self.field_names_2):
                    remapped_record[new_field] = record[old_field]
                data_2[record_id] = remapped_record

        logging.info('imported %d rows from file 1', len(data_1))
        logging.info('imported %d rows from file 2', len(data_2))

        logging.info('using fields: %s' %
                     [field['field'] for field in self.field_definition])

        # If --skip_training has been selected, and we have a settings cache still
        # persisting from the last run, use it in this next run.
        # __Note:__ if you want to add more training data, don't use skip training
        if self.skip_training and os.path.exists(self.settings_file):

            # Load our deduper from the last training session cache.
            logging.info('reading from previous training cache %s' %
                         self.settings_file)
            with open(self.settings_file, 'rb') as f:
                deduper = dedupe.StaticRecordLink(f)

            fields = {
                variable.field
                for variable in deduper.data_model.primary_fields
            }
            (nonexact_1, nonexact_2,
             exact_pairs) = exact_matches(data_1, data_2, fields)

        else:
            # # Create a new deduper object and pass our data model to it.
            deduper = dedupe.RecordLink(self.field_definition)

            fields = {
                variable.field
                for variable in deduper.data_model.primary_fields
            }
            (nonexact_1, nonexact_2,
             exact_pairs) = exact_matches(data_1, data_2, fields)

            # Set up our data sample
            logging.info('taking a sample of %d possible pairs',
                         self.sample_size)
            deduper.sample(nonexact_1, nonexact_2, self.sample_size)

            # Perform standard training procedures
            self.dedupe_training(deduper)

        # ## Blocking

        logging.info('blocking...')

        # ## Clustering

        # Find the threshold that will maximize a weighted average of our precision and recall.
        # When we set the recall weight to 2, we are saying we care twice as much
        # about recall as we do precision.
        #
        # If we had more data, we would not pass in all the blocked data into
        # this function but a representative sample.

        logging.info('finding a good threshold with a recall_weight of %s' %
                     self.recall_weight)
        threshold = deduper.threshold(data_1,
                                      data_2,
                                      recall_weight=self.recall_weight)

        # `duplicateClusters` will return sets of record IDs that dedupe
        # believes are all referring to the same entity.

        logging.info('clustering...')
        clustered_dupes = deduper.match(data_1, data_2, threshold)

        clustered_dupes.extend(exact_pairs)

        logging.info('# duplicate sets %s' % len(clustered_dupes))

        write_function = csvhelpers.writeLinkedResults
        # write out our results

        if self.output_file:
            if sys.version < '3':
                with open(self.output_file, 'wb',
                          encoding='utf-8') as output_file:
                    write_function(clustered_dupes, self.input_1, self.input_2,
                                   output_file, self.inner_join)
            else:
                with open(self.output_file, 'w',
                          encoding='utf-8') as output_file:
                    write_function(clustered_dupes, self.input_1, self.input_2,
                                   output_file, self.inner_join)
        else:
            write_function(clustered_dupes, self.input_1, self.input_2,
                           sys.stdout, self.inner_join)
예제 #16
0
def link_records(dataset,
                 left_records_abbr,
                 right_records_abbr,
                 fields_json_file_path,
                 known_matches_filepath='src/prodigy/known_matches.jsonl'):
    """
    Collect the best possible training data for linking records across multiple
    datasets. This recipe is an example of linking records across 2 CSV files
    using the dedupe.io library.
    """

    load_dotenv(find_dotenv())

    account_name = os.environ.get('COSMOS_ACCOUNT_NAME')
    db_name = os.environ.get('COSMOS_DB_NAME')
    graph_name = os.environ.get('COSMOS_GRAPH_NAME')
    master_key = os.environ.get('COSMOS_MASTER_KEY')

    doc_qm = DocumentQueryManager(account_name, master_key, db_name)
    gremlin_qm = GremlinQueryManager(account_name, master_key, db_name,
                                     graph_name)

    # left_records = get_services(doc_qm, left_record_label)
    # right_records = get_services(doc_qm, right_record_label)
    left_record_file_path = f'data/processed/{left_records_abbr}_dedupe_data.csv'
    right_record_file_path = f'data/processed/{right_records_abbr}_dedupe_data.csv'
    left_records = readData(left_record_file_path)
    right_records = readData(right_record_file_path)

    db = connect()  # uses the settings in your prodigy.json

    output_path = f'data/processed/{left_records_abbr}_{right_records_abbr}_dedupe'
    os.makedirs(output_path, exist_ok=True)
    output_file = f'{output_path}/data_matching_output.csv'
    settings_file = f'{output_path}/data_matching_learned_settings'
    training_file = f'{output_path}/data_matching_training.json'

    def descriptions():
        for dataset in (left_records, right_records):
            for k, record in dataset.items():
                yield record['short_description']
                yield record['long_description']

    with open(fields_json_file_path) as fields_json_file:
        fields = json.load(fields_json_file)

    for field in fields:
        validate_field(field)
        if field['type'] == 'Text' and 'corpus' in field:
            func_name = field['corpus'][1:-1]
            field['corpus'] = locals()[func_name].__call__()

    # print(json.dumps(fields, indent=4))

    linker = dedupe.RecordLink(fields)
    # To train the linker, we feed it a sample of records.
    linker.sample(left_records, right_records, 500
                  # round(min(len(left_records) / 2, len(right_records) / 2))
                  )

    # If we have training data saved from a previous run of linker,
    # look for it an load it in.
    examples = db.get_dataset(dataset)
    if examples:
        linker = update_linker(linker, examples)

    # Add known matches
    known_matches = []
    for match in JSONL(known_matches_filepath):
        left_match = None
        right_match = None
        for lr in left_records.values():
            if left_records_abbr in match and lr['name'] == match[
                    left_records_abbr].lower():
                left_match = lr
        for rr in right_records.values():
            if right_records_abbr in match and rr['name'] == match[
                    right_records_abbr].lower():
                right_match = rr
        if left_match and right_match:
            known_matches.append((left_match, right_match))
    linker.markPairs({'distinct': [], 'match': known_matches})

    def update(examples, linker=linker):
        linker = update_linker(linker, examples)

    def get_progress(session=0, total=0, loss=0, linker=linker):
        n_match = len(linker.training_pairs['match'])
        n_distinct = len(linker.training_pairs['distinct'])
        n_match_progress = min(1, (n_match / 10)) / 2
        n_distinct_progress = min(1, (n_distinct / 10)) / 2
        print("Examples Annotated: {0}/10 positive, {1}/10 negative".format(
            n_match, n_distinct))
        print(n_match_progress, n_distinct_progress)
        progress = min(1, n_match_progress + n_distinct_progress)
        return progress

    def on_exit(controller, linker=linker):
        print('TRAINING PAIRS')
        print(linker.training_pairs)
        print('=' * 50)
        print('=' * 50)
        linker.train()
        # Save our weights and predicates to disk.  If the settings file
        # exists, we will skip all the training and learning next time we run
        # this file.
        with open(settings_file, 'wb') as sf:
            linker.writeSettings(sf)

        print('clustering...')
        linked_records = linker.match(left_records, right_records, 0)
        print('# duplicate sets', len(linked_records))

        # ## Writing Results

        # Write our original data back out to a CSV with a new column called
        # 'Cluster ID' which indicates which records refer to each other.

        cluster_membership = {}
        cluster_id = None
        for cluster_id, (cluster, score) in enumerate(linked_records):
            for record_id in cluster:
                cluster_membership[record_id] = (cluster_id, score)

        if cluster_id:
            unique_id = cluster_id + 1
        else:
            unique_id = 0

        with open(output_file, 'w') as f:
            writer = csv.writer(f)

            header_unwritten = True

            for fileno, filename in enumerate(
                (left_record_file_path, right_record_file_path)):
                with open(filename) as f_input:
                    reader = csv.reader(f_input)

                    if header_unwritten:
                        heading_row = next(reader)
                        heading_row.insert(0, 'source file')
                        heading_row.insert(0, 'Link Score')
                        heading_row.insert(0, 'Cluster ID')
                        writer.writerow(heading_row)
                        header_unwritten = False
                    else:
                        next(reader)

                    for row_id, row in enumerate(reader):
                        cluster_details = cluster_membership.get(filename +
                                                                 str(row_id))
                        if cluster_details is None:
                            cluster_id = unique_id
                            unique_id += 1
                            score = None
                        else:
                            cluster_id, score = cluster_details
                        row.insert(0, fileno)
                        row.insert(0, score)
                        row.insert(0, cluster_id)
                        writer.writerow(row)

        print(cluster_membership)

    stream = record_pairs_stream(linker)

    with open('src/prodigy/record_pairs.html') as template_file:
        html_template = template_file.read()

    return {
        'view_id': 'html',
        'dataset': dataset,
        'stream': stream,
        'update': update,
        'progress': get_progress,
        'on_exit': on_exit,
        'config': {
            'html_template': html_template
        }
    }
def do_training_iteration(iteration_number,
                          linkage_fields,
                          bad_data_dict,
                          good_data_dict,
                          asset_class,
                          geo_setting,
                          clip_betas=True,
                          clip_levels=None,
                          prior_weights=None):

    # Validate inputs
    if clip_betas and clip_levels is None:
        raise Exception(
            "Please provide clip levels for option clip_betas=True")

    # Create a new linker object and pass our data model to it
    linker = dedupe.RecordLink(linkage_fields, num_cores=num_cores)

    # Set up the regularized logistic regression model
    linker.classifier = RegularizedLogisticRegression()

    # For the initial iteration, use the prior weights
    if iteration_number == 0:
        assert prior_weights is not None, "Must provide prior weights for initial training iteration"
        prior_intercept, prior_beta = prior_weights
        linker.classifier.weights = prior_beta
        linker.classifier.bias = prior_intercept

    # Else we use the latest iteration of the training set
    else:

        # To train the linker, we feed it a sample of records
        random.seed(1)
        np.random.seed(1)
        linker.sample(bad_data_dict, good_data_dict, 10000)

        # Look for the previous training file iteration
        trainset_file_prev_iter = '{}/training_set_build/fuzzy_{}_training_set_v{}.json'.format(
            scratch_dir, asset_class, "_it_{}".format(iteration_number - 1))
        assert os.path.exists(trainset_file_prev_iter
                              ), "Cannot find previous iteration training set"

        # Read the previous training set
        logger.info(
            'Reading labeled examples from {}'.format(trainset_file_prev_iter))
        with open(trainset_file_prev_iter) as tf:
            training_pairs_prev = read_training(tf)
        linker.markPairs(training_pairs_prev)

        # Do the training
        logger.info(
            "Training the linker, iteration {}".format(iteration_number))
        linker.train()
        linker.cleanupTraining()

        # Apply beta clipping if asked to
        if clip_betas:
            linker.classifier.weights = linker.classifier.weights.clip(
                min=clip_levels[:, 0], max=clip_levels[:, 1])

    # Finally, serialize the linker
    logger.info(
        "Serializing the linker, iteration {}".format(iteration_number))
    store_linker(linker, scratch_dir + "/training_set_build", asset_class,
                 geo_setting, "_it_{}".format(iteration_number))
    return linker
예제 #18
0
def link_records(dataset, left_record_file_path, right_record_file_path, fields_json_file_path):
    """
    Collect the best possible training data for linking records across multiple
    datasets. This recipe is an example of linking records across 2 CSV files
    using the dedupe.io library.
    """

    db = connect()  # uses the settings in your prodigy.json

    output_file = 'data_matching_output.csv'
    settings_file = 'data_matching_learned_settings'
    training_file = 'data_matching_training.json'

    left_records = readData(left_record_file_path)
    right_records = readData(right_record_file_path)

    def descriptions():
        for dataset in (left_records, right_records):
            for record in dataset.values():
                yield record['description']

    with open(fields_json_file_path) as fields_json_file:
        fields = json.load(fields_json_file)

    for field in fields:
        validate_field(field)
        if field['type'] == 'Text' and 'corpus' in field:            
            func_name = field['corpus'][1:-1]
            field['corpus'] = locals()[func_name].__call__()

    print('LEN RECORDS: ', len(left_records) / 2, len(right_records) / 2)
    print('MIN SAMPLE', min(len(left_records) / 2, len(right_records) / 2))
    print(fields)

    linker = dedupe.RecordLink(fields)
    # To train the linker, we feed it a sample of records.
    linker.sample(
        left_records,
        right_records,
        round(min(len(left_records) / 2, len(right_records) / 2))
    )

    print('getting examples')
    # If we have training data saved from a previous run of linker,
    # look for it an load it in.
    examples = db.get_dataset(dataset)
    if len(examples) > 0:
        linker = update_linker(linker, examples)

    def update(examples, linker=linker):
        print(len(examples))
        linker = update_linker(linker, examples)

    def get_progress(session=0, total=0, loss=0, linker=linker):
        n_match = len(linker.training_pairs['match'])
        n_distinct = len(linker.training_pairs['distinct'])
        n_match_progress = min(1, (n_match / 10)) / 2
        n_distinct_progress = min(1, (n_distinct / 10)) / 2
        print("Examples Annotated: {0}/10 positive, {1}/10 negative".format(n_match, n_distinct))
        print(n_match_progress, n_distinct_progress)
        progress = min(1, n_match_progress + n_distinct_progress)
        return progress

    def on_exit(controller, linker=linker):
        linker.train()
        # Save our weights and predicates to disk.  If the settings file
        # exists, we will skip all the training and learning next time we run
        # this file.
        with open(settings_file, 'wb') as sf:
            linker.writeSettings(sf)

        print('clustering...')
        linked_records = linker.match(left_records, right_records, 0)
        print('# duplicate sets', len(linked_records))

        # ## Writing Results

        # Write our original data back out to a CSV with a new column called 
        # 'Cluster ID' which indicates which records refer to each other.

        cluster_membership = {}
        cluster_id = None
        for cluster_id, (cluster, score) in enumerate(linked_records):
            for record_id in cluster:
                cluster_membership[record_id] = (cluster_id, score)

        if cluster_id:
            unique_id = cluster_id + 1
        else:
            unique_id = 0


        with open(output_file, 'w') as f:
            writer = csv.writer(f)

            header_unwritten = True

            for fileno, filename in enumerate((left_record_file_path, right_record_file_path)):
                with open(filename) as f_input:
                    reader = csv.reader(f_input)

                    if header_unwritten:
                        heading_row = next(reader)
                        heading_row.insert(0, 'source file')
                        heading_row.insert(0, 'Link Score')
                        heading_row.insert(0, 'Cluster ID')
                        writer.writerow(heading_row)
                        header_unwritten = False
                    else:
                        next(reader)

                    for row_id, row in enumerate(reader):
                        cluster_details = cluster_membership.get(filename + str(row_id))
                        if cluster_details is None:
                            cluster_id = unique_id
                            unique_id += 1
                            score = None
                        else:
                            cluster_id, score = cluster_details
                        row.insert(0, fileno)
                        row.insert(0, score)
                        row.insert(0, cluster_id)
                        writer.writerow(row)

        print(cluster_membership)

    stream = record_pairs_stream(linker)

    with open('./record_pairs.html') as template_file:
        html_template = template_file.read()

    return {
        'view_id': 'html',
        'dataset': dataset,
        'stream': stream,
        'update': update,
        'progress': get_progress,
        'on_exit': on_exit,
        'config': {
            'html_template': html_template
        }
    }
예제 #19
0
  def main(self) :

    data_1 = {}
    data_2 = {}
    # import the specified CSV file

    data_1 = csvhelpers.readData(self.input_1, 
                                 self.field_names_1,
                                 prefix='input_1')
    data_2 = csvhelpers.readData(self.input_2, 
                                 self.field_names_2,
                                 prefix='input_2')

    # sanity check for provided field names in CSV file
    for field in self.field_names_1 :
      if field not in data_1.values()[0]:
        raise parser.error("Could not find field '" + field + "' in input")

    for field in self.field_names_2 :
      if field not in data_2.values()[0]:
        raise parser.error("Could not find field '" + field + "' in input")


    if self.field_names_1 != self.field_names_2 :
      for record_id, record in data_2.items() :
        remapped_record = {}
        for new_field, old_field in zip(self.field_names_1, self.field_names_2) :
          remapped_record[new_field] = record[old_field]
        data_2[record_id] = remapped_record
    
    logging.info('imported %d rows from file 1', len(data_1))
    logging.info('imported %d rows from file 2', len(data_2))

    logging.info('using fields: %s' % self.field_definition.keys())
    # # Create a new deduper object and pass our data model to it.
    deduper = dedupe.RecordLink(self.field_definition)

    # Set up our data sample
    logging.info('taking a sample of %d possible pairs', self.sample_size)
    deduper.sample(data_1, data_2, self.sample_size)

    # If we have training data saved from a previous run of dedupe,
    # look for it an load it in.
    # __Note:__ if you want to train from scratch, delete the training_file

    if os.path.exists(self.training_file):
      logging.info('reading labeled examples from %s' % self.training_file)
      deduper.readTraining(self.training_file)
    elif self.skip_training:
      raise parser.error("You need to provide an existing training_file or run this script without --skip_training")

    if not self.skip_training:
      logging.info('starting active labeling...')

      dedupe.consoleLabel(deduper)
      deduper.train()

      # When finished, save our training away to disk
      logging.info('saving training data to %s' % self.training_file)
      deduper.writeTraining(self.training_file)
    else:
      logging.info('skipping the training step')
      deduper.train()

    # ## Blocking

    logging.info('blocking...')

    # ## Clustering

    # Find the threshold that will maximize a weighted average of our precision and recall. 
    # When we set the recall weight to 2, we are saying we care twice as much
    # about recall as we do precision.
    #
    # If we had more data, we would not pass in all the blocked data into
    # this function but a representative sample.

    logging.info('finding a good threshold with a recall_weight of %s' % 
                 self.recall_weight)
    threshold = deduper.threshold(data_1, data_2, recall_weight=self.recall_weight)

    # `duplicateClusters` will return sets of record IDs that dedupe
    # believes are all referring to the same entity.

    logging.info('clustering...')
    clustered_dupes = deduper.match(data_1, data_2, threshold)

    logging.info('# duplicate sets %s' % len(clustered_dupes))

    write_function = csvhelpers.writeLinkedResults
    # write out our results

    if self.output_file :
      with open(self.output_file, 'w') as output_file :
        write_function(clustered_dupes, 
                       self.input_1, 
                       self.input_2, 
                       output_file,
                       self.inner_join)
    else :
        write_function(clustered_dupes, 
                       self.input_1, 
                       self.input_2, 
                       sys.stdout,
                       self.inner_join)