Пример #1
0
    def write_clusters(self, clustered_dupes, src_data):
        '''
        write data with id, cluster_id, conf_score, <canoncial columns>
        '''
        with StringIO() as f:
            is_header = True
            writer = csv.writer(f)
            for (cluster_id, cluster) in enumerate(clustered_dupes):
                id_set, scores = cluster
                score_dict = {id: score for id, score in zip(id_set, scores)}
                #zip id_set, scores together for [(id,score),(id,score)]
                all_cluster_rows = [src_data[c]
                                    for c in id_set]  #list of dicts
                canonical_rep = dedupe.canonicalize(all_cluster_rows)
                #make a score dict with zip.
                for rec in all_cluster_rows:
                    rec['cluster_id'] = cluster_id

                    for can_col in canonical_rep:
                        rec['canoncial_' + can_col] = canonical_rep[can_col]
                    rec['confidence'] = score_dict[int(
                        rec[self.data_source.id_col])]
                    if is_header:
                        writer.writerow(rec.keys())
                        is_header = False
                    writer.writerow(rec.values())
            self.outp_file.write_file(f.getvalue())
Пример #2
0
def main():
    d_dict = load_dict()
    if SETTINGS_PATH.exists():
        print(f"Reading settings from {SETTINGS_PATH.name}.")
        with SETTINGS_PATH.open('rb') as f:
            deduper = dedupe.StaticDedupe(f)
    else:
        deduper = label_and_train(d_dict)

    if not DUPES_PATH.exists():
        threshold = deduper.threshold(d_dict, recall_weight=1)

        print("Clustering...")
        clustered_dupes = deduper.match(d_dict, threshold)

        print(f"Writing {len(clustered_dupes)} clusters to {DUPES_PATH.name}.")
        DUPES_PATH.write_bytes(pickle.dumps(clustered_dupes))

    clustered_dupes = pickle.loads(DUPES_PATH.read_bytes())

    for cluster in clustered_dupes:
        id_set, scores = cluster
        cluster_dicts = [d_dict[c] for c in id_set]
        min_index, min_score = get_the_freaking_minimum_index_and_score(scores)
        lowest_score_dict = cluster_dicts[min_index]
        canonical_rep = dedupe.canonicalize(cluster_dicts)
        print(f"\nFound {len(cluster_dicts)} registrations for:")
        print_dict(canonical_rep)
        print(f"Least confident dupe ({min_score}) is:")
        print_dict(lowest_score_dict)
        input("Press enter for next cluster.")
Пример #3
0
    def cluster(self):
        deduper = self.deduper
        if not self.data_d:
            self.data_d = self._read_data(self.input_file)
        data_d = self.data_d
        threshold = deduper.threshold(data_d, recall_weight=1)

        # ## clustering
        print('clustering...')
        clustered_dupes = deduper.match(data_d, threshold)

        print('# duplicate sets', len(clustered_dupes))
        cluster_membership = {}
        cluster_id = 0
        for (cluster_id, cluster) in enumerate(clustered_dupes):
            id_set, scores = cluster
            cluster_d = [data_d[c] for c in id_set]
            canonical_rep = dedupe.canonicalize(cluster_d)
            for record_id, score in zip(id_set, scores):
                cluster_membership[record_id] = {
                    'cluster id': cluster_id,
                    'canonical representation' : canonical_rep,
                    'confidence': score
                }

        singleton_id = cluster_id + 1

        output_file = self.output_file
        input_file = self.input_file
        with open(output_file, 'w') as f_output, open(input_file) as f_input:
            writer = csv.writer(f_output)
            reader = csv.reader(f_input)
        
            heading_row = next(reader)
            heading_row.insert(0, 'confidence_score')
            heading_row.insert(0, 'Cluster ID')
            canonical_keys = canonical_rep.keys()
            for key in canonical_keys:
                heading_row.append('canonical_' + key)
        
            writer.writerow(heading_row)
        
            for row in reader:
                # row_id = int(row[0])
                row_id = row[0]
                if row_id in cluster_membership:
                    cluster_id = cluster_membership[row_id]["cluster id"]
                    canonical_rep = cluster_membership[row_id]["canonical representation"]
                    row.insert(0, cluster_membership[row_id]['confidence'])
                    row.insert(0, cluster_id)
                    for key in canonical_keys:
                        row.append(canonical_rep[key].encode('utf8'))
                else:
                    row.insert(0, None)
                    row.insert(0, singleton_id)
                    singleton_id += 1
                    for key in canonical_keys:
                        row.append(None)
                writer.writerow(row)
Пример #4
0
def writeCanonRep(session_id, name_pattern='cr_{0}'):
    engine = worker_session.bind
    metadata = MetaData()
    entity = Table('entity_{0}'.format(session_id), metadata,
        autoload=True, autoload_with=engine, keep_existing=True)
    proc_table = Table('processed_{0}'.format(session_id), metadata,
        autoload=True, autoload_with=engine, keep_existing=True)

    cr_cols = [Column('record_id', String, primary_key=True)]
    for col in proc_table.columns:
        if col.name != 'record_id':
            cr_cols.append(Column(col.name, col.type))
    cr = Table(name_pattern.format(session_id), metadata, *cr_cols)
    cr.drop(bind=engine, checkfirst=True)
    cr.create(bind=engine)

    cols = [entity.c.entity_id]
    col_names = [c for c in proc_table.columns.keys() if c != 'record_id']
    for name in col_names:
        cols.append(label(name, func.array_agg(getattr(proc_table.c, name))))
    rows = worker_session.query(*cols)\
        .filter(entity.c.record_id == proc_table.c.record_id)\
        .group_by(entity.c.entity_id)
    names = cr.columns.keys()
    with open('/tmp/{0}.csv'.format(name_pattern.format(session_id)), 'wb') as f:
        writer = UnicodeCSVWriter(f)
        writer.writerow(names)
        for row in rows:
            r = [row.entity_id]
            dicts = [dict(**{n:None for n in col_names}) for i in range(len(row[1]))]
            for idx, dct in enumerate(dicts):
                for name in col_names:
                    dicts[idx][name] = unicode(getattr(row, name)[idx])
            canon_form = dedupe.canonicalize(dicts)
            r.extend([canon_form[k] for k in names if canon_form.get(k) is not None])
            writer.writerow(r)
    canon_table_name = name_pattern.format(session_id)
    copy_st = 'COPY "{0}" ('.format(canon_table_name)
    for idx, name in enumerate(names):
        if idx < len(names) - 1:
            copy_st += '"{0}", '.format(name)
        else:
            copy_st += '"{0}")'.format(name)
    else:
        copy_st += "FROM STDIN WITH (FORMAT CSV, HEADER TRUE, DELIMITER ',', NULL ' ')"
    conn = engine.raw_connection()
    cur = conn.cursor()
    with open('/tmp/{0}.csv'.format(name_pattern.format(session_id)), 'rb') as f:
        cur.copy_expert(copy_st, f)
    conn.commit()
Пример #5
0
    def cluster(self):
        # Set threshold
        print('threshold...')
        threshold = self.deduper.threshold(self.dictionary, recall_weight=1)

        # ## Clustering

        print('clustering...')
        clustered_dupes = self.deduper.match(self.dictionary, threshold)

        cluster_membership = {}
        cluster_id = 0
        for (cluster_id, cluster) in enumerate(clustered_dupes):
            id_set, scores = cluster
            cluster_d = [self.dictionary[c] for c in id_set]
            canonical_rep = dedupe.canonicalize(cluster_d)
            for record_id, score in zip(id_set, scores):
                cluster_membership[record_id] = {
                    "cluster id": cluster_id,
                    "canonical representation": canonical_rep,
                    "confidence": score
                }

        cluster_index = []
        for i in cluster_membership.items():
            cluster_index.append(i)

        dfa = pd.DataFrame(cluster_index)

        dfa.rename(columns={0: 'Id'}, inplace=True)

        dfa['cluster id'] = dfa[1].apply(lambda x: x["cluster id"])
        dfa['confidence'] = dfa[1].apply(lambda x: x["confidence"])

        canonical_list = []

        for i in dfa[1][0]['canonical representation'].keys():
            canonical_list.append(i)
            dfa[i + ' - ' + 'canonical'] = None
            dfa[i + ' - ' + 'canonical'] = dfa[1].apply(
                lambda x: x['canonical representation'][i])

        dfa.set_index('Id', inplace=True)
        self.dataframe = self.dataframe.join(dfa)
        self.dataframe[self.entity["output_fields"] +
                       ['cluster id', 'confidence']].to_csv(
                           self.entity["name"].lower() + "_output.csv")
        #self.dataframe[self.entity["output_fields"] + ['cluster id', 'confidence']].to_sql(self.entity["name"].lower()+"_output", con=db.engine, if_exists='replace')
        return self.dataframe
Пример #6
0
def rundedupe2():
    threshold = deduper.threshold(data_d, recall_weight=1)
    print('clustering...')
    clustered_dupes = deduper.match(data_d, threshold)
    print('# duplicate sets', len(clustered_dupes))
    cluster_membership = {}

    cluster_id = 0
    for (cluster_id, cluster) in enumerate(clustered_dupes):
        id_set, scores = cluster
        cluster_d = [data_d[c] for c in id_set]
        canonical_rep = dedupe.canonicalize(cluster_d)
        for record_id, score in zip(id_set, scores):
            cluster_membership[record_id] = {
                "cluster id": cluster_id,
                "canonical representation": canonical_rep,
                "confidence": score
            }
    singleton_id = cluster_id + 1
    with open(os.getcwd() + "/media/output_files/" + output_file,
              'w') as f_output, open(input_file) as f_input:
        writer = csv.writer(f_output)
        reader = csv.reader(f_input)
        heading_row = next(reader)
        heading_row.insert(0, 'confidence_score')
        heading_row.insert(0, 'Cluster ID')
        canonical_keys = canonical_rep.keys()
        for key in canonical_keys:
            heading_row.append('canonical_' + key)
            writer.writerow(heading_row)
            for row in reader:
                row_id = int(row[0])
                if row_id in cluster_membership:
                    cluster_id = cluster_membership[row_id]["cluster id"]
                    canonical_rep = cluster_membership[row_id][
                        "canonical representation"]
                    row.insert(0, cluster_membership[row_id]['confidence'])
                    row.insert(0, cluster_id)
                    for key in canonical_keys:
                        row.append(canonical_rep[key].encode('utf8'))
                else:
                    row.insert(0, None)
                    row.insert(0, singleton_id)
                    singleton_id += 1
                    for key in canonical_keys:
                        row.append(None)
                writer.writerow(row)
Пример #7
0
 def gen_cluster_members(self, clustered_dupes, src_data):
     '''
     format and canonicalize the matched data
     '''
     #clustered dupes like [((<ids>), (<conf scores>)),... ]
     cluster_membership = {}
     for (cluster_id, cluster) in enumerate(clustered_dupes):
         id_set, scores = cluster
         cluster_d = [src_data[c] for c in id_set]
         canonical_rep = dedupe.canonicalize(cluster_d)
         for record_id, score in zip(id_set, scores):
             cluster_membership[record_id] = {
                 "cluster id": cluster_id,
                 "canonical representation": canonical_rep,
                 "confidence": score
             }
     return cluster_membership
Пример #8
0
print('clustering...')
clustered_dupes = deduper.match(data_d, threshold)

print('# duplicate sets', len(clustered_dupes))

# ## Writing Results

# Write our original data back out to a CSV with a new column called
# 'Cluster ID' which indicates which records refer to each other.

cluster_membership = {}
cluster_id = 0
for (cluster_id, cluster) in enumerate(clustered_dupes):
    id_set, scores = cluster
    cluster_d = [data_d[c] for c in id_set]
    canonical_rep = dedupe.canonicalize(cluster_d)
    for record_id, score in zip(id_set, scores):
        cluster_membership[record_id] = {
            "cluster id": cluster_id,
            "canonical representation": canonical_rep,
            "confidence": score
        }

singleton_id = cluster_id + 1

with open(output_file, 'w') as f_output:
    writer = csv.writer(f_output)

    with open(input_file) as f_input:
        reader = csv.reader(f_input)
print 'clustering...'
clustered_dupes = deduper.match(data_d, threshold)

print '# duplicate sets', len(clustered_dupes)

# ## Writing Results

# Write our original data back out to a CSV with a new column called 
# 'Cluster ID' which indicates which records refer to each other.

cluster_membership = {}
cluster_id = 0
for (cluster_id, cluster) in enumerate(clustered_dupes):
    id_set, conf_score = cluster
    cluster_d = [data_d[c] for c in id_set]
    canonical_rep = dedupe.canonicalize(cluster_d)
    for record_id in id_set:
        cluster_membership[record_id] = {
            "cluster id" : cluster_id,
            "canonical representation" : canonical_rep,
            "confidence": conf_score
        }

singleton_id = cluster_id + 1

with open(output_file, 'w') as f_output:
    writer = csv.writer(f_output)

    with open(input_file) as f_input :
        reader = csv.reader(f_input)
Пример #10
0
def _cluster(deduper, data, threshold, canonicalize):
    """Internal method that clusters the data.

        Parameters
        ----------
        deduper : dedupe.Deduper
            A trained instance of dedupe.
        data : dict
            The dedupe formatted data dictionary.
        threshold : dedupe.Threshold
            The threshold used for clustering.
        canonicalize : bool or list, default False
            Option that provides the canonical records as additional columns.
            Specifying a list of column names only canonicalizes those columns.

        Returns
        -------
        pd.DataFrame
            A dataframe storing the clustering results.
    """
    # ## Clustering
    print('clustering...')
    clustered_dupes = deduper.match(data, threshold)

    print('# duplicate sets', len(clustered_dupes))

    # Convert data_d to string so that Price & LatLong won't get traceback
    # during dedupe.canonicalize()
    for i in data.values():
        for key in i:
            if i[key] is None:
                pass
            else:
                i[key] = str(i[key])
    
    df_data = []
    # ## Writing Results
    cluster_id = 0
    for (cluster_id, cluster) in enumerate(clustered_dupes):
        id_set, scores = cluster
        cluster_d = [data[c] for c in id_set]

        canonical_rep = None
        if canonicalize:
            canonical_rep = dedupe.canonicalize(cluster_d)

        for record_id, score in zip(id_set, scores):
            tmp = {
                'Id': record_id,
                'cluster id': cluster_id,
                'confidence': score,
            }

            if canonicalize:
                fields_to_canon = canonical_rep.keys()

                if isinstance(canonicalize, list):
                    fields_to_canon = canonicalize

                for key in fields_to_canon:
                    canon_key = 'canonical_' + key
                    tmp[canon_key] = canonical_rep[key]

            df_data.append(tmp)

    clustered_df = pd.DataFrame(df_data)
    clustered_df = clustered_df.set_index('Id')

    return clustered_df
Пример #11
0
def deduping_function(fields, training_file, settings_file, input_file,
                      output_file, input_dict):
    """
    Function to dedupe rows given a
    fields: list of dictionaries of fields relevant for the training
    training_file: json with training set if the function was already run
    settings_file: file with settings if the function was already run
    input_file: CSV with values to dedup
    output_file: CSV where results are going to be written
    input_dict: dictionary with keys as id and values as dictionary of features for each row
    
    Result is to write the output_file
    """
    if os.path.exists(settings_file):
        print('reading from {}'.format(settings_file))
        with open(settings_file, 'rb') as f:
            deduper = dedupe.StaticDedupe(f)
    else:
        deduper = dedupe.Dedupe(fields)
        deduper.sample(input_dict)
        if os.path.exists(training_file):
            print('reading labeled examples from ', format(training_file))
            with open(training_file) as tf:
                deduper.readTraining(tf)

        print('starting active labeling...')
        dedupe.consoleLabel(deduper)
        deduper.train()

        print('writing training file')
        with open(training_file, 'w') as tf:
            deduper.writeTraining(tf)
        print('writing settings file')
        with open(settings_file, 'wb') as sf:
            deduper.writeSettings(sf)

    print('blocking...')
    threshold = deduper.threshold(input_dict, recall_weight=2)
    print('clustering...')
    clustered_dupes = deduper.match(input_dict, threshold)
    print('# duplicate sets {}'.format(len(clustered_dupes)))

    cluster_membership = {}
    cluster_id = 0
    for (cluster_id, cluster) in enumerate(clustered_dupes):
        id_set, scores = cluster
        cluster_d = [input_dict[c] for c in id_set]
        canonical_rep = dedupe.canonicalize(cluster_d)
        for record_id, score in zip(id_set, scores):
            cluster_membership[record_id] = {
                "cluster id": cluster_id,
                "canonical representation": canonical_rep,
                "confidence": score
            }

    singleton_id = cluster_id + 1

    with open(output_file, 'w') as f_output:
        writer = csv.writer(f_output)

        with open(input_file) as f_input:
            reader = csv.reader(f_input)

            heading_row = next(reader)
            heading_row.insert(0, 'confidence_score')
            heading_row.insert(0, 'Cluster ID')
            canonical_keys = canonical_rep.keys()
            for key in canonical_keys:
                heading_row.append('canonical_' + key)

            writer.writerow(heading_row)

            for row in reader:
                row_id = int(row[0])
                if row_id in cluster_membership:
                    cluster_id = cluster_membership[row_id]["cluster id"]
                    canonical_rep = cluster_membership[row_id][
                        "canonical representation"]
                    row.insert(0, cluster_membership[row_id]['confidence'])
                    row.insert(0, cluster_id)
                    for key in canonical_keys:
                        row.append(canonical_rep[key].encode('utf8'))
                else:
                    row.insert(0, None)
                    row.insert(0, singleton_id)
                    singleton_id += 1
                    for key in canonical_keys:
                        row.append(None)
                writer.writerow(row)

    print('deduping sucessfully finished')
Пример #12
0
def dedupe_dataframe(df, field_properties, canonicalize=False, config_name="dedupe_dataframe"):
    # Import Data
    print('runnning CUSTOM dedupe_dataframe LOCAL')
    config_name = config_name.replace(" ", "_")
    
    settings_file = config_name + '_learned_settings'
    training_file = config_name + '_training.json'

    print('importing data ...')

    df = clean_punctuation(df)
    
    specify_type(df, field_properties)                
    
    df['dictionary'] = df.apply(lambda x: dict(zip(df.columns,x.tolist())), axis=1)
    data_d = dict(zip(df.index,df.dictionary))
    for col in df.columns:
      print("df col - - - ", col)
    print(' df[dictionary].. shape', df.shape)
    # If a settings file already exists, we'll just load that and skip training
    if os.path.exists(settings_file):
        print('reading from', settings_file)
        with open(settings_file, 'rb') as f:
            deduper = dedupe.StaticDedupe(f)
    else:
        # ## Training
        # Define the fields dedupe will pay attention to
        
        fields = []
        select_fields(fields, field_properties)
        # print("fields", fields)
        # Create a new deduper object and pass our data model to it.
        deduper = dedupe.Dedupe(fields)

        # To train dedupe, we feed it a sample of records.
        deduper.sample(data_d, 15000)

        # If we have training data saved from a previous run of dedupe,
        # look for it and load it in.
        # __Note:__ if you want to train from scratch, delete the training_file
        if os.path.exists(training_file):
            print('reading labeled examples from ', training_file)
            with open(training_file, 'rb') as f:
                deduper.readTraining(f)
        
        # print('starting active labeling...')
        dedupe.consoleLabel(deduper)

        # Using the examples we just labeled, train the deduper and learn
        # blocking predicates
        print('calling train...')
        deduper.train()

        # When finished, save our training to disk
        with open(training_file, 'w') as tf:
            deduper.writeTraining(tf)

        # Save our weights and predicates to disk.  If the settings file
        # exists, we will skip all the training and learning next time we run
        # this file.
        with open(settings_file, 'wb') as sf:
            deduper.writeSettings(sf)
        print('saved our training to disk.')
    # ## Set threshold

    threshold = deduper.threshold(data_d, recall_weight=1)
    print('threshold=', threshold)
    # ## Clustering

    print('clustering...')
    clustered_dupes = deduper.match(data_d, threshold)

    print('# duplicate sets', len(clustered_dupes))

    # Convert data_d to string so that Price & LatLong won't get traceback during dedupe.canonicalize()
    
    for i in data_d.values():
        for key in i:
            if i[key] == None:
                pass
            else:
                i[key] = str(i[key])
            
    # ## Writing Results

    cluster_membership = {}
    cluster_id = 0
    for (cluster_id, cluster) in enumerate(clustered_dupes):
        id_set, scores = cluster
        cluster_d = [data_d[c] for c in id_set]
        canonical_rep = dedupe.canonicalize(cluster_d)
        for record_id, score in zip(id_set, scores):
            cluster_membership[record_id] = {
                "cluster id" : cluster_id,
                "canonical representation" : canonical_rep,
                "confidence": score
            }

    cluster_index=[]
    for i in cluster_membership.items():
        cluster_index.append(i)

    # turn results into dataframe
    dfa = pd.DataFrame(cluster_index)

    dfa.rename(columns={0: 'Id index'}, inplace=True)

    canonical_list=[]
    
    if canonicalize== True:
        for i in dfa[1][0]['canonical representation'].keys():
            canonical_list.append(i)
            dfa[i + ' - ' + 'canonical'] = None
            dfa[i + ' - ' + 'canonical'] = dfa[1].apply(lambda x: x['canonical representation'][i])
    elif canonicalize == False:
        pass            
    elif type(canonicalize) == list:
        for i in canonicalize:
            dfa[i + ' - ' + 'canonical'] = None
            dfa[i + ' - ' + 'canonical'] = dfa[1].apply(lambda x: x['canonical representation'][i])

    dfa.set_index('Id index', inplace=True)
  
    df = df.join(dfa)

    return df
Пример #13
0
def process():
    """entry point"""
    api = sesamclient.Connection(sesamapi_base_url=INSTANCE,
                                 jwt_auth_token=JWT,
                                 timeout=60 * 10)
    result = api.session.get(
        api.sesamapi_base_url +
        "publishers/{}/entities?deleted=false&history=false".format(SOURCE))

    if result.status_code != 200:
        logging.warning('Sesam API returned non 200 code {}'.format(
            result.status_code))
        iterator = iter([])
    else:
        iterator = iter(result.json())
    raw_data = []

    for dataset_entity in iterator:
        raw_data.append(dataset_entity)
    # this is in memory service, for large datasets we probably may want to have service with DB storage
    if len(raw_data) > 100_000:
        logging.warning(
            'This service is not suitable for datasets with more than 100 000 elements'
            ' and may lead to OutOfMemory errors')

    cleaned_data = read_data(raw_data)

    logging.info('reading from %s', SETTINGS_FILE)

    with open(SETTINGS_FILE, 'rb') as f:
        deduper = dedupe.StaticDedupe(f)

    threshold = deduper.threshold(cleaned_data, recall_weight=1)

    logging.info('clustering...')
    clustered_dupes = deduper.match(cleaned_data, threshold)
    logging.info('%d duplicate sets found', len(clustered_dupes))

    cluster_membership = {}
    cluster_id = 0

    for (cluster_id, cluster) in enumerate(clustered_dupes):
        id_set, scores = cluster
        cluster_d = [cleaned_data[c] for c in id_set]
        canonical_rep = dedupe.canonicalize(cluster_d)
        for record_id, score in zip(id_set, scores):
            cluster_membership[record_id] = {
                "cluster id": cluster_id,
                "canonical representation": canonical_rep,
                "confidence": score
            }

    result_dataset = []
    for row in raw_data:
        row_id = row['_id']
        if row_id in cluster_membership:
            result_dict = {
                '_id': row_id,
                'cluster_id': cluster_membership[row_id]["cluster id"],
                'confidence_score': cluster_membership[row_id]['confidence']
            }

            if ADD_ORIGINALS:
                result_dict['originals'] = {key: row[key] for key in KEYS}

            if ADD_CANONICALS:
                result_dict['canonical_rep'] = cluster_membership[row_id][
                    "canonical representation"]

            result_dataset.append(result_dict)
    if TARGET is not None:
        target_url = api.sesamapi_base_url + "receivers/{}/entities".format(
            TARGET)
        requests.post(target_url,
                      json.dumps(sorted(result_dataset,
                                        key=lambda k: k['cluster_id']),
                                 cls=NumpyEncoder),
                      headers={
                          'Authorization': 'Bearer {}'.format(JWT),
                          'Content-Type': 'application/json'
                      })
        return Response()

    return Response(json.dumps(sorted(result_dataset,
                                      key=lambda k: k['cluster_id']),
                               cls=NumpyEncoder),
                    mimetype='application/json')
Пример #14
0
def dedupe_training(category, recall_weight=1):
    assert (category in ['mangas', 'animes']),"Only mangas or animes needs training"
    data = {}
    for work in Work.objects.filter(category__slug=category[:-1]).values_list('title', 'vo_title', 'id'):
        data[work[2]] = {'title': work[0], 'vo_title': work[1]}
        for field in ['title', 'vo_title']:
            if not data[work[2]][field]:
                data[work[2]][field] = None
    fields = [
        {'field': 'title', 'type': 'String'},
        {'field': 'vo_title', 'type': 'String'},
    ]
    output_file = os.path.join(DEDUPE_DIR, category + '_output.csv')
    settings_file = os.path.join(DEDUPE_DIR, category + '_learned_settings')
    training_file = os.path.join(DEDUPE_DIR, category+'_training.json')

    deduper = Dedupe(fields)
    deduper.sample(data)
    consoleLabel(deduper)

    if os.path.exists(training_file):
        print('reading labeled examples from ', training_file)
        with open(training_file, 'rb') as f:
            deduper.readTraining(f)

    deduper.train()

    with open(training_file, 'w') as tf:
        deduper.writeTraining(tf)

    with open(settings_file, 'wb') as sf:
        deduper.writeSettings(sf)

    #recall_weight defines the value you give to recall opposed to the value of precision
    threshold = deduper.threshold(data, recall_weight)

    print('clustering...')
    clustered_dupes = deduper.match(data, threshold)

    print('# duplicate sets', len(clustered_dupes))

    input_file = os.path.join(DEDUPE_DIR, category + '.csv')
    with open(input_file, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f, delimiter='\t')
        writer.writerow(['id', 'title', 'vo_title'])
        for work_id in data:
            title = data[work_id]['title']
            vo_title = data[work_id]['vo_title']
            writer.writerow([work_id, title, vo_title])
    cluster_membership = {}
    cluster_id = 0
    for (cluster_id, cluster) in enumerate(clustered_dupes):
        id_set, scores = cluster
        cluster_d = [data[c] for c in id_set]
        canonical_rep = canonicalize(cluster_d)
        for record_id, score in zip(id_set, scores):
            cluster_membership[record_id] = {
                "cluster id" : cluster_id,
                "canonical representation" : canonical_rep,
                "confidence": score
            }
    singleton_id = cluster_id + 1

    with open(output_file, 'w', newline='', encoding='utf-8') as f_output, open(input_file, newline='', encoding='utf-8') as f_input:
        writer = csv.writer(f_output, delimiter='\t')
        reader = csv.reader(f_input, delimiter='\t')

        heading_row = next(reader)
        heading_row.insert(0, 'confidence_score')
        heading_row.insert(0, 'Cluster ID')
        canonical_keys = canonical_rep.keys()
        for key in canonical_keys:
            heading_row.append('canonical_' + key)

        writer.writerow(heading_row)

        for row in reader:
            row_id = int(row[0])
            if row_id in cluster_membership:
                cluster_id = cluster_membership[row_id]["cluster id"]
                canonical_rep = cluster_membership[row_id]["canonical representation"]
                row.insert(0, cluster_membership[row_id]['confidence'])
                row.insert(0, cluster_id)
                for key in canonical_keys:
                    row.append(canonical_rep[key])
            else:
                row.insert(0, None)
                row.insert(0, singleton_id)
                singleton_id += 1
                for key in canonical_keys:
                    row.append(None)
            writer.writerow(row)
    print('output file written')
Пример #15
0
def dupe_aff(aff_data):
    import dedupe
    import logging

    log_level = logging.DEBUG
    logging.getLogger().setLevel(log_level)

    fields = [
        {
            'field': 'name',
            'type': 'String'
        },
    ]
    data = {}
    cnt = 0
    for aff in aff_data:
        data[cnt] = {"id": cnt, "name": aff}
        cnt += 1
    deduper = dedupe.Dedupe(fields)
    deduper.sample(data, 15000)
    print('starting active labeling...')
    dedupe.consoleLabel(deduper)
    deduper.train()
    print('blocking...')
    threshold = deduper.threshold(data, recall_weight=2)

    # `match` will return sets of record IDs that dedupe
    # believes are all referring to the same entity.

    print('clustering...')
    clustered_dupes = deduper.match(data, threshold)

    print('# duplicate sets', len(clustered_dupes))

    # ## Writing Results

    # Write our original data back out to a CSV with a new column called
    # 'Cluster ID' which indicates which records refer to each other.

    cluster_membership = {}
    cluster_id = 0
    for (cluster_id, cluster) in enumerate(clustered_dupes):
        id_set, scores = cluster
        cluster_d = [data[c] for c in id_set]
        for x in cluster_d:
            x["id"] = str(x["id"])
        canonical_rep = dedupe.canonicalize(cluster_d)
        for record_id, score in zip(id_set, scores):
            cluster_membership[record_id] = {
                "cluster id": cluster_id,
                "canonical representation": canonical_rep,
                "confidence": score
            }

    singleton_id = cluster_id + 1

    clusters = []
    for c in clustered_dupes:
        clusters.append([data[k] for k in c[0]])

    clusters = sorted(clusters, key=lambda x: len(x), reverse=True)
Пример #16
0
    def train(input_file, train_sample, fields):
        try:
            data_d = Dedupetrainer.readData(input_file)
            # Define the fields dedupe will pay attention to
            # Create a new deduper object and pass our data model to it.
            deduper = dedupe.Dedupe(fields)
            print("***********************")
            deduper.sample(data_d, train_sample)
            dedupe.consoleLabel(deduper)
            deduper.train()
            threshold = deduper.threshold(data_d, recall_weight=1)

            # ## Clustering

            # `match` will return sets of record IDs that dedupe
            # believes are all referring to the same entity.

            print('clustering...')
            clustered_dupes = deduper.match(data_d, threshold)

            print('# duplicate sets', len(clustered_dupes))
            cluster_membership = {}
            cluster_id = 0
            for (cluster_id, cluster) in enumerate(clustered_dupes):
                id_set, scores = cluster
                cluster_d = [data_d[c] for c in id_set]
                canonical_rep = dedupe.canonicalize(cluster_d)
                for record_id, score in zip(id_set, scores):
                    cluster_membership[record_id] = {
                        "cluster id": cluster_id,
                        "canonical representation": canonical_rep,
                        "confidence": score
                    }

            singleton_id = cluster_id + 1
            output_file = "./media/output.csv"
            with open(output_file,
                      'w') as f_output, open(input_file) as f_input:
                writer = csv.writer(f_output)
                reader = csv.reader(f_input)

                heading_row = next(reader)
                heading_row.insert(0, 'confidence_score')
                heading_row.insert(0, 'Cluster ID')
                canonical_keys = canonical_rep.keys()
                for key in canonical_keys:
                    heading_row.append('canonical_' + key)

                writer.writerow(heading_row)

                for row in reader:
                    row_id = int(row[0])
                    if row_id in cluster_membership:
                        cluster_id = cluster_membership[row_id]["cluster id"]
                        canonical_rep = cluster_membership[row_id][
                            "canonical representation"]
                        row.insert(0, cluster_membership[row_id]['confidence'])
                        row.insert(0, cluster_id)
                        for key in canonical_keys:
                            row.append(canonical_rep[key].encode('utf8'))
                    else:
                        row.insert(0, None)
                        row.insert(0, singleton_id)
                        singleton_id += 1
                        for key in canonical_keys:
                            row.append(None)
                    writer.writerow(row)

            print("process has been completed")
            return "success"

        except Exception as e:
            print("Error in catch ", str(e))
            return "failure"
Пример #17
0
def clusterreview(request):
    username = request.session['username']
    if request.method == 'POST':
        print("cluster review post")
        selects = request.POST.getlist("_selected_action")

        print(selects)
        clusterid = request.POST.get("clusterid")
        # clusterCora.objects.filter(clusterid=clusterid).update(is_checked=1)

        aaa = clusterCora.objects.filter(clusterid=clusterid)
        sett = set(aitem.cora_id for aitem in aaa)
        print("sett", sett)
        noset = set(int(noitem) for noitem in selects)
        print('noset:', noset)
        iyes = list(sett.difference(noset))
        print("iyes", iyes)
        clusterCora.objects.filter(cora_id__in=noset).update(
            is_checked=0)  # no
        clusterCora.objects.filter(cora_id__in=set(iyes)).update(
            is_checked=1)  # yes
        uids = WorkerInfo.objects.filter(user=username).values_list('id',
                                                                    flat=True)
        msg = {"clusterid": clusterid, "include": iyes, 'exclude': list(noset)}
        WorkLog.objects.create(
            task=dxaconstants.ERTASK.Cora,
            operate_message=json.dumps(msg),
            operate_flag=dxaconstants.WorkerOperation.clusterReview,
            operate_user=uids[0],
            operate_system=dxaconstants.TestSystems.Dedupe)

        num = clusterCora.objects.filter(user=username, is_checked=-1).count()
        print("num:", num)

        if num == 0:
            clusteridlist = clusterCora.objects.values('clusterid').distinct()
            print('clusterlist:', clusteridlist)
            examples = {'distinct': [], 'match': []}
            for clusteridd in clusteridlist:
                print(clusteridd)
                print(clusteridd['clusterid'])
                aaa = clusterCora.objects.filter(
                    clusterid=clusteridd['clusterid'])
                # sett = set(aitem.cora_id for aitem in aaa)
                nosets = clusterCora.objects.filter(
                    clusterid=clusteridd['clusterid'], is_checked=0)  # no
                yessets = clusterCora.objects.filter(
                    clusterid=clusteridd['clusterid'], is_checked=1)  # yes
                nocoraids = set(aitem.cora_id for aitem in nosets)
                yescoraids = set(aitem.cora_id for aitem in yessets)
                inosescora = Cora.objects.filter(id__in=nocoraids)
                iyesescora = Cora.objects.filter(id__in=yescoraids)
                for nocora in inosescora:
                    d1 = dict([('text', nocora.text), ('id', str(nocora.id))])
                    for yescora in iyesescora:
                        d2 = dict([('text', yescora.text),
                                   ('id', str(yescora.id))])
                        record_pair = (d1, d2)
                        # record_pair = dict([('text', ditem.text1), ('id', ditem.id1)]) + dict(
                        #     [('text', ditem.text2), ('id', ditem.id2)])
                        print(record_pair)
                        examples['distinct'].append(record_pair)
                for yescora1 in iyesescora:
                    d1 = dict([('text', yescora1.text),
                               ('id', str(yescora1.id))])
                    for yescora2 in iyesescora:
                        d2 = dict([('text', yescora2.text),
                                   ('id', str(yescora2.id))])
                        if not yescora1.id == yescora2.id:
                            record_pair = (d1, d2)
                            examples['match'].append(record_pair)
            if os.path.exists(training_file):
                print('reading labeled examples from ', training_file)
                with open(training_file, 'rb') as af:
                    deduper.readTraining(af)
            deduper.markPairs(examples)
            deduper.train()
            # When finished, save our training away to disk
            with open(training_file, 'w') as tf:
                deduper.writeTraining(tf)
            # Save our weights and predicates to disk.  If the settings file
            # exists, we will skip all the training and learning next time we run
            # this file.
            with open(settings_file, 'wb') as sf:
                deduper.writeSettings(sf)
            threshold = deduper.threshold(temp_d, recall_weight=1)
            print('clustering...')
            clustered_dupes = deduper.match(temp_d, threshold)

            print('# duplicate sets', len(clustered_dupes))

            cluster_id = 0
            print(clustered_dupes)

            aaa = models.clusterTemp.objects.filter(user=username)
            if aaa:
                aaa.delete()
            models.clusterCanonicalRepresentation.objects.filter(
                user=username).delete()
            for (cluster_id, cluster) in enumerate(clustered_dupes):
                id_set, scores = cluster
                print(id_set)
                cluster_d = [temp_d[c] for c in id_set]
                print(cluster_d)
                canonical_rep = dedupe.canonicalize(cluster_d)
                models.clusterCanonicalRepresentation.objects.create(
                    clusterid=cluster_id,
                    canonrep=canonical_rep,
                    user=username,
                    task=dxaconstants.ERTASK.Cora)
                for record_id, score in zip(id_set, scores):
                    # cluster_membershipRound2[record_id] = {
                    #     "cluster id": cluster_id,
                    #     "canonical representation": canonical_rep,
                    #     "confidence": score
                    #
                    # }
                    models.clusterTemp.objects.create(
                        task_record_id=record_id,
                        task=dxaconstants.ERTASK.Cora,
                        clusterid=cluster_id,
                        confidence=score,
                        user=username)
            # print(cluster_membershipRound2)

            aaaaa = models.clusterTemp.objects.all()
            sett = set(bitem.task_record_id for bitem in aaaaa)
            corass = Cora.objects.all()
            alls = set(bitem.id for bitem in corass)
            isingle = list(alls.difference(sett))
            for single in isingle:
                models.clusterTemp.objects.create(
                    task_record_id=single,
                    task=dxaconstants.ERTASK.Cora,
                    clusterid=-1,
                    confidence=0.0,
                    user=username)
            return redirect('/dedupe/addtoclusters')
        else:
            a = clusterCora.objects.filter(user=username,
                                           is_checked=-1).aggregate(
                                               Min('clusterid'))
            datas = clusterCora.objects.filter(clusterid=a['clusterid__min'])
            sett = set(citem.cora_id for citem in datas)
            coras = Cora.objects.filter(id__in=sett)
            print(coras)
            return render(request, 'dedupe/clusterreview.html', {
                'data': coras,
                'clusterid': a['clusterid__min']
            })
    a = clusterCora.objects.filter(user=username,
                                   is_checked=-1).aggregate(Min('clusterid'))
    print(a)
    print(a['clusterid__min'])
    datas = clusterCora.objects.filter(clusterid=a['clusterid__min'])
    print(datas)
    sett = set(bitem.cora_id for bitem in datas)
    coras = Cora.objects.filter(id__in=sett)
    print(coras)
    return render(request, 'dedupe/clusterreview.html', {
        'data': coras,
        'clusterid': a['clusterid__min']
    })
Пример #18
0
def deDuplication():

    input_file = 'csv_example_input.csv'
    output_file = 'csv_example_output.csv'
    settings_file = 'csv_example_learned_settings3'
    training_file = 'csv_example_training.json3'

    def preProcess(column):

        try:
            column = column.decode('utf-8')
        except AttributeError:
            pass
        column = unidecode(column)
        column = re.sub(' +', ' ', column)
        column = re.sub('\n', ' ', column)
        column = column.strip().strip('"').strip("'").lower().strip()

        if not column:
            column = None
        return column

    # Read in the data from CSV file:
    def readData(filename):

        data_d = {}
        with open(filename, encoding="utf-8") as f:
            reader = csv.DictReader(f)
            for row in reader:
                clean_row = [(k, preProcess(v)) for (k, v) in row.items()]
                row_id = row['id']
                data_d[row_id] = dict(clean_row)

        return data_d

    print('importing data ...')
    data_d = readData(input_file)

    if os.path.exists(settings_file):
        print('reading from', settings_file)
        with open(settings_file, 'rb') as f:
            deduper = dedupe.StaticDedupe(f)
    else:
        fields = [
            {
                'field': 'DisplayName_Processed',
                'type': 'String'
            },
            {
                'field': 'Email_Processed',
                'type': 'String'
            },
            {
                'field': 'Info',
                'type': 'String'
            },
        ]
        deduper = dedupe.Dedupe(fields)
        deduper.sample(data_d, 15000)

        if os.path.exists(training_file):
            print('reading labeled examples from ', training_file)
            with open(training_file, 'rb') as f:
                deduper.readTraining(f)

        print('starting active labeling...')

        dedupe.consoleLabel(deduper)

        deduper.train()

        with open(training_file, 'w') as tf:
            deduper.writeTraining(tf)

        with open(settings_file, 'wb') as sf:
            deduper.writeSettings(sf)

    threshold = deduper.threshold(data_d, recall_weight=1)

    print('clustering...')
    clustered_dupes = deduper.match(data_d, threshold)

    print('# duplicate sets', len(clustered_dupes))

    cluster_membership = {}
    cluster_id = 0
    for (cluster_id, cluster) in enumerate(clustered_dupes):
        id_set, scores = cluster
        cluster_d = [data_d[c] for c in id_set]
        canonical_rep = dedupe.canonicalize(cluster_d)
        for record_id, score in zip(id_set, scores):
            cluster_membership[record_id] = {
                "cluster id": cluster_id,
                "canonical representation": canonical_rep,
                "confidence": score
            }

    singleton_id = cluster_id + 1

    with open(output_file, 'w',
              encoding="utf-8") as f_output, open(input_file,
                                                  encoding="utf-8") as f_input:
        writer = csv.writer(f_output)
        reader = csv.reader(f_input)

        heading_row = next(reader)
        heading_row.insert(0, 'confidence_score')
        heading_row.insert(0, 'Cluster ID')
        canonical_keys = canonical_rep.keys()
        for key in canonical_keys:
            heading_row.append('canonical_' + key)

        writer.writerow(heading_row)

        for row in reader:
            row_id = row[0]
            if row_id in cluster_membership:
                cluster_id = cluster_membership[row_id]["cluster id"]
                canonical_rep = cluster_membership[row_id][
                    "canonical representation"]
                row.insert(0, cluster_membership[row_id]['confidence'])
                row.insert(0, cluster_id)
                for key in canonical_keys:
                    row.append(canonical_rep[key].encode('utf8'))
            else:
                row.insert(0, None)
                row.insert(0, singleton_id)
                singleton_id += 1
                for key in canonical_keys:
                    row.append(None)
            writer.writerow(row)
    return clustered_dupes
Пример #19
0
def dedupe_dataframe(df, field_properties, canonicalize=False,
                     config_name="dedupe_dataframe", recall_weight=1,
                     sample_size=0.3):
    """Deduplicates a dataframe given fields of interest.

        Parameters
        ----------
        df : pd.DataFrame
            The dataframe to deduplicate.
        field_properties : list
            A list specifying what fields to use for deduplicating records.
        canonicalize : bool, default False
            Option that provides the canonical record as additional columns.
        config_name : str, default dedupe_dataframe
            The configuration file name. Note that this will be used as 
            a prefixto save the settings and training files.
        recall_weight : int, default 1
            Find the threshold that will maximize a weighted average of our
            precision and recall.  When we set the recall weight to 2, we are
            saying we care twice as much about recall as we do precision.
        sample_size : float, default 0.3
            Specify the sample size used for training as a float from 0 to 1.
            By default it is 30% (0.3) of our data.

        Returns
        -------
        pd.DataFrame
            A pandas dataframe that contains the cluster id and confidence
            score. Optionally, it will contain canonicalized columns for all
            attributes of the record.

    """
    # Import Data  
    config_name = config_name.replace(" ", "_")
   
    settings_file = config_name + '_learned_settings'
    training_file = config_name + '_training.json'

    print('importing data ...')

    df = clean_punctuation(df)
    
    specify_type(df, field_properties)                
    
    df['dictionary'] = df.apply(
        lambda x: dict(zip(df.columns, x.tolist())), axis=1)
    data_d = dict(zip(df.index, df.dictionary))
    
    # If a settings file already exists, we'll just load that and skip training
    if os.path.exists(settings_file):
        print('reading from', settings_file)
        with open(settings_file, 'rb') as f:
            deduper = dedupe.StaticDedupe(f)
    else:
        # ## Training

        # Define the fields dedupe will pay attention to
        
        fields = []
        select_fields(fields, field_properties)

        # Create a new deduper object and pass our data model to it.
        deduper = dedupe.Dedupe(fields)

        # To train dedupe, we feed it a sample of records.
        sample_num = math.floor(len(df) * sample_size)
        deduper.sample(data_d, sample_num)

        # If we have training data saved from a previous run of dedupe,
        # look for it and load it in.
        # __Note:__ if you want to train from scratch, delete the training_file
        if os.path.exists(training_file):
            print('reading labeled examples from ', training_file)
            with open(training_file, 'rb') as f:
                deduper.readTraining(f)

        print('starting active labeling...')

        dedupe.consoleLabel(deduper)

        # Using the examples we just labeled, train the deduper and learn
        # blocking predicates
        deduper.train()

        # When finished, save our training to disk
        with open(training_file, 'w') as tf:
            deduper.writeTraining(tf)

        # Save our weights and predicates to disk.  If the settings file
        # exists, we will skip all the training and learning next time we run
        # this file.
        with open(settings_file, 'wb') as sf:
            deduper.writeSettings(sf)

    # ## Set threshold
    threshold = deduper.threshold(data_d, recall_weight=recall_weight)

    # ## Clustering
    print('clustering...')
    clustered_dupes = deduper.match(data_d, threshold)

    print('# duplicate sets', len(clustered_dupes))

    # Convert data_d to string so that Price & LatLong won't get traceback
    # during dedupe.canonicalize()
    for i in data_d.values():
        for key in i:
            if i[key] is None:
                pass
            else:
                i[key] = str(i[key])
            
    # ## Writing Results
    cluster_membership = {}
    cluster_id = 0
    for (cluster_id, cluster) in enumerate(clustered_dupes):
        id_set, scores = cluster
        cluster_d = [data_d[c] for c in id_set]
        canonical_rep = dedupe.canonicalize(cluster_d)
        for record_id, score in zip(id_set, scores):
            cluster_membership[record_id] = {
                "cluster id": cluster_id,
                "canonical representation": canonical_rep,
                "confidence": score
            }

    cluster_index = []
    for i in cluster_membership.items():
        cluster_index.append(i)

    # turn results into dataframe
    dfa = pd.DataFrame(cluster_index)
    dfa.rename(columns={0: 'Id'}, inplace=True)
    
    dfa['cluster id'] = dfa[1].apply(lambda x: x["cluster id"])
    dfa['confidence'] = dfa[1].apply(lambda x: x["confidence"])

    canonical_list = []
    
    if canonicalize:
        for i in dfa[1][0]['canonical representation'].keys():
            canonical_list.append(i)
            dfa[i + ' - ' + 'canonical'] = None
            dfa[i + ' - ' + 'canonical'] = dfa[1].apply(
                lambda x: x['canonical representation'][i])
    elif type(canonicalize) == list:
        for i in canonicalize:
            dfa[i + ' - ' + 'canonical'] = None
            dfa[i + ' - ' + 'canonical'] = dfa[1].apply(
                lambda x: x['canonical representation'][i])

    dfa.set_index('Id', inplace=True)
    df = df.join(dfa)            
    df.drop(columns=[1, 'dictionary'], inplace=True)

    return df