-
Notifications
You must be signed in to change notification settings - Fork 0
/
dedupe_illegal_books.py
98 lines (83 loc) · 2.91 KB
/
dedupe_illegal_books.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# Import necessary libraries
from future.builtins import next
import os
import csv
import re
import logging
import optparse
import dedupe
import pandas as pd
# Paths to key files
input_file = "data/reduced_super_book.csv"
output_file = "data/dedupe_matches.csv"
settings_file = "data/dedupe_learned_settings.csv"
training_file = "data/dedupe_training.json"
# Data was preprocessed in R.
# Read in the csv:
df = pd.read_csv(input_file)
# Replace 'Nan' with 'None' object (required by Dedupe)
df = df.where((pd.notnull(df)), None)
# Convert into dictionary of records
data = df.to_dict(orient="index")
# Create a variables list for Dedupe to work with
variables = [
{'field':'primary_author_name','type':'String'},
{'field':'full_book_title','type':'String'}
]
# Initialise deduplication object
deduper = dedupe.Dedupe(variables)
# Train the model
deduper.sample(data, 150000, .5) # This randomly generates 150000 pairs of records to compare
dedupe.consoleLabel(deduper) # Call up command line tool for manually labelling training pairs.
deduper.train() # Trains the model to work out which pairs are the same
# Write training to disk
with open(training_file, 'w') as tf:
deduper.writeTraining(tf)
with open(settings_file, 'wb') as sf:
deduper.writeSettings(sf)
# Now make our predictions
threshold = deduper.threshold(data, recall_weight = 1) # We care equally about precision and recall
print("clustering ...")
clustered_dupes = deduper.match(data, threshold)
print("# duplicate sets", len(clustered_dupes))
# Write the predictions to csv
cluster_membership = {}
cluster_id = 0
for (cluster_id, cluster) in enumerate(clustered_dupes):
id_set, scores = cluster
cluster_d = [data[c] for c in id_set]
canonical_rep = dedupe.canonicalize(cluster_d)
for record_id, score in zip(id_set, scores):
cluster_membership[record_id] = {
"cluster id" : cluster_id,
"canonical representation" : canonical_rep,
"confidence": score
}
singleton_id = cluster_id + 1
row_id = -1
with open(output_file, 'w', newline='') as f_output, open(input_file) as f_input:
writer = csv.writer(f_output)
reader = csv.reader(f_input)
heading_row = next(reader)
heading_row.insert(0, 'confidence_score')
heading_row.insert(0, 'Cluster ID')
canonical_keys = canonical_rep.keys()
for key in canonical_keys:
heading_row.append('canonical_' + key)
writer.writerow(heading_row)
for row in reader:
row_id += 1 # increment row id
if row_id in cluster_membership:
cluster_id = cluster_membership[row_id]["cluster id"]
canonical_rep = cluster_membership[row_id]["canonical representation"]
row.insert(0, cluster_membership[row_id]['confidence'])
row.insert(0, cluster_id)
for key in canonical_keys:
row.append(canonical_rep[key].encode('utf8'))
else:
row.insert(0, None)
row.insert(0, singleton_id)
singleton_id += 1
for key in canonical_keys:
row.append(None)
writer.writerow(row)