def _readTraining(self, file_name, training_pairs): """Read training pairs from a file""" with open(file_name, "r") as f: training_pairs_raw = json.load(f) training_pairs = {0: [], 1: []} for (label, examples) in training_pairs_raw.iteritems(): for pair in examples: training_pairs[int(label)].append((core.frozendict(pair[0]), core.frozendict(pair[1]))) training_data = training.addTrainingData(training_pairs, self.data_model, self.training_data) return (training_pairs, training_data)
def _readTraining(self, file_name, training_pairs): """Read training pairs from a file""" with open(file_name, 'r') as f: training_pairs_raw = json.load(f, cls=self.training_decoder) training_pairs = {0: [], 1: []} for (label, examples) in training_pairs_raw.iteritems(): for pair in examples: training_pairs[int(label)].append( (core.frozendict(pair[0]), core.frozendict(pair[1]))) training_data = training.addTrainingData(training_pairs, self.data_model, self.training_data) return (training_pairs, training_data)
def makeSampleDict(session_id, fields): session = worker_session engine = session.bind metadata = MetaData() proc_table = Table('processed_%s' % session_id, metadata, autoload=True, autoload_with=engine) entity_table = Table('entity_%s' % session_id, metadata, autoload=True, autoload_with=engine) result = {} cols = [getattr(proc_table.c, f) for f in fields] ''' Get one record from each cluster of exact duplicates that are already in entity map + all records that don't have entries in the entity_map SELECT p.<fields from model> FROM processed as p LEFT JOIN entity as e WHERE e.target_record_id IS NULL ''' curs = session.query(*cols)\ .outerjoin(entity_table, proc_table.c.record_id == entity_table.c.record_id)\ .filter(entity_table.c.target_record_id == None) result = dict((i, frozendict(zip(fields, row))) for i, row in enumerate(curs)) return result
def canonicalImport(filename): data_d = {} duplicates_d = {} with open(filename) as f: reader = csv.reader(f) header = reader.next() for i, row in enumerate(reader): instance = {} for j, col in enumerate(row): if header[j] == "unique_id": duplicates_d.setdefault(col, []).append(i) else: # we may want to think about removing common stop # words # col = col.strip() # col = re.sub('[^a-z0-9 ]', ' ', col) # col = re.sub('\.', ' ', col) # col = re.sub(r'\bthe\b', ' ', col) # col = re.sub(r'restaurant', ' ', col) # col = re.sub(r'cafe', ' ', col) # col = re.sub(r'diner', ' ', col) # col = re.sub(r'\(.*\)', ' ', col) # col = re.sub(r'\bn\.', ' ', col) # col = re.sub(r'\bs\.', ' ', col) # col = re.sub(r'\be\.', ' ', col) # col = re.sub(r'\bw\.', ' ', col) col = re.sub(r"\broad\b", "rd", col) col = re.sub(" +", " ", col) instance[header[j]] = col.strip().strip('"').strip("'") data_d[i] = frozendict(instance) duplicates_s = set([]) for unique_id in duplicates_d: if len(duplicates_d[unique_id]) > 1: for pair in combinations(duplicates_d[unique_id], 2): duplicates_s.add(frozenset(pair)) return (data_d, header, duplicates_s)