def index_all(self, data: Data): for field in self.index_fields: unique_fields = {record[field] for record in data.values() if record[field]} self.index(unique_fields, field)
def training_data_dedupe(data: Data, common_key: str, training_size: int = 50000) -> TrainingData: # pragma: nocover """Construct training data for consumption by the ActiveLearning markPairs method from an already deduplicated dataset. Args: data: Dictionary of records, where the keys are record_ids and the values are dictionaries with the keys being field names common_key: The name of the record field that uniquely identifies a match training_size: the rough limit of the number of training examples, defaults to 50000 .. note:: Every match must be identified by the sharing of a common key. This function assumes that if two records do not share a common key then they are distinct records. """ identified_records: Dict[str, List[RecordID]] identified_records = collections.defaultdict(list) matched_pairs: Set[Tuple[RecordID, RecordID]] = set() distinct_pairs: Set[Tuple[RecordID, RecordID]] = set() unique_record_ids: Set[RecordID] = set() # a list of record_ids associated with each common_key for record_id, record in data.items(): unique_record_ids.add(record_id) identified_records[record[common_key]].append(record_id) # all combinations of matched_pairs from each common_key group for record_ids in identified_records.values(): if len(record_ids) > 1: matched_pairs.update(itertools.combinations(sorted(record_ids), 2)) # type: ignore # calculate indices using dedupe.core.randomPairs to avoid # the memory cost of enumerating all possible pairs unique_record_ids_l = list(unique_record_ids) pair_indices = randomPairs(len(unique_record_ids), training_size) distinct_pairs = set() for i, j in pair_indices: distinct_pairs.add((unique_record_ids_l[i], unique_record_ids_l[j])) distinct_pairs -= matched_pairs matched_records = [(data[key_1], data[key_2]) for key_1, key_2 in matched_pairs] distinct_records = [(data[key_1], data[key_2]) for key_1, key_2 in distinct_pairs] training_pairs: TrainingData training_pairs = {'match': matched_records, 'distinct': distinct_records} return training_pairs
def sqlite_id_type(data: Data) -> Literal['text', 'integer']: example = next(iter(data.keys())) python_type = type(example) if python_type is bytes or python_type is str: return 'text' elif python_type is int: return 'integer' else: raise ValueError('Invalid type for record id')
def training_data_link( data_1: Data, data_2: Data, common_key: str, training_size: int = 50000) -> TrainingData: # pragma: nocover ''' Construct training data for consumption by the func:`mark_pairs` method from already linked datasets. Args: data_1: Dictionary of records from first dataset, where the keys are record_ids and the values are dictionaries with the keys being field names data_2: Dictionary of records from second dataset, same form as data_1 common_key: The name of the record field that uniquely identifies a match training_size: the rough limit of the number of training examples, defaults to 50000 .. note:: Every match must be identified by the sharing of a common key. This function assumes that if two records do not share a common key then they are distinct records. ''' identified_records: Dict[str, Tuple[List[RecordID], List[RecordID]]] identified_records = collections.defaultdict(lambda: ([], [])) matched_pairs: Set[Tuple[RecordID, RecordID]] = set() distinct_pairs: Set[Tuple[RecordID, RecordID]] = set() for record_id, record in data_1.items(): identified_records[record[common_key]][0].append(record_id) for record_id, record in data_2.items(): identified_records[record[common_key]][1].append(record_id) for keys_1, keys_2 in identified_records.values(): if keys_1 and keys_2: matched_pairs.update(itertools.product(keys_1, keys_2)) keys_1 = list(data_1.keys()) keys_2 = list(data_2.keys()) random_pairs = [ (keys_1[i], keys_2[j]) for i, j in randomPairsMatch(len(data_1), len(data_2), training_size) ] distinct_pairs = { pair for pair in random_pairs if pair not in matched_pairs } matched_records = [(data_1[key_1], data_2[key_2]) for key_1, key_2 in matched_pairs] distinct_records = [(data_1[key_1], data_2[key_2]) for key_1, key_2 in distinct_pairs] training_pairs: TrainingData training_pairs = {'match': matched_records, 'distinct': distinct_records} return training_pairs