예제 #1
0
파일: blocking.py 프로젝트: ahill187/dedupe
 def index_all(self, data: Data):
     for field in self.index_fields:
         unique_fields = {record[field]
                          for record
                          in data.values()
                          if record[field]}
         self.index(unique_fields, field)
예제 #2
0
def training_data_dedupe(data: Data,
                         common_key: str,
                         training_size: int = 50000) -> TrainingData:  # pragma: nocover
    """Construct training data for consumption by the ActiveLearning
    markPairs method from an already deduplicated dataset.

    Args:
        data: Dictionary of records, where the keys are record_ids and
            the values are dictionaries with the keys being
            field names

        common_key: The name of the record field that uniquely identifies
            a match

        training_size: the rough limit of the number of training examples,
            defaults to 50000

    .. note::

        Every match must be identified by the sharing of a common key.
        This function assumes that if two records do not share a common key
        then they are distinct records.
    """
    identified_records: Dict[str, List[RecordID]]
    identified_records = collections.defaultdict(list)
    matched_pairs: Set[Tuple[RecordID, RecordID]] = set()
    distinct_pairs: Set[Tuple[RecordID, RecordID]] = set()
    unique_record_ids: Set[RecordID] = set()

    # a list of record_ids associated with each common_key
    for record_id, record in data.items():
        unique_record_ids.add(record_id)
        identified_records[record[common_key]].append(record_id)

    # all combinations of matched_pairs from each common_key group
    for record_ids in identified_records.values():
        if len(record_ids) > 1:
            matched_pairs.update(itertools.combinations(sorted(record_ids), 2))  # type: ignore

    # calculate indices using dedupe.core.randomPairs to avoid
    # the memory cost of enumerating all possible pairs
    unique_record_ids_l = list(unique_record_ids)
    pair_indices = randomPairs(len(unique_record_ids), training_size)
    distinct_pairs = set()
    for i, j in pair_indices:
        distinct_pairs.add((unique_record_ids_l[i],
                            unique_record_ids_l[j]))

    distinct_pairs -= matched_pairs

    matched_records = [(data[key_1], data[key_2])
                       for key_1, key_2 in matched_pairs]

    distinct_records = [(data[key_1], data[key_2])
                        for key_1, key_2 in distinct_pairs]
    training_pairs: TrainingData
    training_pairs = {'match': matched_records,
                      'distinct': distinct_records}

    return training_pairs
예제 #3
0
파일: core.py 프로젝트: shajinzheng/dedupe
def sqlite_id_type(data: Data) -> Literal['text', 'integer']:

    example = next(iter(data.keys()))
    python_type = type(example)

    if python_type is bytes or python_type is str:
        return 'text'
    elif python_type is int:
        return 'integer'
    else:
        raise ValueError('Invalid type for record id')
예제 #4
0
def training_data_link(
        data_1: Data,
        data_2: Data,
        common_key: str,
        training_size: int = 50000) -> TrainingData:  # pragma: nocover
    '''
    Construct training data for consumption by the func:`mark_pairs`
    method from already linked datasets.

    Args:

        data_1: Dictionary of records from first dataset, where the
                keys are record_ids and the values are dictionaries
                with the keys being field names
        data_2: Dictionary of records from second dataset, same form as
                data_1
        common_key: The name of the record field that uniquely identifies
                    a match
        training_size: the rough limit of the number of training examples,
                       defaults to 50000

    .. note::

         Every match must be identified by the sharing of a common key.
         This function assumes that if two records do not share a common key
         then they are distinct records.
    '''

    identified_records: Dict[str, Tuple[List[RecordID], List[RecordID]]]
    identified_records = collections.defaultdict(lambda: ([], []))
    matched_pairs: Set[Tuple[RecordID, RecordID]] = set()
    distinct_pairs: Set[Tuple[RecordID, RecordID]] = set()

    for record_id, record in data_1.items():
        identified_records[record[common_key]][0].append(record_id)

    for record_id, record in data_2.items():
        identified_records[record[common_key]][1].append(record_id)

    for keys_1, keys_2 in identified_records.values():
        if keys_1 and keys_2:
            matched_pairs.update(itertools.product(keys_1, keys_2))

    keys_1 = list(data_1.keys())
    keys_2 = list(data_2.keys())

    random_pairs = [
        (keys_1[i], keys_2[j])
        for i, j in randomPairsMatch(len(data_1), len(data_2), training_size)
    ]

    distinct_pairs = {
        pair
        for pair in random_pairs if pair not in matched_pairs
    }

    matched_records = [(data_1[key_1], data_2[key_2])
                       for key_1, key_2 in matched_pairs]
    distinct_records = [(data_1[key_1], data_2[key_2])
                        for key_1, key_2 in distinct_pairs]

    training_pairs: TrainingData
    training_pairs = {'match': matched_records, 'distinct': distinct_records}

    return training_pairs