예제 #1
0
파일: core.py 프로젝트: shajinzheng/dedupe
def sqlite_id_type(data: Data) -> Literal['text', 'integer']:

    example = next(iter(data.keys()))
    python_type = type(example)

    if python_type is bytes or python_type is str:
        return 'text'
    elif python_type is int:
        return 'integer'
    else:
        raise ValueError('Invalid type for record id')
예제 #2
0
def training_data_link(
        data_1: Data,
        data_2: Data,
        common_key: str,
        training_size: int = 50000) -> TrainingData:  # pragma: nocover
    '''
    Construct training data for consumption by the func:`mark_pairs`
    method from already linked datasets.

    Args:

        data_1: Dictionary of records from first dataset, where the
                keys are record_ids and the values are dictionaries
                with the keys being field names
        data_2: Dictionary of records from second dataset, same form as
                data_1
        common_key: The name of the record field that uniquely identifies
                    a match
        training_size: the rough limit of the number of training examples,
                       defaults to 50000

    .. note::

         Every match must be identified by the sharing of a common key.
         This function assumes that if two records do not share a common key
         then they are distinct records.
    '''

    identified_records: Dict[str, Tuple[List[RecordID], List[RecordID]]]
    identified_records = collections.defaultdict(lambda: ([], []))
    matched_pairs: Set[Tuple[RecordID, RecordID]] = set()
    distinct_pairs: Set[Tuple[RecordID, RecordID]] = set()

    for record_id, record in data_1.items():
        identified_records[record[common_key]][0].append(record_id)

    for record_id, record in data_2.items():
        identified_records[record[common_key]][1].append(record_id)

    for keys_1, keys_2 in identified_records.values():
        if keys_1 and keys_2:
            matched_pairs.update(itertools.product(keys_1, keys_2))

    keys_1 = list(data_1.keys())
    keys_2 = list(data_2.keys())

    random_pairs = [
        (keys_1[i], keys_2[j])
        for i, j in randomPairsMatch(len(data_1), len(data_2), training_size)
    ]

    distinct_pairs = {
        pair
        for pair in random_pairs if pair not in matched_pairs
    }

    matched_records = [(data_1[key_1], data_2[key_2])
                       for key_1, key_2 in matched_pairs]
    distinct_records = [(data_1[key_1], data_2[key_2])
                        for key_1, key_2 in distinct_pairs]

    training_pairs: TrainingData
    training_pairs = {'match': matched_records, 'distinct': distinct_records}

    return training_pairs