def sample_product(self, data_1, data_2, blocked_proportion, sample_size): offset = len(data_1) blocked_sample_size = int(blocked_proportion * sample_size) predicates = list(self.data_model.predicates(index_predicates=False)) deque_1 = sampling.randomDeque(data_1) deque_2 = sampling.randomDeque(data_2) blocked_sample_keys = sampling.linkBlockedSample(blocked_sample_size, predicates, deque_1, deque_2) random_sample_size = sample_size - len(blocked_sample_keys) random_sample_keys = core.randomPairsMatch(len(deque_1), len(deque_2), random_sample_size) random_sample_keys = {(a, b + offset) for a, b in random_sample_keys} self.candidates = [(data_1[k1], data_2[k2]) for k1, k2 in blocked_sample_keys | random_sample_keys] self.distances = self.transform(self.candidates)
def trainingDataLink(data_1, data_2, common_key, training_size=50000) : # pragma: nocover ''' Construct training data for consumption by the ActiveLearning markPairs method from already linked datasets. Arguments : data_1 -- Dictionary of records from first dataset, where the keys are record_ids and the values are dictionaries with the keys being field names data_2 -- Dictionary of records from second dataset, same form as data_1 common_key -- The name of the record field that uniquely identifies a match training_size -- the rough limit of the number of training examples, defaults to 50000 Warning: Every match must be identified by the sharing of a common key. This function assumes that if two records do not share a common key then they are distinct records. ''' identified_records = collections.defaultdict(lambda: [[],[]]) matched_pairs = set() distinct_pairs = set() for record_id, record in data_1.items() : identified_records[record[common_key]][0].append(record_id) for record_id, record in data_2.items() : identified_records[record[common_key]][1].append(record_id) for keys_1, keys_2 in identified_records.values() : if keys_1 and keys_2 : matched_pairs.update(itertools.product(keys_1, keys_2)) keys_1 = list(data_1.keys()) keys_2 = list(data_2.keys()) random_pairs = [(keys_1[i], keys_2[j]) for i, j in randomPairsMatch(len(data_1), len(data_2), training_size)] distinct_pairs = (pair for pair in random_pairs if pair not in matched_pairs) matched_records = [(data_1[key_1], data_2[key_2]) for key_1, key_2 in matched_pairs] distinct_records = [(data_1[key_1], data_2[key_2]) for key_1, key_2 in distinct_pairs] training_pairs = {'match' : matched_records, 'distinct' : distinct_records} return training_pairs
def sample(self, data_1, data_2, sample_size=150000, blocked_proportion=.5): ''' Draws a random sample of combinations of records from the first and second datasets, and initializes active learning with this sample Arguments: data_1 -- Dictionary of records from first dataset, where the keys are record_ids and the values are dictionaries with the keys being field names data_2 -- Dictionary of records from second dataset, same form as data_1 sample_size -- Size of the sample to draw ''' if len(data_1) == 0: raise ValueError( 'Dictionary of records from first dataset is empty.') elif len(data_2) == 0: raise ValueError( 'Dictionary of records from second dataset is empty.') if len(data_1) > len(data_2): data_1, data_2 = data_2, data_1 data_1 = core.index(data_1) self.sampled_records_1 = Sample(data_1, 500) offset = len(data_1) data_2 = core.index(data_2, offset) self.sampled_records_2 = Sample(data_2, 500) blocked_sample_size = int(blocked_proportion * sample_size) predicates = list(self.data_model.predicates(index_predicates=False)) deque_1 = sampling.randomDeque(data_1) deque_2 = sampling.randomDeque(data_2) blocked_sample_keys = sampling.linkBlockedSample(blocked_sample_size, predicates, deque_1, deque_2) random_sample_size = sample_size - len(blocked_sample_keys) random_sample_keys = core.randomPairsMatch(len(deque_1), len(deque_2), random_sample_size) random_sample_keys = {(a, b + offset) for a, b in random_sample_keys} data_sample = ((data_1[k1], data_2[k2]) for k1, k2 in blocked_sample_keys | random_sample_keys) data_sample = core.freezeData(data_sample) self._loadSample(data_sample)
def _sample(self: HasDataModel, data_1, data_2, blocked_proportion, sample_size) -> List[TrainingExample]: offset = len(data_1) blocked_sample_size = int(blocked_proportion * sample_size) predicates = list(self.data_model.predicates(index_predicates=False)) deque_1 = sampling.randomDeque(data_1) deque_2 = sampling.randomDeque(data_2) blocked_sample_keys = sampling.linkBlockedSample(blocked_sample_size, predicates, deque_1, deque_2) random_sample_size = sample_size - len(blocked_sample_keys) random_sample_keys = core.randomPairsMatch(len(deque_1), len(deque_2), random_sample_size) unique_random_sample_keys = {(a, b + offset) for a, b in random_sample_keys} return [(data_1[k1], data_2[k2]) for k1, k2 in blocked_sample_keys | unique_random_sample_keys]
def sample(self, data_1, data_2, blocked_proportion, sample_size): offset = len(data_1) blocked_sample_size = int(blocked_proportion * sample_size) predicates = list(self.distances.predicates(index_predicates=False)) deque_1 = sampling.randomDeque(data_1) deque_2 = sampling.randomDeque(data_2) blocked_sample_keys = sampling.linkBlockedSample( blocked_sample_size, predicates, deque_1, deque_2) random_sample_size = sample_size - len(blocked_sample_keys) random_sample_keys = core.randomPairsMatch(len(deque_1), len(deque_2), random_sample_size) random_sample_keys = {(a, b + offset) for a, b in random_sample_keys} return [(data_1[k1], data_2[k2]) for k1, k2 in blocked_sample_keys | random_sample_keys]
def training_data_link( data_1: Data, data_2: Data, common_key: str, training_size: int = 50000) -> TrainingData: # pragma: nocover ''' Construct training data for consumption by the func:`mark_pairs` method from already linked datasets. Args: data_1: Dictionary of records from first dataset, where the keys are record_ids and the values are dictionaries with the keys being field names data_2: Dictionary of records from second dataset, same form as data_1 common_key: The name of the record field that uniquely identifies a match training_size: the rough limit of the number of training examples, defaults to 50000 .. note:: Every match must be identified by the sharing of a common key. This function assumes that if two records do not share a common key then they are distinct records. ''' identified_records: Dict[str, Tuple[List[RecordID], List[RecordID]]] identified_records = collections.defaultdict(lambda: ([], [])) matched_pairs: Set[Tuple[RecordID, RecordID]] = set() distinct_pairs: Set[Tuple[RecordID, RecordID]] = set() for record_id, record in data_1.items(): identified_records[record[common_key]][0].append(record_id) for record_id, record in data_2.items(): identified_records[record[common_key]][1].append(record_id) for keys_1, keys_2 in identified_records.values(): if keys_1 and keys_2: matched_pairs.update(itertools.product(keys_1, keys_2)) keys_1 = list(data_1.keys()) keys_2 = list(data_2.keys()) random_pairs = [ (keys_1[i], keys_2[j]) for i, j in randomPairsMatch(len(data_1), len(data_2), training_size) ] distinct_pairs = { pair for pair in random_pairs if pair not in matched_pairs } matched_records = [(data_1[key_1], data_2[key_2]) for key_1, key_2 in matched_pairs] distinct_records = [(data_1[key_1], data_2[key_2]) for key_1, key_2 in distinct_pairs] training_pairs: TrainingData training_pairs = {'match': matched_records, 'distinct': distinct_records} return training_pairs