def sample_product(self, data_1, data_2, blocked_proportion, sample_size): offset = len(data_1) blocked_sample_size = int(blocked_proportion * sample_size) predicates = list(self.data_model.predicates(index_predicates=False)) deque_1 = sampling.randomDeque(data_1) deque_2 = sampling.randomDeque(data_2) blocked_sample_keys = sampling.linkBlockedSample(blocked_sample_size, predicates, deque_1, deque_2) random_sample_size = sample_size - len(blocked_sample_keys) random_sample_keys = core.randomPairsMatch(len(deque_1), len(deque_2), random_sample_size) random_sample_keys = {(a, b + offset) for a, b in random_sample_keys} self.candidates = [(data_1[k1], data_2[k2]) for k1, k2 in blocked_sample_keys | random_sample_keys] self.distances = self.transform(self.candidates)
def sample(self, data_1, data_2, sample_size=150000, blocked_proportion=.5): ''' Draws a random sample of combinations of records from the first and second datasets, and initializes active learning with this sample Arguments: data_1 -- Dictionary of records from first dataset, where the keys are record_ids and the values are dictionaries with the keys being field names data_2 -- Dictionary of records from second dataset, same form as data_1 sample_size -- Size of the sample to draw ''' if len(data_1) == 0: raise ValueError( 'Dictionary of records from first dataset is empty.') elif len(data_2) == 0: raise ValueError( 'Dictionary of records from second dataset is empty.') if len(data_1) > len(data_2): data_1, data_2 = data_2, data_1 data_1 = core.index(data_1) self.sampled_records_1 = Sample(data_1, 500) offset = len(data_1) data_2 = core.index(data_2, offset) self.sampled_records_2 = Sample(data_2, 500) blocked_sample_size = int(blocked_proportion * sample_size) predicates = list(self.data_model.predicates(index_predicates=False)) deque_1 = sampling.randomDeque(data_1) deque_2 = sampling.randomDeque(data_2) blocked_sample_keys = sampling.linkBlockedSample(blocked_sample_size, predicates, deque_1, deque_2) random_sample_size = sample_size - len(blocked_sample_keys) random_sample_keys = core.randomPairsMatch(len(deque_1), len(deque_2), random_sample_size) random_sample_keys = {(a, b + offset) for a, b in random_sample_keys} data_sample = ((data_1[k1], data_2[k2]) for k1, k2 in blocked_sample_keys | random_sample_keys) data_sample = core.freezeData(data_sample) self._loadSample(data_sample)
def _sample(self: HasDataModel, data_1, data_2, blocked_proportion, sample_size) -> List[TrainingExample]: offset = len(data_1) blocked_sample_size = int(blocked_proportion * sample_size) predicates = list(self.data_model.predicates(index_predicates=False)) deque_1 = sampling.randomDeque(data_1) deque_2 = sampling.randomDeque(data_2) blocked_sample_keys = sampling.linkBlockedSample(blocked_sample_size, predicates, deque_1, deque_2) random_sample_size = sample_size - len(blocked_sample_keys) random_sample_keys = core.randomPairsMatch(len(deque_1), len(deque_2), random_sample_size) unique_random_sample_keys = {(a, b + offset) for a, b in random_sample_keys} return [(data_1[k1], data_2[k2]) for k1, k2 in blocked_sample_keys | unique_random_sample_keys]
def sample(self, data_1, data_2, sample_size=150000, blocked_proportion=.5): ''' Draws a random sample of combinations of records from the first and second datasets, and initializes active learning with this sample Arguments: data_1 -- Dictionary of records from first dataset, where the keys are record_ids and the values are dictionaries with the keys being field names data_2 -- Dictionary of records from second dataset, same form as data_1 sample_size -- Size of the sample to draw ''' if len(data_1) > len(data_2): data_1, data_2 = data_2, data_1 data_1 = core.index(data_1) offset = len(data_1) data_2 = core.index(data_2, offset) blocked_sample_size = int(blocked_proportion * sample_size) predicates = [ pred for pred in predicateGenerator(self.data_model) if pred.type == 'SimplePredicate' ] data_1 = sampling.randomDeque(data_1) data_2 = sampling.randomDeque(data_2) blocked_sample_keys = sampling.linkBlockedSample( blocked_sample_size, predicates, data_1, data_2) random_sample_size = sample_size - len(blocked_sample_keys) random_sample_keys = dedupe.core.randomPairsMatch( len(data_1), len(data_2), random_sample_size) random_sample_keys = set( (a, b + offset) for a, b in random_sample_keys) data_1 = dict(data_1) data_2 = dict(data_2) data_sample = ((data_1[k1], data_2[k2]) for k1, k2 in blocked_sample_keys | random_sample_keys) data_sample = core.freezeData(data_sample) self._loadSample(data_sample)
def sample(self, data_1, data_2, sample_size=150000, blocked_proportion=.5) : ''' Draws a random sample of combinations of records from the first and second datasets, and initializes active learning with this sample Arguments: data_1 -- Dictionary of records from first dataset, where the keys are record_ids and the values are dictionaries with the keys being field names data_2 -- Dictionary of records from second dataset, same form as data_1 sample_size -- Size of the sample to draw ''' if len(data_1) > len(data_2) : data_1, data_2 = data_2, data_1 data_1 = core.index(data_1) offset = len(data_1) data_2 = core.index(data_2, offset) blocked_sample_size = int(blocked_proportion * sample_size) predicates = [pred for pred in predicateGenerator(self.data_model) if pred.type == 'SimplePredicate'] data_1 = sampling.randomDeque(data_1) data_2 = sampling.randomDeque(data_2) blocked_sample_keys = sampling.linkBlockedSample(blocked_sample_size, predicates, data_1, data_2) random_sample_size = sample_size - len(blocked_sample_keys) random_sample_keys = dedupe.core.randomPairsMatch(len(data_1), len(data_2), random_sample_size) random_sample_keys = set((a, b + offset) for a, b in random_sample_keys) data_1 = dict(data_1) data_2 = dict(data_2) data_sample = ((data_1[k1], data_2[k2]) for k1, k2 in blocked_sample_keys | random_sample_keys) data_sample = core.freezeData(data_sample) self._loadSample(data_sample)
def sample(self, data, sample_size=15000, blocked_proportion=0.5): '''Draw a sample of record pairs from the dataset (a mix of random pairs & pairs of similar records) and initialize active learning with this sample Arguments: data -- Dictionary of records, where the keys are record_ids and the values are dictionaries with the keys being field names sample_size -- Size of the sample to draw blocked_proportion -- Proportion of the sample that will be blocked ''' data = core.index(data) self.sampled_records = Sample(data, 900) blocked_sample_size = int(blocked_proportion * sample_size) predicates = list(self.data_model.predicates(index_predicates=False)) data = sampling.randomDeque(data) blocked_sample_keys = sampling.dedupeBlockedSample( blocked_sample_size, predicates, data) random_sample_size = sample_size - len(blocked_sample_keys) random_sample_keys = set( core.randomPairs(len(data), random_sample_size)) data = dict(data) data_sample = [(data[k1], data[k2]) for k1, k2 in blocked_sample_keys | random_sample_keys] data_sample = core.freezeData(data_sample) self._loadSample(data_sample)
def sample(self, data, sample_size=15000, blocked_proportion=0.5): '''Draw a sample of record pairs from the dataset (a mix of random pairs & pairs of similar records) and initialize active learning with this sample Arguments: data -- Dictionary of records, where the keys are record_ids and the values are dictionaries with the keys being field names sample_size -- Size of the sample to draw blocked_proportion -- Proportion of the sample that will be blocked ''' data = core.index(data) self.sampled_records = Sample(data, 900) blocked_sample_size = int(blocked_proportion * sample_size) predicates = list(self.data_model.predicates(index_predicates=False)) data = sampling.randomDeque(data) blocked_sample_keys = sampling.dedupeBlockedSample(blocked_sample_size, predicates, data) random_sample_size = sample_size - len(blocked_sample_keys) random_sample_keys = set(core.randomPairs(len(data), random_sample_size)) data = dict(data) data_sample = [(data[k1], data[k2]) for k1, k2 in blocked_sample_keys | random_sample_keys] self._loadSample(data_sample)
def sample(self, data_1, data_2, blocked_proportion, sample_size): offset = len(data_1) blocked_sample_size = int(blocked_proportion * sample_size) predicates = list(self.distances.predicates(index_predicates=False)) deque_1 = sampling.randomDeque(data_1) deque_2 = sampling.randomDeque(data_2) blocked_sample_keys = sampling.linkBlockedSample( blocked_sample_size, predicates, deque_1, deque_2) random_sample_size = sample_size - len(blocked_sample_keys) random_sample_keys = core.randomPairsMatch(len(deque_1), len(deque_2), random_sample_size) random_sample_keys = {(a, b + offset) for a, b in random_sample_keys} return [(data_1[k1], data_2[k2]) for k1, k2 in blocked_sample_keys | random_sample_keys]
def sample(self, data, blocked_proportion, sample_size): blocked_sample_size = int(blocked_proportion * sample_size) predicates = list(self.data_model.predicates(index_predicates=False)) data = sampling.randomDeque(data) blocked_sample_keys = sampling.dedupeBlockedSample( blocked_sample_size, predicates, data) random_sample_size = sample_size - len(blocked_sample_keys) random_sample_keys = set( core.randomPairs(len(data), random_sample_size)) data = dict(data) return [(data[k1], data[k2]) for k1, k2 in blocked_sample_keys | random_sample_keys]
def sample(self, data, blocked_proportion, sample_size): blocked_sample_size = int(blocked_proportion * sample_size) predicates = list(self.data_model.predicates(index_predicates=False)) data = sampling.randomDeque(data) blocked_sample_keys = sampling.dedupeBlockedSample(blocked_sample_size, predicates, data) random_sample_size = sample_size - len(blocked_sample_keys) random_sample_keys = set(core.randomPairs(len(data), random_sample_size)) data = dict(data) return [(data[k1], data[k2]) for k1, k2 in blocked_sample_keys | random_sample_keys]
def sample(self, data, sample_size=15000, blocked_proportion=0.5) : '''Draw a sample of record pairs from the dataset (a mix of random pairs & pairs of similar records) and initialize active learning with this sample Arguments: data -- Dictionary of records, where the keys are record_ids and the values are dictionaries with the keys being field names sample_size -- Size of the sample to draw blocked_proportion -- Proportion of the sample that will be blocked ''' data = core.index(data) blocked_sample_size = int(blocked_proportion * sample_size) predicates = list(predicateGenerator(self.data_model, index_predicates=False, canopies=self.canopies)) data = sampling.randomDeque(data) blocked_sample_keys = sampling.dedupeBlockedSample(blocked_sample_size, predicates, data) random_sample_size = sample_size - len(blocked_sample_keys) random_sample_keys = set(dedupe.core.randomPairs(len(data), random_sample_size)) data = dict(data) data_sample = [(data[k1], data[k2]) for k1, k2 in blocked_sample_keys | random_sample_keys] data_sample = core.freezeData(data_sample) # data can be a very large object, so we'll free it up as soon # as possible del data self._loadSample(data_sample)
def sample(self, data, sample_size=15000, blocked_proportion=0.5) : '''Draw a sample of record pairs from the dataset (a mix of random pairs & pairs of similar records) and initialize active learning with this sample Arguments: data -- Dictionary of records, where the keys are record_ids and the values are dictionaries with the keys being field names sample_size -- Size of the sample to draw blocked_proportion -- Proportion of the sample that will be blocked ''' data = core.index(data) blocked_sample_size = int(blocked_proportion * sample_size) predicates = [pred for pred in predicateGenerator(self.data_model) if pred.type == 'SimplePredicate'] data = sampling.randomDeque(data) blocked_sample_keys = sampling.dedupeBlockedSample(blocked_sample_size, predicates, data) random_sample_size = sample_size - len(blocked_sample_keys) random_sample_keys = set(dedupe.core.randomPairs(len(data), random_sample_size)) data = dict(data) data_sample = ((data[k1], data[k2]) for k1, k2 in blocked_sample_keys | random_sample_keys) data_sample = core.freezeData(data_sample) self._loadSample(data_sample)