def __init__(self, data_model, data_1, data_2, blocked_proportion, sample_size, original_length_1, original_length_2): self.data_model = data_model data_1 = core.index(data_1) offset = len(data_1) data_2 = core.index(data_2, offset) self.candidates = super().sample(data_1, data_2, blocked_proportion, sample_size) self.blocker = RecordLinkBlockLearner(data_model, self.candidates, data_1, data_2, original_length_1, original_length_2) self._common_init()
def __init__(self, data_model, data_1, data_2, blocked_proportion, sample_size, original_length_1, original_length_2, index_include): self.data_model = data_model data_1 = core.index(data_1) offset = len(data_1) data_2 = core.index(data_2, offset) self.candidates = self._sample(data_1, data_2, blocked_proportion, sample_size) random_pair = random.choice(self.candidates) exact_match = (random_pair[0], random_pair[0]) index_include = index_include.copy() index_include.append(exact_match) self.blocker = RecordLinkBlockLearner(data_model, self.candidates, data_1, data_2, original_length_1, original_length_2, index_include) self.classifier = RLRLearner(self.data_model) self.classifier.candidates = self.candidates self._common_init() self.mark([exact_match] * 4 + [random_pair], [1] * 4 + [0])
def sample(self, data_1, data_2, sample_size=15000, blocked_proportion=.5, original_length_1=None, original_length_2=None): ''' Draws a random sample of combinations of records from the first and second datasets, and initializes active learning with this sample Arguments: data_1 -- Dictionary of records from first dataset, where the keys are record_ids and the values are dictionaries with the keys being field names data_2 -- Dictionary of records from second dataset, same form as data_1 sample_size -- Size of the sample to draw ''' self._checkData(data_1, data_2) data_1 = core.index(data_1) if original_length_1 is None: original_length_1 = len(data_1) self.sampled_records_1 = Sample(data_1, 600, original_length_1) offset = len(data_1) data_2 = core.index(data_2, offset) if original_length_2 is None: original_length_2 = len(data_2) self.sampled_records_2 = Sample(data_2, 600, original_length_2) self.active_learner = self.ActiveLearner(self.data_model) self.active_learner.sample_product(data_1, data_2, blocked_proportion, sample_size)
def sample_product(self, data_1, data_2, blocked_proportion, sample_size, original_length_1=None, original_length_2=None): data_1 = core.index(data_1) offset = len(data_1) data_2 = core.index(data_2, offset) self.candidates = super().sample_product(data_1, data_2, blocked_proportion, sample_size) self.classifier._init(self.candidates) sampled_records_1 = Sample(data_1, 600, original_length_1) sampled_records_2 = Sample(data_2, 600, original_length_2) self.blocker._init_product(self.candidates, sampled_records_1, sampled_records_2, data_2) return sampled_records_1, sampled_records_2
def sample(self, data_1, data_2, sample_size=150000, blocked_proportion=.5): ''' Draws a random sample of combinations of records from the first and second datasets, and initializes active learning with this sample Arguments: data_1 -- Dictionary of records from first dataset, where the keys are record_ids and the values are dictionaries with the keys being field names data_2 -- Dictionary of records from second dataset, same form as data_1 sample_size -- Size of the sample to draw ''' if len(data_1) == 0: raise ValueError( 'Dictionary of records from first dataset is empty.') elif len(data_2) == 0: raise ValueError( 'Dictionary of records from second dataset is empty.') if len(data_1) > len(data_2): data_1, data_2 = data_2, data_1 data_1 = core.index(data_1) self.sampled_records_1 = Sample(data_1, 500) offset = len(data_1) data_2 = core.index(data_2, offset) self.sampled_records_2 = Sample(data_2, 500) blocked_sample_size = int(blocked_proportion * sample_size) predicates = list(self.data_model.predicates(index_predicates=False)) deque_1 = sampling.randomDeque(data_1) deque_2 = sampling.randomDeque(data_2) blocked_sample_keys = sampling.linkBlockedSample(blocked_sample_size, predicates, deque_1, deque_2) random_sample_size = sample_size - len(blocked_sample_keys) random_sample_keys = core.randomPairsMatch(len(deque_1), len(deque_2), random_sample_size) random_sample_keys = {(a, b + offset) for a, b in random_sample_keys} data_sample = ((data_1[k1], data_2[k2]) for k1, k2 in blocked_sample_keys | random_sample_keys) data_sample = core.freezeData(data_sample) self._loadSample(data_sample)
def sample(self, data_1, data_2, sample_size=150000, blocked_proportion=.5): ''' Draws a random sample of combinations of records from the first and second datasets, and initializes active learning with this sample Arguments: data_1 -- Dictionary of records from first dataset, where the keys are record_ids and the values are dictionaries with the keys being field names data_2 -- Dictionary of records from second dataset, same form as data_1 sample_size -- Size of the sample to draw ''' if len(data_1) > len(data_2): data_1, data_2 = data_2, data_1 data_1 = core.index(data_1) offset = len(data_1) data_2 = core.index(data_2, offset) blocked_sample_size = int(blocked_proportion * sample_size) predicates = [ pred for pred in predicateGenerator(self.data_model) if pred.type == 'SimplePredicate' ] data_1 = sampling.randomDeque(data_1) data_2 = sampling.randomDeque(data_2) blocked_sample_keys = sampling.linkBlockedSample( blocked_sample_size, predicates, data_1, data_2) random_sample_size = sample_size - len(blocked_sample_keys) random_sample_keys = dedupe.core.randomPairsMatch( len(data_1), len(data_2), random_sample_size) random_sample_keys = set( (a, b + offset) for a, b in random_sample_keys) data_1 = dict(data_1) data_2 = dict(data_2) data_sample = ((data_1[k1], data_2[k2]) for k1, k2 in blocked_sample_keys | random_sample_keys) data_sample = core.freezeData(data_sample) self._loadSample(data_sample)
def sample(self, data_1, data_2, sample_size=150000, blocked_proportion=.5) : ''' Draws a random sample of combinations of records from the first and second datasets, and initializes active learning with this sample Arguments: data_1 -- Dictionary of records from first dataset, where the keys are record_ids and the values are dictionaries with the keys being field names data_2 -- Dictionary of records from second dataset, same form as data_1 sample_size -- Size of the sample to draw ''' if len(data_1) > len(data_2) : data_1, data_2 = data_2, data_1 data_1 = core.index(data_1) offset = len(data_1) data_2 = core.index(data_2, offset) blocked_sample_size = int(blocked_proportion * sample_size) predicates = [pred for pred in predicateGenerator(self.data_model) if pred.type == 'SimplePredicate'] data_1 = sampling.randomDeque(data_1) data_2 = sampling.randomDeque(data_2) blocked_sample_keys = sampling.linkBlockedSample(blocked_sample_size, predicates, data_1, data_2) random_sample_size = sample_size - len(blocked_sample_keys) random_sample_keys = dedupe.core.randomPairsMatch(len(data_1), len(data_2), random_sample_size) random_sample_keys = set((a, b + offset) for a, b in random_sample_keys) data_1 = dict(data_1) data_2 = dict(data_2) data_sample = ((data_1[k1], data_2[k2]) for k1, k2 in blocked_sample_keys | random_sample_keys) data_sample = core.freezeData(data_sample) self._loadSample(data_sample)
def __init__(self, distances, data, blocked_proportion, sample_size, original_length, index_include): logger.debug("Initializing DedupeDisagreementLearner class") self.distances = distances logger.debug( f"labeler.DedupeDisagreementLearner distances type: {type(distances)}" ) logger.debug( f"labeler.DedupeDisagreementLearner self.distances type: {type(self.distances)}" ) data = core.index(data) self.candidates = super().sample(data, blocked_proportion, sample_size) random_pair = random.choice(self.candidates) exact_match = (random_pair[0], random_pair[0]) index_include = index_include.copy() index_include.append(exact_match) self.blocker = DedupeBlockLearner(distances, self.candidates, data, original_length, index_include) self._common_init() logger.debug("Initializing with 5 random values") self.mark([exact_match] * 4 + [random_pair], [1] * 4 + [0])
def __init__(self, data_model, data, blocked_proportion, sample_size, index_include): self.data_model = data_model data = core.index(data) self.candidates = self._sample(data, blocked_proportion, sample_size) random_pair = random.choice(self.candidates) exact_match = (random_pair[0], random_pair[0]) index_include = index_include.copy() index_include.append(exact_match) self.blocker = DedupeBlockLearner(data_model, self.candidates, data, index_include) self.classifier = RLRLearner(self.data_model) self.classifier.candidates = self.candidates self._common_init() self.mark([exact_match] * 4 + [random_pair], [1] * 4 + [0])
def sample(self, data, sample_size=15000, blocked_proportion=0.5, original_length=None): '''Draw a sample of record pairs from the dataset (a mix of random pairs & pairs of similar records) and initialize active learning with this sample Arguments: data -- Dictionary of records, where the keys are record_ids and the values are dictionaries with the keys being field names sample_size -- Size of the sample to draw blocked_proportion -- Proportion of the sample that will be blocked original_length -- Length of original data, should be set if `data` is a sample of full data ''' self._checkData(data) data = core.index(data) if original_length is None: original_length = len(data) self.sampled_records = Sample(data, 2000, original_length) self.active_learner = self.ActiveLearner(self.data_model) self.active_learner.sample_combo(data, blocked_proportion, sample_size)
def sample(self, data, sample_size=15000, blocked_proportion=0.5): '''Draw a sample of record pairs from the dataset (a mix of random pairs & pairs of similar records) and initialize active learning with this sample Arguments: data -- Dictionary of records, where the keys are record_ids and the values are dictionaries with the keys being field names sample_size -- Size of the sample to draw blocked_proportion -- Proportion of the sample that will be blocked ''' data = core.index(data) self.sampled_records = Sample(data, 900) blocked_sample_size = int(blocked_proportion * sample_size) predicates = list(self.data_model.predicates(index_predicates=False)) data = sampling.randomDeque(data) blocked_sample_keys = sampling.dedupeBlockedSample(blocked_sample_size, predicates, data) random_sample_size = sample_size - len(blocked_sample_keys) random_sample_keys = set(core.randomPairs(len(data), random_sample_size)) data = dict(data) data_sample = [(data[k1], data[k2]) for k1, k2 in blocked_sample_keys | random_sample_keys] self._loadSample(data_sample)
def sample(self, data, sample_size=15000, blocked_proportion=0.5): '''Draw a sample of record pairs from the dataset (a mix of random pairs & pairs of similar records) and initialize active learning with this sample Arguments: data -- Dictionary of records, where the keys are record_ids and the values are dictionaries with the keys being field names sample_size -- Size of the sample to draw blocked_proportion -- Proportion of the sample that will be blocked ''' data = core.index(data) self.sampled_records = Sample(data, 900) blocked_sample_size = int(blocked_proportion * sample_size) predicates = list(self.data_model.predicates(index_predicates=False)) data = sampling.randomDeque(data) blocked_sample_keys = sampling.dedupeBlockedSample( blocked_sample_size, predicates, data) random_sample_size = sample_size - len(blocked_sample_keys) random_sample_keys = set( core.randomPairs(len(data), random_sample_size)) data = dict(data) data_sample = [(data[k1], data[k2]) for k1, k2 in blocked_sample_keys | random_sample_keys] data_sample = core.freezeData(data_sample) self._loadSample(data_sample)
def __init__(self, distances, data, blocked_proportion, sample_size, original_length, index_include): self.distances = distances self.sampler = DedupeSampler(distances) data = core.index(data) self.candidates = self.sampler.sample(data, blocked_proportion, sample_size) logger.info(f"self.candidates: {len(self.candidates)}") random_pair = random.choice(self.candidates) exact_match = (random_pair[0], random_pair[0]) index_include = index_include.copy() index_include.append(exact_match) self.block_learner = BlockLearner(distances, self.candidates, data, original_length, index_include) self._common_init() logger.debug("Initializing with 5 random values") self.mark([exact_match] * 4 + [random_pair], [1] * 4 + [0])
def __init__(self, data_model, data, blocked_proportion, sample_size, original_length): self.data_model = data_model data = core.index(data) self.candidates = super().sample(data, blocked_proportion, sample_size) self.blocker = DedupeBlockLearner(data_model, self.candidates, data, original_length) self._common_init()
def sample_combo(self, data, blocked_proportion, sample_size, original_length=None): data = core.index(data) self.candidates = super().sample_combo(data, blocked_proportion, sample_size) self.classifier._init(self.candidates) sampled_records = Sample(data, 2000, original_length) self.blocker._init_combo(self.candidates, sampled_records, data) return sampled_records
def sample(self, data, sample_size=15000, blocked_proportion=0.5) : '''Draw a sample of record pairs from the dataset (a mix of random pairs & pairs of similar records) and initialize active learning with this sample Arguments: data -- Dictionary of records, where the keys are record_ids and the values are dictionaries with the keys being field names sample_size -- Size of the sample to draw blocked_proportion -- Proportion of the sample that will be blocked ''' data = core.index(data) blocked_sample_size = int(blocked_proportion * sample_size) predicates = list(predicateGenerator(self.data_model, index_predicates=False, canopies=self.canopies)) data = sampling.randomDeque(data) blocked_sample_keys = sampling.dedupeBlockedSample(blocked_sample_size, predicates, data) random_sample_size = sample_size - len(blocked_sample_keys) random_sample_keys = set(dedupe.core.randomPairs(len(data), random_sample_size)) data = dict(data) data_sample = [(data[k1], data[k2]) for k1, k2 in blocked_sample_keys | random_sample_keys] data_sample = core.freezeData(data_sample) # data can be a very large object, so we'll free it up as soon # as possible del data self._loadSample(data_sample)
def sample(self, data, sample_size=15000, blocked_proportion=0.5) : '''Draw a sample of record pairs from the dataset (a mix of random pairs & pairs of similar records) and initialize active learning with this sample Arguments: data -- Dictionary of records, where the keys are record_ids and the values are dictionaries with the keys being field names sample_size -- Size of the sample to draw blocked_proportion -- Proportion of the sample that will be blocked ''' data = core.index(data) blocked_sample_size = int(blocked_proportion * sample_size) predicates = [pred for pred in predicateGenerator(self.data_model) if pred.type == 'SimplePredicate'] data = sampling.randomDeque(data) blocked_sample_keys = sampling.dedupeBlockedSample(blocked_sample_size, predicates, data) random_sample_size = sample_size - len(blocked_sample_keys) random_sample_keys = set(dedupe.core.randomPairs(len(data), random_sample_size)) data = dict(data) data_sample = ((data[k1], data[k2]) for k1, k2 in blocked_sample_keys | random_sample_keys) data_sample = core.freezeData(data_sample) self._loadSample(data_sample)