def _trainBlocker(self, ppc=1, uncovered_dupes=1) : training_pairs = copy.deepcopy(self.training_pairs) blocker_types = self._blockerTypes() confident_nonduplicates = training.semiSupervisedNonDuplicates(self.data_sample, self.data_model, sample_size=32000) training_pairs['distinct'].extend(confident_nonduplicates) predicate_set = blocking.predicateGenerator(blocker_types, self.data_model) (self.predicates, self.stop_words) = dedupe.blocking.blockTraining(training_pairs, predicate_set, ppc, uncovered_dupes, self.pool, self._linkage_type) self.blocker = self._Blocker(self.predicates, self.pool, self.stop_words)
def _learnBlocking(self, eta, epsilon): """Learn a good blocking of the data""" confident_nonduplicates = training.semiSupervisedNonDuplicates(self.data_sample, self.data_model) self.training_pairs[0].extend(confident_nonduplicates) predicate_functions = ( predicates.wholeFieldPredicate, predicates.tokenFieldPredicate, predicates.commonIntegerPredicate, predicates.sameThreeCharStartPredicate, predicates.sameFiveCharStartPredicate, predicates.sameSevenCharStartPredicate, predicates.nearIntegersPredicate, predicates.commonFourGram, predicates.commonSixGram, ) tfidf_thresholds = [0.2, 0.4, 0.6, 0.8] full_string_records = {} fields = [k for k, v in self.data_model["fields"].items() if v["type"] != "Missing Data"] for pair in self.data_sample[0:2000]: for (k, v) in pair: full_string_records[k] = " ".join(v[field] for field in fields) df_index = tfidf.documentFrequency(full_string_records) learned_predicates = dedupe.blocking.blockTraining( self.training_pairs, predicate_functions, fields, tfidf_thresholds, df_index, eta, epsilon ) return learned_predicates
def _learnBlocking(self, eta, epsilon): """Learn a good blocking of the data""" confident_nonduplicates = training.semiSupervisedNonDuplicates( self.data_sample, self.data_model, sample_size=32000 ) self.training_pairs[0].extend(confident_nonduplicates) predicate_set = predicateGenerator(self.blocker_types, self.data_model) learned_predicates = dedupe.blocking.blockTraining(self.training_pairs, predicate_set, eta, epsilon) return learned_predicates
def _learnBlocking(self, eta, epsilon): """Learn a good blocking of the data""" confident_nonduplicates = training.semiSupervisedNonDuplicates( self.data_sample, self.data_model, sample_size=32000) self.training_pairs[0].extend(confident_nonduplicates) predicate_set = predicateGenerator(self.blocker_types, self.data_model) learned_predicates = dedupe.blocking.blockTraining( self.training_pairs, predicate_set, eta, epsilon) return learned_predicates
def _trainBlocker(self, ppc=1, uncovered_dupes=1): # pragma : no cover training_pairs = copy.deepcopy(self.training_pairs) confident_nonduplicates = training.semiSupervisedNonDuplicates( self.data_sample, self.data_model, sample_size=32000) training_pairs[u'distinct'].extend(confident_nonduplicates) predicate_set = predicateGenerator(self.data_model) (self.predicates, self.stop_words) = dedupe.training.blockTraining( training_pairs, predicate_set, ppc, uncovered_dupes, self._linkage_type) self.blocker = self._Blocker(self.predicates, self.stop_words)
def _trainBlocker(self, ppc, uncovered_dupes, index_predicates): # pragma : no cover training_pairs = copy.deepcopy(self.training_pairs) confident_nonduplicates = training.semiSupervisedNonDuplicates( self.data_sample, self.data_model, self.classifier, sample_size=32000) training_pairs[u'distinct'].extend(confident_nonduplicates) predicate_set = self.data_model.predicates(index_predicates, self.canopies) self.predicates = dedupe.training.blockTraining( training_pairs, predicate_set, ppc, uncovered_dupes, self._linkage_type) self.blocker = blocking.Blocker(self.predicates)
def _trainBlocker(self, ppc, uncovered_dupes, index_predicates) : # pragma : no cover training_pairs = copy.deepcopy(self.training_pairs) confident_nonduplicates = training.semiSupervisedNonDuplicates(self.data_sample, self.data_model, self.classifier, sample_size=32000) training_pairs[u'distinct'].extend(confident_nonduplicates) predicate_set = self.data_model.predicates(index_predicates, self.canopies) self.predicates = dedupe.training.blockTraining(training_pairs, predicate_set, ppc, uncovered_dupes, self._linkage_type) self.blocker = blocking.Blocker(self.predicates)