def _trainClassifier(self, alpha=.1) : # pragma : no cover self.data_model = core.trainModel(self.training_data, self.data_model, alpha) self._logLearnedWeights()
def _trainClassifier(self, alpha=None): # pragma : no cover if alpha is None: alpha = self._regularizer() self.data_model = core.trainModel(self.training_data, self.data_model, self.learner, alpha) self._logLearnedWeights()
def _trainClassifier(self, alpha=None) : # pragma : no cover if alpha is None : alpha = self._regularizer() self.data_model = core.trainModel(self.training_data, self.data_model, self.learner, alpha) self._logLearnedWeights()
def train(self, data_sample, training_source=None): """ Learn field weights from file of labeled examples or round of interactive labeling Keyword arguments: data_sample -- a sample of record pairs training_source -- either a path to a file of labeled examples or a labeling function In the sample of record_pairs, each element is a tuple of two records. Each record is, in turn, a tuple of the record's key and a record dictionary. In in the record dictionary the keys are the names of the record field and values are the record values. For example, a data_sample with only one pair of records, [ ( (854, {'city': 'san francisco', 'address': '300 de haro st.', 'name': "sally's cafe & bakery", 'cuisine': 'american'}), (855, {'city': 'san francisco', 'address': '1328 18th st.', 'name': 'san francisco bbq', 'cuisine': 'thai'}) ) ] The labeling function will be used to do active learning. The function will be supplied a list of examples that the learner is the most 'curious' about, that is examples where we are most uncertain about how they should be labeled. The labeling function will label these, and based upon what we learn from these examples, the labeling function will be supplied with new examples that the learner is now most curious about. This will continue until the labeling function sends a message that we it is done labeling. The labeling function must be a function that takes two arguments. The first argument is a sequence of pairs of records. The second argument is the data model. The labeling function must return two outputs. The function must return a dictionary of labeled pairs and a finished flag. The dictionary of labeled pairs must have two keys, 1 and 0, corresponding to record pairs that are duplicates or nonduplicates respectively. The values of the dictionary must be a sequence of records pairs, like the sequence that was passed in. The 'finished' flag should take the value False for active learning to continue, and the value True to stop active learning. i.e. labelFunction(record_pairs, data_model) : ... return (labeled_pairs, finished) For a working example, see consoleLabel in training Labeled example files are typically generated by saving the examples labeled in a previous session. If you need details for this file see the method writeTraining. """ self.data_sample = data_sample if training_source.__class__ is not str and not isinstance(training_source, types.FunctionType): raise ValueError if training_source.__class__ is str: logging.info("reading training from file") if self.training_data is None: self._initializeTraining(training_source) (self.training_pairs, self.training_data) = self._readTraining(training_source, self.training_data) elif isinstance(training_source, types.FunctionType): if self.training_data is None: self._initializeTraining() (self.training_data, self.training_pairs, self.data_model) = training.activeLearning( self.data_sample, self.data_model, training_source, self.training_data, self.training_pairs ) n_folds = min(max(2, numpy.sum(self.training_data["label"]) / 3), 20) print n_folds alpha = crossvalidation.gridSearch(self.training_data, core.trainModel, self.data_model, k=n_folds) self.data_model = core.trainModel(self.training_data, self.data_model, alpha) self._logLearnedWeights()
def train(self, data_sample, training_source=None): """ Learn field weights from file of labeled examples or round of interactive labeling Keyword arguments: data_sample -- a sample of record pairs training_source -- either a path to a file of labeled examples or a labeling function In the sample of record_pairs, each element is a tuple of two records. Each record is, in turn, a tuple of the record's key and a record dictionary. In in the record dictionary the keys are the names of the record field and values are the record values. For example, a data_sample with only one pair of records, [ ( (854, {'city': 'san francisco', 'address': '300 de haro st.', 'name': "sally's cafe & bakery", 'cuisine': 'american'}), (855, {'city': 'san francisco', 'address': '1328 18th st.', 'name': 'san francisco bbq', 'cuisine': 'thai'}) ) ] The labeling function will be used to do active learning. The function will be supplied a list of examples that the learner is the most 'curious' about, that is examples where we are most uncertain about how they should be labeled. The labeling function will label these, and based upon what we learn from these examples, the labeling function will be supplied with new examples that the learner is now most curious about. This will continue until the labeling function sends a message that we it is done labeling. The labeling function must be a function that takes two arguments. The first argument is a sequence of pairs of records. The second argument is the data model. The labeling function must return two outputs. The function must return a dictionary of labeled pairs and a finished flag. The dictionary of labeled pairs must have two keys, 1 and 0, corresponding to record pairs that are duplicates or nonduplicates respectively. The values of the dictionary must be a sequence of records pairs, like the sequence that was passed in. The 'finished' flag should take the value False for active learning to continue, and the value True to stop active learning. i.e. labelFunction(record_pairs, data_model) : ... return (labeled_pairs, finished) For a working example, see consoleLabel in training Labeled example files are typically generated by saving the examples labeled in a previous session. If you need details for this file see the method writeTraining. """ self.data_sample = data_sample if training_source.__class__ is not str and not isinstance( training_source, types.FunctionType): raise ValueError if training_source.__class__ is str: logging.info('reading training from file') if self.training_data is None: self._initializeTraining(training_source) (self.training_pairs, self.training_data) = self._readTraining(training_source, self.training_data) elif isinstance(training_source, types.FunctionType): if self.training_data is None: self._initializeTraining() (self.training_data, self.training_pairs, self.data_model) = training.activeLearning( self.data_sample, self.data_model, training_source, self.training_data, self.training_pairs) n_folds = min(numpy.sum(self.training_data['label']) / 3, 20) n_folds = min(max(2, numpy.sum(self.training_data['label']) / 3), 20) logging.info('%d folds', n_folds) alpha = crossvalidation.gridSearch(self.training_data, core.trainModel, self.data_model, k=n_folds) self.data_model = core.trainModel(self.training_data, self.data_model, alpha) self._logLearnedWeights()