def test_disambiguator_store(self): # Create a silly classifier that disambiguates between "stam" (tree # trunk) or "romp" (body trunk) as the Dutch translation of the # English noun "trunk" lempos = u"trunk/n" # FIXME: store_fit() should only accept unicode strings target_names = u"stam romp".encode("utf-8").split() vocab = u"boom hoofd".split() X = np.array([[0,1], [1,0], [0,1], [1,0]]) y = np.array([1,0,1,0]) estimator = NearestCentroid() estimator.fit(X, y) centroids = estimator.centroids_ score = estimator.score(X, y) # Store estimator fname = tempfile.NamedTemporaryFile().name f = DisambiguatorStore(fname, "w") f.save_estimator(NearestCentroid()) f.save_vocab(vocab) f.store_fit(lempos, estimator) f.save_target_names(lempos, target_names) f.close() # Restore estimator f2 = DisambiguatorStore(fname) estimator2 = f2.load_estimator() vocab2 = f2.load_vocab() f2.restore_fit(lempos, estimator2) target_names2 = f2.load_target_names(lempos) centroids2 = estimator2.centroids_ score2 = estimator2.score(X, y) assert_array_equal(centroids, centroids2) assert target_names == target_names2 assert vocab == vocab2 assert score == score2
class ModelBuilder(object): """ Class for fitting disambiguators on samples and storing them in HDF5 file. Parameters ---------- data_generator: DataSetGenerator instance Labeled samples generator samp_hdf_fname: str Name of HDF5 file containing context samples models_hdf_fname: str Name of HDF5 file for storing disambiguation models classifier: classifier instance Classifier instance from sklearn, to be fitted for each ambiguous source lempos. Possibly a Pipeline instance. counts_fname: str Name of pickle file containing lemma counts. If used in combination with classifier that supports class weights (e.g. SGDClassifier) or class priors (e.g. Naive Bayes), classes will be weighted according to their lemma frequency. with_vocab_mask: bool, optional If true, a vocabulary mask is stored for each model. This is required when the classifier performs feature selection and the vocabulary has to be pruned accordingly. """ # feature selectors that reduce the vocabulary FEATURE_SELECTORS = _BaseFilter, RFE, _LearntSelectorMixin def __init__(self, data_generator, models_hdf_fname, classifier, counts_fname=None, with_vocab_mask=False, **kwargs): self.data_generator = data_generator self.models_hdf_fname = models_hdf_fname self.classifier = classifier self.counts_fname = counts_fname self.with_vocab_mask = with_vocab_mask def run(self): """ build disambiguation models """ self._prepare() self._build() self._finish() def _prepare(self): self.start_time = time.time() self.disambiguator_count = 0 log.info("creating models file " + self.models_hdf_fname) self.models_hdfile = DisambiguatorStore(self.models_hdf_fname, "w") self.models_hdfile.save_estimator(self.classifier) # FIXME: hmm, a bit sneaky... self.models_hdfile.copy_vocab(self.data_generator.samp_hdfile) if self.counts_fname: log.info("reading counts from " + self.counts_fname) self.counts_dict = cPickle.load(open(self.counts_fname)) if self.with_vocab_mask: log.info("storage with vocabulary masks") self.vocab = self.models_hdfile.load_vocab() def _build(self): for data_set in self.data_generator: if data_set.target_lempos: self._build_disambiguator(data_set) else: log.error("no samples and thus no disambiguation models for " + data_set.source_lempos) def _build_disambiguator(self, data_set): log.info(u"building disambiguator for {} with {} translations".format( data_set.source_lempos, len(data_set.target_lempos))) if self.counts_fname: self._set_class_weights(data_set.target_lempos) try: self.classifier.fit(data_set.samples, data_set.targets) except ValueError, error: if ( error.args in [ ("zero-size array to reduction operation maximum which has no " "identity",), ("zero-size array to maximum.reduce without identity",), ('Invalid threshold: all features are discarded.',)] ): # this happens when there are no features selected # e.g. when using SelectFpr log.error("No model created, because no features selected!") return else: raise error self.models_hdfile.store_fit(data_set.source_lempos, self.classifier) self.models_hdfile.save_target_names(data_set.source_lempos, data_set.target_lempos) if self.with_vocab_mask: self._save_vocab_mask(data_set.source_lempos) self.disambiguator_count += 1