class TranslationClassifier(object): """ Class for translation disambiguation using per lempos disambiguators stored in a HDF5 file """ def __init__(self, models_fname): self.models = DisambiguatorStore(models_fname) self.classifier = self.models.load_estimator() self.vocab = self.models.load_vocab(as_dict=True) def score(self, source_lempos, context_vec): """ score translation candidates for source lempos combination, returning a dict mapping target lempos combinations to scores """ try: self.models.restore_fit(source_lempos, self.classifier) except KeyError: log.debug(u"no model available for source lempos " + source_lempos) return {} target_names = self.models.load_target_names(source_lempos) return self._predict(context_vec, target_names) def _predict(self, context_vec, target_names): """ return a dict mapping target names to scores """ # FIXME: some estimators have no predict_proba method # e.g. NearestCentroid preds = self.classifier.predict_proba(context_vec) return dict(zip(target_names, preds[0]))
def _prepare(self): self.start_time = time.time() self.disambiguator_count = 0 log.info("creating models file " + self.models_hdf_fname) self.models_hdfile = DisambiguatorStore(self.models_hdf_fname, "w") self.models_hdfile.save_estimator(self.classifier) # FIXME: hmm, a bit sneaky... self.models_hdfile.copy_vocab(self.data_generator.samp_hdfile) if self.counts_fname: log.info("reading counts from " + self.counts_fname) self.counts_dict = cPickle.load(open(self.counts_fname)) if self.with_vocab_mask: log.info("storage with vocabulary masks") self.vocab = self.models_hdfile.load_vocab()
def test_disambiguator_store(self): # Create a silly classifier that disambiguates between "stam" (tree # trunk) or "romp" (body trunk) as the Dutch translation of the # English noun "trunk" lempos = u"trunk/n" # FIXME: store_fit() should only accept unicode strings target_names = u"stam romp".encode("utf-8").split() vocab = u"boom hoofd".split() X = np.array([[0,1], [1,0], [0,1], [1,0]]) y = np.array([1,0,1,0]) estimator = NearestCentroid() estimator.fit(X, y) centroids = estimator.centroids_ score = estimator.score(X, y) # Store estimator fname = tempfile.NamedTemporaryFile().name f = DisambiguatorStore(fname, "w") f.save_estimator(NearestCentroid()) f.save_vocab(vocab) f.store_fit(lempos, estimator) f.save_target_names(lempos, target_names) f.close() # Restore estimator f2 = DisambiguatorStore(fname) estimator2 = f2.load_estimator() vocab2 = f2.load_vocab() f2.restore_fit(lempos, estimator2) target_names2 = f2.load_target_names(lempos) centroids2 = estimator2.centroids_ score2 = estimator2.score(X, y) assert_array_equal(centroids, centroids2) assert target_names == target_names2 assert vocab == vocab2 assert score == score2
def trace_nc(graphs_fname, model_fname, n=None, source_pos=[], outf=codecs.getwriter('utf8')(sys.stdout)): if isinstance(outf, basestring): outf = codecs.open(outf, "w", encoding="utf-8") graphs = cPickle.load(open(graphs_fname)) model = DisambiguatorStore(model_fname) estimator = model.load_estimator() vocab = np.array(model.load_vocab()) vocab_dict = model.load_vocab(as_dict=True) score_attr = "centroid_score" for graph_count, graph in enumerate(graphs[:n]): source_string = graph.source_string() outf.write( 100 * "=" + "/n") outf.write( u"{}: {}\n".format(graph_count + 1, source_string)) outf.write( 100 * "=" + "\n\n") reverse_lookup = make_reverse_lookup(graph) source_node_vectors = make_source_node_vectors(graph, vocab_dict) source_graph_vector = sp.csr_matrix(source_node_vectors.sum(axis=0)) for sn, node_vec in zip(graph.source_nodes_iter(ordered=True), source_node_vectors): if source_pos and graph.pos(sn) not in source_pos: continue try: source_lempos = u" ".join(graph.node[sn]["lex_lempos"]) except KeyError: log.debug(u"(annotated) source lempos {0} not in" u"lexicon\n".format(graph.lempos(sn))) continue try: model.restore_fit(source_lempos, estimator) except KeyError: log.debug(u"no model available for (lexicon) source lempos " u"{0}\n".format(source_lempos)) continue context_vec = source_graph_vector - node_vec context_vec = context_vec.toarray() normalize(context_vec, copy=False) try: mask = model.load_vocab_mask(source_lempos)[:] except KeyError: local_vocab = vocab else: context_vec = context_vec[:,mask] local_vocab = vocab[mask] try: centroids = estimator.centroids_ except AttributeError: # pipeline centroids = estimator.steps[-1][-1].centroids_ target_lempos_list = model.load_target_names(source_lempos) outf.write( 100 * "-" + "\n") outf.write( source_lempos + "\n") outf.write( 100 * "-" + "\n\n") for target_lempos, target_centroid in zip(target_lempos_list, centroids): prod = target_centroid * context_vec outf.write( u"==> {:<24} {:1.4f} {}\n".format( target_lempos, prod.sum(), prod.sum() * 100 * "X")) outf.write("\n") for target_lempos, target_centroid in zip(target_lempos_list, centroids): prod = target_centroid * context_vec indices = target_centroid.argsort()[::-1] outf.write( "\n" + source_string + "\n\n") outf.write( u"{:<64} ==> {:<24} {:1.4f} {}\n".format( source_lempos, target_lempos, prod.sum(), prod.sum() * 100 * "X")) for i in indices: if prod[0,i] > 0: context_lemma = local_vocab[i] sources = ",".join(reverse_lookup[context_lemma]) bar = prod[0,i] * 100 * "*" outf.write( u"{:<64} --> {:<24} {:1.4f} {}\n".format( sources, context_lemma, prod[0,i], bar)) outf.write("\n")
class ModelBuilder(object): """ Class for fitting disambiguators on samples and storing them in HDF5 file. Parameters ---------- data_generator: DataSetGenerator instance Labeled samples generator samp_hdf_fname: str Name of HDF5 file containing context samples models_hdf_fname: str Name of HDF5 file for storing disambiguation models classifier: classifier instance Classifier instance from sklearn, to be fitted for each ambiguous source lempos. Possibly a Pipeline instance. counts_fname: str Name of pickle file containing lemma counts. If used in combination with classifier that supports class weights (e.g. SGDClassifier) or class priors (e.g. Naive Bayes), classes will be weighted according to their lemma frequency. with_vocab_mask: bool, optional If true, a vocabulary mask is stored for each model. This is required when the classifier performs feature selection and the vocabulary has to be pruned accordingly. """ # feature selectors that reduce the vocabulary FEATURE_SELECTORS = _BaseFilter, RFE, _LearntSelectorMixin def __init__(self, data_generator, models_hdf_fname, classifier, counts_fname=None, with_vocab_mask=False, **kwargs): self.data_generator = data_generator self.models_hdf_fname = models_hdf_fname self.classifier = classifier self.counts_fname = counts_fname self.with_vocab_mask = with_vocab_mask def run(self): """ build disambiguation models """ self._prepare() self._build() self._finish() def _prepare(self): self.start_time = time.time() self.disambiguator_count = 0 log.info("creating models file " + self.models_hdf_fname) self.models_hdfile = DisambiguatorStore(self.models_hdf_fname, "w") self.models_hdfile.save_estimator(self.classifier) # FIXME: hmm, a bit sneaky... self.models_hdfile.copy_vocab(self.data_generator.samp_hdfile) if self.counts_fname: log.info("reading counts from " + self.counts_fname) self.counts_dict = cPickle.load(open(self.counts_fname)) if self.with_vocab_mask: log.info("storage with vocabulary masks") self.vocab = self.models_hdfile.load_vocab() def _build(self): for data_set in self.data_generator: if data_set.target_lempos: self._build_disambiguator(data_set) else: log.error("no samples and thus no disambiguation models for " + data_set.source_lempos) def _build_disambiguator(self, data_set): log.info(u"building disambiguator for {} with {} translations".format( data_set.source_lempos, len(data_set.target_lempos))) if self.counts_fname: self._set_class_weights(data_set.target_lempos) try: self.classifier.fit(data_set.samples, data_set.targets) except ValueError, error: if ( error.args in [ ("zero-size array to reduction operation maximum which has no " "identity",), ("zero-size array to maximum.reduce without identity",), ('Invalid threshold: all features are discarded.',)] ): # this happens when there are no features selected # e.g. when using SelectFpr log.error("No model created, because no features selected!") return else: raise error self.models_hdfile.store_fit(data_set.source_lempos, self.classifier) self.models_hdfile.save_target_names(data_set.source_lempos, data_set.target_lempos) if self.with_vocab_mask: self._save_vocab_mask(data_set.source_lempos) self.disambiguator_count += 1
def print_centroids(models_fname, lemma=None, pos=None, minimum=0, n=None, outf=codecs.getwriter('utf8')(sys.stdout) ): # If used in combination with a feature selector, # models must be build using with_vocab_mask=True # FIXME: messy code below models = DisambiguatorStore(models_fname) classifier = models.load_estimator() full_vocab = np.array(models.load_vocab()) fits = models.file[models.FITS_PATH] line = 78 * "=" + "\n" subline = " " + 74 * "-" + "\n" if isinstance(outf, basestring): outf = codecs.open(outf, "w", encoding="utf-8") if lemma: if lemma in fits: lemma_list = [lemma] else: lemma_list = [] else: lemma_list = fits for lemma in lemma_list: for lemma_pos in fits[lemma]: if not pos or lemma_pos == pos: lempos = lemma + u"/" + lemma_pos outf.write(line + lempos + "\n" + line) models.restore_fit(lempos, classifier) if isinstance(classifier, NearestCentroid): centroids_ = classifier.centroids_ else: # NearestCentroid is last item in Pipeline nc = classifier.steps[-1][-1] assert isinstance(nc, NearestCentroid) centroids_ = nc.centroids_ target_names = models.load_target_names(lempos) target_n = 0 try: vocab_mask = models.load_vocab_mask(lempos)[:] except: vocab = full_vocab else: vocab = full_vocab[vocab_mask] for target, centroid in zip(target_names, centroids_): target_n += 1 outf.write(subline) outf.write(u" [{}] {} ---> {}\n".format( target_n, lempos, target)) outf.write(subline) indices = centroid.argsort().tolist() indices.reverse() for i in indices[:n]: if centroid[i] > minimum: outf.write(u" {0:>16.8f} {1:<16} {2}\n".format( centroid[i], vocab[i], centroid[i] * 100 * "*"))
def __init__(self, models_fname): self.models = DisambiguatorStore(models_fname) self.classifier = self.models.load_estimator() self.vocab = self.models.load_vocab(as_dict=True)