class TranslationClassifier(object): """ Class for translation disambiguation using per lempos disambiguators stored in a HDF5 file """ def __init__(self, models_fname): self.models = DisambiguatorStore(models_fname) self.classifier = self.models.load_estimator() self.vocab = self.models.load_vocab(as_dict=True) def score(self, source_lempos, context_vec): """ score translation candidates for source lempos combination, returning a dict mapping target lempos combinations to scores """ try: self.models.restore_fit(source_lempos, self.classifier) except KeyError: log.debug(u"no model available for source lempos " + source_lempos) return {} target_names = self.models.load_target_names(source_lempos) return self._predict(context_vec, target_names) def _predict(self, context_vec, target_names): """ return a dict mapping target names to scores """ # FIXME: some estimators have no predict_proba method # e.g. NearestCentroid preds = self.classifier.predict_proba(context_vec) return dict(zip(target_names, preds[0]))
def test_disambiguator_store(self): # Create a silly classifier that disambiguates between "stam" (tree # trunk) or "romp" (body trunk) as the Dutch translation of the # English noun "trunk" lempos = u"trunk/n" # FIXME: store_fit() should only accept unicode strings target_names = u"stam romp".encode("utf-8").split() vocab = u"boom hoofd".split() X = np.array([[0,1], [1,0], [0,1], [1,0]]) y = np.array([1,0,1,0]) estimator = NearestCentroid() estimator.fit(X, y) centroids = estimator.centroids_ score = estimator.score(X, y) # Store estimator fname = tempfile.NamedTemporaryFile().name f = DisambiguatorStore(fname, "w") f.save_estimator(NearestCentroid()) f.save_vocab(vocab) f.store_fit(lempos, estimator) f.save_target_names(lempos, target_names) f.close() # Restore estimator f2 = DisambiguatorStore(fname) estimator2 = f2.load_estimator() vocab2 = f2.load_vocab() f2.restore_fit(lempos, estimator2) target_names2 = f2.load_target_names(lempos) centroids2 = estimator2.centroids_ score2 = estimator2.score(X, y) assert_array_equal(centroids, centroids2) assert target_names == target_names2 assert vocab == vocab2 assert score == score2
def trace_nc(graphs_fname, model_fname, n=None, source_pos=[], outf=codecs.getwriter('utf8')(sys.stdout)): if isinstance(outf, basestring): outf = codecs.open(outf, "w", encoding="utf-8") graphs = cPickle.load(open(graphs_fname)) model = DisambiguatorStore(model_fname) estimator = model.load_estimator() vocab = np.array(model.load_vocab()) vocab_dict = model.load_vocab(as_dict=True) score_attr = "centroid_score" for graph_count, graph in enumerate(graphs[:n]): source_string = graph.source_string() outf.write( 100 * "=" + "/n") outf.write( u"{}: {}\n".format(graph_count + 1, source_string)) outf.write( 100 * "=" + "\n\n") reverse_lookup = make_reverse_lookup(graph) source_node_vectors = make_source_node_vectors(graph, vocab_dict) source_graph_vector = sp.csr_matrix(source_node_vectors.sum(axis=0)) for sn, node_vec in zip(graph.source_nodes_iter(ordered=True), source_node_vectors): if source_pos and graph.pos(sn) not in source_pos: continue try: source_lempos = u" ".join(graph.node[sn]["lex_lempos"]) except KeyError: log.debug(u"(annotated) source lempos {0} not in" u"lexicon\n".format(graph.lempos(sn))) continue try: model.restore_fit(source_lempos, estimator) except KeyError: log.debug(u"no model available for (lexicon) source lempos " u"{0}\n".format(source_lempos)) continue context_vec = source_graph_vector - node_vec context_vec = context_vec.toarray() normalize(context_vec, copy=False) try: mask = model.load_vocab_mask(source_lempos)[:] except KeyError: local_vocab = vocab else: context_vec = context_vec[:,mask] local_vocab = vocab[mask] try: centroids = estimator.centroids_ except AttributeError: # pipeline centroids = estimator.steps[-1][-1].centroids_ target_lempos_list = model.load_target_names(source_lempos) outf.write( 100 * "-" + "\n") outf.write( source_lempos + "\n") outf.write( 100 * "-" + "\n\n") for target_lempos, target_centroid in zip(target_lempos_list, centroids): prod = target_centroid * context_vec outf.write( u"==> {:<24} {:1.4f} {}\n".format( target_lempos, prod.sum(), prod.sum() * 100 * "X")) outf.write("\n") for target_lempos, target_centroid in zip(target_lempos_list, centroids): prod = target_centroid * context_vec indices = target_centroid.argsort()[::-1] outf.write( "\n" + source_string + "\n\n") outf.write( u"{:<64} ==> {:<24} {:1.4f} {}\n".format( source_lempos, target_lempos, prod.sum(), prod.sum() * 100 * "X")) for i in indices: if prod[0,i] > 0: context_lemma = local_vocab[i] sources = ",".join(reverse_lookup[context_lemma]) bar = prod[0,i] * 100 * "*" outf.write( u"{:<64} --> {:<24} {:1.4f} {}\n".format( sources, context_lemma, prod[0,i], bar)) outf.write("\n")
def print_centroids(models_fname, lemma=None, pos=None, minimum=0, n=None, outf=codecs.getwriter('utf8')(sys.stdout) ): # If used in combination with a feature selector, # models must be build using with_vocab_mask=True # FIXME: messy code below models = DisambiguatorStore(models_fname) classifier = models.load_estimator() full_vocab = np.array(models.load_vocab()) fits = models.file[models.FITS_PATH] line = 78 * "=" + "\n" subline = " " + 74 * "-" + "\n" if isinstance(outf, basestring): outf = codecs.open(outf, "w", encoding="utf-8") if lemma: if lemma in fits: lemma_list = [lemma] else: lemma_list = [] else: lemma_list = fits for lemma in lemma_list: for lemma_pos in fits[lemma]: if not pos or lemma_pos == pos: lempos = lemma + u"/" + lemma_pos outf.write(line + lempos + "\n" + line) models.restore_fit(lempos, classifier) if isinstance(classifier, NearestCentroid): centroids_ = classifier.centroids_ else: # NearestCentroid is last item in Pipeline nc = classifier.steps[-1][-1] assert isinstance(nc, NearestCentroid) centroids_ = nc.centroids_ target_names = models.load_target_names(lempos) target_n = 0 try: vocab_mask = models.load_vocab_mask(lempos)[:] except: vocab = full_vocab else: vocab = full_vocab[vocab_mask] for target, centroid in zip(target_names, centroids_): target_n += 1 outf.write(subline) outf.write(u" [{}] {} ---> {}\n".format( target_n, lempos, target)) outf.write(subline) indices = centroid.argsort().tolist() indices.reverse() for i in indices[:n]: if centroid[i] > minimum: outf.write(u" {0:>16.8f} {1:<16} {2}\n".format( centroid[i], vocab[i], centroid[i] * 100 * "*"))