示例#1
0
class TranslationClassifier(object):
    """
    Class for translation disambiguation using per lempos disambiguators
    stored in a HDF5 file
    """
    
    def __init__(self, models_fname):
        self.models = DisambiguatorStore(models_fname)
        self.classifier = self.models.load_estimator()
        self.vocab = self.models.load_vocab(as_dict=True)
        
    def score(self, source_lempos, context_vec):
        """
        score translation candidates for source lempos combination,
        returning a dict mapping target lempos combinations to scores
        """
        try:
            self.models.restore_fit(source_lempos, self.classifier)
        except KeyError:
            log.debug(u"no model available for source lempos " + 
                      source_lempos)
            return {}  
        
        target_names = self.models.load_target_names(source_lempos)
        return self._predict(context_vec, target_names)
    
    def _predict(self, context_vec, target_names):
        """
        return a dict mapping target names to scores
        """
        # FIXME: some estimators have no predict_proba method
        # e.g. NearestCentroid
        preds = self.classifier.predict_proba(context_vec)
        return dict(zip(target_names, preds[0]))
示例#2
0
 def test_disambiguator_store(self):
     # Create a silly classifier that disambiguates between "stam" (tree
     # trunk) or "romp" (body trunk) as the Dutch translation of the
     # English noun "trunk"
     lempos = u"trunk/n"
     # FIXME: store_fit() should only accept unicode strings
     target_names = u"stam romp".encode("utf-8").split()
     vocab = u"boom hoofd".split()
     
     X = np.array([[0,1],
                   [1,0],
                   [0,1],
                   [1,0]])
     y = np.array([1,0,1,0])
     
     estimator = NearestCentroid()
     estimator.fit(X, y)
     
     centroids = estimator.centroids_
     score = estimator.score(X, y)
     
     # Store estimator
     fname = tempfile.NamedTemporaryFile().name
     f = DisambiguatorStore(fname, "w")
     f.save_estimator(NearestCentroid())
     f.save_vocab(vocab)
     f.store_fit(lempos, estimator)
     f.save_target_names(lempos, target_names)
     f.close()
     
     # Restore estimator    
     f2 = DisambiguatorStore(fname) 
     estimator2 = f2.load_estimator()
     vocab2 = f2.load_vocab()
     f2.restore_fit(lempos, estimator2)
     target_names2 = f2.load_target_names(lempos)
     centroids2 = estimator2.centroids_
     score2 = estimator2.score(X, y)
     
     assert_array_equal(centroids, centroids2)
     assert target_names == target_names2
     assert vocab == vocab2
     assert score == score2
示例#3
0
def trace_nc(graphs_fname, model_fname, n=None, source_pos=[],
             outf=codecs.getwriter('utf8')(sys.stdout)):
    if isinstance(outf, basestring):
        outf = codecs.open(outf, "w", encoding="utf-8")
    graphs = cPickle.load(open(graphs_fname))
    model = DisambiguatorStore(model_fname)
    estimator = model.load_estimator()
    vocab = np.array(model.load_vocab())
    vocab_dict = model.load_vocab(as_dict=True)
    score_attr = "centroid_score"
    
    for graph_count, graph in enumerate(graphs[:n]):
        source_string = graph.source_string()
        outf.write( 100 * "=" + "/n")
        outf.write( u"{}: {}\n".format(graph_count + 1, source_string))
        outf.write( 100 * "=" + "\n\n")
        
        reverse_lookup = make_reverse_lookup(graph)
        source_node_vectors = make_source_node_vectors(graph, vocab_dict)
        source_graph_vector = sp.csr_matrix(source_node_vectors.sum(axis=0))
        
        for sn, node_vec in zip(graph.source_nodes_iter(ordered=True), 
                               source_node_vectors):
            if source_pos and graph.pos(sn) not in source_pos:
                continue
            
            try:
                source_lempos = u" ".join(graph.node[sn]["lex_lempos"])
            except KeyError:
                log.debug(u"(annotated) source lempos {0} not in"
                          u"lexicon\n".format(graph.lempos(sn)))
                continue
            
            try:
                model.restore_fit(source_lempos, estimator)
            except KeyError:
                log.debug(u"no model available for (lexicon) source lempos "
                          u"{0}\n".format(source_lempos))
                continue
            
            context_vec = source_graph_vector - node_vec
            context_vec = context_vec.toarray()
            normalize(context_vec, copy=False)
            
            try:
                mask = model.load_vocab_mask(source_lempos)[:]
            except KeyError:
                local_vocab = vocab
            else:
                context_vec = context_vec[:,mask]
                local_vocab = vocab[mask]
            
            try:
                centroids = estimator.centroids_
            except AttributeError:
                # pipeline
                centroids = estimator.steps[-1][-1].centroids_  
                
            target_lempos_list = model.load_target_names(source_lempos)
                           
            outf.write( 100 * "-" + "\n")
            outf.write( source_lempos + "\n")
            outf.write( 100 * "-" + "\n\n")
            
            for target_lempos, target_centroid in zip(target_lempos_list,
                                                      centroids):
                prod = target_centroid * context_vec
                outf.write( u"==> {:<24} {:1.4f}  {}\n".format(
                    target_lempos, 
                    prod.sum(),
                    prod.sum() * 100 * "X"))
            outf.write("\n")
            
            for target_lempos, target_centroid in zip(target_lempos_list,
                                                      centroids):
                prod = target_centroid * context_vec
                indices = target_centroid.argsort()[::-1]
                
                outf.write(  "\n" + source_string + "\n\n")
                
                outf.write( u"{:<64} ==> {:<24} {:1.4f}  {}\n".format(
                    source_lempos,
                    target_lempos, 
                    prod.sum(),
                    prod.sum() * 100 * "X"))
                
                for i in indices:
                    if prod[0,i] > 0:
                        context_lemma = local_vocab[i]
                        sources = ",".join(reverse_lookup[context_lemma])
                        bar = prod[0,i] * 100 * "*"
                        outf.write( u"{:<64} --> {:<24} {:1.4f}  {}\n".format(
                            sources, 
                            context_lemma, 
                            prod[0,i],
                            bar))
                outf.write("\n")
示例#4
0
def print_centroids(models_fname, lemma=None, pos=None, minimum=0, n=None,
                    outf=codecs.getwriter('utf8')(sys.stdout) ):
    # If used in combination with a feature selector,
    # models must be build using with_vocab_mask=True
    
    # FIXME: messy code below
    models = DisambiguatorStore(models_fname)
    classifier = models.load_estimator()
    full_vocab = np.array(models.load_vocab())
    fits = models.file[models.FITS_PATH]
    line = 78 * "=" + "\n"
    subline = "    " + 74 * "-" + "\n"
    if isinstance(outf, basestring):
        outf = codecs.open(outf, "w", encoding="utf-8")
        
    if lemma:
        if lemma in fits:
            lemma_list = [lemma]
        else:
            lemma_list = []
    else:
        lemma_list = fits
    
    for lemma in lemma_list:
        for lemma_pos in fits[lemma]:
            if not pos or lemma_pos == pos:
                lempos = lemma + u"/" + lemma_pos
                outf.write(line + lempos + "\n" + line)
                models.restore_fit(lempos, classifier)
                
                if isinstance(classifier, NearestCentroid):
                    centroids_ = classifier.centroids_
                else:
                    # NearestCentroid is last item in Pipeline
                    nc = classifier.steps[-1][-1]
                    assert isinstance(nc, NearestCentroid)
                    centroids_ = nc.centroids_
                    
                target_names = models.load_target_names(lempos)
                target_n = 0
                
                try:
                    vocab_mask = models.load_vocab_mask(lempos)[:]
                except:
                    vocab = full_vocab
                else:
                    vocab = full_vocab[vocab_mask]
                    
                
                for target, centroid in zip(target_names, centroids_):
                    target_n += 1
                    outf.write(subline)
                    outf.write(u"    [{}] {} ---> {}\n".format(
                        target_n,
                        lempos,
                        target))
                    outf.write(subline)
                    indices = centroid.argsort().tolist()
                    indices.reverse()
                    
                    for i in indices[:n]:
                        if centroid[i] > minimum:
                            outf.write(u"    {0:>16.8f}    {1:<16}    {2}\n".format(
                                centroid[i],
                                vocab[i],
                                centroid[i] * 100 * "*"))