Пример #1
0
class TranslationClassifier(object):
    """
    Class for translation disambiguation using per lempos disambiguators
    stored in a HDF5 file
    """
    
    def __init__(self, models_fname):
        self.models = DisambiguatorStore(models_fname)
        self.classifier = self.models.load_estimator()
        self.vocab = self.models.load_vocab(as_dict=True)
        
    def score(self, source_lempos, context_vec):
        """
        score translation candidates for source lempos combination,
        returning a dict mapping target lempos combinations to scores
        """
        try:
            self.models.restore_fit(source_lempos, self.classifier)
        except KeyError:
            log.debug(u"no model available for source lempos " + 
                      source_lempos)
            return {}  
        
        target_names = self.models.load_target_names(source_lempos)
        return self._predict(context_vec, target_names)
    
    def _predict(self, context_vec, target_names):
        """
        return a dict mapping target names to scores
        """
        # FIXME: some estimators have no predict_proba method
        # e.g. NearestCentroid
        preds = self.classifier.predict_proba(context_vec)
        return dict(zip(target_names, preds[0]))
Пример #2
0
 def _prepare(self):
     self.start_time = time.time() 
     self.disambiguator_count = 0
     
     log.info("creating models file " + self.models_hdf_fname)
     self.models_hdfile = DisambiguatorStore(self.models_hdf_fname, "w")
     
     self.models_hdfile.save_estimator(self.classifier)
     
     # FIXME: hmm, a bit sneaky...
     self.models_hdfile.copy_vocab(self.data_generator.samp_hdfile)
     
     if self.counts_fname:
         log.info("reading counts from " + self.counts_fname)
         self.counts_dict = cPickle.load(open(self.counts_fname))
     
     if self.with_vocab_mask:
         log.info("storage with vocabulary masks")
         self.vocab = self.models_hdfile.load_vocab()
Пример #3
0
 def test_disambiguator_store(self):
     # Create a silly classifier that disambiguates between "stam" (tree
     # trunk) or "romp" (body trunk) as the Dutch translation of the
     # English noun "trunk"
     lempos = u"trunk/n"
     # FIXME: store_fit() should only accept unicode strings
     target_names = u"stam romp".encode("utf-8").split()
     vocab = u"boom hoofd".split()
     
     X = np.array([[0,1],
                   [1,0],
                   [0,1],
                   [1,0]])
     y = np.array([1,0,1,0])
     
     estimator = NearestCentroid()
     estimator.fit(X, y)
     
     centroids = estimator.centroids_
     score = estimator.score(X, y)
     
     # Store estimator
     fname = tempfile.NamedTemporaryFile().name
     f = DisambiguatorStore(fname, "w")
     f.save_estimator(NearestCentroid())
     f.save_vocab(vocab)
     f.store_fit(lempos, estimator)
     f.save_target_names(lempos, target_names)
     f.close()
     
     # Restore estimator    
     f2 = DisambiguatorStore(fname) 
     estimator2 = f2.load_estimator()
     vocab2 = f2.load_vocab()
     f2.restore_fit(lempos, estimator2)
     target_names2 = f2.load_target_names(lempos)
     centroids2 = estimator2.centroids_
     score2 = estimator2.score(X, y)
     
     assert_array_equal(centroids, centroids2)
     assert target_names == target_names2
     assert vocab == vocab2
     assert score == score2
Пример #4
0
def trace_nc(graphs_fname, model_fname, n=None, source_pos=[],
             outf=codecs.getwriter('utf8')(sys.stdout)):
    if isinstance(outf, basestring):
        outf = codecs.open(outf, "w", encoding="utf-8")
    graphs = cPickle.load(open(graphs_fname))
    model = DisambiguatorStore(model_fname)
    estimator = model.load_estimator()
    vocab = np.array(model.load_vocab())
    vocab_dict = model.load_vocab(as_dict=True)
    score_attr = "centroid_score"
    
    for graph_count, graph in enumerate(graphs[:n]):
        source_string = graph.source_string()
        outf.write( 100 * "=" + "/n")
        outf.write( u"{}: {}\n".format(graph_count + 1, source_string))
        outf.write( 100 * "=" + "\n\n")
        
        reverse_lookup = make_reverse_lookup(graph)
        source_node_vectors = make_source_node_vectors(graph, vocab_dict)
        source_graph_vector = sp.csr_matrix(source_node_vectors.sum(axis=0))
        
        for sn, node_vec in zip(graph.source_nodes_iter(ordered=True), 
                               source_node_vectors):
            if source_pos and graph.pos(sn) not in source_pos:
                continue
            
            try:
                source_lempos = u" ".join(graph.node[sn]["lex_lempos"])
            except KeyError:
                log.debug(u"(annotated) source lempos {0} not in"
                          u"lexicon\n".format(graph.lempos(sn)))
                continue
            
            try:
                model.restore_fit(source_lempos, estimator)
            except KeyError:
                log.debug(u"no model available for (lexicon) source lempos "
                          u"{0}\n".format(source_lempos))
                continue
            
            context_vec = source_graph_vector - node_vec
            context_vec = context_vec.toarray()
            normalize(context_vec, copy=False)
            
            try:
                mask = model.load_vocab_mask(source_lempos)[:]
            except KeyError:
                local_vocab = vocab
            else:
                context_vec = context_vec[:,mask]
                local_vocab = vocab[mask]
            
            try:
                centroids = estimator.centroids_
            except AttributeError:
                # pipeline
                centroids = estimator.steps[-1][-1].centroids_  
                
            target_lempos_list = model.load_target_names(source_lempos)
                           
            outf.write( 100 * "-" + "\n")
            outf.write( source_lempos + "\n")
            outf.write( 100 * "-" + "\n\n")
            
            for target_lempos, target_centroid in zip(target_lempos_list,
                                                      centroids):
                prod = target_centroid * context_vec
                outf.write( u"==> {:<24} {:1.4f}  {}\n".format(
                    target_lempos, 
                    prod.sum(),
                    prod.sum() * 100 * "X"))
            outf.write("\n")
            
            for target_lempos, target_centroid in zip(target_lempos_list,
                                                      centroids):
                prod = target_centroid * context_vec
                indices = target_centroid.argsort()[::-1]
                
                outf.write(  "\n" + source_string + "\n\n")
                
                outf.write( u"{:<64} ==> {:<24} {:1.4f}  {}\n".format(
                    source_lempos,
                    target_lempos, 
                    prod.sum(),
                    prod.sum() * 100 * "X"))
                
                for i in indices:
                    if prod[0,i] > 0:
                        context_lemma = local_vocab[i]
                        sources = ",".join(reverse_lookup[context_lemma])
                        bar = prod[0,i] * 100 * "*"
                        outf.write( u"{:<64} --> {:<24} {:1.4f}  {}\n".format(
                            sources, 
                            context_lemma, 
                            prod[0,i],
                            bar))
                outf.write("\n")
Пример #5
0
class ModelBuilder(object):
    """
    Class for fitting disambiguators on samples and storing them in HDF5 file.
    
    Parameters
    ----------
    data_generator: DataSetGenerator instance
        Labeled samples generator
    samp_hdf_fname: str
        Name of HDF5 file containing context samples
    models_hdf_fname: str
        Name of HDF5 file for storing disambiguation models
    classifier: classifier instance
        Classifier instance from sklearn, to be fitted for each ambiguous 
        source lempos. Possibly a Pipeline instance.
    counts_fname: str
        Name of pickle file containing lemma counts. If used in combination 
        with  classifier that supports class weights (e.g. SGDClassifier)
        or class priors (e.g. Naive Bayes), classes will be weighted 
        according to their lemma frequency.
    with_vocab_mask: bool, optional
        If true, a vocabulary mask is stored for each model. This is required 
        when the classifier performs feature selection and the vocabulary has
        to be pruned accordingly.
    """
    
    # feature selectors that reduce the vocabulary
    FEATURE_SELECTORS = _BaseFilter, RFE, _LearntSelectorMixin
    
    def __init__(self, data_generator, models_hdf_fname, classifier,
                 counts_fname=None, with_vocab_mask=False, **kwargs):
        self.data_generator = data_generator
        self.models_hdf_fname = models_hdf_fname
        self.classifier = classifier
        self.counts_fname = counts_fname
        self.with_vocab_mask = with_vocab_mask
        
    def run(self):
        """
        build disambiguation models
        """
        self._prepare()
        self._build()
        self._finish()

    def _prepare(self):
        self.start_time = time.time() 
        self.disambiguator_count = 0
        
        log.info("creating models file " + self.models_hdf_fname)
        self.models_hdfile = DisambiguatorStore(self.models_hdf_fname, "w")
        
        self.models_hdfile.save_estimator(self.classifier)
        
        # FIXME: hmm, a bit sneaky...
        self.models_hdfile.copy_vocab(self.data_generator.samp_hdfile)
        
        if self.counts_fname:
            log.info("reading counts from " + self.counts_fname)
            self.counts_dict = cPickle.load(open(self.counts_fname))
        
        if self.with_vocab_mask:
            log.info("storage with vocabulary masks")
            self.vocab = self.models_hdfile.load_vocab()
            
    def _build(self):
        for data_set in self.data_generator:
            if data_set.target_lempos:
                self._build_disambiguator(data_set)
            else:
                log.error("no samples and thus no disambiguation models for " +
                          data_set.source_lempos)
            
    def _build_disambiguator(self, data_set):
        log.info(u"building disambiguator for {} with {} translations".format(
            data_set.source_lempos, len(data_set.target_lempos)))
        
        if self.counts_fname:
            self._set_class_weights(data_set.target_lempos)
        
        try:
            self.classifier.fit(data_set.samples, data_set.targets)  
        except ValueError, error:
            if ( error.args in [
                ("zero-size array to reduction operation maximum which has no "
                 "identity",),
                ("zero-size array to maximum.reduce without identity",),
                ('Invalid threshold: all features are discarded.',)] ):
                # this happens when there are no features selected 
                # e.g. when using SelectFpr
                log.error("No model created, because no features selected!")
                return
            else:
                raise error
            
        self.models_hdfile.store_fit(data_set.source_lempos, self.classifier)
        self.models_hdfile.save_target_names(data_set.source_lempos,
                                             data_set.target_lempos)
        if self.with_vocab_mask:        
            self._save_vocab_mask(data_set.source_lempos)
        self.disambiguator_count += 1  
Пример #6
0
def print_centroids(models_fname, lemma=None, pos=None, minimum=0, n=None,
                    outf=codecs.getwriter('utf8')(sys.stdout) ):
    # If used in combination with a feature selector,
    # models must be build using with_vocab_mask=True
    
    # FIXME: messy code below
    models = DisambiguatorStore(models_fname)
    classifier = models.load_estimator()
    full_vocab = np.array(models.load_vocab())
    fits = models.file[models.FITS_PATH]
    line = 78 * "=" + "\n"
    subline = "    " + 74 * "-" + "\n"
    if isinstance(outf, basestring):
        outf = codecs.open(outf, "w", encoding="utf-8")
        
    if lemma:
        if lemma in fits:
            lemma_list = [lemma]
        else:
            lemma_list = []
    else:
        lemma_list = fits
    
    for lemma in lemma_list:
        for lemma_pos in fits[lemma]:
            if not pos or lemma_pos == pos:
                lempos = lemma + u"/" + lemma_pos
                outf.write(line + lempos + "\n" + line)
                models.restore_fit(lempos, classifier)
                
                if isinstance(classifier, NearestCentroid):
                    centroids_ = classifier.centroids_
                else:
                    # NearestCentroid is last item in Pipeline
                    nc = classifier.steps[-1][-1]
                    assert isinstance(nc, NearestCentroid)
                    centroids_ = nc.centroids_
                    
                target_names = models.load_target_names(lempos)
                target_n = 0
                
                try:
                    vocab_mask = models.load_vocab_mask(lempos)[:]
                except:
                    vocab = full_vocab
                else:
                    vocab = full_vocab[vocab_mask]
                    
                
                for target, centroid in zip(target_names, centroids_):
                    target_n += 1
                    outf.write(subline)
                    outf.write(u"    [{}] {} ---> {}\n".format(
                        target_n,
                        lempos,
                        target))
                    outf.write(subline)
                    indices = centroid.argsort().tolist()
                    indices.reverse()
                    
                    for i in indices[:n]:
                        if centroid[i] > minimum:
                            outf.write(u"    {0:>16.8f}    {1:<16}    {2}\n".format(
                                centroid[i],
                                vocab[i],
                                centroid[i] * 100 * "*"))
Пример #7
0
 def __init__(self, models_fname):
     self.models = DisambiguatorStore(models_fname)
     self.classifier = self.models.load_estimator()
     self.vocab = self.models.load_vocab(as_dict=True)