示例#1
0
 def test_disambiguator_store(self):
     # Create a silly classifier that disambiguates between "stam" (tree
     # trunk) or "romp" (body trunk) as the Dutch translation of the
     # English noun "trunk"
     lempos = u"trunk/n"
     # FIXME: store_fit() should only accept unicode strings
     target_names = u"stam romp".encode("utf-8").split()
     vocab = u"boom hoofd".split()
     
     X = np.array([[0,1],
                   [1,0],
                   [0,1],
                   [1,0]])
     y = np.array([1,0,1,0])
     
     estimator = NearestCentroid()
     estimator.fit(X, y)
     
     centroids = estimator.centroids_
     score = estimator.score(X, y)
     
     # Store estimator
     fname = tempfile.NamedTemporaryFile().name
     f = DisambiguatorStore(fname, "w")
     f.save_estimator(NearestCentroid())
     f.save_vocab(vocab)
     f.store_fit(lempos, estimator)
     f.save_target_names(lempos, target_names)
     f.close()
     
     # Restore estimator    
     f2 = DisambiguatorStore(fname) 
     estimator2 = f2.load_estimator()
     vocab2 = f2.load_vocab()
     f2.restore_fit(lempos, estimator2)
     target_names2 = f2.load_target_names(lempos)
     centroids2 = estimator2.centroids_
     score2 = estimator2.score(X, y)
     
     assert_array_equal(centroids, centroids2)
     assert target_names == target_names2
     assert vocab == vocab2
     assert score == score2
示例#2
0
class ModelBuilder(object):
    """
    Class for fitting disambiguators on samples and storing them in HDF5 file.
    
    Parameters
    ----------
    data_generator: DataSetGenerator instance
        Labeled samples generator
    samp_hdf_fname: str
        Name of HDF5 file containing context samples
    models_hdf_fname: str
        Name of HDF5 file for storing disambiguation models
    classifier: classifier instance
        Classifier instance from sklearn, to be fitted for each ambiguous 
        source lempos. Possibly a Pipeline instance.
    counts_fname: str
        Name of pickle file containing lemma counts. If used in combination 
        with  classifier that supports class weights (e.g. SGDClassifier)
        or class priors (e.g. Naive Bayes), classes will be weighted 
        according to their lemma frequency.
    with_vocab_mask: bool, optional
        If true, a vocabulary mask is stored for each model. This is required 
        when the classifier performs feature selection and the vocabulary has
        to be pruned accordingly.
    """
    
    # feature selectors that reduce the vocabulary
    FEATURE_SELECTORS = _BaseFilter, RFE, _LearntSelectorMixin
    
    def __init__(self, data_generator, models_hdf_fname, classifier,
                 counts_fname=None, with_vocab_mask=False, **kwargs):
        self.data_generator = data_generator
        self.models_hdf_fname = models_hdf_fname
        self.classifier = classifier
        self.counts_fname = counts_fname
        self.with_vocab_mask = with_vocab_mask
        
    def run(self):
        """
        build disambiguation models
        """
        self._prepare()
        self._build()
        self._finish()

    def _prepare(self):
        self.start_time = time.time() 
        self.disambiguator_count = 0
        
        log.info("creating models file " + self.models_hdf_fname)
        self.models_hdfile = DisambiguatorStore(self.models_hdf_fname, "w")
        
        self.models_hdfile.save_estimator(self.classifier)
        
        # FIXME: hmm, a bit sneaky...
        self.models_hdfile.copy_vocab(self.data_generator.samp_hdfile)
        
        if self.counts_fname:
            log.info("reading counts from " + self.counts_fname)
            self.counts_dict = cPickle.load(open(self.counts_fname))
        
        if self.with_vocab_mask:
            log.info("storage with vocabulary masks")
            self.vocab = self.models_hdfile.load_vocab()
            
    def _build(self):
        for data_set in self.data_generator:
            if data_set.target_lempos:
                self._build_disambiguator(data_set)
            else:
                log.error("no samples and thus no disambiguation models for " +
                          data_set.source_lempos)
            
    def _build_disambiguator(self, data_set):
        log.info(u"building disambiguator for {} with {} translations".format(
            data_set.source_lempos, len(data_set.target_lempos)))
        
        if self.counts_fname:
            self._set_class_weights(data_set.target_lempos)
        
        try:
            self.classifier.fit(data_set.samples, data_set.targets)  
        except ValueError, error:
            if ( error.args in [
                ("zero-size array to reduction operation maximum which has no "
                 "identity",),
                ("zero-size array to maximum.reduce without identity",),
                ('Invalid threshold: all features are discarded.',)] ):
                # this happens when there are no features selected 
                # e.g. when using SelectFpr
                log.error("No model created, because no features selected!")
                return
            else:
                raise error
            
        self.models_hdfile.store_fit(data_set.source_lempos, self.classifier)
        self.models_hdfile.save_target_names(data_set.source_lempos,
                                             data_set.target_lempos)
        if self.with_vocab_mask:        
            self._save_vocab_mask(data_set.source_lempos)
        self.disambiguator_count += 1