Пример #1
0
def print_freq(samp_fname, lemma=None, pos=None, minimum=0,
               outf=codecs.getwriter('utf8')(sys.stdout)):
    hdfile = h5py.File(samp_fname, "r")
    vocab = [t.decode("utf-8") for t in hdfile["vocab"][()]]
    line = 78 * "=" + "\n"
    
    if lemma:
        if lemma in hdfile["samples"]:
            lemma_list = [lemma]
        else:
            lemma_list = []
    else:
        lemma_list = hdfile["samples"]
    
    for lemma in lemma_list:
        for lemma_pos in hdfile["samples"][lemma]:
            if not pos or lemma_pos == pos:
                outf.write(line + lemma + u"/" + lemma_pos + "\n" + line)
                group = hdfile["samples"][lemma][lemma_pos]
                # dtype must be int rather than i8, because calling .sum will return 
                sample_mat = coo_matrix_from_hdf5(group, dtype="f")
                sample_mat = sample_mat.tocsr()
                sums = sample_mat.sum(axis=0)
                total = float(sums.sum())
                indices = sums.argsort()
                indices = indices[0,:].tolist()[0]
                indices.reverse()
                
                for i in indices:
                    if sums[0,i] > minimum:
                        outf.write(u"{0:>16.8}{1:>16.8f}%     {2}\n".format(
                            sums[0,i],
                            (100 * sums[0,i]) / total,
                            vocab[i]))
Пример #2
0
def print_samples(samp_fname, lemma=None, pos=None, outf=codecs.getwriter('utf8')(sys.stdout)):
    hdfile = h5py.File(samp_fname, "r")
    vocab = [t.decode("utf-8") for t in hdfile["vocab"][()]]
    line = 78 * "=" + "\n"
    
    if lemma:
        if lemma in hdfile["samples"]:
            lemma_list = [lemma]
        else:
            lemma_list = []
    else:
        lemma_list = hdfile["samples"]
    
    for lemma in lemma_list:
        for lemma_pos in hdfile["samples"][lemma]:
            if not pos or lemma_pos == pos:
                outf.write(line + lemma + u"/" + lemma_pos + "\n" + line)
                group = hdfile["samples"][lemma][lemma_pos]
                sample_mat = coo_matrix_from_hdf5(group)
                sample_mat = sample_mat.tocsr()
                for count, row in enumerate(sample_mat):
                    outf.write(u"{0:<9d}: ".format(count + 1))
                    pairs = [ (vocab[j], count) 
                              for j, count in zip(row.indices, row.data) ]
                    pairs.sort()
                    string = u", ".join(u"{}:{}".format(*p) for p in pairs)
                    outf.write(string + u"\n")
Пример #3
0
def extend_samples(samp_hdf_fname, tdict_pkl_fname, reverse_tdict_pkl_fname,
                   ext_hdf_fname, max_samp=None):
    log.info("opening original samples file " + samp_hdf_fname)
    samp_hdfile = h5py.File(samp_hdf_fname, "r") 
    
    ext_mat = make_extension_matrix(samp_hdfile, tdict_pkl_fname, reverse_tdict_pkl_fname)
    
    log.info("creating extended samples file " + ext_hdf_fname)
    ext_hdfile = h5py.File(ext_hdf_fname, "w") 
    ext_samples = ext_hdfile.create_group("samples") 
    
    log.info("copying vocabulary ({0} terms)".format(len(samp_hdfile["vocab"])))
    ext_hdfile.create_dataset("vocab", data=samp_hdfile["vocab"])
    i = 0
    
    for lemma, lemma_group in samp_hdfile["samples"].iteritems():
        for pos, pos_group in lemma_group.iteritems():
            log.info(u"{0}: creating extended samples for {1}/{2}".format(i, lemma,pos))
            samp_mat = coo_matrix_from_hdf5(pos_group).tocsr()
            mat = (samp_mat * ext_mat).tocoo()
            group = ext_hdfile.create_group(u"samples/{0}/{1}".format(lemma,pos))
            coo_matrix_to_hdf5(mat, group, data_dtype="i1", compression="gzip")
            
            i += 1
            if i == max_samp:
                log.info("reached maximum number of samples")
                break
        if i == max_samp:
            break
    
    log.info("closing " + samp_hdf_fname)
    samp_hdfile.close()          

    log.info("closing " + ext_hdf_fname)
    ext_hdfile.close()          
Пример #4
0
    def test_translation_classifier(self):
        models_hdf_fname = config["test_data_dir"] + "/de-en_models.hdf5_"

        # make a translation classifier that uses this model
        trans_clf = TranslationClassifier(models_hdf_fname)

        # load a couple of vectors from the samples (i.e. the training
        # material) to test the translation classifier
        f = h5py.File(config["test_data_dir"] + "/de-en_samples.hdf5_", "r")
        source_lempos = "Teller/n"
        targets = "basket/n dial/n disc/n dish/n disk/n plate/n".split()

        for target_lempos in targets:
            log.info(u"True translation = " + target_lempos)

            m = coo_matrix_from_hdf5(f["/samples/" + target_lempos])
            m = m.tocsr()

            for vector in m[:10]:
                scores = trans_clf.score(source_lempos, vector)
                best = sorted(scores.items(), key=operator.itemgetter(1))[-1]
                log.info(u"Predicted translation = {} (P={})".format(*best))
Пример #5
0
def make_new_samples(sample_hdfile, filtered_hdfile, columns_selector):
    org_samples = sample_hdfile["samples"]
    filtered_samples = filtered_hdfile.create_group("samples")

    for lemma, lemma_group in org_samples.iteritems():
        for pos, pos_group in lemma_group.iteritems():
            lempos = lemma + u"/" + pos
            log.info("adding filtered samples for " + lempos)
            sample_mat = coo_matrix_from_hdf5(pos_group)
            sample_mat = sample_mat.tocsc()
            # select only columns corresponding to filtered vocabulary,
            # removing other columns
            sample_mat = sample_mat[:, columns_selector]
            # get indices of non-empty rows
            sample_mat = sample_mat.tolil()
            rows_selector = sample_mat.rows.nonzero()[0]
            # select only non-empty rows, removing empty rows
            sample_mat = sample_mat.tocsr()
            sample_mat = sample_mat[rows_selector]
            sample_mat = sample_mat.tocoo()
            filtered_group = filtered_samples.create_group(lempos)
            coo_matrix_to_hdf5(sample_mat, filtered_group, data_dtype="=i1", compression="gzip")
Пример #6
0
def make_models(tab_fname, samp_hdf_fname, models_hdf_fname, classifier,
                save_classifier_func=save_nb_classifier_to_hdf5, counts_pkl_fname=None,
                max_models=None, source_lempos_subset=None, vocab_i=None, vocab_j=None):
    start_time = time.time() 
    log.info("opening samples file " + samp_hdf_fname)
    sample_hdfile = h5py.File(samp_hdf_fname, "r")
    samples = sample_hdfile["samples"]
    
    if counts_pkl_fname:
        log.info("reading counts from " + counts_pkl_fname)
        counts_dict = cPickle.load(open(counts_pkl_fname))
    else:
        class_prior=None

    log.info("creating models file " + models_hdf_fname)
    models_hdfile = h5py.File(models_hdf_fname, "w")
    models = models_hdfile.create_group("models")
    # Pickle classifier and include in hdf5 file.
    # This saves the parameters from __init__.
    # This is before a call to fit(), so class_log_prior_ and 
    # feature_log_prob_ are excluded.
    # Loading this pickled object requires its class (e.g. MultinomialNB)
    # to be part of the current namespace.
    # Alternative is to use the _get_params() and set_params() methods
    # from the BaseEstimator class
    log.info("saving classifier {0}".format(classifier))
    models["classifier_pickle"] = cPickle.dumps(classifier) 
    
    log.info("copying vocabulary ({0} terms)".format(len(sample_hdfile["vocab"])))
    # create new type for variable-length strings
    # see http://code.google.com/p/h5py/wiki/HowTo#Variable-length_strings
    str_type = h5py.new_vlen(str)
    models_hdfile.create_dataset("vocab", data=sample_hdfile["vocab"][vocab_i:vocab_j], 
                                 dtype=str_type)
    
    prev_source_lempos = None
    models_count = 0
    
    for line in codecs.open(tab_fname, encoding="utf8"):
        if models_count == max_models:
            log.info("reached max number of models")
            break
        
        source_label, target_label = line.rstrip().split("\t")[1:3]
        # strip corpus POS tag
        source_lempos = source_label.rsplit("/", 1)[0]
        target_lempos = target_label.rsplit("/", 1)[0]
        
        if source_lempos_subset and source_lempos not in source_lempos_subset:
            log.debug(u"skipping model for {} -> {}".format(source_lempos,
                                                           target_lempos))
            continue

        try:
            samp_group = samples[target_lempos]
        except KeyError:
            log.warning("found no sample for " + target_lempos)
            continue
                
        sm = coo_matrix_from_hdf5(samp_group)        
        
        # hdf5 cannot store array of unicode strings, so use byte strings for
        # label names
        target_lempos = target_lempos.encode("utf-8")
        
        if prev_source_lempos == source_lempos:
            if target_lempos in target_names:
                # this is due to an old bug in the code that finds
                # translation ambiguities in the lexicon - test becomes
                # redundant in the future
                log.warn(u"skipping duplicate target lempos " + target_lempos.decode("utf-8"))
            else:
                data = sp.vstack([data, sm])
                target_count += 1
                target_names.append(target_lempos)
                # concat new targets depending on number of instances
                new_targets = np.zeros((sm.shape[0],)) + target_count
                targets = np.hstack((targets, new_targets))
        else:
            if prev_source_lempos and target_count:
                data = data.tocsr()[:, vocab_i:vocab_j]
                log.debug(u"fitting classifer for {} with {} targets on {} instances with {} features".format(
                    prev_source_lempos, len(target_names), data.shape[0], data.shape[1]))
                
                if counts_pkl_fname:
                    class_prior = get_class_priors(counts_dict, target_names)
                    # convert to list to prevent an error message from scilearn
                    class_prior = list(class_prior)

                classifier.fit(data, targets, class_prior=class_prior)
                    
                class_group = models.create_group(prev_source_lempos)
                log.info("saving classifier model for " + prev_source_lempos)
                save_classifier_func(class_group, classifier)
                class_group.create_dataset("target_names", data=target_names)
                models_count += 1
                
            # init data for new model
            data = sm
            targets = np.zeros((sm.shape[0],))
            target_count = 0
            target_names = [target_lempos]
            
        prev_source_lempos = source_lempos
    
    log.info("saved {} models".format(models_count))
    log.info("closing models file " + models_hdf_fname)    
    models_hdfile.close()
    size = os.path.getsize(models_hdf_fname) / float(1024.0 ** 2)
    elapsed_time = time.time() - start_time
    log.info("elapsed time: {0}".format(datetime.timedelta(seconds=elapsed_time)))
    log.info("average time per model: {0}".format(
        datetime.timedelta(seconds=elapsed_time/float(models_count))))
    log.info("models file size: {0:.2f} MB".format(size))
    log.info("average model size: {:.2f} MB".format(size / float(models_count)))
    sample_hdfile.close()