def print_freq(samp_fname, lemma=None, pos=None, minimum=0, outf=codecs.getwriter('utf8')(sys.stdout)): hdfile = h5py.File(samp_fname, "r") vocab = [t.decode("utf-8") for t in hdfile["vocab"][()]] line = 78 * "=" + "\n" if lemma: if lemma in hdfile["samples"]: lemma_list = [lemma] else: lemma_list = [] else: lemma_list = hdfile["samples"] for lemma in lemma_list: for lemma_pos in hdfile["samples"][lemma]: if not pos or lemma_pos == pos: outf.write(line + lemma + u"/" + lemma_pos + "\n" + line) group = hdfile["samples"][lemma][lemma_pos] # dtype must be int rather than i8, because calling .sum will return sample_mat = coo_matrix_from_hdf5(group, dtype="f") sample_mat = sample_mat.tocsr() sums = sample_mat.sum(axis=0) total = float(sums.sum()) indices = sums.argsort() indices = indices[0,:].tolist()[0] indices.reverse() for i in indices: if sums[0,i] > minimum: outf.write(u"{0:>16.8}{1:>16.8f}% {2}\n".format( sums[0,i], (100 * sums[0,i]) / total, vocab[i]))
def print_samples(samp_fname, lemma=None, pos=None, outf=codecs.getwriter('utf8')(sys.stdout)): hdfile = h5py.File(samp_fname, "r") vocab = [t.decode("utf-8") for t in hdfile["vocab"][()]] line = 78 * "=" + "\n" if lemma: if lemma in hdfile["samples"]: lemma_list = [lemma] else: lemma_list = [] else: lemma_list = hdfile["samples"] for lemma in lemma_list: for lemma_pos in hdfile["samples"][lemma]: if not pos or lemma_pos == pos: outf.write(line + lemma + u"/" + lemma_pos + "\n" + line) group = hdfile["samples"][lemma][lemma_pos] sample_mat = coo_matrix_from_hdf5(group) sample_mat = sample_mat.tocsr() for count, row in enumerate(sample_mat): outf.write(u"{0:<9d}: ".format(count + 1)) pairs = [ (vocab[j], count) for j, count in zip(row.indices, row.data) ] pairs.sort() string = u", ".join(u"{}:{}".format(*p) for p in pairs) outf.write(string + u"\n")
def extend_samples(samp_hdf_fname, tdict_pkl_fname, reverse_tdict_pkl_fname, ext_hdf_fname, max_samp=None): log.info("opening original samples file " + samp_hdf_fname) samp_hdfile = h5py.File(samp_hdf_fname, "r") ext_mat = make_extension_matrix(samp_hdfile, tdict_pkl_fname, reverse_tdict_pkl_fname) log.info("creating extended samples file " + ext_hdf_fname) ext_hdfile = h5py.File(ext_hdf_fname, "w") ext_samples = ext_hdfile.create_group("samples") log.info("copying vocabulary ({0} terms)".format(len(samp_hdfile["vocab"]))) ext_hdfile.create_dataset("vocab", data=samp_hdfile["vocab"]) i = 0 for lemma, lemma_group in samp_hdfile["samples"].iteritems(): for pos, pos_group in lemma_group.iteritems(): log.info(u"{0}: creating extended samples for {1}/{2}".format(i, lemma,pos)) samp_mat = coo_matrix_from_hdf5(pos_group).tocsr() mat = (samp_mat * ext_mat).tocoo() group = ext_hdfile.create_group(u"samples/{0}/{1}".format(lemma,pos)) coo_matrix_to_hdf5(mat, group, data_dtype="i1", compression="gzip") i += 1 if i == max_samp: log.info("reached maximum number of samples") break if i == max_samp: break log.info("closing " + samp_hdf_fname) samp_hdfile.close() log.info("closing " + ext_hdf_fname) ext_hdfile.close()
def test_translation_classifier(self): models_hdf_fname = config["test_data_dir"] + "/de-en_models.hdf5_" # make a translation classifier that uses this model trans_clf = TranslationClassifier(models_hdf_fname) # load a couple of vectors from the samples (i.e. the training # material) to test the translation classifier f = h5py.File(config["test_data_dir"] + "/de-en_samples.hdf5_", "r") source_lempos = "Teller/n" targets = "basket/n dial/n disc/n dish/n disk/n plate/n".split() for target_lempos in targets: log.info(u"True translation = " + target_lempos) m = coo_matrix_from_hdf5(f["/samples/" + target_lempos]) m = m.tocsr() for vector in m[:10]: scores = trans_clf.score(source_lempos, vector) best = sorted(scores.items(), key=operator.itemgetter(1))[-1] log.info(u"Predicted translation = {} (P={})".format(*best))
def make_new_samples(sample_hdfile, filtered_hdfile, columns_selector): org_samples = sample_hdfile["samples"] filtered_samples = filtered_hdfile.create_group("samples") for lemma, lemma_group in org_samples.iteritems(): for pos, pos_group in lemma_group.iteritems(): lempos = lemma + u"/" + pos log.info("adding filtered samples for " + lempos) sample_mat = coo_matrix_from_hdf5(pos_group) sample_mat = sample_mat.tocsc() # select only columns corresponding to filtered vocabulary, # removing other columns sample_mat = sample_mat[:, columns_selector] # get indices of non-empty rows sample_mat = sample_mat.tolil() rows_selector = sample_mat.rows.nonzero()[0] # select only non-empty rows, removing empty rows sample_mat = sample_mat.tocsr() sample_mat = sample_mat[rows_selector] sample_mat = sample_mat.tocoo() filtered_group = filtered_samples.create_group(lempos) coo_matrix_to_hdf5(sample_mat, filtered_group, data_dtype="=i1", compression="gzip")
def make_models(tab_fname, samp_hdf_fname, models_hdf_fname, classifier, save_classifier_func=save_nb_classifier_to_hdf5, counts_pkl_fname=None, max_models=None, source_lempos_subset=None, vocab_i=None, vocab_j=None): start_time = time.time() log.info("opening samples file " + samp_hdf_fname) sample_hdfile = h5py.File(samp_hdf_fname, "r") samples = sample_hdfile["samples"] if counts_pkl_fname: log.info("reading counts from " + counts_pkl_fname) counts_dict = cPickle.load(open(counts_pkl_fname)) else: class_prior=None log.info("creating models file " + models_hdf_fname) models_hdfile = h5py.File(models_hdf_fname, "w") models = models_hdfile.create_group("models") # Pickle classifier and include in hdf5 file. # This saves the parameters from __init__. # This is before a call to fit(), so class_log_prior_ and # feature_log_prob_ are excluded. # Loading this pickled object requires its class (e.g. MultinomialNB) # to be part of the current namespace. # Alternative is to use the _get_params() and set_params() methods # from the BaseEstimator class log.info("saving classifier {0}".format(classifier)) models["classifier_pickle"] = cPickle.dumps(classifier) log.info("copying vocabulary ({0} terms)".format(len(sample_hdfile["vocab"]))) # create new type for variable-length strings # see http://code.google.com/p/h5py/wiki/HowTo#Variable-length_strings str_type = h5py.new_vlen(str) models_hdfile.create_dataset("vocab", data=sample_hdfile["vocab"][vocab_i:vocab_j], dtype=str_type) prev_source_lempos = None models_count = 0 for line in codecs.open(tab_fname, encoding="utf8"): if models_count == max_models: log.info("reached max number of models") break source_label, target_label = line.rstrip().split("\t")[1:3] # strip corpus POS tag source_lempos = source_label.rsplit("/", 1)[0] target_lempos = target_label.rsplit("/", 1)[0] if source_lempos_subset and source_lempos not in source_lempos_subset: log.debug(u"skipping model for {} -> {}".format(source_lempos, target_lempos)) continue try: samp_group = samples[target_lempos] except KeyError: log.warning("found no sample for " + target_lempos) continue sm = coo_matrix_from_hdf5(samp_group) # hdf5 cannot store array of unicode strings, so use byte strings for # label names target_lempos = target_lempos.encode("utf-8") if prev_source_lempos == source_lempos: if target_lempos in target_names: # this is due to an old bug in the code that finds # translation ambiguities in the lexicon - test becomes # redundant in the future log.warn(u"skipping duplicate target lempos " + target_lempos.decode("utf-8")) else: data = sp.vstack([data, sm]) target_count += 1 target_names.append(target_lempos) # concat new targets depending on number of instances new_targets = np.zeros((sm.shape[0],)) + target_count targets = np.hstack((targets, new_targets)) else: if prev_source_lempos and target_count: data = data.tocsr()[:, vocab_i:vocab_j] log.debug(u"fitting classifer for {} with {} targets on {} instances with {} features".format( prev_source_lempos, len(target_names), data.shape[0], data.shape[1])) if counts_pkl_fname: class_prior = get_class_priors(counts_dict, target_names) # convert to list to prevent an error message from scilearn class_prior = list(class_prior) classifier.fit(data, targets, class_prior=class_prior) class_group = models.create_group(prev_source_lempos) log.info("saving classifier model for " + prev_source_lempos) save_classifier_func(class_group, classifier) class_group.create_dataset("target_names", data=target_names) models_count += 1 # init data for new model data = sm targets = np.zeros((sm.shape[0],)) target_count = 0 target_names = [target_lempos] prev_source_lempos = source_lempos log.info("saved {} models".format(models_count)) log.info("closing models file " + models_hdf_fname) models_hdfile.close() size = os.path.getsize(models_hdf_fname) / float(1024.0 ** 2) elapsed_time = time.time() - start_time log.info("elapsed time: {0}".format(datetime.timedelta(seconds=elapsed_time))) log.info("average time per model: {0}".format( datetime.timedelta(seconds=elapsed_time/float(models_count)))) log.info("models file size: {0:.2f} MB".format(size)) log.info("average model size: {:.2f} MB".format(size / float(models_count))) sample_hdfile.close()