def extend_samples(samp_hdf_fname, tdict_pkl_fname, reverse_tdict_pkl_fname, ext_hdf_fname, max_samp=None): log.info("opening original samples file " + samp_hdf_fname) samp_hdfile = h5py.File(samp_hdf_fname, "r") ext_mat = make_extension_matrix(samp_hdfile, tdict_pkl_fname, reverse_tdict_pkl_fname) log.info("creating extended samples file " + ext_hdf_fname) ext_hdfile = h5py.File(ext_hdf_fname, "w") ext_samples = ext_hdfile.create_group("samples") log.info("copying vocabulary ({0} terms)".format(len(samp_hdfile["vocab"]))) ext_hdfile.create_dataset("vocab", data=samp_hdfile["vocab"]) i = 0 for lemma, lemma_group in samp_hdfile["samples"].iteritems(): for pos, pos_group in lemma_group.iteritems(): log.info(u"{0}: creating extended samples for {1}/{2}".format(i, lemma,pos)) samp_mat = coo_matrix_from_hdf5(pos_group).tocsr() mat = (samp_mat * ext_mat).tocoo() group = ext_hdfile.create_group(u"samples/{0}/{1}".format(lemma,pos)) coo_matrix_to_hdf5(mat, group, data_dtype="i1", compression="gzip") i += 1 if i == max_samp: log.info("reached maximum number of samples") break if i == max_samp: break log.info("closing " + samp_hdf_fname) samp_hdfile.close() log.info("closing " + ext_hdf_fname) ext_hdfile.close()
def make_new_samples(sample_hdfile, filtered_hdfile, columns_selector): org_samples = sample_hdfile["samples"] filtered_samples = filtered_hdfile.create_group("samples") for lemma, lemma_group in org_samples.iteritems(): for pos, pos_group in lemma_group.iteritems(): lempos = lemma + u"/" + pos log.info("adding filtered samples for " + lempos) sample_mat = coo_matrix_from_hdf5(pos_group) sample_mat = sample_mat.tocsc() # select only columns corresponding to filtered vocabulary, # removing other columns sample_mat = sample_mat[:, columns_selector] # get indices of non-empty rows sample_mat = sample_mat.tolil() rows_selector = sample_mat.rows.nonzero()[0] # select only non-empty rows, removing empty rows sample_mat = sample_mat.tocsr() sample_mat = sample_mat[rows_selector] sample_mat = sample_mat.tocoo() filtered_group = filtered_samples.create_group(lempos) coo_matrix_to_hdf5(sample_mat, filtered_group, data_dtype="=i1", compression="gzip")