def iter_XY(config): (embeddingconfig, gramconfig, runconfig) = config embedding = WordEmbedding(embeddingpath(embeddingconfig)) unknown_embedding = None if not runconfig.filter_unknown: unknown_embedding = zeros(embedding.values.shape[1]) def embed(gram): return embedding.embed(gram, unknown_embedding) with gzip.open(grampath(gramconfig), mode='rt') as f: for line in f: line = line.split() if len(line) == gramconfig.gram_size: gram = line lbl = word_to_label(gramconfig.skipwords, '') elif len(line) != gramconfig.gram_size + 1: print('Skip length > 1 not supported. Dropping Gram:', *line, file=stderr) continue else: gram = line[ : gramconfig.skippos] + line[gramconfig.skippos + 1 : ] skip = line[gramconfig.skippos] lbl = word_to_label(gramconfig.skipwords, skip) try: gramvec = embed(gram) yield gramvec, lbl except KeyError: pass
def create_grams(gramconfig): makedirs('data/grams', exist_ok=True) corpuspath = config.corpuspath(gramconfig.corpus) grampath = config.grampath(gramconfig) if path.isfile(grampath): return uncompressed_grampath = str(Path(grampath).parent / Path(grampath).stem) with gzip.open(corpuspath, mode='rt') as inf: with open(uncompressed_grampath, mode='wt') as outf: # may overwrite for line in inf: c = gramconfig for gram, skip in iter_grams(line, c.gram_size, c.skipwords, c.skippos, c.filter_skips): completegram = ' '.join(chain(gram[ : c.skippos], skip, gram[c.skippos : ])) print(completegram, file=outf) subprocess.run(['/usr/bin/shuf', '-o', uncompressed_grampath, uncompressed_grampath]) compress(uncompressed_grampath)