예제 #1
0
def iter_XY(config):
    (embeddingconfig, gramconfig, runconfig) = config

    embedding = WordEmbedding(embeddingpath(embeddingconfig))
    unknown_embedding = None
    if not runconfig.filter_unknown:
        unknown_embedding = zeros(embedding.values.shape[1])
    def embed(gram):
        return embedding.embed(gram, unknown_embedding)

    with gzip.open(grampath(gramconfig), mode='rt') as f:
        for line in f:
            line = line.split()
            if len(line) == gramconfig.gram_size:
                gram = line
                lbl = word_to_label(gramconfig.skipwords, '')
            elif len(line) != gramconfig.gram_size + 1:
                print('Skip length > 1 not supported. Dropping Gram:', *line, file=stderr)
                continue
            else:
                gram = line[ : gramconfig.skippos] + line[gramconfig.skippos + 1 : ]
                skip = line[gramconfig.skippos]
                lbl = word_to_label(gramconfig.skipwords, skip)

            try:
                gramvec = embed(gram)
                yield gramvec, lbl
            except KeyError:
                pass
예제 #2
0
def create_grams(gramconfig):
    makedirs('data/grams', exist_ok=True)
    corpuspath = config.corpuspath(gramconfig.corpus)
    grampath = config.grampath(gramconfig)
    if path.isfile(grampath):
        return
    uncompressed_grampath = str(Path(grampath).parent / Path(grampath).stem)
    with gzip.open(corpuspath, mode='rt') as inf:
        with open(uncompressed_grampath, mode='wt') as outf: # may overwrite
            for line in inf:
                c = gramconfig
                for gram, skip in iter_grams(line, c.gram_size, c.skipwords, c.skippos, c.filter_skips):
                    completegram = ' '.join(chain(gram[ : c.skippos], skip, gram[c.skippos : ]))
                    print(completegram, file=outf)

    subprocess.run(['/usr/bin/shuf', '-o', uncompressed_grampath, uncompressed_grampath])
    compress(uncompressed_grampath)