예제 #1
0
def main(path_all, path_train, path_val, size):
    print "[nlppreprocess.split_corpus] Processing ..."
    iterator = utils.read_sentences(path_all)

    count = 0
    for s in open(path_all):
        count += 1
    N = count
    print "[nlppreprocess.split_corpus] Total size: %d" % N
    perm = np.random.RandomState(1234).permutation(N)
    val_index = perm[-size:]

    print "[nlppreprocess.split_corpus] Writing ..."
    f_train = open(path_train, "w")
    f_val = open(path_val, "w")
    for i, s in enumerate(iterator):
        line = " ".join(s)
        if i in val_index:
            f_val.write("%s\n" % line.encode("utf-8"))
        else:
            f_train.write("%s\n" % line.encode("utf-8"))
    f_train.flush()
    f_train.close()
    f_val.flush()
    f_val.close()
예제 #2
0
def main(path, char):
    assert os.path.exists(path)
    
    if not char:
        iterator = utils.read_sentences(path)
    else:
        print "[nlppreprocess.create_dictionary] NOTE: char-level mode!"
        iterator = CharIterator(path)

    print "[nlppreprocess.create_dictionary] Processing ..."
    dictionary = gensim.corpora.Dictionary(iterator, prune_at=None)
    vocab = dictionary.token2id
    print "[nlppreprocess.create_dictionary] Vocabulary size: %d" % len(vocab)
    
    if not char:
        dictionary.save_as_text(path + ".dictionary")
        print "[nlppreprocess.create_dictionary] Saved the dictionary to %s" % (path + ".dictionary")
    else:
        dictionary.save_as_text(path + ".char.dictionary")
        print "[nlppreprocess.create_dictionary] Saved the dictionary to %s" % (path + ".char.dictionary")
예제 #3
0
def main(path_in, path_out):
    print "[nlppreprocess.tokenizer] Processing ..."
    iterator = utils.read_sentences(path_in)
    iterator = Tokenizer_with_nltk(iterator)
    utils.write_sentences(iterator, path_out)
예제 #4
0
def main(path_in, path_out, prune_at, min_count):
    print "[nlppreprocess.replace_rare_words] Processing ..."
    iterator = utils.read_sentences(path_in)
    iterator = ReplaceRareWords(iterator, prune_at, min_count)
    count_UNK_rate(iterator)
    utils.write_sentences(iterator, path_out)
예제 #5
0
def main(path_in, path_out):
    print "[nlppreprocess.replace_digits] Processing ..."
    iterator = utils.read_sentences(path_in)
    iterator = ReplaceDigits(iterator)
    utils.write_sentences(iterator, path_out)
예제 #6
0
def main(path_in, path_out):
    print "[nlppreprocess.generate_counts] Processing ..."
    iterator = utils.read_sentences(path_in)
    iterator = AppendEdgeOfSent(iterator)
    utils.write_sentences(iterator, path_out)
예제 #7
0
def main(path_in, path_out):
    print "[nlppreprocess.lowercase] Processing ..."
    iterator = utils.read_sentences(path_in)
    iterator = Lowercase(iterator)
    utils.write_sentences(iterator, path_out)
예제 #8
0
def main(path_in, path_out):
    print "[nlppreprocess.append_eos] Processing ..."
    iterator = utils.read_sentences(path_in)
    iterator = AppendEOS(iterator)
    utils.write_sentences(iterator, path_out)