Exemplo n.º 1
0
def get_quatrains():
    _, ch2int = get_vocab()
    def quatrain_filter(poem):
        if not is_quatrain(poem):
            return False
        else:
            for sentence in poem['sentences']:
                for ch in sentence:
                    if ch not in ch2int:
                        return False
            return True
    return list(filter(quatrain_filter, get_all_corpus()))
Exemplo n.º 2
0
def get_quatrains():  # 返回每个字符都在字库ch2int中的四行诗的诗句
    _, ch2int = get_vocab()
    def quatrain_filter(poem):
        if not is_quatrain(poem):
            return False
        else:
            for sentence in poem['sentences']:
                for ch in sentence:
                    if ch not in ch2int:
                        return False
            return True
    return filter(quatrain_filter, get_all_corpus())  # get_all_corpus()方法返回的是所有诗句文件数据中的诗的记录,每一行代表一首诗的名、作者、朝代、诗句
Exemplo n.º 3
0
def _gen_vocab():
    print "Generating the vocabulary ..."
    corpus = get_all_corpus()
    char_cnts = dict()
    for idx, poem in enumerate(corpus):
        for sentence in poem['sentences']:
            for ch in sentence:
                char_cnts[ch] = char_cnts[ch]+1 if ch in char_cnts else 1
        if 0 == (idx+1)%10000:
            print "[Vocabulary] %d/%d poems have been processed." %(idx+1, len(corpus))
    vocab = sorted([ch for ch in char_cnts], key = lambda ch: -char_cnts[ch])[:VOCAB_SIZE-2]
    with codecs.open(_vocab_path, 'w', 'utf-8') as fout:
        json.dump(vocab, fout)
    print "The vocabulary has been built."