예제 #1
0
def _gen_embedding(ndim, alignment=False):
    print "Generating %d-dim word embedding ..." %ndim
    int2ch, ch2int = get_vocab()
    ch_lists = []
    quatrains = get_quatrains()
    for idx, poem in enumerate(quatrains):
        for sentence in poem['sentences']:
            ch_lists.append(filter(lambda ch: ch in ch2int, sentence))
        if alignment:
            # the i-th characters in the poem, used to boost Dui Zhang
            i_characters = [[sentence[j] for sentence in poem['sentences']] for j in range(len(poem['sentences'][0]))]
            for characters in i_characters:
                ch_lists.append(filter(lambda ch: ch in ch2int, characters))
        if 0 == (idx+1)%10000:
            print "[Word2Vec] %d/%d poems have been processed." %(idx+1, len(quatrains))
    print "Hold on. This may take some time ..."
    model = models.Word2Vec(ch_lists, size = ndim, min_count = 5)
    embedding = uniform(-1.0, 1.0, [VOCAB_SIZE, ndim])
    for idx, ch in enumerate(int2ch):
        if ch in model.wv:
            embedding[idx,:] = model.wv[ch]
    if alignment:
        model.save(_w2v_with_alignment_model_path)
        print "Word2Vec model is saved."
        np.save(_w2v_with_alignment_path, embedding)
        print "Word embedding is saved."
    else:
        model.save(_w2v_model_path)
        print "Word2Vec model is saved."
        np.save(_w2v_path, embedding)
        print "Word embedding is saved."
예제 #2
0
def _rank_all_words():
    segmenter = Segmenter()  #generation   sxhy dictp
    stopwords = get_stopwords()
    print("Start TextRank over the selected quatrains ...")
    quatrains = get_quatrains()
    adjlist = dict()
    for idx, poem in enumerate(quatrains):
        if 0 == (idx + 1) % 10000:
            print("[TextRank] Scanning %d/%d poems ..." %
                  (idx + 1, len(quatrains)))
        for sentence in poem['sentences']:
            segs = filter(lambda word: word not in stopwords,
                          segmenter.segment(sentence))  #分词结果
            for seg in segs:
                if seg not in adjlist:
                    adjlist[seg] = dict()
            for i, seg in enumerate(segs):
                for _, other in enumerate(segs[i + 1:]):
                    if seg != other:
                        adjlist[seg][other] = adjlist[seg][other]+1 \
                                if other in adjlist[seg] else 1.0
                        adjlist[other][seg] = adjlist[other][seg]+1 \
                                if seg in adjlist[other] else 1.0
    for word in adjlist:
        w_sum = sum(weight for other, weight in adjlist[word].items())  #权重增加
        for other in adjlist[word]:
            adjlist[word][other] /= w_sum
    print("[TextRank] Weighted graph has been built.")
    _text_rank(adjlist)
예제 #3
0
def _rank_all_words():
    segmenter = Segmenter()  # 诗句分段器
    stopwords = get_stopwords()  # 停用词列表
    print "Start TextRank over the selected quatrains ..."
    quatrains = get_quatrains()  # 四行诗集合
    adjlist = dict()
    for idx, poem in enumerate(quatrains):  # 对于每首诗
        if 0 == (idx + 1) % 10000:
            print "[TextRank] Scanning %d/%d poems ..." % (idx + 1,
                                                           len(quatrains))
        for sentence in poem['sentences']:  # 对于每一句诗
            segs = filter(lambda word: word not in stopwords,
                          segmenter.segment(sentence))  # 得到不再停用词中的词段
            for seg in segs:  # 对于每个词段
                if seg not in adjlist:
                    adjlist[seg] = dict()  # 每个词段生成一个字典dict
            for i, seg in enumerate(segs):  # 对于每个词段
                for _, other in enumerate(
                        segs[i + 1:]):  # 去和后面的每个词段比较,实际是源于text_rank需要的网状结构图
                    if seg != other:  # 精巧的code
                        adjlist[seg][other] = adjlist[seg][other]+1 \
                                if other in adjlist[seg] else 1.0
                        adjlist[other][seg] = adjlist[other][seg]+1 \
                                if seg in adjlist[other] else 1.0
    for word in adjlist:
        w_sum = sum(
            weight
            for other, weight in adjlist[word].items())  # 求该word对应的所有词的权重综合
        for other in adjlist[word]:
            adjlist[word][other] /= w_sum  # 求该word中每个value对应的权重平均值
    print "[TextRank] Weighted graph has been built."
    _text_rank(adjlist)
예제 #4
0
def get_pop_quatrains(num = 100000):
    cnts = get_word_cnts()
    segmenter = Segmenter()
    quatrains = get_quatrains()
    min_word_cnts = [_min_word_cnt(cnts, quatrain, segmenter) \
            for i, quatrain in enumerate(quatrains)]
    indexes = sorted(range(len(quatrains)), key = lambda i: -min_word_cnts[i])
    return [quatrains[index] for index in indexes[:min(num, len(indexes))]]
예제 #5
0
def eval_train_data():
    evaluator = RhymeEvaluator()

    quatrains = get_quatrains()
    poems = list(map(lambda quatrain: quatrain['sentences'], quatrains)) # Strip out metadata information

    print( "Testing {} quatrains from the corpus.".format(len(poems)))
    eval_poems(evaluator, poems)
예제 #6
0
def _gen_word_cnts():
    counters = dict()
    segmenter = Segmenter()
    quatrains = get_quatrains()
    for idx, poem in enumerate(quatrains):
        for sentence in poem['sentences']:
            segs = segmenter.segment(sentence)
            for seg in segs:
                counters[seg] = counters[seg]+1 if seg in counters else 1
        if 0 == (idx+1)%10000:
            print "[Word Count] %d/%d quatrains has been processed." %(idx+1, len(quatrains))
    with codecs.open(_wc_path, 'w', 'utf-8') as fout:
        json.dump(counters, fout)
예제 #7
0
 def _train(self):
     print("Start training Word2Vec for planner ...")
     quatrains = get_quatrains()
     segmenter = Segmenter()
     seg_lists = []
     for idx, quatrain in enumerate(quatrains):
         seg_list = []
         for sentence in quatrain['sentences']:
             seg_list.extend([seg for seg in segmenter.segment(sentence) if seg in self.ranks])
         seg_lists.append(seg_list)
         if 0 == (idx+1)%10000:
             print("[Plan Word2Vec] %d/%d quatrains has been processed." %(idx+1, len(quatrains)))
     print("Hold on. This may take some time ...")
     self.model = models.Word2Vec(seg_lists, size = 512, min_count = 5)
     self.model.save(_model_path)
예제 #8
0
def _gen_embedding(ndim):
    print "Generating %d-dim word embedding ..." % ndim
    int2ch, ch2int = get_vocab()
    ch_lists = []
    quatrains = get_quatrains()
    for idx, poem in enumerate(quatrains):
        for sentence in poem['sentences']:
            ch_lists.append(filter(lambda ch: ch in ch2int, sentence))
        if 0 == (idx + 1) % 10000:
            print "[Word2Vec] %d/%d poems have been processed." % (
                idx + 1, len(quatrains))
    print "Hold on. This may take some time ..."
    model = models.Word2Vec(ch_lists, size=ndim, min_count=5)
    embedding = uniform(-1.0, 1.0, [VOCAB_SIZE, ndim])
    for idx, ch in enumerate(int2ch):
        if ch in model.wv:
            embedding[idx, :] = model.wv[ch]
    np.save(_w2v_path, embedding)
    print "Word embedding is saved."
예제 #9
0
 def _train(self):
     print "Start training Word2Vec for planner ..."
     quatrains = get_quatrains()
     segmenter = Segmenter()  # 对诗句分段和取其中的每个词不一样
     seg_lists = []
     for idx, quatrain in enumerate(quatrains):
         seg_list = []
         for sentence in quatrain['sentences']:
             seg_list.extend(
                 filter(lambda seg: seg in self.ranks,
                        segmenter.segment(sentence)))
         seg_lists.append(seg_list)
         if 0 == (idx + 1) % 10000:
             print "[Plan Word2Vec] %d/%d quatrains has been processed." % (
                 idx + 1, len(quatrains))
     print "Hold on. This may take some time ..."
     self.model = models.Word2Vec(seg_lists, size=512,
                                  min_count=5)  # 代表一个词向量类,生成的是词向量模型
     self.model.save(_model_path)
예제 #10
0
파일: word2vec.py 프로젝트: 862604947/TEST
def _gen_embedding(ndim):  # 生成ndim维度的词向量
    print "Generating %d-dim word embedding ..." % ndim
    int2ch, ch2int = get_vocab()  # 得到词库
    ch_lists = []
    quatrains = get_quatrains()  # 得到所有符合要求规则的四行诗的诗句
    for idx, poem in enumerate(quatrains):  # 对于四行诗中的每一首诗
        for sentence in poem['sentences']:  # 对于诗中的每一句诗
            ch_lists.append(filter(lambda ch: ch in ch2int,
                                   sentence))  # 检查诗句的每一行中哪些在ch2int词典中
        if 0 == (idx + 1) % 10000:
            print "[Word2Vec] %d/%d poems have been processed." % (
                idx + 1, len(quatrains))
    print "Hold on. This may take some time ..."
    model = models.Word2Vec(ch_lists, size=ndim,
                            min_count=5)  # ch_list是词库,ndim是要生成的词向量的维度
    embedding = uniform(-1.0, 1.0,
                        [VOCAB_SIZE, ndim])  # 平均分布的矩阵,每一行代表一个词向量,每一个词向量维度ndim
    for idx, ch in enumerate(int2ch):
        if ch in model.wv:  # 如果int2ch中的该词在model生成的词向量中
            embedding[idx, :] = model.wv[ch]  # embedding中的该行代表ch对应的词向量
    np.save(_w2v_path, embedding)
    print "Word embedding is saved."
예제 #11
0
    def extract_couplets_with_tag(self, tag_dir, tag_name):
        tag_path = os.path.join(tag_dir, tag_name)
        if not os.path.exists(tag_path):
            raise ValueError('There is no valid tags')

        tag_sets = set()
        with codecs.open(tag_path, 'r', 'utf-8') as fin:
            # TODO: read from tags(default: one tag one line)
            line = fin.readline().strip()
            while line:
                tag_sets.add(line)
                line = fin.readline().strip()

        couplets_dict = quatrains.get_quatrains()
        couplets = [couplet['sentences'] for couplet in couplets_dict]

        seg = Segmenter()
        tag_couplet_path = os.path.join(DATA_PROCESSED_DIR, 'tag_couplets.txt')

        with open(tag_couplet_path, 'w') as fout:
            print 'create tag_couplets.txt'

        data = []

        with codecs.open(tag_couplet_path, 'a', 'utf-8') as fout:
            for couplet in couplets:
                seg_list_0 = seg.segment(couplet[0])
                seg_list_1 = seg.segment(couplet[1])
                flag_0 = set(seg_list_0) & tag_sets
                flag_1 = set(seg_list_1) & tag_sets
                if flag_0 or flag_1:
                    fout.write(couplet[0])
                    fout.write('\n')
                    fout.write(couplet[1])
                    fout.write('\n')
                    fout.write('\n')
                    data.append(couplet)
        return data
예제 #12
0
from generate import Generator
from plan import Planner

if __name__ == '__main__':
    evaluator = RhymeEvaluator()
    '''
    print "Evaluating rule-based method ..."
    scores = []
    with codecs.open('results.txt', 'r', 'utf-8') as fin:
        line = fin.readline()
        while line:
            scores.append(evaluator.eval(split_sentences(line.strip())))
            line = fin.readline()
    print "Mean score = %f, standard deviation = %f" % (np.mean(scores), np.std(scores))
    '''
    quatrains = get_quatrains()
    print "Testing %d quatrains from the corpus." % len(quatrains)
    scores = []
    for quatrain in quatrains:
        score = evaluator.eval(quatrain['sentences'])
        scores.append(score)
    print "Mean score = %f, standard deviation = %f" % (np.mean(scores),
                                                        np.std(scores))
    num = 100
    print "Testing %d poems generated by RNN ..." % num
    scores = []
    planner = Planner()
    generator = Generator()
    for _ in range(num):
        keywords = planner.plan(u'')
        assert 4 == len(keywords)