예제 #1
0
                        prob_sum -= 1./(rank+1)
        shuffle(words)

    def plan(self, text):
        def extract(sentence):
            return [x for x in jieba.lcut(sentence) if x in self.ranks]
        keywords = sorted(reduce(lambda x,y:x+y, list(map(extract, split_sentences(text))), []),
            key = lambda x : self.ranks[x])
        words = [keywords[idx] for idx in \
                [i for i in range(len(keywords)) if 0 == i or keywords[i] != keywords[i-1]]]
        if len(words) < 2:
            self.expand(words, 2)
        else:
            while len(words) > 2:
                words.pop()
        return words

if __name__ == '__main__':
    planner = Planner()
    kw_train_data = get_kw_train_data()
    for row in kw_train_data:
        num = randint(1,3)
        uprint(row[1:])
        print("num = %d" %num)
        guess = row[1:num+1]
        planner.expand(guess, 4)
        uprintln(guess)
        assert len(guess) == 4
        print()

예제 #2
0
def get_word_cnts():
    if not os.path.exists(_wc_path):
        _gen_word_cnts()
    with codecs.open(_wc_path, 'r', 'utf-8') as fin:
        return json.load(fin)

def _min_word_cnt(cnts, poem, segmenter):
    min_cnt = (1<<31)-1
    for sentence in poem['sentences']:
        segs = segmenter.segment(sentence)
        for seg in segs:
            min_cnt = min(min_cnt, cnts[seg])
    return min_cnt

# TODO(vera): remove unnecessary logic
def get_pop_quatrains(num = 100000):
    cnts = get_word_cnts()
    segmenter = Segmenter()
    quatrains = get_quatrains()
    min_word_cnts = [_min_word_cnt(cnts, quatrain, segmenter) \
            for i, quatrain in enumerate(quatrains)]
    indexes = sorted(range(len(quatrains)), key = lambda i: -min_word_cnts[i])
    return [quatrains[index] for index in indexes[:min(num, len(indexes))]]

if __name__ == '__main__':
    cnts = get_word_cnts()
    words = sorted([word for word in cnts], key = lambda w: -cnts[w])
    uprintln(words[:20])