prob_sum -= 1./(rank+1) shuffle(words) def plan(self, text): def extract(sentence): return [x for x in jieba.lcut(sentence) if x in self.ranks] keywords = sorted(reduce(lambda x,y:x+y, list(map(extract, split_sentences(text))), []), key = lambda x : self.ranks[x]) words = [keywords[idx] for idx in \ [i for i in range(len(keywords)) if 0 == i or keywords[i] != keywords[i-1]]] if len(words) < 2: self.expand(words, 2) else: while len(words) > 2: words.pop() return words if __name__ == '__main__': planner = Planner() kw_train_data = get_kw_train_data() for row in kw_train_data: num = randint(1,3) uprint(row[1:]) print("num = %d" %num) guess = row[1:num+1] planner.expand(guess, 4) uprintln(guess) assert len(guess) == 4 print()
def get_word_cnts(): if not os.path.exists(_wc_path): _gen_word_cnts() with codecs.open(_wc_path, 'r', 'utf-8') as fin: return json.load(fin) def _min_word_cnt(cnts, poem, segmenter): min_cnt = (1<<31)-1 for sentence in poem['sentences']: segs = segmenter.segment(sentence) for seg in segs: min_cnt = min(min_cnt, cnts[seg]) return min_cnt # TODO(vera): remove unnecessary logic def get_pop_quatrains(num = 100000): cnts = get_word_cnts() segmenter = Segmenter() quatrains = get_quatrains() min_word_cnts = [_min_word_cnt(cnts, quatrain, segmenter) \ for i, quatrain in enumerate(quatrains)] indexes = sorted(range(len(quatrains)), key = lambda i: -min_word_cnts[i]) return [quatrains[index] for index in indexes[:min(num, len(indexes))]] if __name__ == '__main__': cnts = get_word_cnts() words = sorted([word for word in cnts], key = lambda w: -cnts[w]) uprintln(words[:20])