def read_from_sentence_txt(start, emission, transition): ## ./result/sentence.txt print('read from sentence.txt') for line in open(SENTENCE_FILE): line = util.as_text(line.strip()) if len(line) < 2: continue if not util.is_chinese(line): continue ## for start start.setdefault(line[0], 0) start[line[0]] += 1 ## for emission pinyin_list = topinyin(line) char_list = [c for c in line] for hanzi, pinyin in zip(char_list, pinyin_list): emission.setdefault(hanzi, {}) emission[hanzi].setdefault(pinyin, 0) emission[hanzi][pinyin] += 1 ## for transition for f, t in zip(line[:-1], line[1:]): transition.setdefault(f, {}) transition[f].setdefault(t, 0) transition[f][t] += 1
def stdOut(sentence_result_lists, dicts, scope): lists = list() j = 0 sentence_nbr = len(dicts) sentence_result_lists_tmp = map( lambda (key, score): (key, score * (1 - math.log( (key + 1)) / math.log(sentence_nbr))), sentence_result_lists) sentence_result_lists2 = sorted(sentence_result_lists_tmp, key=lambda (key, score): score, reverse=True) try: for distattr3 in sentence_result_lists2: sentence_id = distattr3[0] tmp = dicts[sentence_id] tmp2 = filter(lambda x: is_chinese(x), tmp) if (len(tmp2) < 8 or contain_redundant( redundant_dict='../resource/redundant_dict.txt', string_with_redundant=tmp)): continue j += 1 result_str = removePrefix(tmp.strip(" "), "”".decode("utf8")) result = distattr2(sentence_id, result_str) lists.append(result) if (j >= scope): break std = sorted(lists, key=lambda x: 0.5 * len(x.strs) / (x.ids + 1), reverse=True) except: std = lists return std
def change(filepath): f = open(filepath, 'r', encoding=testDecode(filepath)) fnew = open(filepath[:-4] + '_new.txt', 'w+', encoding="utf-8") # 将结果存入新的文本中 fristLine = None while True: if not fristLine: # fristline为空则读取 fristLine = f.readline() if not fristLine: break newLine = fristLine.strip() # print(newLine) if len(newLine) != 0: # 第一行有字 secondLine = f.readline() if not secondLine: #第二行没有,即为退出 fnew.write(fristLine.strip()) break while len(secondLine.strip()) == 0: secondLine = f.readline() if not secondLine: #第二行没有,即为退出 fnew.write(fristLine.strip()) break firstLast = fristLine.strip()[-1] if firstLast in [ "。", "*", ":", ">", "」", ')', '?', '!', ')', '=', '”', '^', '*', '】', '▲', '▽', '☆', '○', '¨', '╔', '?', '》', ';' ]: # 能换段 fnew.write(fristLine.rstrip()) fnew.write("\n") fristLine = " %s" % (secondLine.strip()) pass elif firstLast in [",", "…", "、", '「', '(', "(", "<", '—'] \ or is_chinese(firstLast) or firstLast.isalnum(): # 不能换段 fnew.write(fristLine.rstrip()) fristLine = secondLine.strip() pass else: print("特殊字符 %s" % firstLast) fnew.write(fristLine.rstrip()) fnew.write("\n") fristLine = " %s" % (secondLine.strip()) fnew.flush() else: # 第一行为空,重读 fristLine = f.readline() f.close() fnew.close()
def extract_chinese_sentences(content): content = util.as_text(content) content = content.replace(' ', '') content = content.replace('\t', '') sentences = [] s = '' for c in content: if util.is_chinese(c): s += c else: sentences.append(s) s = '' sentences.append(s) return [s.strip() for s in sentences if len(s.strip()) > 1]
def read_from_word_txt(start, emission, transition): ## ! 基于word.txt的优化 print('read from word.txt') _base = 1000. _min_value = 2. for line in open(WORD_FILE): line = util.as_text(line.strip()) if '=' not in line: continue if len(line) < 3: continue ls = line.split('=') if len(ls) != 2: continue word, num = ls word = word.strip() num = num.strip() if len(num) == 0: continue num = float(num) num = max(_min_value, num / _base) if not util.is_chinese(word): continue ## for start start.setdefault(word[0], 0) start[word[0]] += num ## for emission pinyin_list = topinyin(word) char_list = [c for c in word] for hanzi, pinyin in zip(char_list, pinyin_list): emission.setdefault(hanzi, {}) emission[hanzi].setdefault(pinyin, 0) emission[hanzi][pinyin] += num ## for transition for f, t in zip(word[:-1], word[1:]): transition.setdefault(f, {}) transition[f].setdefault(t, 0) transition[f][t] += num
def stdOut(self, rank, dicts, top): lists = list() j = 0 try: for sentence_id in rank: tmp = dicts[sentence_id] tmp2 = filter(lambda x: is_chinese(x), tmp) if (len(tmp2) < 8 or contain_redundant( redundant_dict='../resource/redundant_dict.txt', string_with_redundant=tmp)): continue j += 1 result_str = removePrefix(tmp.replace(" ", ""), "”".decode("utf8")) result = distattr2(sentence_id, result_str) lists.append(result) if (j >= top): break std = sorted(lists, key=lambda x: x.ids) except: std = lists return std
def calc_x_offset(self, text, size): offset = sum([size * 1.6 for st in text if is_chinese(st)]) offset += sum([size * 0.8 for st in text if not is_chinese(st)]) return offset
def testcChinese(): for str in "我1.。a": # 只有“我”是true print("%s is chinese:%r" % (str, is_chinese(str))) pass
def main(): # load data conn = MySQLdb.connect(host=args.host, user=args.user, passwd=args.passwd, db=args.db, charset='utf8') cur = conn.cursor() cur.execute('select id, content_html from t_crawler_obj limit ' + args.file[0] + ',' + args.file[1]) data = cur.fetchall() # load model model = doc2vec.Doc2Vec.load(args.model) # parse data by beautiful soup dicts1 = dict() for line in data: ids, content_html = line content = BeautifulSoup(content_html, "html.parser") dicts1[ids] = content.get_text() # split sentence # nested dict dict2-> key: paper, value: dicttmp-> key: sentence id, value: sentence string dicts2 = defaultdict(dict) for key, value in dicts1.items(): lists = cut_sentence_new(value) dicttmp = dict() for key2, value2 in enumerate(lists): dicttmp[key2] = value2 dicts2[key] = dicttmp # split words dict3-> key: paper, value: dicttmp-> key: sentence id, value: sentence split list dicts3 = defaultdict(dict) analyse.set_stop_words('../resource/stop_words.txt') for key, value in dicts2.items(): dicttmp = dict() for key2, value2 in value.items(): seg_list = jieba.cut( string_parser(punc_file='../resource/punc_file.txt', string_with_punc=value2)) seg_list = filter(lambda x: x != " ", seg_list) lists = list(seg_list) if (len(lists) >= 3): #save sentence with length greater than 3 dicttmp[key2] = lists dicts3[key] = dicttmp # vectorization and textrank for key, value in dicts3.items(): dictrember = dict() X = list() i = 0 for key2, value2 in value.items(): dictrember[i] = key2 # i: X index; key2: sentence order X.append(model.infer_vector(value2)) i += 1 X = np.array(X, dtype='float32') distance_matrix = pairwise_distances(X, metric='cosine') rank = rankgetter(distance_matrix=distance_matrix, dictrember=dictrember) j = 0 try: lists = list() for info in rank: ind = info.ids # sentence order tmp = dicts2[key][ind] tmp2 = filter(lambda x: is_chinese(x), tmp) if (len(tmp2) < 8 or contain_redundant( redundant_dict='../resource/redundant_dict.txt', string_with_redundant=dicts2[key][ind])): continue j += 1 result_str = removePrefix(dicts2[key][ind].replace(" ", ""), "”".decode("utf8")) result = distattr2(ind, result_str) lists.append(result) if (j >= args.top): break stdOut = sorted( lists, key=lambda x: x.ids ) # print the result according to the order sentence for key3, sentence3 in enumerate(stdOut): print str(key) + " " + str(key3 + 1) + ": " + sentence3.strs except: print("No More Qualified Sentence!")