def sentence_evaluate(self, sentence): seg_list = jieba.cut( string_parser(punc_file='../resource/punc_file.txt', string_with_punc=sentence)) seg_list = filter(lambda x: x != " ", seg_list) lists = list(seg_list) return lists
def main(): conn = MySQLdb.connect( host= args.host, user=args.user, passwd= args.passwd, db= args.db, charset='utf8') cur = conn.cursor() cur.execute('select id, content from t_news_detail order by rand() limit 200000') data = cur.fetchall() # parse data by beautiful soup with open("../resource/split_words.txt", "wb") as f: for line in data: ids, content_html = line content = BeautifulSoup(content_html) text = content.get_text() try: lists = cut_sentence_new(text) for key2,value2 in enumerate(lists): label = str(ids) + "_" + str(key2) seg_list = jieba.cut(string_parser(punc_file = '../resource/punc_file.txt',string_with_punc = value2)) seg_list = filter(lambda x: x != " ", seg_list) lists = list(seg_list) #save sentence with length greater than 3 if(len(lists) >= 3): vector = ",".join(lists) f.write(label + ',' + vector + '\n') except: continue f.close()
def sentence_evaluate(sentence): seg_list = pseg.cut( string_parser(punc_file='../resource/punc_file.txt', string_with_punc=sentence)) seg_list = filter( lambda x: x.word != "" and x.word not in stopw and 'n' in x.flag and len(x.word) >= 2, seg_list) words = map(lambda x: x.word, seg_list) words = filter(lambda x: len(x) != 0, words) return words
def evaluate(content, keyword): scope = 6 if keyword[0] is not None: try: model_word[keyword[0]] except: keyword = list(jieba.cut(string_parser(punc_file = '../resource/punc_file.txt',string_with_punc = keyword[0]))) # split text to sentence lists = cut_sentence_new(content) dictssentence = {key: value.strip("\n") for key, value in enumerate(lists)} # split sentence to words only consider the sentence with len greater than 6 # for word !!! top 5 words words_importance = evaluate_words(dictssentence,keyword)[:20] words = map(lambda (word, importances): word, words_importance) keywords_list = map(lambda x: word2vec_evaluate(x), words) keywords_list = filter(lambda x: len(x) != 0, keywords_list) denomiator = len(keywords_list) agg = reduce(lambda x,y : x + y, keywords_list) agg_final = map(lambda x: str(x),agg/denomiator) return agg_final
def evaluate(): args = parser.parse_args() scope = 6 content = args.content keyword = [args.keyword] if keyword[0] is not None: try: model_word[keyword[0]] except: keyword = list( jieba.cut( string_parser(punc_file='../resource/punc_file.txt', string_with_punc=keyword[0]))) # split text to sentence lists = cut_sentence_new(content) dictssentence = {key: value.strip("\n") for key, value in enumerate(lists)} # split sentence to words only consider the sentence with len greater than 6 # for word !!! top 5 words words_importance = evaluate_words(dictssentence, keyword)[:20] words = map(lambda (word, importances): word, words_importance) # for sentence sentence_result_lists = evaluate_sentence(dictssentence) if len(sentence_result_lists) == 0: result_final = evaluate_sentence_tradition(dictssentence, words_importance) outputString = dictssentence.get(result_final[0].ids, "") return (outputString, words) else: sentence = stdOut(sentence_result_lists, dictssentence, 6) if len(sentence) == 0: result_final = evaluate_sentence_tradition(dictssentence, words_importance) outputString = dictssentence.get(result_final[0].ids, "") return (outputString, words) else: outputString = sentence[0].strs return (outputString, words)
def main(): # load data conn = MySQLdb.connect(host=args.host, user=args.user, passwd=args.passwd, db=args.db, charset='utf8') cur = conn.cursor() cur.execute('select id, content_html from t_crawler_obj limit ' + args.file[0] + ',' + args.file[1]) data = cur.fetchall() # load model model = doc2vec.Doc2Vec.load(args.model) # parse data by beautiful soup dicts1 = dict() for line in data: ids, content_html = line content = BeautifulSoup(content_html, "html.parser") dicts1[ids] = content.get_text() # split sentence # nested dict dict2-> key: paper, value: dicttmp-> key: sentence id, value: sentence string dicts2 = defaultdict(dict) for key, value in dicts1.items(): lists = cut_sentence_new(value) dicttmp = dict() for key2, value2 in enumerate(lists): dicttmp[key2] = value2 dicts2[key] = dicttmp # split words dict3-> key: paper, value: dicttmp-> key: sentence id, value: sentence split list dicts3 = defaultdict(dict) analyse.set_stop_words('../resource/stop_words.txt') for key, value in dicts2.items(): dicttmp = dict() for key2, value2 in value.items(): seg_list = jieba.cut( string_parser(punc_file='../resource/punc_file.txt', string_with_punc=value2)) seg_list = filter(lambda x: x != " ", seg_list) lists = list(seg_list) if (len(lists) >= 3): #save sentence with length greater than 3 dicttmp[key2] = lists dicts3[key] = dicttmp # vectorization and textrank for key, value in dicts3.items(): dictrember = dict() X = list() i = 0 for key2, value2 in value.items(): dictrember[i] = key2 # i: X index; key2: sentence order X.append(model.infer_vector(value2)) i += 1 X = np.array(X, dtype='float32') distance_matrix = pairwise_distances(X, metric='cosine') rank = rankgetter(distance_matrix=distance_matrix, dictrember=dictrember) j = 0 try: lists = list() for info in rank: ind = info.ids # sentence order tmp = dicts2[key][ind] tmp2 = filter(lambda x: is_chinese(x), tmp) if (len(tmp2) < 8 or contain_redundant( redundant_dict='../resource/redundant_dict.txt', string_with_redundant=dicts2[key][ind])): continue j += 1 result_str = removePrefix(dicts2[key][ind].replace(" ", ""), "”".decode("utf8")) result = distattr2(ind, result_str) lists.append(result) if (j >= args.top): break stdOut = sorted( lists, key=lambda x: x.ids ) # print the result according to the order sentence for key3, sentence3 in enumerate(stdOut): print str(key) + " " + str(key3 + 1) + ": " + sentence3.strs except: print("No More Qualified Sentence!")