def process_mail(mail,exist=0): title = mail.name art_title = title vec = mail.text sen2 = 0 if(sen2 == 1): input_name = title+'_sen2_test.txt' feature_name = title+'_sen2_features.txt' output_name = title+'_sen2_output.txt' #vec = construct_2sen_vec(article) else: input_name = title+'_test.txt' feature_name = title+'_features.txt' output_name = title+'_output.txt' #vec = construct_sentence_vec(article) matrix,title_sim = cal_similarity(mail) important = lexrank(matrix,len(vec),0.001) important = (important-min(important))/(max(important)-min(important)) if(max(title_sim)-min(title_sim) != 0): title_sim = (title_sim-min(title_sim))/(max(title_sim)-min(title_sim)) if(math.isnan(important[0])): print('similarity') print(matrix) for k in range(len(important)): if(math.isnan(important[k])): important[k] = 0 return vec,important,title_sim
def lexrankSentences(match): abstract, text = match pattern2 = re.compile(r'<P>([\w\W]+?)</P>') pattern3 = re.compile(r'\n') text = pattern3.sub(" ", text) sentences = pattern2.findall(text) result = lexrank(sentences) s1 = '' s2 = '' if len(result) >= 2: s1 = result[0] s2 = result[1] return abstract, s1, s2
def oz(yontem, so, haber): if len(haber) < 1: print('Lütfen Cümle Giriniz') elif so == "" or so == 0 or so == '0': print('Lütfen Özet Uzunluğu için Sıkıştırma Oranı Giriniz') elif len(haber) < 2: return haber else: if yontem == "textrank": ozetcumleleri = textrank(haber, so) elif yontem == "kumeleme": ozetcumleleri = kumeleme(haber, so) elif yontem == "lexrank": ozetcumleleri = lexrank(haber, so) else: ozetcumleleri = klasik(haber, so) yeniozet = ' . '.join(ozetcumleleri) + ' . ' return (yeniozet)
def summarize(txtfilepath, maxlength, model): text_file = open(txtfilepath, "r", encoding="UTF-8") text = text_file.read() text_file.close() sentences = list(sent_splitter_ja(text)) rank = lexrank.lexrank(sentences, len(sentences), 0.5, "word2vec", model) dict = {} for i in range(len(sentences)): dict[sentences[i]] = rank[i] result = {} for k, v in sorted(dict.items(), key=lambda x: -x[1]): result[k] = v ret = "" keys = list(result.keys()) while len(ret + keys[0]) < maxlength: ret += keys[0] + "\n" keys.pop(0) if len(keys) == 0: break return ret
def process_document(title,sen2=1,exist=1): article,art_title = crawl_article(title) #sen2 = 1 #exist = 1 if(sen2 == 1): input_name = title+'_sen2_test.txt' feature_name = title+'_sen2_features.txt' output_name = title+'_sen2_output.txt' vec = construct_2sen_vec(article) else: input_name = title+'_test.txt' feature_name = title+'_features.txt' output_name = title+'_output.txt' vec = construct_sentence_vec(article) ''' if(exist == 0): construct_input_file(vec,input_name,art_title) os.system('python takelab_sts/takelab_simple_features.py '+input_name+' > '+feature_name) os.system('svm-predict '+feature_name+' model.txt ' + output_name) os.system('python postprocess_scores.py '+ input_name+' ' + output_name) matrix,title_sim = construct_similarity_matrix(output_name,len(vec)) ''' tmp = mail_struct() tmp.name = title tmp.text = vec bot = re.compile(r'\_',re.S) tmp.remove_stop_ver_title = stop_word_remove(bot.sub(' ',title)) matrix,title_sim = cal_similarity(tmp) important = lexrank(matrix,len(vec),0.001) for k in range(len(important)): if(math.isnan(important[k])): important[k] = 0 important = (important-min(important))/(max(important)-min(important)) title_sim = (title_sim-min(title_sim))/(max(title_sim)-min(title_sim)) sort_index = numpy.argsort(important)[::-1] return vec,important,title_sim