Пример #1
0
def process_mail(mail,exist=0):
	title = mail.name
	art_title = title
	vec = mail.text
	sen2 = 0
	if(sen2 == 1):
                input_name = title+'_sen2_test.txt'
                feature_name = title+'_sen2_features.txt'
                output_name = title+'_sen2_output.txt'
                #vec = construct_2sen_vec(article)
        else:
                input_name = title+'_test.txt'
                feature_name = title+'_features.txt'
                output_name = title+'_output.txt'
                #vec = construct_sentence_vec(article)
	matrix,title_sim = cal_similarity(mail)
        important =  lexrank(matrix,len(vec),0.001)
        important = (important-min(important))/(max(important)-min(important))
	if(max(title_sim)-min(title_sim) != 0):
        	title_sim = (title_sim-min(title_sim))/(max(title_sim)-min(title_sim))
	if(math.isnan(important[0])):
		print('similarity')
		print(matrix)
	for k in range(len(important)):
                if(math.isnan(important[k])):
                        important[k] = 0
        return vec,important,title_sim
Пример #2
0
def lexrankSentences(match):
    abstract, text = match
    pattern2 = re.compile(r'<P>([\w\W]+?)</P>')
    pattern3 = re.compile(r'\n')
    text = pattern3.sub(" ", text)
    sentences = pattern2.findall(text)
    result = lexrank(sentences)
    s1 = ''
    s2 = ''
    if len(result) >= 2:
        s1 = result[0]
        s2 = result[1]
    return abstract, s1, s2
Пример #3
0
def oz(yontem, so, haber):

    if len(haber) < 1:
        print('Lütfen Cümle Giriniz')
    elif so == "" or so == 0 or so == '0':
        print('Lütfen Özet Uzunluğu için Sıkıştırma Oranı Giriniz')
    elif len(haber) < 2:
        return haber
    else:
        if yontem == "textrank":
            ozetcumleleri = textrank(haber, so)
        elif yontem == "kumeleme":
            ozetcumleleri = kumeleme(haber, so)
        elif yontem == "lexrank":
            ozetcumleleri = lexrank(haber, so)
        else:
            ozetcumleleri = klasik(haber, so)
        yeniozet = ' . '.join(ozetcumleleri) + ' . '
        return (yeniozet)
Пример #4
0
def summarize(txtfilepath, maxlength, model):
    text_file = open(txtfilepath, "r", encoding="UTF-8")
    text = text_file.read()
    text_file.close()
    
    sentences = list(sent_splitter_ja(text))
    rank = lexrank.lexrank(sentences, len(sentences), 0.5, "word2vec", model)
    dict = {}
    for i in range(len(sentences)):
        dict[sentences[i]] = rank[i]

    result = {}
    for k, v in sorted(dict.items(), key=lambda x: -x[1]):
        result[k] = v

    ret = ""
    keys = list(result.keys())
    while len(ret + keys[0]) < maxlength:
        ret += keys[0] + "\n"
        keys.pop(0)
        if len(keys) == 0:
            break
    
    return ret
Пример #5
0
def process_document(title,sen2=1,exist=1):
	article,art_title = crawl_article(title)
        #sen2 = 1
        #exist = 1
        if(sen2 == 1):
                input_name = title+'_sen2_test.txt'
                feature_name = title+'_sen2_features.txt'
                output_name = title+'_sen2_output.txt'
                vec = construct_2sen_vec(article)
	else:
                input_name = title+'_test.txt'
                feature_name = title+'_features.txt'
                output_name = title+'_output.txt'
                vec = construct_sentence_vec(article)
	'''
	if(exist == 0):
                construct_input_file(vec,input_name,art_title)
                os.system('python takelab_sts/takelab_simple_features.py '+input_name+' > '+feature_name)
                os.system('svm-predict '+feature_name+' model.txt ' +  output_name)
                os.system('python postprocess_scores.py '+ input_name+' ' + output_name)
        matrix,title_sim = construct_similarity_matrix(output_name,len(vec))
	'''
	tmp = mail_struct()
	tmp.name = title
	tmp.text = vec
	bot = re.compile(r'\_',re.S)
	tmp.remove_stop_ver_title = stop_word_remove(bot.sub(' ',title))
	matrix,title_sim = cal_similarity(tmp)
        important =  lexrank(matrix,len(vec),0.001)
	for k in range(len(important)):
		if(math.isnan(important[k])):
			important[k] = 0
        important = (important-min(important))/(max(important)-min(important))
	title_sim = (title_sim-min(title_sim))/(max(title_sim)-min(title_sim))
        sort_index = numpy.argsort(important)[::-1]
	return vec,important,title_sim