/
Hybrid.py
115 lines (104 loc) · 4.55 KB
/
Hybrid.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import os
import bs4 as bs
import urllib.request
import re
import nltk
# nltk.download('averaged_perceptron_tagger')
DATASET_LOCATION = "/home/carry/opensoft/OpenSoft-Data/All_FT"
DESTINATION_LOCATION = "/home/carry/opensoft/Opensoft-2019/api"
from knapsack import knapsack
import heapq
from gensim.summarization.summarizer import summarize,_set_graph_edge_weights,_build_graph,_build_corpus,_clean_text_by_sentences, _build_hasheable_corpus
from gensim.summarization import keywords
from gensim.summarization.pagerank_weighted import pagerank_weighted as _pagerank
from gensim.summarization.commons import build_graph as _build_graph
from gensim.summarization.commons import remove_unreachable_nodes as _remove_unreachable_nodes
case_filenames = [f for f in os.listdir(DATASET_LOCATION) if not f.startswith(".") ]
MAX_WORDS=100
for i,case_filename in enumerate(case_filenames[:]):
with open('{}/{}'.format(DATASET_LOCATION,case_filename)) as f:
text = f.read().strip()
text = text.split("\n",6)[6]
sentences = _clean_text_by_sentences(text)
sent_for_nltk = [sent.text for sent in sentences]
nltk_str = " ".join(sent_for_nltk)
corpus = _build_corpus(sentences)
hashable_corpus = _build_hasheable_corpus(corpus)
graph = _build_graph(hashable_corpus)
_set_graph_edge_weights(graph)
_remove_unreachable_nodes(graph)
pagerank_scores = _pagerank(graph)
sentences_by_corpus = dict(zip(hashable_corpus, sentences))
get_sentences = [sentences_by_corpus[tuple(doc)] for doc in hashable_corpus[:-1]]
get_scores = [pagerank_scores.get(doc) for doc in hashable_corpus[:-1]]
word_frequencies = {}
for word in nltk.word_tokenize(nltk_str):
if word not in word_frequencies.keys():
word_frequencies[word] = 1
else:
word_frequencies[word] += 1
maximum_frequncy = max(word_frequencies.values())
for word in word_frequencies.keys():
word_frequencies[word] = (word_frequencies[word]/maximum_frequncy)
sentence_scores = {}
stopped_sentences = []
sani_sent_list = []
new_sent = sent_for_nltk[0]
for j in range(len(sent_for_nltk)-1):
last_word = new_sent.split(" ")[-1]
if last_word and last_word[-1] != ".":
new_sent += "."
last_word = last_word[:-1]
if len(last_word) < 4 or "." in last_word or "/" in last_word:
new_sent += (" " + sent_for_nltk[j+1])
else:
sani_sent_list.append(new_sent)
new_sent = sent_for_nltk[j+1]
if new_sent.split(" ")[-1][-1] != ".":
new_sent += "."
sani_sent_list.append(new_sent)
for sent in sani_sent_list:
j=0
stopped_sent_words = []
for word in nltk.word_tokenize(sent.lower()):
j=j+1
if word in word_frequencies.keys():
if sent not in sentence_scores.keys():
sentence_scores[sent] = word_frequencies[word]
else:
sentence_scores[sent] += word_frequencies[word]
stopped_sent_words.append(word)
stopped_sentences.append(" ".join(stopped_sent_words))
sentence_scores[sent]=sentence_scores[sent]/j
for j,get_score in enumerate(get_scores):
if get_scores[j] == None:
get_scores[j]=0
j=0
l=0
final_sentence_scores={}
for sent in sani_sent_list:
j=j+l
l=0
if sent not in final_sentence_scores.keys():
final_sentence_scores[sent]=0
else:
final_sentence_scores[sent]+=sentence_scores[sent]
for sentence in get_sentences[j:-1]:
if sentence.text[-1]!='.':
sentence.text+='.'
if sent.endswith(sentence.text):
final_sentence_scores[sent]+=get_scores[j]
l=l+1
break
final_sentence_scores[sent]+=get_scores[j]
l=l+1
summary_sentences = heapq.nlargest(30, final_sentence_scores, key=final_sentence_scores.get)
size = [len(s.split(" ")) for s in summary_sentences]
weights = [final_sentence_scores[s]/len(s.split(" ")) for s in summary_sentences]
sol = knapsack(size, weights).solve(MAX_WORDS)
max_weight, selected_sizes = sol
summary = " ".join(summary_sentences[s] for s in selected_sizes)
words_in_summary = len(summary.split(" "))
print("\n File : {} \n Summary : {} \n Words : {} \n".format(case_filename,summary,words_in_summary))
q=open("{}/{}".format(DESTINATION_LOCATION,case_filename),"w")
q.write(summary)