-
Notifications
You must be signed in to change notification settings - Fork 0
/
summaries.py
118 lines (76 loc) · 3.35 KB
/
summaries.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
# mock up of summarising
# load up a text file
# split into sentences
# load up the corresponding tags from database
# find sentences that have the largest amount of tags
# maybe include bigram analysis?
# write the best candidates to database - no need yet
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer, sent_tokenize
import searchlib as lib
from untokenize import untokenize
path = 'C:/Users/bazilj/PycharmProjects/nltk_test/text_samples/CS Results Retrieving archived data.txt'
contents = open(path, 'r').read()
clean = lib.clean_text(contents)
tokens = lib.tokenize(clean)
lemma = WordNetLemmatizer()
token_dict = dict([])
lemmed = []
for token in tokens:
# create a dictionary of transformations
token_dict[token] = lemma.lemmatize(token, lib.get_wordnet_pos(nltk.pos_tag(nltk.word_tokenize(token))[0][1]))
# root words separated
lemmed.append(lemma.lemmatize(token, lib.get_wordnet_pos(nltk.pos_tag(nltk.word_tokenize(token))[0][1])))
# calculate frequency dist. This is where we can improve the quality of summarisation by using a different algorithm
fdist1 = nltk.FreqDist(lemmed)
# get the whole scoring set - there's probably a better method
scoring_set = dict(fdist1.most_common(len(fdist1)))
# split into sentences
tokenizer_words = TweetTokenizer()
tokens_sentences = [tokenizer_words.tokenize(t) for t in nltk.sent_tokenize(contents)]
# score sentences based on the frequency distribution of the words
for sentence in tokens_sentences:
sentence_roots = [lemma.lemmatize(sentence_word, lib.get_wordnet_pos(nltk.pos_tag(nltk.word_tokenize(sentence_word))[0][1])) for sentence_word in sentence]
# to score the words in the sentence
# we gonna:
# put original and lemmed as a dictionary where {orig_word = lemmed_word}
sentence_dict = dict(zip(sentence, sentence_roots))
# compare the lemmed words in the dictionary to the fdist most common,
for term, term_lemma in sentence_dict.items():
for key, val in scoring_set.items():
if term_lemma == key:
sentence_dict[term] = val
# when match found sub the lemmed word for the word score.
# sum the scores
total_score = 0
for word, score in sentence_dict.items():
if word.isnumeric():
pass
else:
if isinstance(score, int):
total_score += score
# add score to the start of the sentence
sentence.insert(0, total_score)
# find highest ranked sentences and return in the correct order
final_list = dict([])
for i in range(0, 6):
max1 = [0, 0]
for j in range(len(tokens_sentences)):
if tokens_sentences[j][0] > max1[1]:
max1[1] = tokens_sentences[j][0]
# store the index
max1[0] = j
# insert at index - we preserve the position by storing the index as the key
final_list[max1[0]] = tokens_sentences[max1[0]]
# remove at index
tokens_sentences.pop(max1[0])
print(sorted(final_list))
# detokenize - and order
for i in range(0, len(sorted(final_list))):
print(sorted(final_list)[i])
statement = final_list[sorted(final_list)[i]]
# remove the score
statement.pop(0)
condensed = untokenize(statement)
print(condensed)