forked from yask123/Summarize-it
-
Notifications
You must be signed in to change notification settings - Fork 11
/
lsa.py
172 lines (147 loc) · 6.98 KB
/
lsa.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals
import math
from warnings import warn
try:
import numpy
except ImportError:
numpy = None
try:
from numpy.linalg import svd as singular_value_decomposition
except ImportError:
singular_value_decomposition = None
from base_summarizer import BaseSummarizer
import spacy.en
from spacy.parts_of_speech import VERB, NOUN, PROPN, PRON, PUNCT
from spacy.en import STOPWORDS
import logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)
class LsaSummarizer(BaseSummarizer):
MIN_DIMENSIONS = 3
REDUCTION_RATIO = 1/1
_stop_words = frozenset()
def __init__(self, ):
BaseSummarizer.__init__(self, )
self.nlp = spacy.en.English(entity=False, matcher=False)
self.nlp_doc = None
@property
def stop_words(self):
return self._stop_words
@stop_words.setter
def stop_words(self, words):
self._stop_words = frozenset(map(self.normalize_word, words))
def __call__(self, document, sentences_count, user_dict):
self._ensure_dependecies_installed()
self.nlp_doc = self.nlp(document)
self.user_dict = user_dict
logger.info("Created doc")
dictionary = self._create_dictionary()
# empty document
if not dictionary:
return ()
matrix = self._create_matrix(dictionary)
matrix = self._compute_term_frequency(matrix)
u, sigma, v = singular_value_decomposition(matrix, full_matrices=False)
ranks = iter(self._compute_ranks(sigma, v))
sents = [s.text for s in self.nlp_doc.sents]
logger.info("Sentences generated by spacy are %s, count %s", sents, len(sents))
new_sents = self._get_best_sentences(sents, sentences_count*2,
lambda s: next(ranks))
filt_sents = [sent for sent in new_sents if self.better_question(sent)]
additional_sents = set(new_sents) - set(filt_sents)
to_add = sentences_count - len(filt_sents)
final_sents = filt_sents
if to_add > 0:
final_sents += sorted(list(additional_sents)[:to_add], key=lambda x: len(x), reverse=True)
logger.info("Filtered sentences %s", filt_sents)
logger.info("Final recommendations are %s", final_sents[:sentences_count])
return final_sents
def better_question(self, txt):
if len(txt.split()) > 5:
parse = self.nlp(txt)
for sent in parse.sents:
if len(sent) > 5:
p2 = self.nlp(sent.text)
for (i, wd) in enumerate(p2):
if wd.lemma_ in (u'can', u'should', u'will', u'could', u'why', u'what', u'how', u'is'):
return u'ROOT' in [x.dep_ for x in p2[i+1:]] and u'?' in [x.orth_ for x in p2[i+1:]]
def _ensure_dependecies_installed(self):
if numpy is None:
raise ValueError("LSA summarizer requires NumPy. Please, install it by command 'pip install numpy'.")
def _create_dictionary(self, ):
"""Creates mapping key = word, value = row index"""
words = [wd.orth_ for wd in self.nlp_doc if wd.pos != PUNCT]
unique_words = frozenset(w.lemma_ for w in self.nlp_doc if w not in STOPWORDS and w.tag_ != "PRP" and (w.pos == VERB or w.pos == NOUN))
unique_users = frozenset(self.user_dict.values())
logger.info("Have %s unique words" % len(unique_words))
logger.info("Have %s unique users" % len(unique_users))
return dict((w, i) for i, w in enumerate(unique_words|unique_users))
def collect_bow(self, txt):
sents = nlp(txt).sents
return [x for x in [retrieve_main_bow(sent) for sent in sents] if x]
def _create_matrix(self, dictionary):
"""
Creates matrix of shape |unique words|×|sentences| where cells
contains number of occurences of words (rows) in senteces (cols).
"""
sentences = list(self.nlp_doc.sents)
words_count = len(dictionary)
sentences_count = len(sentences)
logger.info ("Have %s sentences " % sentences_count)
if words_count < sentences_count:
message = (
"Number of words (%d) is lower than number of sentences (%d). "
"LSA algorithm may not work properly."
)
logger.warn(message % (words_count, sentences_count))
# create matrix |unique words|×|sentences| filled with zeroes
matrix = numpy.zeros((words_count, sentences_count))
for col, sentence in enumerate(sentences):
for word in [wd.lemma_ for wd in sentence if wd.lemma_ in dictionary]:
matrix[dictionary[word], col] += 1
if sentence.text in self.user_dict and len(self.user_dict[sentence.text]) > 1:
logger.info("Matching sentence %s with user %s", sentence.text, self.user_dict[sentence.text])
matrix[dictionary[self.user_dict[sentence.text]], col] += 1
return matrix
def _compute_term_frequency(self, matrix, smooth=0.4):
"""
Computes TF metrics for each sentence (column) in the given matrix.
You can read more about smoothing parameter at URL below:
http://nlp.stanford.edu/IR-book/html/htmledition/maximum-tf-normalization-1.html
"""
assert 0.0 <= smooth < 1.0
max_word_frequencies = numpy.max(matrix, axis=0)
rows, cols = matrix.shape
for row in range(rows):
for col in range(cols):
max_word_frequency = max_word_frequencies[col]
if max_word_frequency != 0:
frequency = matrix[row, col]/max_word_frequency
matrix[row, col] = smooth + (1.0 - smooth)*frequency
return matrix
def _compute_ranks(self, sigma, v_matrix):
assert len(sigma) == v_matrix.shape[0], "Matrices should be multiplicable"
dimensions = max(LsaSummarizer.MIN_DIMENSIONS,
int(len(sigma)*LsaSummarizer.REDUCTION_RATIO))
powered_sigma = tuple(s**2 if i < dimensions else 0.0
for i, s in enumerate(sigma))
ranks = []
# iterate over columns of matrix (rows of transposed matrix)
for column_vector in v_matrix.T:
rank = sum(s*v**2 for s, v in zip(powered_sigma, column_vector))
ranks.append(math.sqrt(rank))
return ranks
def retrieve_main_bow(tokens):
bow = set()
for tok in tokens:
if tok.pos != PUNCT:
if tok.dep_ == 'advcl' or tok.dep == 'xcomp':
bow.add(' '.join([ti.lower_ for ti in list(tok.children) if tok.tag_ != "PRP" and ti.lower_ not in STOPWORDS]))
bow.add(tok.lower_)
if tok.pos == NOUN or tok.pos == VERB:
if tok.tag_ != "PRP" and tok.lower_ not in STOPWORDS:
bow.add(tok.lower_)
mt = re.sub(r'[\n\t\n]', u'', u' '.join(list(bow))+u'.')
return mt if len(mt.strip().split()) > 2 else None