-
Notifications
You must be signed in to change notification settings - Fork 0
/
text_model.py
206 lines (180 loc) · 8.78 KB
/
text_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
import re
import collections
import copy
import nltk.tokenize
import common
import link_lexemes
import const_phrase
# calculating critical frequency for arbitrarily built frequency dictionary
# (may be raw frequency as well as assotiative power)
def calculate_critical_frequency(in_freq_dictionary):
distinct_frequencies = set(in_freq_dictionary.values())
max_rank = len(distinct_frequencies)
return 1.0 + 0.5 * max_rank
# returns a list of sentences, each one being a list of words, free of punctuation marks
def tokenize_sentences(in_raw_text):
# splitting sentences
sentences = nltk.tokenize.sent_tokenize(in_raw_text)
# splitting words, removing punctuation
punct_free_sentences = []
for sentence in sentences:
tokens = nltk.tokenize.wordpunct_tokenize(sentence)
punct_free_sentences.append([word.lower() for word in tokens if re.findall('\w+', word)])
return punct_free_sentences
def write_text(in_stream, in_sentences):
for sentence in in_sentences:
print >>in_stream, ' '.join(sentence)
# every tuple of lexemes with equal existence matrices is a constant phrase
def extract_constant_phrases(in_existence_areas):
ids_sorted = sorted(in_existence_areas.keys())
id_indices_used = set([])
constant_phrases = []
for head_index in xrange(len(ids_sorted)):
if head_index in id_indices_used:
continue
head = ids_sorted[head_index]
constant_phrase_context = []
for context_index in xrange(head_index + 1, len(ids_sorted)):
context = ids_sorted[context_index]
if in_existence_areas[head] == in_existence_areas[context] \
and context_index not in id_indices_used:
constant_phrase_context.append(context)
id_indices_used.add(context_index)
if len(constant_phrase_context):
constant_phrases.append([head] + constant_phrase_context)
id_indices_used.add(head_index)
return constant_phrases
def refresh_text_index(in_lexemes, in_text_index):
new_text = {}
# removing links to non-existent lexemes
for id in in_text_index:
new_text[id] = \
[word_id for word_id in in_text_index[id] if word_id in in_lexemes]
# removing empty sentences
text_dict = {id: words for (id, words) in new_text.iteritems() if len(words)}
return text_dict
# dumps two lists of dominant and nondominant lexemes with their associative powers
def get_freq_dict(in_text_model):
phrases = {}
for phrase in in_text_model.existence_areas.keys():
phrase_string = in_text_model.get_phrase(phrase)
power = in_text_model.get_phrase_weight(phrase)
phrases[phrase_string] = power
return phrases
class AssociativeModel(object):
def __init__(self, in_sentences, in_language = common.DEFAULT_LANGUAGE):
self.language = in_language
self.sentences = copy.deepcopy(in_sentences)
'''
builds:
self.fls - full link set in the form ['word', start_position]
self.pos_to_lex_mapping for words from fls: {start_position -> 'word'}
self.lex_to_pos_mapping for words from fls: {'word' -> start_position}
self.indexed_text: sentences containing positions of fls' lexemes held inside of them
'''
self.build_lexicon(self.sentences)
self.initial_preprocessing(self.sentences)
self.existence_areas = self.calculate_existence_areas(self.indexed_text,
self.pos_to_lex_mapping)
self.associative_power_dict = self.calculate_associative_power(self.indexed_text,
self.existence_areas)
self.critical_associative_power = calculate_critical_frequency(self.associative_power_dict)
# removing attributive lexemes, merging constant phrases
self.existence_areas = self.perform_existence_areas_analysis(self.existence_areas)
# extracting full link set
def initial_preprocessing(self, in_sentences):
# just words in the right order
self.fls = link_lexemes.extract_full_link_set(in_sentences,
self.language,
keep_positions = True)
self.pos_to_lex_mapping = {record[1]:record[0] for record in self.fls}
self.lex_to_pos_mapping = {record[0]:record[1] for record in self.fls}
# representing sentences as lists of FLS indices, not words themselves
self.indexed_text = self.build_text_index(in_sentences, self.lex_to_pos_mapping)
def build_lexicon(self, in_sentences):
unique_words = set([])
for sentence in in_sentences:
unique_words |= set(sentence)
self.lexicon = sorted(unique_words)
def build_text_index(self, in_sentences, in_mapping):
indexed_sentences = []
for sentence in in_sentences:
new_sentence = ([in_mapping[word] for word in sentence if word in in_mapping])
if len(new_sentence):
indexed_sentences.append(new_sentence)
return indexed_sentences
def merge_constant_phrases(self, in_existence_areas):
const_phrases = const_phrase.extract_constant_phrases(in_existence_areas)
def is_attributive_lexeme(self, in_id):
target_area = set(self.existence_areas[in_id])
for lexeme_id in self.existence_areas.keys():
if not lexeme_id == in_id:
other_area = set(self.existence_areas[lexeme_id])
if not len(target_area.difference(other_area)) \
and len(other_area.difference(target_area)):
return True
return False
# filtering off attributive lexemes, merging constant phrases
def perform_existence_areas_analysis(self, in_existence_areas):
nonattributive = self.extract_nonattributive_lexemes(in_existence_areas)
merged = self.merge_const_phrases(nonattributive)
return merged
def merge_const_phrases(self, in_existence_areas):
# detecting constant phrases
phrases_merged = const_phrase.extract_constant_phrases(in_existence_areas)
lexemes_to_remove = set([])
replace_map = {}
for phrase in phrases_merged:
lexemes_to_remove |= set(phrase[1:])
replace_map[phrase[0]] = phrase
merged_existence_areas = {}
for (lex_id, area) in in_existence_areas.iteritems():
if lex_id in replace_map:
merged_existence_areas[tuple(replace_map[lex_id])] = area
elif not lex_id in lexemes_to_remove:
merged_existence_areas[tuple([lex_id])] = area
return merged_existence_areas
def extract_nonattributive_lexemes(self, in_existence_areas):
return {lexeme_id: area for (lexeme_id, area) in in_existence_areas.iteritems() \
if not self.is_attributive_lexeme(lexeme_id)}
'''
returns a mapping: 'lexeme ID' -> [sentence ids]
'''
def calculate_existence_areas(self, in_indexed_text, in_pos_to_lex_mapping):
existence_areas = collections.defaultdict(lambda: set([]))
for index in xrange(len(in_indexed_text)):
for word_id in in_indexed_text[index]:
if word_id in in_pos_to_lex_mapping:
existence_areas[word_id].add(index)
# sorting sets
result_areas = {word_id: sorted(sent_ids) \
for (word_id, sent_ids) in sorted(existence_areas.iteritems())}
return result_areas
'''
Returns a map:
{lexeme_id -> # unique adjacent lexemes within all the sentences except for the generative one}
'''
def calculate_associative_power(self, in_indexed_text, in_existence_areas):
result = {}
for lexeme_id in in_existence_areas:
# we do not count generative sentence while computing associative power
existence_area = in_existence_areas[lexeme_id]
result[lexeme_id] = len(existence_area[1:])
return result
def text_as_string(self):
result_text = []
for sentence_id in sorted(self.sentences.keys()):
for word_id in self.sentences[sentence_id]:
result_text.append(self.active_lexemes[word_id])
result_text.append('.')
return '\t'.join(result_text)
def get_phrase(self, in_phrase):
return ' '.join([self.pos_to_lex_mapping[lex_id] for lex_id in in_phrase])
def get_phrase_weight(self, in_phrase):
return sum([self.associative_power_dict[lex_id] for lex_id in in_phrase])
def test():
lexemes = [['mom', 'dad', 'dog', 'cat']]
filtering_set = ['dad', 'cat']
print stop(lexemes, in_filtering_set = filtering_set)
if __name__ == '__main__':
test()