-
Notifications
You must be signed in to change notification settings - Fork 0
/
replacing.py
150 lines (131 loc) · 7.6 KB
/
replacing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import random
from typing import List, Tuple, Dict
from os.path import isfile
import grammar_category
import morphosyntactic
import tokenization
DEBUG = False
class Replacing:
"""Manages replacing nouns in copypasta with given words"""
def __init__(self,
copypasta: List[str],
replacement_words: List[Tuple[Dict, grammar_category.Gender, float]],
morphosyntactic_dictionary: morphosyntactic.Morphosyntactic):
self.current_word = None # type: morphosyntactic.AmbiguousWord
self.previous_word = None # TODO: Don't replace if previous word was replaced or undo replacement of previous word
self.next_word = None # TODO: update this field
self.pasta = copypasta
self.replacement_words = replacement_words
self.morph = morphosyntactic_dictionary
self.ignored_words = []
self.load_ignored_words()
self.selected_meaning = None # type: morphosyntactic.Noun
self.selected_declension = None
if DEBUG:
print("".join(self.pasta))
def load_ignored_words(self, path="ignored_words.txt"):
"""Loads words which would never be replaced"""
ignored_path = path
if isfile(ignored_path):
with open(ignored_path, encoding="utf-8") as file:
for line in file:
self.ignored_words.append(line.strip().lower())
def replace(self) -> List[str]:
"""Replaces every noun in copypasta with matching form of one of replacement words"""
for token_idx, token in enumerate(self.pasta[:]):
if token.isalnum():
if token.lower() in self.morph.morphosyntactic_dictionary:
raw_word = self.morph.morphosyntactic_dictionary[token.lower()]
self.current_word = morphosyntactic.AmbiguousWord(token, raw_word)
if self.current_word.certain_noun():
self.select_meaning()
self.select_declension()
self.print_debug_info()
word_after_replace = self.replace_single_noun()
self.pasta[token_idx] = self.lower_or_uppercase(word_after_replace, token)
self.update_iteration_data() # TODO: maybe it should be updated even if word is not in dictionary
return self.pasta
def lower_or_uppercase(self, replaced_word: str, original_word: str) -> str:
"""Changes replaced word to use same uppercase style as original word
(if original word was ALL UPPERCASE, replacet word will also use this convention)"""
if original_word.isupper() and len(original_word) > 1:
return replaced_word.upper()
elif original_word.istitle() and not self.selected_meaning.base_word.istitle():
return replaced_word[0].upper() + replaced_word[1:]
else:
return replaced_word
def update_iteration_data(self):
"""Updates and clears some data not needed after iteration step"""
self.previous_word = self.current_word
self.current_word = None
self.selected_meaning = None
self.selected_declension = None
def replace_single_noun(self) -> str:
"""Replace one word in copypasta to inflected form of one of possible replacement words"""
replacement_words = self.filter_replacements_by_gender()
if self.should_not_replace(replacement_words):
return self.current_word.word
if len(replacement_words) > 1:
raise NotImplementedError()
replacement_word = replacement_words[0][0] # type: Dict[grammar_category.Number, Dict[grammar_category.Case, str]]
inflected_word = replacement_word[self.selected_declension.number][self.selected_declension.case]
return inflected_word
def filter_replacements_by_gender(self):
"""Returns list of possible replacements with gender matching current word"""
return list(filter(lambda replacement: replacement[1] == self.selected_meaning.gender,
self.replacement_words))
def should_not_replace(self, replacement_words) -> bool: # TODO: Detecting acronyms (by large quantity of meanings?)
"""Checks various contitions, when given word should not be replaced"""
word_in_ignored = self.selected_meaning.base_word in self.ignored_words # TODO: Detect common bigrams "w ogóle"
no_word_to_replace = len(replacement_words) == 0
probability_sum = sum(replacement_word[2] for replacement_word in self.replacement_words)
random_not_replacing = random.random() > probability_sum
return word_in_ignored or no_word_to_replace or random_not_replacing
def select_meaning(self): # TODO: use unigrams
"""Selects best meaning to use from list of meanings in AmbiguousWord object"""
meanings = filter(lambda meaning: isinstance(meaning, morphosyntactic.Noun), self.current_word.meanings)
self.selected_meaning = next(meanings)
def select_declension(self): # TODO: create tagged bigrams and use them OR use previous and (maybe) next word in simpler way
"""Selects best declension to use from list of declensions in Meaning oblject"""
self.selected_declension = sorted(self.selected_meaning.declensions,
key=lambda declension: (declension.number.value, declension.case.value))[0]
def print_debug_info(self):
"""In debug mode prints additional info about selected meanings"""
if DEBUG:
print("NOUN ", self.current_word.word)
print(self.current_word)
print(self.selected_meaning)
print(self.selected_declension)
if __name__ == "__main__":
DEBUG = True
words = [({
grammar_category.Number.SINGULAR: {
grammar_category.Case.GENITIVE: 'mamuta',
grammar_category.Case.VOCATIVE: 'mamucie',
grammar_category.Case.NOMINATIVE: 'mamut',
grammar_category.Case.ACCUSATIVE: 'mamuta',
grammar_category.Case.LOCATIVE: 'mamucie',
grammar_category.Case.DATIVE: 'mamutowi',
grammar_category.Case.INSTRUMENTAL: 'mamutem'},
grammar_category.Number.PLURAL: {
grammar_category.Case.GENITIVE: 'mamutów',
grammar_category.Case.VOCATIVE: 'mamuty',
grammar_category.Case.NOMINATIVE: 'mamuty',
grammar_category.Case.ACCUSATIVE: 'mamuty',
grammar_category.Case.LOCATIVE: 'mamutach',
grammar_category.Case.DATIVE: 'mamutom',
grammar_category.Case.INSTRUMENTAL: 'mamutami'}},
grammar_category.Gender.MASCULINE_INANIMATE,
1.)]
pasta = tokenization.tokenize(
"Mój stary to fanatyk wędkarstwa. Pół mieszkania zajebane wędkami najgorsze. Średnio raz w miesiącu ktoś "
"wdepnie w leżący na ziemi haczyk czy kotwicę i trzeba wyciągać w szpitalu bo mają zadziory na końcu. W "
"swoim 22 letnim życiu już z 10 razy byłem na takim zabiegu. Tydzień temu poszedłem na jakieś losowe "
"badania to baba z recepcji jak mnie tylko zobaczyła to kazała buta ściągać xD bo myślała, że "
"znowu hak w nodze.")
morph = morphosyntactic.Morphosyntactic("polimorfologik-2.1.txt")
morph.create_morphosyntactic_dictionary()
replacer = Replacing(pasta, words, morph)
assert "raz" in replacer.ignored_words
assert "możliwość" in replacer.ignored_words
print("".join(replacer.replace()))