def get_list_sim(self, word): if word not in self.simwords: word = Sentence(word).remove_continue() if word not in self.simwords: word = re.sub(r'(\d+)|([wjfz]+)', '', word) word = re.sub(r'([a-z])(r|s|x)', r'\1', word) if word not in self.simwords: return {self.index[Sentence().R_S]} return self.simwords[word]
def fast_predict(self, testdata, list_of_indices=[10], output_size=5): print('Predicting...') y = [] for sentence in testdata: start = time.time() sentence = ' '.join(word_tokenize(sentence)) old_words = sentence.split() sentence = Sentence(sentence).remove_continue() #print(sentence) new_words = sentence.split() res = self.get_result_continue(old_words, new_words, list_of_indices, output_size) end = time.time() print('Done %f (ms)' % ((end - start) * 1000)) self.maxtime = max(self.maxtime, end - start) y.append(res) print('Done!') return y
def get_words_continue_prev(self, words, lim_per_index): if len(words) == 0: return tuple() if len(words) == 1: return tuple([tuple([w]) for w in self.get_list_sim(words[0])]) cur_word = words[0] # if has accent then not neccessary to guess if cur_word != Sentence().remove_accents(cur_word): if cur_word in self.index: cur_simwords = {self.index[cur_word]} else: cur_simwords = {self.index[Sentence().R_S]} else: cur_simwords = self.get_list_sim(cur_word) list_next_words = self.get_words_continue_prev(words[1:], lim_per_index[1:]) list_choices = {} for next_words in list_next_words: size_next = len(next_words) prop_prev = {} for n in self.list_ngrams: if n - 1 > size_next: continue check_words = next_words[:n - 1] for word in cur_simwords: tp = tuple([word]) + check_words prop = 0 if tp not in self.prop[ n] else self.prop[n][tp] / self.cnt[n] if word not in prop_prev: prop_prev[word] = prop else: prop_prev[word] += prop cur_choices = [ (k, prop_prev[k]) for k in sorted(prop_prev, key=prop_prev.get, reverse=True) ] for choice in cur_choices[:lim_per_index[0]]: if choice[1] < self.eta: break prev_sentence = choice[:1] + tuple(next_words) prop = self.max_prop_indices(prev_sentence) list_choices[prev_sentence] = prop res = tuple([ k for k in sorted(list_choices, key=list_choices.get, reverse=True) ]) #print([(' '.join([self.word[index] for index in indices]), list_choices[indices]) for indices in res[:lim_per_index[-1]]]) return res[:lim_per_index[0]]
def convert_to_indices(self, words): lst = [] for word in words: if type(word) == list: word = word[0] info = self.trie.searchWord(word) if info == False: info = self.trie.searchWord(Sentence().R_S) lst.append(info[0]) return tuple(lst)
def separate_sentences(self, list_paragraphs): print('Separating sentence...') self.list_sentences = [] for paragraph in list_paragraphs: list_of_sentences = sent_tokenize(paragraph) for sentence in list_of_sentences: sentence = Sentence(sentence).remove_continue() self.list_sentences.append(sentence) print(self.list_sentences[:10]) print('Done!')
def fit(self): print('Fitting model...') start_time = time.time() self.prop = {} self.cnt = {} self.simwords = {} self.word = {} self.index = {} for i in self.list_ngrams: self.prop[i] = {} self.cnt[i] = 0 print('Extract n-gram ', self.list_ngrams) cnt, full_size = 1, len(self.list_sentences) for sentence in self.list_sentences: print('Processing at %d/%d (%.2f)' % (cnt, full_size, (cnt / full_size) * 100)) list_indices = [] words = word_tokenize(sentence) for word in words: if word not in self.index: self.index[word] = len(self.word) + 1 index = self.index[word] self.word[index] = word # build prop list_indices.append(index) for ngram in self.list_ngrams: if ngram > len(list_indices): continue word_encodes = tuple(list_indices[-ngram:]) if word_encodes not in self.prop[ngram]: self.prop[ngram][word_encodes] = 1 else: self.prop[ngram][word_encodes] += 1 # inscrease cnt for ngram in self.list_ngrams: self.cnt[ngram] += max(0, len(words) - ngram + 1) cnt += 1 ################# add __object__ ##################### R_S = Sentence().R_S if R_S not in self.index: index = len(self.index) + 1 self.index[R_S] = index self.word[index] = R_S self.prop[1][index] = 0 ###################################################### print('Done!') print('Extract similar non-accent words') cnt, full_size = 1, len(self.index) # build sim words for word in self.index: print('Processing at %d/%d (%.2f)' % (cnt, full_size, (cnt / full_size) * 100)) index = self.index[word] non_accent = Sentence().remove_accents(word) if non_accent not in self.simwords: self.simwords[non_accent] = {index} else: self.simwords[non_accent].add(index) cnt += 1 print('Done!') end_time = time.time() print('Finished! Ellapse time: %f (ms)' % ((end_time - start_time) * 1000))