def Parser(self, txt): f = open(txt) x = f.read().lower().replace(',', '').replace("'", "").replace("[", "").replace("]", "").replace(".", "").replace("(", "").replace(")", "").replace(";", "").replace(":", "").replace("-", "").split() for item in x: split = rusyllab.split_words(item.split()) verbs_collection.extend(split) f.close()
def count_of_syllables(): arr_syllables = [] file = open("comments/clean_comments.txt", "r") for line in file: syllables = rusyllab.split_words(line.strip().lower().split()) for syllable in syllables: arr_syllables.append(syllable) return dict(Counter(arr_syllables))
def get_rhyme_ending(word): stress_pos = accent.put_stress(word).find('\'') if stress_pos == -1: return word lst = list(word) lst[stress_pos - 1] = lst[stress_pos - 1].upper() word = ''.join(lst) sx = rusyllab.split_words([word]) for i in range(len(sx)): if not sx[i].islower(): return ''.join(sx[i:]).lower()
def Parser_input(self): x = self.plainTextEdit.toPlainText().lower().replace(',', '').replace( "'", "").replace("[", "").replace("]", "").replace(".", "").replace( "(", "").replace(")", "").replace(";", "").replace(":", "").replace("-", "").split() for item in x: split = rusyllab.split_words(item.split()) verbs_collection.extend(split)
def check(word): if (len(word.split()) > 1): return "Слишком много слов" elif len(word) == 1 and word in cons: return 'чё' elif word == "/start": return "Здаров, епт" elif(len(word) < 20): syllables = rusyllab.split_words([word]) syl = syllables[0] if syl[0] in cons: return check_cons(syllables, word) else: return check_vow(syllables, word) else: return "Браток, помедленней"
def check_cons(syllables, word): syl = syllables[0] tmp = list(syl[:]) excons = "" i = 0 for let in tmp: if let in cons: tmp[i] = "" i = i + 1 if i >= 3 and len(syllables) < 2: return "хуе" + "".join(syllables) if let in conc or let in vow: break excons = "".join(tmp) + "".join(syllables[1:]) new_word = rusyllab.split_words([excons]) result = check_vow(new_word, word) return result
def answer2pieces(answer_str, max_answer_len): if answer_representation == 'chars': # вариант для разбивки на символы return rpad_chars(BEG_CHAR + answer_str + END_CHAR, max_answer_len) elif answer_representation == 'syllables': # вариант для разбивки на слоги seq = [BEG_CHAR] + rusyllab.split_words(answer_str.split()) + [END_CHAR] l = len(seq) if l < max_answer_len: seq = seq + list(itertools.repeat(PAD_CHAR, (max_answer_len - l))) return seq elif answer_representation == 'sentencepiece': seq = [BEG_CHAR] + spm_encoder.EncodeAsPieces(answer_str) + [END_CHAR] l = len(seq) if l < max_answer_len: seq = seq + list(itertools.repeat(PAD_CHAR, (max_answer_len - l))) return seq else: raise NotImplementedError()
def tokenize(self, text, use_preproc=False, use_stem=False, use_lemm=False, check_length=True, check_stopwords=True): preprocessed_text = text if use_preproc: preprocessed_text, _ = self.preprocessor.preproc( text, use_lemm=use_lemm, use_stem=use_stem, check_stopwords=check_stopwords, check_length=check_length) syllables = rusyllab.split_words(preprocessed_text.split()) return list(filter(lambda syl: syl != ' ', syllables))
def split_word(text): syllables_lst = rusyllab.split_words(text.strip().lower().split()) return syllables_lst