def parse(self, text): # Lazy load model file to speed up startup if not self.model: self.model = self.load_model(self.language) text = text.strip() # Adding a period improves detection on especially short sentences period_added = False last_character = text.strip()[-1] if re.match(r"\w", last_character, flags=re.UNICODE): text += "." period_added = True pipeline = Pipeline(self.model, "tokenize", Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu") error = ProcessingError() processed = pipeline.process(text, error) if error.occurred(): raise ParserException(error.message) # Remove the period to make sure input corresponds to output if period_added: processed = "\n".join(processed.rstrip().split("\n")[:-1]) + "\n\n" return processed
def process(self, text: 'str') -> 'Scene': """ Processes the description and builds a scene based on it. Parameters ---------- text : str The description of the scene. Returns ------- Scene The scene described by the text. """ text_preprocessed = self._preprocess(text) pipeline = Pipeline(self.model, "tokenize", Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu") error = ProcessingError() processed = pipeline.process(text_preprocessed, error) parsed = conllu.parse(processed) scene = self._traverse_tree(parsed) return scene
def parse(self, text): # Lazy load model file to speed up startup if not self.model: self.model = self.load_model() text = text.strip() # Adding a period improves detection on especially short sentences period_added = False last_character = text.strip()[-1] if re.match(r"\w", last_character, flags=re.UNICODE): text += "." period_added = True pipeline = Pipeline( self.model, "tokenize", Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu" ) error = ProcessingError() processed = pipeline.process(text, error) if error.occurred(): raise ParserException(error.message) # Remove the period to make sure input corresponds to output if period_added: processed = "\n".join(processed.rstrip().split("\n")[:-1]) + "\n\n" return processed
def udpipeS(pathmodel, sourcepath, pathdestination): model = Model.load(pathmodel) if not model: sys.stderr.write("Cannot load model from file '%s'\n" % pathmodel) sys.exit(1) sys.stderr.write('done\n') pipeline = Pipeline(model, "horizontal", Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu") error = ProcessingError() i = 1 for filename in os.listdir(sourcepath): f = open(pathdestination + filename[:-3] + "conllu", "a") f.truncate(0) text = io.open(sourcepath + filename, "r", encoding="utf-8") string = "".join(text.readlines()) # Process data processed = pipeline.process(string, error) if error.occurred(): sys.stderr.write("An error occurred when running run_udpipe: ") sys.stderr.write(error.message) sys.stderr.write("\n") sys.exit(1) f.write(processed) f.close() print("File n ", i, " processed of ", len(os.listdir(sourcepath))) i += 1
def tag_ud(text='Текст нужно передать функции в виде строки!', modelfile='udpipe_syntagrus.model'): model = Model.load(modelfile) pipeline = Pipeline(model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu') processed = pipeline.process( text) # обрабатываем текст, получаем результат в формате conllu output = [l for l in processed.split('\n') if not l.startswith('#') ] # пропускаем строки со служебной информацией tagged = [ w.split('\t')[2].lower() + '_' + w.split('\t')[3] for w in output if w ] # извлекаем из обработанного текста лемму и тэг # tagged_propn = [] # propn = [] # for t in tagged: # if t.endswith('PROPN'): # if propn: # propn.append(t) # else: # propn = [t] # else: # if len(propn) > 1: # for x in propn: # #name = '::'.join([x.split('_')[0] for x in propn]) + '_PROPN' # tagged_propn.append(x) # elif len(propn) == 1: # tagged_propn.append(propn[0]) # tagged_propn.append(t) # propn = [] return tagged
def udpipeG(pathmodel): path = "/home/guido/Progetto Unitexto/textdata/cleanedTxt/" model = Model.load(pathmodel) if not model: sys.stderr.write("Cannot load model from file '%s'\n" % pathmodel) sys.exit(1) sys.stderr.write('done\n') pipeline = Pipeline(model, "horizontal", Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu") error = ProcessingError() # corp = io.open("/home/guido/Progetto Unitexto/textdata/corpus.txt","r",encoding= "utf-8") # Read whole input # string="".join(corp.readlines()) # Process data # processed = pipeline.process(string, error) f = open("/home/guido/Progetto Unitexto/textdata/corpus.conllu", "a") f.truncate(0) i = 1 for filename in os.listdir(path): text = io.open(path + filename, "r", encoding="utf-8") string = "".join(text.readlines()) # Process data processed = pipeline.process(string, error) if error.occurred(): sys.stderr.write("An error occurred when running run_udpipe: ") sys.stderr.write(error.message) sys.stderr.write("\n") sys.exit(1) f.write(processed) print("File n ", i, " processed of ", len(os.listdir(path))) i += 1
def load_file(self, name, filename, lang): if filename not in tronco_special_files: filename_dir = os.path.join(root_path, "corpora", name, filename) if not lang in self.models: self.models[lang] = Model.load( os.path.join(root_path, "udpipe", udpipe_models[lang]['path'])) pipeline = Pipeline(self.models[lang], "tokenize", Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu") with open(filename_dir) as f: try: text = f.read().splitlines() except: return False raw_text = [] metadata = {'filename': filename} [ metadata.update({ x.split(" = ", 1)[0].split("# ", 1)[1]: x.split(" = ", 1)[1] }) if x.strip().startswith("# ") and " = " in x else raw_text.append(x) for x in text ] if not name in self.files: self.files[name] = {} self.files[name][filename] = pipeline.process( "\n".join(raw_text)).replace("# newdoc\n", "").replace("# newpar\n", "") if not name in self.metadata: self.metadata[name] = {} self.metadata[name][filename] = metadata
def udpipe(sentences, model_name, verbose=False): """ Parse text to Universal Dependencies using UDPipe. :param sentences: iterable of iterables of strings (one string per line) :param model_name: filename containing UDPipe model to load :param verbose: print extra information :return: iterable of lines containing parsed output """ from ufal.udpipe import Model, Pipeline, ProcessingError model = Model.load(model_name) if not model: raise ValueError("Invalid model: '%s'" % model_name) pipeline = Pipeline(model, "conllu", Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu") lines1, lines2 = tee(l for s in sentences for l in s) text = "\n".join(lines1) error = ProcessingError() num_tokens = sum(1 for l in lines2 if l) with ioutil.external_write_mode(): print("Running %s on %d tokens... " % (model_name, num_tokens), end="", flush=True) start = time() processed = pipeline.process(text, error) duration = time() - start with ioutil.external_write_mode(): print("Done (%.3fs, %.0f tokens/s)" % (duration, num_tokens / duration if duration else 0)) if verbose: print(processed) if error.occurred(): raise RuntimeError(error.message) return processed.splitlines()
def run(model_file, text_file): print('Loading model...') model = Model.load(model_file) pipeline = Pipeline(model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu') print('Reading corpus...') with open(text_file) as f: text = f.read() print('Analyzing text...') processed = pipeline.process(text) print('Extracting phrases...') phrases = [] sent = [] for line in tqdm((processed + '#').splitlines()): if line.startswith('#') and len(sent): preps = get_preps(sent) for prep, dep_id in preps.values(): pphrase = get_phrase(prep, dep_id, sent) phrases.append(pphrase) sent.clear() elif len(line) > 1: try: sent.append(Token(line.split('\t'))) except ValueError: continue print('Done!') return phrases
def tokenize_and_tag_texts(dict_texts): eng_model = Model.load('english-partut-ud-2.5-191206.udpipe') fr_model = Model.load('french-partut-ud-2.5-191206.udpipe') eng_pipeline = Pipeline(eng_model, 'generic_tokenizer', '', '', '') fr_pipeline = Pipeline(fr_model, 'generic_tokenizer', '', '', '') for language_key, primal_texts in dict_texts.items(): tokenized_tagged_eng_text = eng_pipeline.process(primal_texts[1]) tokenized_tagged_fr_text = fr_pipeline.process(primal_texts[2]) dict_tokenized_tagged_texts = { 'eng': tokenized_tagged_eng_text, 'fr': tokenized_tagged_fr_text } # print(tokenized_tagged_eng_text) # print(tokenized_tagged_fr_text) # print(dict_tokenized_tagged_texts) return dict_tokenized_tagged_texts
def make_conll_with_udpipe(text): model_path = os.path.join(os.getcwd(), 'udparsers', 'russian-syntagrus-ud-2.5-191206.udpipe' ) # здесь указать путь к модели model = Model.load(model_path) pipeline = Pipeline(model, 'tokenizer=ranges', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu') return pipeline.process(text)
def make_conll_with_udpipe(text, language='german'): if language == 'german': model_path = path.join('..', '..', 'udpipe', 'german-ud-2.0-170801.udpipe') model = Model.load(model_path) pipeline = Pipeline(model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu') return pipeline.process(text)
def wordToInf(self, text): process_pipeline = Pipeline(self.modelForInfinitive, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu') wordInfo = process_pipeline.process(text).split('\n')[4].split('\t') if (wordInfo[3] == 'NUM'): return ('_NUM_' + ('x' * len(wordInfo[2]))) else: return wordInfo[2]
class SyntaxParser(PreProcesser): def __init__(self, model_path): self.parser_model = Model.load(model_path) self.parser_pipeline = Pipeline(self.parser_model, 'conllu', Pipeline.NONE, Pipeline.DEFAULT, 'conllu') def transform_item(self, x): return self.parser_pipeline.process(x, ProcessingError())
def get(modelAdd, text): from ufal.udpipe import Model, Pipeline, ProcessingError error = ProcessingError() model = Model.load(modelAdd) pipeline = Pipeline(model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu") parsedArticle = pipeline.process(text, error) return parsedArticle
def run_udpipe(self, path_to_model, sents=None): if sents is None: sents = self.sents verticals = self._to_vertical(sents) model = Model.load(path_to_model) pipeline = Pipeline(model, "vertical", Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu") error = ProcessingError() conllu = pipeline.process(verticals, error) return conllu
def extract_sentences(input_file: str, output_file: str, logger) -> None: logger.info( "==== Now performing sentence extraction from paragraphs file ====") # UDPipe initliazation lang_model = 'lang_models/czech-ud-2.0-170801.udpipe' model = Model.load(lang_model) if not model: logger.error('Could not load UDPipe language model: ' + lang_model) ud_pipeline = Pipeline(model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT, '') ud_error = ProcessingError() sentences_file = open(output_file, "w") # reopen paragraphs for reading paragraphs_file = open(input_file, "r") sentences_count = 0 for p_line in paragraphs_file: page_first_sentence = "" page_first_paragraph = p_line.split( '\t', 1) # use the variable as temporary list # If there is a paragraph content if len(page_first_paragraph) == 2: page_uri = page_first_paragraph[0] page_first_paragraph = page_first_paragraph[1] # Extract first sentence form paragraph using UDPipe: ud_output = ud_pipeline.process(page_first_paragraph, ud_error) if ud_error.occurred(): logger.error( 'Error occured while extracting sentence using UDPipe: ' + ud_error.message) page_first_sentence = "" else: ud_output = ud_output.split('\n') if len(ud_output) >= 4: page_first_sentence = ud_output[3][ 9:] # assumption about the output format else: page_first_sentence = "" # Write sentence to the file sentences_file.write(page_uri + '\t' + page_first_sentence + '\n') sentences_count += 1 if sentences_count % 2000 == 0: logger.info("Extracted {} sentences.".format(sentences_count)) logger.info("Finished extraction of {} sentences.".format(sentences_count)) paragraphs_file.close() sentences_file.close()
def process_sentence(self, sen, field_names=None): pipeline = Pipeline(self._model, self._inp_format, self._pos_settings, self._parse_settings, 'conllu') error = ProcessingError() # For catching errors... inp_sen = ''.join(self._encode_sentence(sen, field_names)) # Do the processing... + Write the output in CoNLL-U processed = pipeline.process(inp_sen, error) if error.occurred(): raise UDPipeError(error.message) ret_sen = self._decode_sentence(processed, sen, field_names) return ret_sen
def parse(sentence): sys.argv.append('tokenize') sys.argv.append('conllu') sys.argv.append('russian-syntagrus-ud-2.4-190531.udpipe') model = Model.load(sys.argv[3]) pipeline = Pipeline(model, sys.argv[1], Pipeline.DEFAULT, Pipeline.DEFAULT, sys.argv[2]) error = ProcessingError() # small preproccessing step sentence = re.sub('«', '« ', sentence) sentence = re.sub('»', '» ', sentence) parsed = pipeline.process(sentence, error) print(parsed) return parsed
def tag_parse_tree(self, root): """Tag (+lemmatize, fill FEATS) and parse a tree (already tokenized).""" pipeline = Pipeline(self.tool, 'horizontal', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu') in_data = " ".join([n.form for n in root.descendants]) out_data = pipeline.process(in_data, self.error) if self.error.occurred(): raise IOError("UDPipe error " + self.error.message) self.conllu_reader.files.filehandle = io.StringIO(out_data) parsed_root = self.conllu_reader.read_tree() nodes = [root] + root.descendants for parsed_node in parsed_root.descendants: node = nodes[parsed_node.ord] node.parent = nodes[parsed_node.parent.ord] for attr in 'upos xpos lemma feats'.split(): setattr(node, attr, getattr(parsed_node, attr))
class HabrPostagging: def __init__(self): # https://github.com/jwijffels/udpipe.models.ud.2.0/blob/master/inst/udpipe-ud-2.0-170801/russian-ud-2.0-170801.udpipe self.model = Model.load("russian-ud-2.0-170801.udpipe") self.pipeline = Pipeline(self.model, 'generic_tokenizer', '', '', '') self.reset_counter() def reset_counter(self): self.__pos_couter = {} def get_counter(self): return self.__pos_couter def __update_counter(self, pos): if pos in self.__pos_couter: self.__pos_couter[pos] += 1 else: self.__pos_couter[pos] = 1 def tag_file(self, input_file, output_file): text = pickle.load(open(input_file, 'rb'))['text'] # Вообще-то питоновская обёртка udpipe сильно багованная, но на всех наших даннных отработала корректно. # Других парсеров, умеющих работать с русским, насколько мне известно, нет. parsed = self.pipeline.process(text) parsed = parse(parsed) with open(output_file, 'w', encoding='utf-8') as f: for sentence in parsed: for word in sentence: self.__update_counter(word['upos']) f.write('\t'.join([ word['form'], word['lemma'], word['upos'], str(word['feats']) ]) + '\n') f.write('\n') def tag_files(self, files, input_dir, output_dir, log=False): for filename in files: input_file = os.path.join(input_dir, filename) if os.path.isfile(input_file): if log: print(input_file) output_file = os.path.join(output_dir, filename) output_file = os.path.splitext(output_file)[0] + '.tsv' self.tag_file(input_file, output_file) def tag_dir(self, input_dir, output_dir, log=False): Path(output_dir).mkdir(parents=True, exist_ok=True) self.tag_files(os.listdir(input_dir), input_dir, output_dir, log)
def process_text(path_to_file, input_format, model, out_file): """ Apply NLP processing to text (tokenize, tag, parse) and save output in CONLLU format. """ pipeline = Pipeline(model, input_format, Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu') error = ProcessingError() with codecs.open(path_to_file) as f: text = f.read() processed = pipeline.process(text, error) if error.occurred(): print("Error when running UDPipe: ") print(error.message) print("\n") sys.exit(1) with codecs.open(out_file, 'w', 'utf-8') as of: of.write(processed)
class PredPattArgumentExtractor(ArgumentExtractor): def __init__( self, path_to_udpipe: str, resolve_relcl: bool = True, resolve_appos: bool = True, resolve_amod: bool = True, resolve_conj: bool = True, resolve_poss: bool = True, ud=dep_v2.VERSION, ): super().__init__() self.model = Model.load(path_to_udpipe) self.pipeline = Pipeline( self.model, "tokenize", Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu" ) self._error = ProcessingError() self._opts = PredPattOpts( resolve_relcl=resolve_relcl, resolve_appos=resolve_appos, resolve_amod=resolve_amod, resolve_conj=resolve_conj, resolve_poss=resolve_poss, ud=ud, ) @lru_cache(maxsize=100000) def extract(self, sentence: str) -> List[Dict[str, Any]]: processed = self.pipeline.process(sentence, self._error) if self._error.occurred(): print(f"=== Error occurred: {self._error.message}") self._error = ProcessingError() return None else: conll_example = [ud_parse for sent_id, ud_parse in load_conllu(processed)][ 0 ] ppatt = PredPatt(conll_example, opts=self._opts) result = [] for predicate in ppatt.instances: structure = { "predicate": predicate.tokens, "arguments": [x.tokens for x in predicate.arguments], } result.append(structure) return result
def parse_udpipe(passages, model_name, verbose=False): from ufal.udpipe import Model, Pipeline, ProcessingError model = Model.load(model_name) if not model: raise ValueError("Invalid model: '%s'" % model_name) pipeline = Pipeline(model, "conllu", Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu") passages1, passages2 = tee(passages) text = "\n".join(l for p in passages1 for l in to_conllu(p, tree=True)) error = ProcessingError() print("Running UDPipe on %d characters... " % len(text), end="", flush=True) start = time() processed = pipeline.process(text, error) print("Done (%.3fs)" % (time() - start)) if verbose: print(processed) if error.occurred(): raise RuntimeError(error.message) return zip(passages2, from_conllu(processed.splitlines(), passage_id=None))
def tag_parse_tree(self, root): """Tag (+lemmatize, fill FEATS) and parse a tree (already tokenized).""" descendants = root.descendants if not descendants: return pipeline = Pipeline(self.tool, 'horizontal', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu') in_data = " ".join([n.form for n in descendants]) out_data = pipeline.process(in_data, self.error) if self.error.occurred(): raise IOError("UDPipe error " + self.error.message) self.conllu_reader.files.filehandle = io.StringIO(out_data) parsed_root = self.conllu_reader.read_tree() nodes = [root] + descendants for parsed_node in parsed_root.descendants: node = nodes[parsed_node.ord] node.parent = nodes[parsed_node.parent.ord] for attr in 'upos xpos lemma feats'.split(): setattr(node, attr, getattr(parsed_node, attr))
class UDPipe: def __init__(self): print('Loading model: ') model_path = r"D:\py_projects\IWonnaBook\udpipe_syntagrus.model" self.model = Model.load(model_path) if not self.model: print('Модель не загружена :(') sys.exit(1) self.pipeline = Pipeline(self.model, "tokenize", Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu") print('done\n') def get_sintax(self, text): error = ProcessingError() processed = self.pipeline.process(text, error) if error.occurred(): print("An error occurred when running run_udpipe: ") print(error.message) print("\n") return processed
def process_user_text_task(input_text=''): if input_text: from conllu import parse from ufal.udpipe import Model, Pipeline from error_search.process_text import process_text import os if not os.path.exists( 'error_search/russian-syntagrus-ud-2.0-170801.udpipe'): boilerplate.fget_file( 'upload/russian-syntagrus-ud-2.0-170801.udpipe', 'error_search/russian-syntagrus-ud-2.0-170801.udpipe') ud_model = Model.load( 'error_search/russian-syntagrus-ud-2.0-170801.udpipe') pipeline = Pipeline(ud_model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu') out = pipeline.process(input_text) tree = parse(out) return process_text(tree)
def udpipe(conllu_in, model_path): model = Model.load(model_path) if not model: sys.stderr.write("Cannot load model from file '%s'\n" % sys.argv[3]) sys.exit(1) pipeline = Pipeline(model, "conllu", Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu") error = ProcessingError() # Process data processed = pipeline.process(conllu_in, error) if error.occurred(): sys.stderr.write("An error occurred when running run_udpipe: ") sys.stderr.write(error.message) sys.stderr.write("\n") sys.exit(1) return processed
def main(args): model = Model.load(args.model) if not model: raise ValueError("Invalid model: '%s'" % args.model) os.makedirs(args.out_dir, exist_ok=True) pipeline = Pipeline(model, "tokenize" if args.txt else "conllu", Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu") for pattern in args.filenames: for in_file in glob(pattern) or [pattern]: basename = os.path.basename(in_file) out_file = os.path.join(args.out_dir, os.path.splitext(basename)[0] + ".conllu") error = ProcessingError() with open(in_file, encoding="utf-8") as f: processed = pipeline.process(f.read(), error) if error.occurred(): raise RuntimeError(error.message) with open(out_file, "w", encoding="utf-8") as f: f.write(processed) if not args.quiet: print("Wrote '%s'" % out_file)
class SyntaxParser: def __init__(self, speller: Optional[Speller] = None): self.udpipe_model = Model.load( sber_encode(sber_path('/var/sberbank/udpipe_syntagrus.model'))) self.process_pipeline = Pipeline(self.udpipe_model, sber_encode('tokenize'), Pipeline.DEFAULT, Pipeline.DEFAULT, sber_encode('conllu')) if speller is None: speller = Speller() self.speller: Speller = speller def get_syntax(self, text): processed = self.process_pipeline.process(sber_encode(text)) content = [ l for l in sber_decode(processed).split('\n') if not l.startswith('#') ] tagged = [w.split('\t') for w in content if w] return SyntaxTree(tagged, self.speller)
class UDParser(object): models = { "en": "UniversalPetrarch/preprocessing/udpipe-1.2.0/model/english-ud-2.0-170801.udpipe", "es": "UniversalPetrarch/preprocessing/udpipe-1.2.0/model/spanish-ancora-ud-2.0-170801.udpipe", "ar": "" } pipeline = None error = ProcessingError() model = None def __init__(self, lang="en"): model_file = "/Users/sxs149331/PycharmProjects/UniversalPetrarch-master/" + self.models[ lang] print model_file self.model = Model.load(model_file) if not self.model: sys.stderr.write("Model Loading Failed") sys.exit(1) sys.stderr.write('done\n') self.pipeline = Pipeline(self.model, "tokenize", Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu") def parse(self, text): #print self.pipeline processed = self.pipeline.process(text.strip(), self.error) if self.error.occurred(): raise ValueError(self.error.message) lines = processed.split("\n") result = [] for line in lines: if line.startswith("#"): continue result.append(line) return ("\n").join(result)
class Solver(object): def __init__(self, seed=42): self.morph = pymorphy2.MorphAnalyzer() self.model = Model.load("data/udpipe_syntagrus.model".encode()) self.process_pipeline = Pipeline(self.model, 'tokenize'.encode(), Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu'.encode()) self.seed = seed self.init_seed() self.paronyms = self.get_paronyms() self.freq_bigrams = self.open_freq_grams() def init_seed(self): return random.seed(self.seed) def open_freq_grams(selfself): with open('data/bigrams_lemmas.pickle', 'rb') as inputfile: counts = pickle.load(inputfile) return counts def get_paronyms(self): paronyms = [] with open('data/paronyms.csv', 'r', encoding='utf-8') as in_file: for line in in_file.readlines(): pair = line.strip(punctuation).split('\t') paronyms.append(pair) return paronyms def lemmatize(self, token): token_all = self.morph.parse(token.lower().rstrip('.,/;!:?'))[0] lemma = token_all.normal_form return lemma def find_closest_paronym(self, par): paronyms = set() for par1, par2 in self.paronyms: paronyms.add(par1) paronyms.add(par2) try: closest = get_close_matches(par, list(paronyms))[0] except IndexError: closest = None return closest def check_pair(self, token_norm): paronym = None for p1, p2 in self.paronyms: if token_norm == p1: paronym = p2 break if token_norm == p2: paronym = p1 break return paronym def find_paronyms(self, token): token_all = self.morph.parse(token.lower().rstrip('.,/;!:?'))[0] token_norm = token_all.normal_form paronym = self.check_pair(token_norm) if paronym is None: paronym_close = self.find_closest_paronym(token_norm) paronym = self.check_pair(paronym_close) if paronym is not None: paronym_parse = self.morph.parse(paronym)[0] try: str_grammar = str(token_all.tag).split()[1] except IndexError: str_grammar = str(token_all.tag) gr = set(str_grammar.replace("Qual ", "").replace(' ', ',').split(',')) try: final_paronym = paronym_parse.inflect(gr).word except AttributeError: final_paronym = paronym else: final_paronym = '' return final_paronym def syntax_parse(self, some_text, token): processed = self.process_pipeline.process(some_text.lower().encode()) content = [l for l in processed.decode().split('\n') if not l.startswith('#')] tagged = [w.split('\t') for w in content if w] linked_word = '' for analysis in tagged: if analysis[1] == token: head = analysis[6] if head == '0': root_id = analysis[0] for analysis in tagged: if analysis[6] == root_id: linked_word = analysis[1] break else: for analysis in tagged: if analysis[0] == head: linked_word = analysis[1] break return linked_word def check_frequencies(self, sentences): examples = [] for token, second_tok, line in sentences: token = token.lower().rstrip('.,;:!?') token_lemma = self.lemmatize(token) second_lemma = self.lemmatize(second_tok) collocation_word = self.syntax_parse(line, token) collocation_lemma = self.lemmatize(collocation_word) first = (collocation_lemma, token_lemma) second = (collocation_lemma, second_lemma) freq1 = self.freq_bigrams[first] freq2 = self.freq_bigrams[second] first = (token_lemma, collocation_lemma) second = (second_lemma, collocation_lemma) freq3 = self.freq_bigrams[first] freq4 = self.freq_bigrams[second] freq_first = freq1 + freq3 freq_second = freq2 + freq4 if freq_second > freq_first: return second_tok if freq_first == freq_second: examples.append((0, freq_first, freq_second, token, second_tok)) good_paronym = '' if examples: good_paronym = examples[0][4] return good_paronym def predict(self, task): return self.predict_from_model(task) def fit(self, tasks): pass def load(self, path="data/models/solver5.pkl"): pass def save(self, path="data/models/solver5.pkl"): pass def predict_from_model(self, task): description = task["text"].replace('НЕВЕРНО ', "неверно ") sents = [] for line in description.split("\n"): for token in line.split(): if token.isupper() and len(token) > 2: # get CAPS paronyms second_pair = self.find_paronyms(token) sents.append((token, second_pair, line)) result = self.check_frequencies(sents) return result.strip(punctuation + '\n')