def _get_surface_form_taggedlemmas(self, forms_in): """Given a tokens deque, return the form & list of tagged lemmas (analyses) for the proper name in the list of forms at the current position, if applicable. If there is no proper name at the beginning of the tokens deque, return (None, None). @param forms_in: a deque of forms tokens @return: (form, tagged lemmas list) or (None, None) """ for test_len in range(min(self._sf_max_len, len(forms_in)), 0, -1): # test the string, handle number placeholders full_substr = [form for form in islice(forms_in, 0, test_len)] test_substr = tuple(['_' if re.match(r'^[0-9]+$', form) else form.lower() for form in full_substr]) if test_substr in self._sf_dict: tls = TaggedLemmas() nums = [num for num in full_substr if re.match(r'^[0-9]+$', num)] for lemma, tag in self._sf_dict[test_substr]: tls.push_back(TaggedLemma()) for num in nums: # replace number placeholders by actual values lemma = re.sub(r'_', num, lemma, count=1) tls[-1].lemma = lemma tls[-1].tag = tag for _ in range(len(test_substr)): # move on in the sentence forms_in.popleft() return " ".join(full_substr), tls return None, None
def _get_surface_form_taggedlemmas(self, forms_in): """Given a tokens deque, return the form & list of tagged lemmas (analyses) for the proper name in the list of forms at the current position, if applicable. If there is no proper name at the beginning of the tokens deque, return (None, None). @param forms_in: a deque of forms tokens @return: (form, tagged lemmas list) or (None, None) """ for test_len in xrange(min(self._sf_max_len, len(forms_in)), 0, -1): # test the string, handle number placeholders full_substr = [form for form in islice(forms_in, 0, test_len)] test_substr = tuple(['_' if re.match(r'^[0-9]+$', form) else form.lower() for form in full_substr]) if test_substr in self._sf_dict: tls = TaggedLemmas() nums = [num for num in full_substr if re.match(r'^[0-9]+$', num)] for lemma, tag in self._sf_dict[test_substr]: tls.push_back(TaggedLemma()) for num in nums: # replace number placeholders by actual values lemma = re.sub(r'_', num, lemma, count=1) tls[-1].lemma = lemma tls[-1].tag = tag for _ in xrange(len(test_substr)): # move on in the sentence forms_in.popleft() return " ".join(full_substr), tls return None, None
def lemmatize(file, output_file): morphodita_model = os.path.join( dir_cur, 'czech-morfflex-pdt-131112-raw_lemmas.tagger-best_accuracy') tagger = Tagger.load(morphodita_model) assert tagger forms = Forms() lemmas = TaggedLemmas() tokens = TokenRanges() tokenizer = tagger.newTokenizer() assert tokenizer with open_gz(output_file, 'w') as out, open_gz(file) as f: for line in f: tokenizer.setText(line) while tokenizer.nextSentence(forms, tokens): tagger.tag(forms, lemmas) # for i in range(len(tokens)): # lemma = lemmas[i] # token = tokens[i] #word = line[token.start:token.start + token.length] #out.write(str(lemma.lemma) + ' ') #out.write(" ".join(list(map(lambda x: str(x.lemma), lemmas)))) out.write(" ".join( list( map( lambda x: str(x.lemma).strip() + '___' + str(x.tag) .strip(), lemmas)))) out.write('\n')
def analyze(self, sent): """Perform morphological analysis on the given sentence, preferring analyses from the list of surface forms. Return a list of tuples (form, lemma, tag).""" self._tokenizer.setText(sent) analyzed = [] while self._tokenizer.nextSentence(self._forms_buf, self._tokens_buf): forms_in = deque(self._forms_buf) self._forms_buf.resize(0) self._analyses_buf.resize(0) # reset previous analyses while forms_in: form, analyses = self._get_surface_form_taggedlemmas(forms_in) if form: # our custom analysis self._analyses_buf.push_back(analyses) else: # Morphodita analysis form = forms_in.popleft() analyses = TaggedLemmas() self._analyzer.analyze(form, 1, analyses) for i in range(len(analyses)): # shorten lemmas (must access the vector directly) analyses[i].lemma = self._analyzer.rawLemma(analyses[i].lemma) self._analyses_buf.push_back(analyses) self._forms_buf.push_back(form) # tag according to the given analysis self._tagger.tagAnalyzed(self._forms_buf, self._analyses_buf, self._indices_buf) analyzed.extend([(f, a[idx].lemma, a[idx].tag) for (f, a, idx) in zip(self._forms_buf, self._analyses_buf, self._indices_buf)]) return analyzed
def __init__(self): self.morphodita_model = os.path.join( dir_cur, 'czech-morfflex-131112.tagger-fast') self.tagger = Tagger.load(self.morphodita_model) self.forms = Forms() self.lemmas = TaggedLemmas() self.tokens = TokenRanges() self.tokenizer = self.tagger.newTokenizer()
def __init__(self, tagger_model): if not os.path.isfile(tagger_model): raise IOError('File %s does not exist' % tagger_model) self._tagger = Tagger.load(tagger_model) self._tokenizer = self._tagger.newTokenizer() self._forms_buf = Forms() self._tokens_buf = TokenRanges() self._tags_buf = TaggedLemmas()
def lemmatize(self, token): from ufal.morphodita import TaggedLemmas lemmas = TaggedLemmas() # container for the result result = morpho.analyze(token, morpho.GUESSER, lemmas) # result is int if result != 0: # sometimes uppercasing the first character helps result = morpho.analyze(token.title(), morpho.GUESSER, lemmas) return morpho.rawLemma(lemmas[0].lemma).lower()
def analyze_form(self, form, guesser=True): """Return all lemma-tag analyses (a list of FormInfo tuples) of a given form.""" use_guesser = 1 if guesser else 0 tagged_lemmas = TaggedLemmas() used_guesser = self.tool.analyze(form, use_guesser, tagged_lemmas) result = [] for tl in tagged_lemmas: result.append(FormInfo(form, tl.lemma, tl.tag, used_guesser)) return result
def __init__(self, derinet_file_name, morfflex_file_name, morpho_file_name): logger.info("Loading derivations.") derinet_db = DeriNetDatabase(derinet_file_name) logger.info("Derivations loaded.") if morfflex_file_name is not None: logger.info("Loading inflections.") db = MorfFlexDatabase(morfflex_file_name, derinet_db) logger.info("Inflections loaded.") else: logger.info("Not loading inflections.") db = derinet_db logger.info("Detecting stem bounds.") for node in db.iter(): node.detect_stems() logger.info("Stem bounds detected.") logger.info("Propagating morph bounds.") for root in db.iter_trees(): root.propagate_morph_bounds() logger.info("Morph bounds propagated.") lemmas = [] tagger = None if morpho_file_name is not None: logger.info("Loading morphology") if morphodita_available: tagger = Tagger.load(morpho_file_name) else: logger.error( "You need to install the MorphoDiTa Python bindings!") if not tagger: logger.critical( "Cannot load morphological dictionary from file '%s'.", morpho_file_name) sys.exit(1) lemmas = TaggedLemmas() logger.info("Morphology loaded.") else: logger.info( "No morphological dictionary specified. Inflectional morphology will not be available." ) tagger = None self.db = db self.tagger = tagger self.lemmas = lemmas
def __init__(self, model_file): """ Instantiates Morphodita from a provided model file. :param model_file: Path to the model file, :type model_file: str """ from ufal.morphodita import Tagger, Forms, TaggedLemmas, TokenRanges self.tagger = Tagger.load(model_file) self.forms = Forms() self.lemmas = TaggedLemmas() self.tokens = TokenRanges() self.tokenizer = self.tagger.newTokenizer()
def create_lemmas(self, text): _forms = Forms() _lemmas = TaggedLemmas() _tokens = TokenRanges() self.tokenizer.setText(text) lemmas = [] while self.tokenizer.nextSentence(_forms, _tokens): self.tagger.tag(_forms, _lemmas) for i in range(len(_lemmas)): lemma = _lemmas[i] token = _tokens[i] form = _forms[i] lemmas.append(Lemma(lemma.lemma, lemma.tag, form)) return lemmas
def lemmatize_and_replace_entities(file, output_file): nametag_model = os.path.join(dir_cur, 'czech-cnec2.0-140304.ner') morphodita_model = os.path.join(dir_cur, 'czech-morfflex-131112.tagger-fast') tagger = Tagger.load(morphodita_model) assert tagger ner = Ner.load(nametag_model) assert ner forms = Forms() lemmas = TaggedLemmas() tokens = TokenRanges() entities = NamedEntities() tokenizer = ner.newTokenizer() assert tokenizer with open_gz(output_file, 'w') as out, open_gz(file) as f: for line in f: tokenizer.setText(line) while tokenizer.nextSentence(forms, tokens): tagger.tag(forms, lemmas) ner.recognize(forms, entities) sorted_entities = sort_entities(entities) open_entities = [] open_entities_type = [] e = 0 for i in range(len(tokens)): lemma = lemmas[i] token = tokens[i] word = line[token.start:token.start + token.length] while e < len( sorted_entities) and sorted_entities[e].start == i: open_entities.append(sorted_entities[e].start + sorted_entities[e].length - 1) open_entities_type.append(sorted_entities[e].type) e += 1 if len(open_entities) == 0: out.write(str(lemma.lemma) + ' ') else: out.write("@!ENT!%s " % ('!'.join(open_entities_type))) while open_entities and open_entities[-1] == i: open_entities.pop() open_entities_type.pop() out.write('\n')
def __init__(self, tagger_model): self.__tagger = Tagger.load(tagger_model) self.__tokenizer = self.__tagger.newTokenizer() self.__forms_buf = Forms() self.__tokens_buf = TokenRanges() self.__lemmas_buf = TaggedLemmas()
def pos_tagging(self, text: str, stem=False, preprocess=True): """ Perform pos tagging of given text :param text: input text :param stem: use stem of word or just lemma :param preprocess: use preprocess :return: list of list of tagged words: List[List[WordPos]] """ lemmas = TaggedLemmas() tokens = TokenRanges() forms = Forms() sentences = [] vanilla_text = text # remove diacritic text = unidecode(text) if preprocess: # remove stop words text = " ".join([ w if w not in self.preprocesor.stop_words else "" for w in text.split() ]) # lower all text text = text.lower() # replace smileys text = self.preprocesor.replace_emoji(text) vanilla_text = text # POS taging self.tokenizer.setText(text) while self.tokenizer.nextSentence(forms, tokens): sentence = [] self.tagger.tag(forms, lemmas) for i in range(len(lemmas)): lemma = lemmas[i].lemma tag = lemmas[i].tag token = tokens[i] token_text = vanilla_text[token.start:token.start + token.length] # remove diacritic lemma = unidecode(lemma) # eng flag eng_word = False # '-' is not boundary token # boundary token if tag[0] == "Z" and lemma != "-": if not preprocess: sentence.append(WordPos(lemma, tag, token_text)) if sentence: sentences.append(sentence) sentence = [] continue # dont stem english words if lemma.find("angl") != -1: eng_word = True # remove additional informations lemma = lemma.split("_")[0] lemma = re.sub(r'-\d*$', '', lemma) # Stem if stem and not eng_word: lemma = cz_stem(lemma) if lemma and not preprocess or len(lemma) > 2: sentence.append(WordPos(lemma, tag, token_text)) if sentence: sentences.append(sentence) return sentences