예제 #1
0
def lemmatize(file, output_file):
    morphodita_model = os.path.join(
        dir_cur, 'czech-morfflex-pdt-131112-raw_lemmas.tagger-best_accuracy')
    tagger = Tagger.load(morphodita_model)
    assert tagger
    forms = Forms()
    lemmas = TaggedLemmas()
    tokens = TokenRanges()
    tokenizer = tagger.newTokenizer()
    assert tokenizer
    with open_gz(output_file, 'w') as out, open_gz(file) as f:
        for line in f:
            tokenizer.setText(line)
            while tokenizer.nextSentence(forms, tokens):
                tagger.tag(forms, lemmas)
                # for i in range(len(tokens)):
                # lemma = lemmas[i]
                # token = tokens[i]
                #word = line[token.start:token.start + token.length]
                #out.write(str(lemma.lemma) + ' ')
                #out.write(" ".join(list(map(lambda x: str(x.lemma), lemmas))))
                out.write(" ".join(
                    list(
                        map(
                            lambda x: str(x.lemma).strip() + '___' + str(x.tag)
                            .strip(), lemmas))))
            out.write('\n')
예제 #2
0
 def __init__(self):
     self.morphodita_model = os.path.join(dir_cur, 'czech-morfflex-131112.tagger-fast')
     self.tagger = Tagger.load(self.morphodita_model)
     self.forms = Forms()
     self.lemmas = TaggedLemmas()
     self.tokens = TokenRanges()
     self.tokenizer = self.tagger.newTokenizer()
예제 #3
0
 def __init__(self):
     self.morphodita_model = os.path.join(
         dir_cur, 'czech-morfflex-131112.tagger-fast')
     self.tagger = Tagger.load(self.morphodita_model)
     self.forms = Forms()
     self.lemmas = TaggedLemmas()
     self.tokens = TokenRanges()
     self.tokenizer = self.tagger.newTokenizer()
예제 #4
0
 def __init__(self, tagger_model):
     if not os.path.isfile(tagger_model):
         raise IOError('File %s does not exist' % tagger_model)
     self._tagger = Tagger.load(tagger_model)
     self._tokenizer = self._tagger.newTokenizer()
     self._forms_buf = Forms()
     self._tokens_buf = TokenRanges()
     self._tags_buf = TaggedLemmas()
예제 #5
0
    def __init__(self, derinet_file_name, morfflex_file_name,
                 morpho_file_name):
        logger.info("Loading derivations.")
        derinet_db = DeriNetDatabase(derinet_file_name)
        logger.info("Derivations loaded.")

        if morfflex_file_name is not None:
            logger.info("Loading inflections.")
            db = MorfFlexDatabase(morfflex_file_name, derinet_db)
            logger.info("Inflections loaded.")
        else:
            logger.info("Not loading inflections.")
            db = derinet_db

        logger.info("Detecting stem bounds.")

        for node in db.iter():
            node.detect_stems()

        logger.info("Stem bounds detected.")
        logger.info("Propagating morph bounds.")

        for root in db.iter_trees():
            root.propagate_morph_bounds()

        logger.info("Morph bounds propagated.")

        lemmas = []
        tagger = None
        if morpho_file_name is not None:
            logger.info("Loading morphology")
            if morphodita_available:
                tagger = Tagger.load(morpho_file_name)
            else:
                logger.error(
                    "You need to install the MorphoDiTa Python bindings!")

            if not tagger:
                logger.critical(
                    "Cannot load morphological dictionary from file '%s'.",
                    morpho_file_name)
                sys.exit(1)

            lemmas = TaggedLemmas()
            logger.info("Morphology loaded.")
        else:
            logger.info(
                "No morphological dictionary specified. Inflectional morphology will not be available."
            )
            tagger = None

        self.db = db
        self.tagger = tagger
        self.lemmas = lemmas
예제 #6
0
파일: keywords.py 프로젝트: vidiecan/ker
    def __init__(self, model_file):
        """
        Instantiates Morphodita from a provided model file.

        :param model_file: Path to the model file,
        :type model_file: str
        """
        from ufal.morphodita import Tagger, Forms, TaggedLemmas, TokenRanges
        self.tagger = Tagger.load(model_file)
        self.forms = Forms()
        self.lemmas = TaggedLemmas()
        self.tokens = TokenRanges()
        self.tokenizer = self.tagger.newTokenizer()
예제 #7
0
    def __init__(self, tagger_model, abst_slots):
        self._tagger = Tagger.load(tagger_model)
        self._analyzer = self._tagger.getMorpho()
        self._tokenizer = self._tagger.newTokenizer()
        self._abst_slots = set(abst_slots.split(','))

        self._forms_buf = Forms()
        self._tokens_buf = TokenRanges()
        self._analyses_buf = Analyses()
        self._indices_buf = Indices()

        self._sf_dict = {}
        self._sf_max_len = 0
예제 #8
0
파일: keywords.py 프로젝트: ufal/ker
    def __init__(self, model_file):
        """
        Instantiates Morphodita from a provided model file.

        :param model_file: Path to the model file,
        :type model_file: str
        """
        from ufal.morphodita import Tagger, Forms, TaggedLemmas, TokenRanges
        self.tagger = Tagger.load(model_file)
        self.forms = Forms()
        self.lemmas = TaggedLemmas()
        self.tokens = TokenRanges()
        self.tokenizer = self.tagger.newTokenizer()
예제 #9
0
 def load_tagger(self, path: str):
     """
     Load morphodita tagger from path.
     :param path:
     :return:
     """
     self.tagger = Tagger.load(path)
     if self.tagger is None:
         raise Exception("[morpho_tagger] Wrong path in tagger")
     # create tokenizer
     self.tokenizer = self.tagger.newTokenizer()
     if self.tagger is None:
         raise Exception("[morpho_tagger] tokenizer not created")
예제 #10
0
파일: convert.py 프로젝트: UFAL-DSG/tgen
    def __init__(self, tagger_model, abst_slots):
        self._tagger = Tagger.load(tagger_model)
        self._analyzer = self._tagger.getMorpho()
        self._tokenizer = self._tagger.newTokenizer()
        self._abst_slots = set(abst_slots.split(','))

        self._forms_buf = Forms()
        self._tokens_buf = TokenRanges()
        self._analyses_buf = Analyses()
        self._indices_buf = Indices()

        self._sf_dict = {}
        self._rev_sf_dict = {}
        self._sf_max_len = 0
예제 #11
0
def lemmatize_and_replace_entities(file, output_file):
    nametag_model = os.path.join(dir_cur, 'czech-cnec2.0-140304.ner')
    morphodita_model = os.path.join(dir_cur,
                                    'czech-morfflex-131112.tagger-fast')
    tagger = Tagger.load(morphodita_model)
    assert tagger
    ner = Ner.load(nametag_model)
    assert ner
    forms = Forms()
    lemmas = TaggedLemmas()
    tokens = TokenRanges()
    entities = NamedEntities()
    tokenizer = ner.newTokenizer()
    assert tokenizer
    with open_gz(output_file, 'w') as out, open_gz(file) as f:
        for line in f:
            tokenizer.setText(line)
            while tokenizer.nextSentence(forms, tokens):
                tagger.tag(forms, lemmas)
                ner.recognize(forms, entities)
                sorted_entities = sort_entities(entities)
                open_entities = []
                open_entities_type = []
                e = 0
                for i in range(len(tokens)):
                    lemma = lemmas[i]
                    token = tokens[i]
                    word = line[token.start:token.start + token.length]
                    while e < len(
                            sorted_entities) and sorted_entities[e].start == i:
                        open_entities.append(sorted_entities[e].start +
                                             sorted_entities[e].length - 1)
                        open_entities_type.append(sorted_entities[e].type)
                        e += 1
                    if len(open_entities) == 0:
                        out.write(str(lemma.lemma) + ' ')
                    else:
                        out.write("@!ENT!%s " % ('!'.join(open_entities_type)))
                    while open_entities and open_entities[-1] == i:
                        open_entities.pop()
                        open_entities_type.pop()
            out.write('\n')
예제 #12
0
def lemmatize_and_replace_entities(file, output_file):
    nametag_model = os.path.join(dir_cur, 'czech-cnec2.0-140304.ner')
    morphodita_model = os.path.join(dir_cur, 'czech-morfflex-131112.tagger-fast')
    tagger = Tagger.load(morphodita_model)
    assert tagger
    ner = Ner.load(nametag_model)
    assert ner
    forms = Forms()
    lemmas = TaggedLemmas()
    tokens = TokenRanges()
    entities = NamedEntities()
    tokenizer = ner.newTokenizer()
    assert tokenizer
    with open_gz(output_file, 'w') as out, open_gz(file) as f:
        for line in f:
            tokenizer.setText(line)
            while tokenizer.nextSentence(forms, tokens):
                tagger.tag(forms, lemmas)
                ner.recognize(forms, entities)
                sorted_entities = sort_entities(entities)
                open_entities = []
                open_entities_type = []
                e = 0
                for i in range(len(tokens)):
                    lemma = lemmas[i]
                    token = tokens[i]
                    word = line[token.start:token.start + token.length]
                    while e < len(sorted_entities) and sorted_entities[e].start == i:
                        open_entities.append(sorted_entities[e].start + sorted_entities[e].length - 1)
                        open_entities_type.append(sorted_entities[e].type)
                        e += 1
                    if len(open_entities) == 0:
                        out.write(str(lemma.lemma) + ' ')
                    else:
                        out.write("@!ENT!%s " % ('!'.join(open_entities_type)))
                    while open_entities and open_entities[-1] == i:
                        open_entities.pop()
                        open_entities_type.pop()
            out.write('\n')
예제 #13
0
def lemmatize(file, output_file):
    morphodita_model = os.path.join(dir_cur, 'czech-morfflex-pdt-131112-raw_lemmas.tagger-best_accuracy')
    tagger = Tagger.load(morphodita_model)
    assert tagger
    forms = Forms()
    lemmas = TaggedLemmas()
    tokens = TokenRanges()
    tokenizer = tagger.newTokenizer()
    assert tokenizer
    with open_gz(output_file, 'w') as out, open_gz(file) as f:
        for line in f:
            tokenizer.setText(line)
            while tokenizer.nextSentence(forms, tokens):
                tagger.tag(forms, lemmas)
                # for i in range(len(tokens)):
                # lemma = lemmas[i]
                # token = tokens[i]
                #word = line[token.start:token.start + token.length]
                #out.write(str(lemma.lemma) + ' ')
                #out.write(" ".join(list(map(lambda x: str(x.lemma), lemmas))))
                out.write(" ".join(list(map(lambda x: str(x.lemma).strip() + '___' + str(x.tag).strip(), lemmas))))
            out.write('\n')
예제 #14
0
파일: expand_stops.py 프로젝트: AoJ/alex
 def __init__(self, tagger_model):
     self.__tagger = Tagger.load(tagger_model)
     self.__tokenizer = self.__tagger.newTokenizer()
     self.__forms_buf = Forms()
     self.__tokens_buf = TokenRanges()
     self.__lemmas_buf = TaggedLemmas()
예제 #15
0
파일: cs_morpho.py 프로젝트: madkote/alex
 def __init__(self, tagger_model):
     self.__tagger = Tagger.load(tagger_model)
     self.__tokenizer = self.__tagger.newTokenizer()
     self.__forms_buf = Forms()
     self.__tokens_buf = TokenRanges()
     self.__lemmas_buf = TaggedLemmas()
예제 #16
0
 def load(self):
     self.tagger = Tagger.load("app/LM/czech-morfflex-pdt-161115.tagger")
     self.tokenizer = self.tagger.newTokenizer()