Exemplo n.º 1
0
 def __init__(self, pipeline, language):
     self.tagger = pipeline.turbo_interface.create_tagger()
     self.parser = pipeline.turbo_interface.create_parser()
     self.lemmatizer = None
     if language == 'PT':
         self.sent_tokenizer = nltk.data.load(
             'tokenizers/punkt/portuguese.pickle')
         self.word_tokenizer = tokenizer_PT.PortugueseFlorestaWordTokenizer(
         )
         self.tagger.load_tagger_model(
             '/home/atm/workspace/CPP/TurboParser/models/portuguese_floresta_v2.0_nomwe_auto/portuguese_floresta_v2.0_nomwe_auto_tagger.model'
         )
         self.parser.load_parser_model(
             '/home/atm/workspace/CPP/TurboParser/models/portuguese_floresta_v2.0_nomwe_auto/portuguese_floresta_v2.0_nomwe_auto_parser_pruned-true_model-standard.model'
         )
         self.lemmatizer = lemmatizer.BasicLemmatizer()
         self.lemmatizer.load_lemmatizer_model(
             '/home/atm/workspace/CPP/TurboParser/models/portuguese_floresta_v2.0_nomwe_auto/portuguese_floresta_v2.0_nomwe_auto_lemmatizer.model'
         )
     elif language == 'PT-Cintil':
         self.sent_tokenizer = nltk.data.load(
             'tokenizers/punkt/portuguese.pickle')
         self.word_tokenizer = tokenizer_PT.PortugueseCintilWordTokenizer()
         self.tagger.load_tagger_model(
             '/home/atm/workspace/CPP/TurboParser/models/portuguese_cetem-depbank/portuguese_cetem-depbank_tagger.model'
         )
         self.parser.load_parser_model(
             '/home/atm/workspace/CPP/TurboParser/models/portuguese_cetem-depbank/portuguese_cetem-depbank_parser_pruned-true_model-standard.model'
         )
     elif language == 'ES':
         self.sent_tokenizer = nltk.data.load(
             'tokenizers/punkt/spanish.pickle')
         self.word_tokenizer = nltk.TreebankWordTokenizer()  # For now...
         self.tagger.load_tagger_model(
             '/home/atm/workspace/CPP/TurboParser/models/spanish_conll2009_v2.0_nomwe_auto/spanish_conll2009_v2.0_nomwe_auto_tagger.model'
         )
         self.parser.load_parser_model(
             '/home/atm/workspace/CPP/TurboParser/models/spanish_conll2009_v2.0_nomwe_auto/spanish_conll2009_v2.0_nomwe_auto_parser_pruned-true_model-standard.model'
         )
         self.lemmatizer = lemmatizer.BasicLemmatizer()
         self.lemmatizer.load_lemmatizer_model(
             '/home/atm/workspace/CPP/TurboParser/models/spanish_conll2009_v2.0_nomwe_auto/spanish_conll2009_v2.0_nomwe_auto_lemmatizer.model'
         )
     elif language == 'EN':
         self.sent_tokenizer = nltk.data.load(
             'tokenizers/punkt/english.pickle')
         self.word_tokenizer = nltk.TreebankWordTokenizer()
         self.tagger.load_tagger_model(
             '/home/atm/workspace/CPP/TurboParser/models/english_proj/english_proj_tagger.model'
         )
         self.parser.load_parser_model(
             '/home/atm/workspace/CPP/TurboParser/models/english_proj/english_proj_parser_pruned-true_model-standard.model'
         )
     else:
         raise NotImplementedError
Exemplo n.º 2
0
    def __init__(self, pipeline, language):
        self.tagger = None
        self.parser = None
        self.semantic_parser = None
        self.lemmatizer = None

        if language not in pipeline.models:
            print 'Error: no model for language %s.' % language
            raise NotImplementedError

        if 'splitter' in pipeline.models[language]:
            self.sent_tokenizer = nltk.data.load(pipeline.models[language]['splitter'])
        else:
            # If no splitter is specified, use the English model.
            self.sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

        if language == 'PT':
            self.word_tokenizer = tokenizer_PT.PortugueseFlorestaWordTokenizer()
        elif language == 'PT-Cintil':
            self.word_tokenizer = tokenizer_PT.PortugueseCintilWordTokenizer()
        else:
            self.word_tokenizer = nltk.TreebankWordTokenizer() # For now...

        if 'tagger' in pipeline.models[language]:
            self.tagger = pipeline.turbo_interface.create_tagger()
            self.tagger.load_tagger_model(pipeline.models[language]['tagger'])
        if 'parser' in pipeline.models[language]:
            self.parser = pipeline.turbo_interface.create_parser()
            self.parser.load_parser_model(pipeline.models[language]['parser'])
        if 'lemmatizer' in pipeline.models[language]:
            self.lemmatizer = lemmatizer.BasicLemmatizer()
            self.lemmatizer.load_lemmatizer_model(pipeline.models[language]['lemmatizer'])
        if 'semantic_parser' in pipeline.models[language]:
            self.semantic_parser = pipeline.turbo_interface.create_semantic_parser()
            self.semantic_parser.load_semantic_parser_model(pipeline.models[language]['semantic_parser'])
Exemplo n.º 3
0
    def __init__(self, pipeline, language):
        self.tagger = None
        self.morphological_tagger = None
        self.entity_recognizer = None
        self.parser = None
        self.semantic_parser = None
        self.lemmatizer = None
        self.coreference_resolver = None

        if language not in pipeline.models:
            print('Error: no model for language %s.' % language)
            raise NotImplementedError

        if 'splitter' in pipeline.models[language]:
            self.sent_tokenizer = nltk.data.load(
                pipeline.models[language]['splitter'])
        else:
            # If no splitter is specified, use the English model.
            self.sent_tokenizer = nltk.data.load(
                'tokenizers/punkt/english.pickle')
        if 'tokenizer' in pipeline.models[language]:
            tokenizer_language = pipeline.models[language]['tokenizer']
            self.word_tokenizer = \
                UniversalWordTokenizer(language=tokenizer_language)
        else:
            self.word_tokenizer = UniversalWordTokenizer(language='none')
        if 'tagger' in pipeline.models[language]:
            self.tagger = pipeline.turbo_interface.create_tagger()
            self.tagger.load_tagger_model(pipeline.models[language]['tagger'])
        if 'morphological_tagger' in pipeline.models[language]:
            self.morphological_tagger = pipeline.turbo_interface.create_morphological_tagger(
            )
            self.morphological_tagger.load_morphological_tagger_model(
                pipeline.models[language]['morphological_tagger'])
        if 'entity_recognizer' in pipeline.models[language]:
            self.entity_recognizer = pipeline.turbo_interface.create_entity_recognizer(
            )
            self.entity_recognizer.load_entity_recognizer_model(
                pipeline.models[language]['entity_recognizer'])
        if 'parser' in pipeline.models[language]:
            self.parser = pipeline.turbo_interface.create_parser()
            self.parser.load_parser_model(pipeline.models[language]['parser'])
        if 'lemmatizer' in pipeline.models[language]:
            self.lemmatizer = lemmatizer.BasicLemmatizer()
            self.lemmatizer.load_lemmatizer_model(
                pipeline.models[language]['lemmatizer'])
        if 'semantic_parser' in pipeline.models[language]:
            self.semantic_parser = pipeline.turbo_interface.create_semantic_parser(
            )
            self.semantic_parser.load_semantic_parser_model(
                pipeline.models[language]['semantic_parser'])
        if 'coreference_resolver' in pipeline.models[language]:
            self.coreference_resolver = pipeline.turbo_interface.create_coreference_resolver(
            )
            self.coreference_resolver.load_coreference_resolver_model(
                pipeline.models[language]['coreference_resolver'])