Пример #1
0
def load_parser(chunker):
    # load spacy parser
    logger.info('loading spacy. chunker=%s', chunker)
    if 'nlp_arch' in chunker:
        parser = SpacyInstance(model='en_core_web_sm',
                               disable=['textcat', 'ner', 'parser']).parser
        parser.add_pipe(parser.create_pipe('sentencizer'), first=True)
        _path_to_model = path.join(chunker_path, chunker_model_file)
        _path_to_params = path.join(chunker_path, chunker_model_dat_file)
        if not path.exists(chunker_path):
            makedirs(chunker_path)
        if not path.exists(_path_to_model):
            logger.info(
                'The pre-trained model to be downloaded for NLP Architect'
                ' word chunker model is licensed under Apache 2.0')
            download_unlicensed_file(nlp_chunker_url, chunker_model_file,
                                     _path_to_model)
        if not path.exists(_path_to_params):
            download_unlicensed_file(nlp_chunker_url, chunker_model_dat_file,
                                     _path_to_params)
        parser.add_pipe(NPAnnotator.load(_path_to_model, _path_to_params),
                        last=True)
    else:
        parser = SpacyInstance(model='en_core_web_sm',
                               disable=['textcat', 'ner']).parser
    logger.info('spacy loaded')
    return parser
Пример #2
0
def load_parser(chunker):
    # load spacy parser
    logger.info("loading spacy. chunker=%s", chunker)
    if "nlp_arch" in chunker:
        parser = SpacyInstance(model="en_core_web_sm",
                               disable=["textcat", "ner", "parser"]).parser
        parser.add_pipe(parser.create_pipe("sentencizer"), first=True)
        _path_to_model = path.join(chunker_path, chunker_model_file)
        _path_to_params = path.join(chunker_path, chunker_model_dat_file)
        if not path.exists(chunker_path):
            makedirs(chunker_path)
        if not path.exists(_path_to_model):
            logger.info(
                "The pre-trained model to be downloaded for NLP Architect"
                " word chunker model is licensed under Apache 2.0")
            download_unlicensed_file(nlp_chunker_url, chunker_model_file,
                                     _path_to_model)
        if not path.exists(_path_to_params):
            download_unlicensed_file(nlp_chunker_url, chunker_model_dat_file,
                                     _path_to_params)
        parser.add_pipe(NPAnnotator.load(_path_to_model, _path_to_params),
                        last=True)
    else:
        parser = SpacyInstance(model="en_core_web_sm",
                               disable=["textcat", "ner"]).parser
    logger.info("spacy loaded")
    return parser
def test_np_annotator_linked(model_path, settings_path, text, phrases):
    annotator = SpacyInstance(model="en", disable=["textcat", "ner", "parser"]).parser
    annotator.add_pipe(annotator.create_pipe("sentencizer"), first=True)
    annotator.add_pipe(NPAnnotator.load(model_path, settings_path), last=True)
    doc = annotator(text)
    noun_phrases = [p.text for p in get_noun_phrases(doc)]
    for p in phrases:
        assert p in noun_phrases
Пример #4
0
    def __init__(self, parser=None):
        if parser is None:
            self.nlp = SpacyInstance(
                disable=['ner', 'parser', 'vectors', 'textcat']).parser
        else:
            self.nlp = parser

        self.nlp.add_pipe(self.nlp.create_pipe('sentencizer'), first=True)
        _path_to_model = path.join(chunker_local_path, chunker_model_file)
        if not path.exists(chunker_local_path):
            makedirs(chunker_local_path)
        if not path.exists(_path_to_model):
            logger.info(
                'The pre-trained model to be downloaded for NLP Architect word'
                ' chunker model is licensed under Apache 2.0')
            download_unlicensed_file(nlp_chunker_url, chunker_model_file, _path_to_model)
        _path_to_params = path.join(chunker_local_path, chunker_model_dat_file)
        if not path.exists(_path_to_params):
            download_unlicensed_file(nlp_chunker_url, chunker_model_dat_file, _path_to_params)
        self.nlp.add_pipe(NPAnnotator.load(_path_to_model, _path_to_params), last=True)
Пример #5
0
        logger.info('loading spacy')
        if 'nlp_arch' in args.chunker:
            nlp = SpacyInstance(model='en_core_web_sm',
                                disable=['textcat', 'ner', 'parser']).parser
            nlp.add_pipe(nlp.create_pipe('sentencizer'), first=True)
            logger.info(
                'The pre-trained model to be downloaded for NLP Architect word'
                ' chunker model is licensed under Apache 2.0')
            _path_to_model = path.join(cur_dir, chunker_model_file)
            download_unlicensed_file(nlp_chunker_url, chunker_model_file,
                                     _path_to_model)
            _path_to_params = path.join(cur_dir, chunker_model_dat_file)
            download_unlicensed_file(nlp_chunker_url, chunker_model_dat_file,
                                     _path_to_params)
            logger.info('Done.')
            nlp.add_pipe(NPAnnotator.load(_path_to_model, _path_to_params),
                         last=True)
        else:
            nlp = SpacyInstance(model='en_core_web_sm',
                                disable=['textcat', 'ner']).parser
        logger.info('spacy loaded')

        num_lines = sum(1 for line in corpus_file)
        corpus_file.seek(0)
        logger.info('%i lines in corpus', num_lines)
        i = 0

        with tqdm(total=num_lines) as pbar:
            for doc in nlp.pipe(corpus_file, n_threads=-1):
                if 'nlp_arch' in args.chunker:
                    spans = get_noun_phrases(doc)
def test_np_annotator_load(model_path, settings_path):
    assert NPAnnotator.load(model_path, settings_path)