示例#1
0
def load_parser(chunker):
    # load spacy parser
    logger.info('loading spacy. chunker=%s', chunker)
    if 'nlp_arch' in chunker:
        parser = SpacyInstance(model='en_core_web_sm',
                               disable=['textcat', 'ner', 'parser']).parser
        parser.add_pipe(parser.create_pipe('sentencizer'), first=True)
        _path_to_model = path.join(chunker_path, chunker_model_file)
        _path_to_params = path.join(chunker_path, chunker_model_dat_file)
        if not path.exists(chunker_path):
            makedirs(chunker_path)
        if not path.exists(_path_to_model):
            logger.info(
                'The pre-trained model to be downloaded for NLP Architect'
                ' word chunker model is licensed under Apache 2.0')
            download_unlicensed_file(nlp_chunker_url, chunker_model_file,
                                     _path_to_model)
        if not path.exists(_path_to_params):
            download_unlicensed_file(nlp_chunker_url, chunker_model_dat_file,
                                     _path_to_params)
        parser.add_pipe(NPAnnotator.load(_path_to_model, _path_to_params),
                        last=True)
    else:
        parser = SpacyInstance(model='en_core_web_sm',
                               disable=['textcat', 'ner']).parser
    logger.info('spacy loaded')
    return parser
示例#2
0
def load_parser(chunker):
    # load spacy parser
    logger.info("loading spacy. chunker=%s", chunker)
    if "nlp_arch" in chunker:
        parser = SpacyInstance(model="en_core_web_sm",
                               disable=["textcat", "ner", "parser"]).parser
        parser.add_pipe(parser.create_pipe("sentencizer"), first=True)
        _path_to_model = path.join(chunker_path, chunker_model_file)
        _path_to_params = path.join(chunker_path, chunker_model_dat_file)
        if not path.exists(chunker_path):
            makedirs(chunker_path)
        if not path.exists(_path_to_model):
            logger.info(
                "The pre-trained model to be downloaded for NLP Architect"
                " word chunker model is licensed under Apache 2.0")
            download_unlicensed_file(nlp_chunker_url, chunker_model_file,
                                     _path_to_model)
        if not path.exists(_path_to_params):
            download_unlicensed_file(nlp_chunker_url, chunker_model_dat_file,
                                     _path_to_params)
        parser.add_pipe(NPAnnotator.load(_path_to_model, _path_to_params),
                        last=True)
    else:
        parser = SpacyInstance(model="en_core_web_sm",
                               disable=["textcat", "ner"]).parser
    logger.info("spacy loaded")
    return parser
def test_np_annotator_linked(model_path, settings_path, text, phrases):
    annotator = SpacyInstance(model="en", disable=["textcat", "ner", "parser"]).parser
    annotator.add_pipe(annotator.create_pipe("sentencizer"), first=True)
    annotator.add_pipe(NPAnnotator.load(model_path, settings_path), last=True)
    doc = annotator(text)
    noun_phrases = [p.text for p in get_noun_phrases(doc)]
    for p in phrases:
        assert p in noun_phrases
示例#4
0
class NPScorer(object):
    def __init__(self, parser=None):
        if parser is None:
            self.nlp = SpacyInstance(
                disable=["ner", "parser", "vectors", "textcat"]).parser
        else:
            self.nlp = parser

        self.nlp.add_pipe(self.nlp.create_pipe("sentencizer"), first=True)
        _path_to_model = path.join(chunker_local_path, chunker_model_file)
        if not path.exists(chunker_local_path):
            makedirs(chunker_local_path)
        if not path.exists(_path_to_model):
            logger.info(
                "The pre-trained model to be downloaded for NLP Architect word"
                " chunker model is licensed under Apache 2.0")
            download_unlicensed_file(nlp_chunker_url, chunker_model_file,
                                     _path_to_model)
        _path_to_params = path.join(chunker_local_path, chunker_model_dat_file)
        if not path.exists(_path_to_params):
            download_unlicensed_file(nlp_chunker_url, chunker_model_dat_file,
                                     _path_to_params)
        self.nlp.add_pipe(NPAnnotator.load(_path_to_model, _path_to_params),
                          last=True)

    def score_documents(self,
                        texts: list,
                        limit=-1,
                        return_all=False,
                        min_tf=5):
        documents = []
        assert len(texts) > 0, "texts should contain at least 1 document"
        assert min_tf > 0, "min_tf should be at least 1"
        with tqdm(total=len(texts),
                  desc="documents scoring progress",
                  unit="docs") as pbar:
            for doc in self.nlp.pipe(texts, n_threads=-1):
                if len(doc) > 0:
                    documents.append(doc)
                pbar.update(1)

        corpus = []
        for doc in documents:
            spans = get_noun_phrases(doc)
            if len(spans) > 0:
                corpus.append((doc, spans))

        if len(corpus) < 1:
            return []

        documents, doc_phrases = list(zip(*corpus))
        scorer = TextSpanScoring(documents=documents,
                                 spans=doc_phrases,
                                 min_tf=min_tf)
        tfidf_scored_list = scorer.get_tfidf_scores()
        if len(tfidf_scored_list) < 1:
            return []
        cvalue_scored_list = scorer.get_cvalue_scores()
        freq_scored_list = scorer.get_freq_scores()

        if limit > 0:
            tf = {tuple(k[0]): k[1] for k in tfidf_scored_list}
            cv = {tuple(k[0]): k[1] for k in cvalue_scored_list}
            fr = {tuple(k[0]): k[1] for k in freq_scored_list}
            tfidf_scored_list_limit = []
            cvalue_scored_list_limit = []
            freq_scored_list_limit = []
            for phrase in list(zip(*tfidf_scored_list))[0][:limit]:
                tfidf_scored_list_limit.append((phrase, tf[tuple(phrase)]))
                cvalue_scored_list_limit.append((phrase, cv[tuple(phrase)]))
                freq_scored_list_limit.append((phrase, fr[tuple(phrase)]))
            tfidf_scored_list = tfidf_scored_list_limit
            cvalue_scored_list = cvalue_scored_list_limit
            freq_scored_list = freq_scored_list_limit

        tfidf_scored_list = scorer.normalize_l2(tfidf_scored_list)
        cvalue_scored_list = scorer.normalize_l2(cvalue_scored_list)
        freq_scored_list = scorer.normalize_minmax(freq_scored_list,
                                                   invert=True)
        tfidf_scored_list = scorer.normalize_minmax(tfidf_scored_list)
        cvalue_scored_list = scorer.normalize_minmax(cvalue_scored_list)
        if return_all:
            tf = {tuple(k[0]): k[1] for k in tfidf_scored_list}
            cv = {tuple(k[0]): k[1] for k in cvalue_scored_list}
            fr = {tuple(k[0]): k[1] for k in freq_scored_list}
            final_list = []
            for phrases in tf.keys():
                final_list.append(([p for p in phrases], tf[phrases],
                                   cv[phrases], fr[phrases]))
            return final_list
        merged_list = scorer.interpolate_scores(
            [tfidf_scored_list, cvalue_scored_list], [0.5, 0.5])
        merged_list = scorer.multiply_scores([merged_list, freq_scored_list])
        merged_list = scorer.normalize_minmax(merged_list)
        final_list = []
        for phrases, score in merged_list:
            if any([len(p) > 1 for p in phrases]):
                final_list.append(([p for p in phrases], score))
        return final_list
示例#5
0
    args = arg_parser.parse_args()
    if args.corpus.endswith('gz'):
        corpus_file = gzip.open(args.corpus,
                                'rt',
                                encoding='utf8',
                                errors='ignore')
    else:
        corpus_file = open(args.corpus, 'r', encoding='utf8', errors='ignore')

    with open(args.marked_corpus, 'w', encoding='utf8') as marked_corpus_file:
        # load spacy parser
        logger.info('loading spacy')
        if 'nlp_arch' in args.chunker:
            nlp = SpacyInstance(model='en_core_web_sm',
                                disable=['textcat', 'ner', 'parser']).parser
            nlp.add_pipe(nlp.create_pipe('sentencizer'), first=True)
            logger.info(
                'The pre-trained model to be downloaded for NLP Architect word'
                ' chunker model is licensed under Apache 2.0')
            _path_to_model = path.join(cur_dir, chunker_model_file)
            download_unlicensed_file(nlp_chunker_url, chunker_model_file,
                                     _path_to_model)
            _path_to_params = path.join(cur_dir, chunker_model_dat_file)
            download_unlicensed_file(nlp_chunker_url, chunker_model_dat_file,
                                     _path_to_params)
            logger.info('Done.')
            nlp.add_pipe(NPAnnotator.load(_path_to_model, _path_to_params),
                         last=True)
        else:
            nlp = SpacyInstance(model='en_core_web_sm',
                                disable=['textcat', 'ner']).parser