def test_np_annotator_linked(model_path, settings_path, text, phrases): annotator = SpacyInstance(model="en", disable=["textcat", "ner", "parser"]).parser annotator.add_pipe(annotator.create_pipe("sentencizer"), first=True) annotator.add_pipe(NPAnnotator.load(model_path, settings_path), last=True) doc = annotator(text) noun_phrases = [p.text for p in get_noun_phrases(doc)] for p in phrases: assert p in noun_phrases
def extract_noun_phrases(docs, nlp_parser, chunker): logger.info('extract nps from: %s', docs) spans = [] for doc in nlp_parser.pipe(docs, n_threads=-1): if 'nlp_arch' in chunker: spans.extend(get_noun_phrases(doc)) else: nps = list(doc.noun_chunks) spans.extend(nps) logger.info('nps= %s', str(spans)) return spans
def mark_noun_phrases(corpus_file, marked_corpus_file, nlp_parser, lines_count, chunker, mark_char='_', grouping=False): i = 0 with tqdm(total=lines_count) as pbar: for doc in nlp_parser.pipe(corpus_file, n_threads=-1): if 'nlp_arch' in chunker: spans = get_noun_phrases(doc) else: spans = list(doc.noun_chunks) i += 1 if len(spans) > 0: span = spans.pop(0) else: span = None span_written = False for token in doc: if span is None: if len(token.text.strip()) > 0: marked_corpus_file.write(token.text + ' ') else: if token.idx < span.start_char or token.idx >= span.end_char: # outside a # span if len(token.text.strip()) > 0: marked_corpus_file.write(token.text + ' ') else: if not span_written: # mark NP's if len(span.text) > 1 and span.lemma_ != '-PRON-': if grouping: text = get_group_norm(span) else: text = span.text # mark NP's text = text.replace(' ', mark_char) + mark_char marked_corpus_file.write(text + ' ') else: marked_corpus_file.write(span.text + ' ') span_written = True if token.idx + len(token.text) == span.end_char: if len(spans) > 0: span = spans.pop(0) else: span = None span_written = False marked_corpus_file.write('\n') pbar.update(1)
def score_documents(self, texts: list, limit=-1, return_all=False, min_tf=5): documents = [] assert len(texts) > 0, "texts should contain at least 1 document" assert min_tf > 0, "min_tf should be at least 1" with tqdm(total=len(texts), desc="documents scoring progress", unit="docs") as pbar: for doc in self.nlp.pipe(texts, n_threads=-1): if len(doc) > 0: documents.append(doc) pbar.update(1) corpus = [] for doc in documents: spans = get_noun_phrases(doc) if len(spans) > 0: corpus.append((doc, spans)) if len(corpus) < 1: return [] documents, doc_phrases = list(zip(*corpus)) scorer = TextSpanScoring(documents=documents, spans=doc_phrases, min_tf=min_tf) tfidf_scored_list = scorer.get_tfidf_scores() if len(tfidf_scored_list) < 1: return [] cvalue_scored_list = scorer.get_cvalue_scores() freq_scored_list = scorer.get_freq_scores() if limit > 0: tf = {tuple(k[0]): k[1] for k in tfidf_scored_list} cv = {tuple(k[0]): k[1] for k in cvalue_scored_list} fr = {tuple(k[0]): k[1] for k in freq_scored_list} tfidf_scored_list_limit = [] cvalue_scored_list_limit = [] freq_scored_list_limit = [] for phrase in list(zip(*tfidf_scored_list))[0][:limit]: tfidf_scored_list_limit.append((phrase, tf[tuple(phrase)])) cvalue_scored_list_limit.append((phrase, cv[tuple(phrase)])) freq_scored_list_limit.append((phrase, fr[tuple(phrase)])) tfidf_scored_list = tfidf_scored_list_limit cvalue_scored_list = cvalue_scored_list_limit freq_scored_list = freq_scored_list_limit tfidf_scored_list = scorer.normalize_l2(tfidf_scored_list) cvalue_scored_list = scorer.normalize_l2(cvalue_scored_list) freq_scored_list = scorer.normalize_minmax(freq_scored_list, invert=True) tfidf_scored_list = scorer.normalize_minmax(tfidf_scored_list) cvalue_scored_list = scorer.normalize_minmax(cvalue_scored_list) if return_all: tf = {tuple(k[0]): k[1] for k in tfidf_scored_list} cv = {tuple(k[0]): k[1] for k in cvalue_scored_list} fr = {tuple(k[0]): k[1] for k in freq_scored_list} final_list = [] for phrases in tf.keys(): final_list.append(([p for p in phrases], tf[phrases], cv[phrases], fr[phrases])) return final_list merged_list = scorer.interpolate_scores( [tfidf_scored_list, cvalue_scored_list], [0.5, 0.5]) merged_list = scorer.multiply_scores([merged_list, freq_scored_list]) merged_list = scorer.normalize_minmax(merged_list) final_list = [] for phrases, score in merged_list: if any([len(p) > 1 for p in phrases]): final_list.append(([p for p in phrases], score)) return final_list
nlp.add_pipe(NPAnnotator.load(_path_to_model, _path_to_params), last=True) else: nlp = SpacyInstance(model='en_core_web_sm', disable=['textcat', 'ner']).parser logger.info('spacy loaded') num_lines = sum(1 for line in corpus_file) corpus_file.seek(0) logger.info('%i lines in corpus', num_lines) i = 0 with tqdm(total=num_lines) as pbar: for doc in nlp.pipe(corpus_file, n_threads=-1): if 'nlp_arch' in args.chunker: spans = get_noun_phrases(doc) else: spans = list(doc.noun_chunks) i += 1 if len(spans) > 0: span = spans.pop(0) else: span = None spanWritten = False for token in doc: if span is None: if len(token.text.strip()) > 0: marked_corpus_file.write(token.text + ' ') else: if token.idx < span.start_char or token.idx >= span.end_char: # outside a # span