def get_textacy_name_entities(text, article_id, drop_determiners=True, exclude_types='numeric'): '''Get Named Entities using textacy ## NOT USED IN THE PROJECT text: full_text or summary article_id: string, article id (names of json files) Return a pd dataframe with two columns: named entities and entities category ''' en = textacy.load_spacy_lang("en_core_web_sm", disable=("parser", )) if isinstance(text, str): # if raw string doc = textacy.make_spacy_doc(text, lang=en) elif isinstance(text, Doc): # if pre-created spacy doc doc = text else: doc = textacy.make_spacy_doc("NA", lang=en) nes = textacy.extract.entities( doc, drop_determiners=drop_determiners, exclude_types=exclude_types) # nes is a generator ne_list = [] ne_label_list = [] for ne in nes: ne_list.append(ne) ne_label_list.append(ne.label_) data = pd.DataFrame(data={'text': ne_list, 'label': ne_label_list}) data = data.drop_duplicates(keep='first') if article_id != None: # store article ID for csv data['article_id'] = article_id return data
def textacy_featurize(transcript): features=list() labels=list() # use Spacy doc try: doc = textacy.make_spacy_doc(transcript) except: os.system('python3 -m spacy download en') doc = textacy.make_spacy_doc(transcript) ts = textacy.TextStats(doc) uniquewords=ts.n_unique_words features.append(uniquewords) labels.append('uniquewords') mfeatures=ts.basic_counts features=features+list(mfeatures.values()) labels=labels+list(mfeatures) kincaid=ts.flesch_kincaid_grade_level features.append(kincaid) labels.append('flesch_kincaid_grade_level') readability=ts.readability_stats features=features+list(readability.values()) labels=labels+list(readability) return features, labels
def test_chunk_size(self, text, chunk_size, en_core_web_sm): doc_full = make_spacy_doc(text, en_core_web_sm) doc_chunked = make_spacy_doc(text, en_core_web_sm, chunk_size=chunk_size) assert isinstance(doc_chunked, spacy.tokens.Doc) assert len(doc_full.text) == len(doc_chunked.text) assert len(doc_full) == len(doc_chunked)
def p_complexity(args): correct_meta_labels = create_meta_labels( os.path.join(args.meta_outputs_dir, CORRECT_META_FILE)) incorrect_meta_labels = create_meta_labels( os.path.join(args.meta_outputs_dir, INCORRECT_META_FILE)) correct_outputs = torch.load(os.path.join(args.test_outputs_dir, CORRECT + LAYER_NAME), map_location='cpu') incorrect_outputs = torch.load(os.path.join(args.test_outputs_dir, INCORRECT + LAYER_NAME), map_location='cpu') correct_cmplx = [] incorrect_cmplx = [] for output in progressbar.progressbar(correct_outputs): psg = output['metadata'][0]['original_passage'] psg = " ".join(w for w in nltk.wordpunct_tokenize(psg) if w.lower() in WORDS or not w.isalpha()) try: doc = textacy.make_spacy_doc(psg) ts = TextStats(doc) cur_cmplx = ts.readability_stats['flesch_kincaid_grade_level'] except: cur_cmplx = 0 correct_cmplx.append(cur_cmplx) for output in progressbar.progressbar(incorrect_outputs): psg = output['metadata'][0]['original_passage'] try: doc = textacy.make_spacy_doc(psg) ts = TextStats(doc) cur_cmplx = ts.readability_stats['flesch_kincaid_grade_level'] except: cur_cmplx = 0 incorrect_cmplx.append(cur_cmplx) correct_cmplx_dict = { 'Complexity': correct_cmplx, 'Meta Prediction': correct_meta_labels, 'Base Network Prediction': [1] * len(correct_cmplx) } incorrect_cmplx_dict = { 'Complexity': incorrect_cmplx, 'Meta Prediction': incorrect_meta_labels, 'Base Network Prediction': [0] * len(incorrect_cmplx) } correct_cmplx_df = pd.DataFrame.from_dict(correct_cmplx_dict) incorrect_cmplx_df = pd.DataFrame.from_dict(incorrect_cmplx_dict) return correct_cmplx_df, incorrect_cmplx_df
def _exec_pipeline_for_sub_corpus(self, normalize_texts, batch_id, docs): # Internal function to enable multi-threaded pipeline execution sub_corpus = textacy.Corpus(self.nlp) for doc in docs: if doc['text']: if normalize_texts: spacy_doc = textacy.make_spacy_doc( (normalize(self.language, doc['text']), {'celex': doc['celex']}), self.nlp) else: spacy_doc = textacy.make_spacy_doc(doc['text'], {'celex': doc['celex']}, self.nlp) sub_corpus.add_doc(spacy_doc) return sub_corpus
def get_sentiment(data, tags, sentence, verbose): if len(tags) > 0: ct = 0 for title in data.titles: for tag in tags: if title == tag: click.echo(tag) content = data.documents[ct] ct += 1 interview = Content(content) doc = textacy.make_spacy_doc(interview.doc) ## Sentiment s = Sentiment() if sentence is True: for sentence in doc.sents: if len(sentence) > 3: sent = s.sentiment_analyzer_scores(sentence.text) if verbose: click.echo("{:-<40} {}\n".format( sent["sentence"], str(sent["score"]))) click.echo(s.sentiment()) else: sent = s.sentiment_analyzer_scores(doc.text) if verbose: click.echo("{:-<40} {}\n".format(sent["sentence"], str(sent["score"]))) click.echo(s.sentiment()) return s.sentiment() else: all_interviews = Content(data.content) doc = textacy.make_spacy_doc(all_interviews.doc) ## Sentiment s = Sentiment() if sentence is True: for sentence in doc.sents: if len(sentence) > 3: sent = s.sentiment_analyzer_scores(sentence.text) if verbose: click.echo("{:-<40} {}\n".format( sent["sentence"], str(sent["score"]))) click.echo(s.sentiment()) else: sent = s.sentiment_analyzer_scores(doc.text) if verbose: click.echo("{:-<40} {}\n".format(sent["sentence"], str(sent["score"]))) click.echo(s.sentiment()) return s.sentiment()
def q_complexity(args): correct_meta_labels = create_meta_labels( os.path.join(args.meta_outputs_dir, CORRECT_META_FILE)) incorrect_meta_labels = create_meta_labels( os.path.join(args.meta_outputs_dir, INCORRECT_META_FILE)) correct_outputs = torch.load(os.path.join(args.test_outputs_dir, CORRECT + LAYER_NAME), map_location='cpu') incorrect_outputs = torch.load(os.path.join(args.test_outputs_dir, INCORRECT + LAYER_NAME), map_location='cpu') correct_cmplx = [] incorrect_cmplx = [] for output in progressbar.progressbar(correct_outputs): q = ' '.join(output['metadata'][0]['question_tokens']) try: doc = textacy.make_spacy_doc(q) ts = TextStats(doc) cur_cmplx = ts.readability_stats['flesch_kincaid_grade_level'] except: cur_cmplx = 0 correct_cmplx.append(cur_cmplx) for output in progressbar.progressbar(incorrect_outputs): q = ' '.join(output['metadata'][0]['question_tokens']) try: doc = textacy.make_spacy_doc(q) ts = TextStats(doc) cur_cmplx = ts.readability_stats['flesch_kincaid_grade_level'] except: cur_cmplx = 0 incorrect_cmplx.append(cur_cmplx) correct_cmplx_dict = { 'Complexity': correct_cmplx, 'Meta Prediction': correct_meta_labels, 'Base Network Prediction': [1] * len(correct_cmplx) } incorrect_cmplx_dict = { 'Complexity': incorrect_cmplx, 'Meta Prediction': incorrect_meta_labels, 'Base Network Prediction': [0] * len(incorrect_cmplx) } correct_cmplx_df = pd.DataFrame.from_dict(correct_cmplx_dict) incorrect_cmplx_df = pd.DataFrame.from_dict(incorrect_cmplx_dict) return correct_cmplx_df, incorrect_cmplx_df
def analyzeSubLevel(input_text: str): """ :Returns: highest CEFR of word in input_text, flesh_kincade_level, number of words """ if (not (isinstance(input_text, str)) or (len(input_text) <= 0)): return "" if (this.cefr_data is None): this.cefr_data = loadCefrList() # TBD make static if (this.spacy_en is None): this.spacy_en = loadSpacyLangEn() # normalize text with NLP input_text = processText(input_text) doc = textacy.make_spacy_doc(input_text, lang=this.spacy_en) ts = textacy.TextStats(doc) flesh_kincade_level = calcFleshKincadeGrade(ts.n_words, ts.n_syllables / ts.n_words) # store words of text lowercase in list words: list = [item.lower() for item in input_text.split()] max_level = getMaxWordLevelForWordsSet(set(words), this.cefr_data) return max_level, flesh_kincade_level, ts.n_words
def test_empty_stats(): text = "" doc = make_spacy_doc(text, lang=SPACY_MODEL) stats = nlp.compute_stats(doc) assert stats.counts.sentences == 0 assert stats.counts.words == 0 assert stats.readibility == None
def get_flesch(text): doc = textacy.make_spacy_doc(text, lang=en) ts = TextStats(doc) try: return ts.flesch_kincaid_grade_level except ZeroDivisionError: return (11.8 * ts.n_syllables) + (0.39 * ts.n_words) - 15.59
def txt_to_df(txt_lst, captions_clm_name="captions"): """ Transform a list of texts to a df with some stats to reshape the number of captions """ # Load the language model for textacy en = en_core_web_sm.load() captions_lst = list() for txt in txt_lst: doc = textacy.make_spacy_doc(txt, lang=en) ts = textacy.text_stats.TextStats(doc) df = pd.DataFrame({ "n_chars": [ts.n_chars], captions_clm_name: doc.text }) captions_lst.append(df) # Concat all df's into captions df for easy sorting and manipulation captions_df = pd.concat(captions_lst, ignore_index=True) captions_df['sent_order'] = captions_df.index return captions_df
def augment_document(self, doc): try: doc = textacy.make_spacy_doc(doc, lang="en_core_web_sm") doc = self.augmenter.apply_transforms(doc) return str(doc) except: return str(doc)
def main(text, dmodels, snormalize='lemma', sngrams=(1, 2, 3, 4, 5, 6), sinclude_pos=('NOUN', 'PROPN', 'ADJ'), swindow_size=1500, stopn=1., sidf=None, verbose=False): # identify language language = textacy.lang_utils.identify_lang(text) if verbose: print('[info] language = "%s"' % language) # load language model nlp = textacy.load_spacy_lang(dmodels[language], disable=("parser", )) # create documents doc = textacy.make_spacy_doc(text, lang=nlp) # model launch keywords = textacy.ke.sgrank( doc, normalize=snormalize, #normalize = None, #normalize = 'lower', ngrams=sngrams, include_pos=sinclude_pos, window_size=swindow_size, topn=stopn, idf=sidf) # return return keywords
def analyze_post(post, debug=False): "Perform NLP analysis" counters = PerfCounters() nlp = create_objdict() # clean fields counters.start('cleanup') clean_fields = generate_clean_fields(post) nlp.clean_fields = clean_fields counters.stop('cleanup') # creating spacy docs counters.start('make_spacy_docs') all_cleaned_content = ' '.join([clean_fields.title, clean_fields.category, " ".join(clean_fields.tags), clean_fields.abstract, clean_fields.text]) # overall terms cleaned_doc = make_spacy_doc(all_cleaned_content, lang=SPACY_MODEL) # title terms title_doc = make_spacy_doc(clean_fields.title, lang=SPACY_MODEL) # for statistics text_doc = make_spacy_doc(post.text, lang=SPACY_MODEL) counters.stop('make_spacy_docs') # terms extraction counters.start('extract_key_terms') nlp.terms = extract_key_terms(cleaned_doc, num_terms=NUM_TERMS, algo=TERM_EXTRACTOR_ALGO, ngrams=NGRAMS) # !note we restrict ngram to one as we only want the lemmized top terms. nlp.title_terms = extract_key_terms(title_doc, num_terms=NUM_TERMS, algo=TERM_EXTRACTOR_ALGO, ngrams=1) counters.stop('extract_key_terms') # text stats counters.start('text_stats') nlp.stats = compute_stats(text_doc) counters.stop('text_stats') if debug: counters.report() return nlp
def spacy_doc(): text = ( "Democrats might know that they stand against Trump's policies, but coming up with their own plan is harder than you think. " "For a long time, the party's top echelon has been captive to free trade orthodoxy. " "Since Bill Clinton, the theory of the case among the Democratic Party's elite has been the more globalization, the better — with mostly a deaf ear turned to the people and places most badly affected. " "Worse, their response to globalization's excesses has been: " "Here's a new trade deal, much better than the last one.") return make_spacy_doc(text, lang="en")
def get_noun_phrases(text, nlp): doc = textacy.make_spacy_doc(text, lang='en_core_web_sm') # print([chunk.text for chunk in doc.noun_chunks]) noun_phrase = [chunk.text for chunk in doc.noun_chunks] # noun phrase single_noun = [word.text for word in doc if (word.pos_ in ["NOUN", "ADJ", "ADV"] and not word.is_stop)] # single NOUN, ADJ, ADV all_noun_phrases = list(set(noun_phrase + single_noun)) return all_noun_phrases
def detect_verb_phrases(sentence, return_as_string: bool = True): pattern = r"(<VERB>?<ADV>*<VERB>+)" doc = make_spacy_doc(data=sentence, lang="en_core_web_sm") verb_phrases = pos_regex_matches(doc=doc, pattern=pattern) if return_as_string: return " ".join([c.text for c in verb_phrases]) return verb_phrases
def test_single_sentence_doc(): doc = textacy.make_spacy_doc( "This is a document with a single sentence.", lang="en_core_web_sm", ) result = kt.scake(doc) assert isinstance(result, list) assert len(result) > 0
def make_corpus(df: pd.DataFrame, col_name: str, min_token_count: int) -> textacy.Corpus: spacy_records = df[col_name].apply( lambda x: textacy.make_spacy_doc(x, lang="en")) long_records = [ record for record in spacy_records if len(record) >= min_token_count ] corpus = textacy.Corpus("en", data=list(long_records)) return corpus
def ts(): text = """ Mr. Speaker, 480,000 Federal employees are working without pay, a form of involuntary servitude; 280,000 Federal employees are not working, and they will be paid. Virtually all of these workers have mortgages to pay, children to feed, and financial obligations to meet. Mr. Speaker, what is happening to these workers is immoral, is wrong, and must be rectified immediately. Newt Gingrich and the Republican leadership must not continue to hold the House and the American people hostage while they push their disastrous 7-year balanced budget plan. The gentleman from Georgia, Mr. Gingrich, and the Republican leadership must join Senator Dole and the entire Senate and pass a continuing resolution now, now to reopen Government. Mr. Speaker, that is what the American people want, that is what they need, and that is what this body must do. """.strip() doc = make_spacy_doc(text, lang="en") ts_ = text_stats.TextStats(doc) return ts_
def test_terms(): text = "the quick fox and the cat. The turtle and the rabbit." doc = make_spacy_doc(text, lang=SPACY_MODEL) terms = nlp.extract_key_terms(doc, num_terms=5) terms = [t[0] for t in terms] # remove scores assert 'fox' in terms assert 'cat' in terms assert 'turtle' in terms assert 'rabbit' in terms
def ts_en(): text = ( "Many years later, as he faced the firing squad, Colonel Aureliano Buendía was " "to remember that distant afternoon when his father took him to discover ice. " "At that time Macondo was a village of twenty adobe houses, built on the bank " "of a river of clear water that ran along a bed of polished stones, which were " "white and enormous, like prehistoric eggs. The world was so recent that many " "things lacked names, and in order to indicate them it was necessary to point." ) return textacy.TextStats(textacy.make_spacy_doc(text, lang="en"))
def generate_categories(data, tags, num): q = Qrmine() if len(tags) > 0: ct = 0 for title in data.titles: for tag in tags: if title == tag: click.echo(tag) content = data.documents[ct] ct += 1 interview = Content(content) doc = textacy.make_spacy_doc(interview.doc) return q.print_categories(doc, num) else: all_interviews = Content(data.content) doc = textacy.make_spacy_doc(all_interviews.doc) return q.print_categories(doc, num)
def test_two_term_behavhior(): """Case when there is less than 3 words and rank algo can't be used. """ text = "search page" doc = make_spacy_doc(text, lang=SPACY_MODEL) terms = nlp.extract_key_terms(doc, num_terms=5) assert 'search' == terms[0][0] assert 0.5 == terms[0][1] assert 'page' == terms[1][0] assert 0.5 == terms[1][1]
def ts_es(): text = ( "Muchos años después, frente al pelotón de fusilamiento, el coronel Aureliano " "Buendía había de recordar aquella tarde remota en que su padre lo llevó a " "conocer el hielo. Macondo era entonces una aldea de veinte casas de barro y " "cañabrava construidas a la orilla de un río de aguas diáfanas que se precipitaban " "por un lecho de piedras pulidas, blancas y enormes como huevos prehistóricos. " "El mundo era tan reciente, que muchas cosas carecían de nombre, y para " "mencionarlas había que señalarlas con el dedo.") return textacy.TextStats(textacy.make_spacy_doc(text, lang="es"))
def preText(text, pos_bow, neg_bow): # parameter: # text : takes a sentence, string # pos_bow: positive bag of words, list # neg_bow: negative bag of words, list # return: # score_word_sim : similarity score for all verbs, float # score_bow: score for bag of words implementation, float # recognize verb pattern pattern = [{ "POS": "VERB", "OP": "*" }, { "POS": "ADV", "OP": "*" }, { "POS": "VERB", "OP": "+" }, { "POS": "PART", "OP": "*" }] # extract verb pattern doc = textacy.make_spacy_doc(text, lang='en_core_web_lg') verbs = textacy.extract.matches(doc, pattern) score_word_sim = 0.0 score_bow = 0.0 for verb in verbs: # singularize verb, e.g. "likes" to "like" singularized_verb = singularize(verb.text) score_word_sim += wordSimilarity(pos_bow, neg_bow, singularized_verb) # apply bag of words to the singularized verb score_bow += pos_bow.count(str(singularized_verb)) score_bow -= neg_bow.count(str(singularized_verb)) # aggregate all verb similarity if score_word_sim > 0.5: score_word_sim = 1.0 elif score_word_sim < -0.5: score_word_sim = -1.0 else: score_word_sim = 0.0 # aggregate the count with bag of words if score_bow > 0.5: score_bow = 1.0 elif score_bow < -0.5: score_bow = -1.0 else: score_bow = 0.0 return score_word_sim, score_bow
def get_nouns(sent): about_talk_text = (get_useCase(sent)[0]) about_talk_doc = textacy.make_spacy_doc(about_talk_text, lang='en_core_web_sm') #Extract Noun Phrase to explain what nouns are involved chunks = [] for chunk in about_talk_doc.noun_chunks: print(chunk) chunks.append(chunk) #print(chunks) return chunks
def test_invalid_data(self): invalid_contents = [ b"This is an English sentence in bytes.", { "content": "This is an English sentence as dict value." }, True, ] for invalid_content in invalid_contents: with pytest.raises(TypeError): _ = make_spacy_doc(invalid_content)
def remove_det(phrase, nlp): pattern = [ {'POS': 'DET', 'OP': '+'}, {'POS': 'ADJ', 'OP': '*'}, {'POS': 'NOUN', "OP": '+'} ] doc = textacy.make_spacy_doc(phrase, lang='en_core_web_sm') # if phrase.startswith("an ") or phrase.startswith("a ") or phrase.startswith("the "): matches = [phs.text for phs in textacy.extract.matches(doc, pattern)] if len(matches) != 0 and phrase.find(matches[0]) == 0: # matched phrase should be in the beginning return phrase.split(" ",1)[1] return phrase
def doc(): lang = textacy.load_spacy_lang("en_core_web_sm") text = ( "Many years later, as he faced the firing squad, Colonel Aureliano Buendía was " "to remember that distant afternoon when his father took him to discover ice. " "At that time Macondo was a village of twenty adobe houses, built on the bank " "of a river of clear water that ran along a bed of polished stones, which were " "white and enormous, like prehistoric eggs. The world was so recent that many " "things lacked names, and in order to indicate them it was necessary to point." ) meta = {"author": "Gabriel García Márquez", "title": "Cien años de soledad"} return textacy.make_spacy_doc((text, meta), lang=lang)