def preprocess(self, lowercase=False, no_punct=False, no_urls=False, no_stop_words=False): """ Preprocess text for matching """ # Break into lines and remove leading and trailing space on each lines = (line.strip() for line in self.text.splitlines()) # Break multi-headlines into a line each chunks = (phrase.strip() for line in lines for phrase in line.split(' ')) # Drop blank lines self.text = '\n'.join(chunk for chunk in chunks if chunk) # Handle lowercase, no_pucnt and no_urls self.text = textacy.preprocess_text(self.text, lowercase=lowercase, no_punct=no_punct, no_urls=no_urls) # Handle stop words if no_stop_words: stop_words = set(stopwords.words('english')) word_tokens = nltk.word_tokenize(self.text) filtered_sentence = [w for w in word_tokens if w not in stop_words] self.text = ' '.join(filtered_sentence)
def retrieve_paragraphs(): import textacy logging.info('Obtaining paragraphs') db = CederMT() training_collection = db.SynthesisTypeFilteringData syn_20170926 = db.syn_20170926 paragraphs = list( training_collection.find( { 'is_training_data': True, 'human_validated': True }, { '_id': 0, 'time_created': 0 })) for paragraph in tqdm(paragraphs): paper = syn_20170926.find_one({ 'doi': paragraph['doi'], }, { '_id': 1, 'paragraphs': { '$slice': [paragraph['paragraph_id'], 1] } }) text = paper['paragraphs'][0]['text'] paragraph['text'] = textacy.preprocess_text(text, fix_unicode=True) return paragraphs
def prepare_text(document): """ textacy - spacy wrapper cleans up text in one go -lower case -remove numbers -remove punctuation -undo contractions -removes accents -fixes garbled unicode -removes currency symbols -string replace '\n' with '' too """ text_processing = textacy.preprocess_text( nlp(document).text.replace('-',' ').replace('\n',''), fix_unicode=True, lowercase=True, transliterate=False, no_urls=False, no_emails=False, no_phone_numbers=False, no_numbers=True, no_currency_symbols=True, no_punct=True, no_contractions=True, no_accents=True ) prepared_text = nlp(text_processing) print ('cleaning text...') return (prepared_text)
def build_comp_termlist(): import os spacy_lang = en_core_web_sm.load() texts = [] for kag_path in glob.glob(KAG_BASE_PATH + '/*'): for comp_path in glob.glob(kag_path + '/*'): _, comp_name = os.path.split(comp_path) # print ('=Handling competence ' + comp_name) comp_text = '' for filename in glob.glob(comp_path + '/*.txt'): clean_text = preprocess_text(open(filename, 'r').read().decode('utf-8'), no_urls=True, no_emails=True, no_phone_numbers=True, no_numbers=True, no_currency_symbols=True, no_punct=True, no_contractions=True, no_accents=True) comp_text = ' '.join(text for text in (comp_text, clean_text)) # comp_text += open(filename, 'r').read().decode('utf-8') doc = textacy.Doc(comp_text, lang=spacy_lang) texts.append( doc.to_terms_list(named_entities=False, as_strings=True)) return texts
def test_plaintext_functionality(self): expected_1 = 'mr president i ask to have printed in the record copies of some of the' observed_1 = textacy.preprocess_text(self.text, lowercase=True, no_punct=True)[:70] expected_2 = [('ed States of America is an amazing ', 'nation', ' that continues to lead the world t'), ('come the role model for developing ', 'nation', 's attempting to give their people t'), ('ve before to better ourselves as a ', 'nation', ', because what we change will set a'), ('nd education. Fortunately, we as a ', 'nation', ' have the opportunity to fix the in'), (' sentences. Judges from across the ', 'nation', ' have said for decades that they do'), ('reopened many racial wounds in our ', 'nation', '. The war on drugs also put addicts')] observed_2 = list(textacy.text_utils.keyword_in_context( self.text, 'nation', window_width=35, print_only=False)) self.assertEqual(observed_1, expected_1) self.assertEqual(observed_2, expected_2)
def preprocess_text(from_article): return textacy.preprocess_text(from_article.get('title', '') + '. ' + from_article.get('abstract', ''), lowercase=True, transliterate=True, no_punct=True, no_numbers=True)
def preprocess(self, lowercase=False, no_punct=False, no_urls=False, no_stop_words=False): """ Preprocess text for matching """ # Break into lines and remove leading and trailing space on each lines = (line.strip() for line in self.text.splitlines()) # Break multi-headlines into a line each chunks = (phrase.strip() for line in lines for phrase in line.split(' ')) # Drop blank lines self.text = '\n'.join(chunk for chunk in chunks if chunk) # Handle lowercase, no_pucnt and no_urls self.text = textacy.preprocess_text(self.text, lowercase=lowercase, no_punct=no_punct, no_urls=no_urls) # Handle stop words if no_stop_words: self.text = self._delete_stop_words(self.text)
def soft_preprocess(df): df.iloc[:,1]=df.iloc[:,1].swifter.apply(ftfy.fix_text) df.iloc[:,1]=df.iloc[:,1].swifter.apply(clean_text) df.iloc[:,1]=df.iloc[:,1].swifter.apply(lambda x: x.replace('"', "'").replace("\n", " ")) df.iloc[:,1]=df.iloc[:,1].swifter.apply(lambda text: textacy.preprocess_text( text, no_currency_symbols=True,no_urls=True,no_emails=True,no_phone_numbers=True,no_numbers=True)) df.iloc[:,1]=df.iloc[:,1].swifter.apply(fix_contractions) return df
def extract_keyphrases_from_text(text, spacy_en): str = preprocess_text(text, fix_unicode=True, lowercase=True, transliterate=True) noun_phrases = [np.text for np in spacy_en(str).noun_chunks] # remove ones too short, lemmatize, etc.. cleankeys = regularise_keys(noun_phrases) return cleankeys
def hard_preprocess(df): df.iloc[:,1]=df.iloc[:,1].swifter.apply(ftfy.fix_text) df.iloc[:,1]=df.iloc[:,1].apply(fix_contractions) df.iloc[:,1]=df.iloc[:,1].swifter.apply(clean_text) df.iloc[:,1]=df.iloc[:,1].swifter.apply(lambda x: x.replace('"', '').replace("\n", " ").replace("\\","")) df.iloc[:,1]=df.iloc[:,1].swifter.apply(lambda text: textacy.preprocess_text( text, no_currency_symbols=True,no_urls=True,no_emails=True,no_phone_numbers=True,no_numbers=True)) df.iloc[:,1] = df.iloc[:,1].swifter.apply(lambda text: " ".join( [word for word in simple_preprocess(text) if word not in stop_words]).strip()) return df
def read_corpus(corpus_path): # spacy_lang = data.load_spacy('en') spacy_lang = en_core_web_sm.load() corpus = textacy.corpus.Corpus(u'en') for filename in glob.glob(corpus_path + '/*.txt'): content = open(filename, 'r').read().decode('utf-8') spacy_doc = spacy_lang(preprocess_text(content), parse=False) corpus.add_doc(spacy_doc) return corpus
def preprocess_text_settings(string_in): string_in = preprocess_text(string_in, fix_unicode=True, lowercase=True, no_urls=True, no_emails=True, no_numbers=True, no_accents=True, no_punct=True) return (string_in)
def get_textacy_doc(text): """ Gets document of textacy library :param text: Text of which textacy doc to get :return: tuple Textacy doc, Processed text """ en = textacy.load_spacy_lang(NLPService._WORD_MODEL_NAME, disable=('parser', )) processed_text = textacy.preprocess_text(text, lowercase=True, no_punct=True) return textacy.make_spacy_doc(processed_text, lang=en), processed_text
def process_content(self): if self._content is not None: ct = 0 for document in self._content.documents: metadata = {} try: metadata['title'] = self._content.titles[ct] except IndexError: metadata['title'] = 'Empty' self._corpus.add_text(textacy.preprocess_text(document, lowercase=True, no_punct=True, no_numbers=True), metadata=metadata) ct += 1 self.load_matrix()
def preprocess_corpus_textacy(corpus): """Remove numbers, whitespace, punctuation and make lower case""" preprocessed = corpus.copy() for index, row in preprocessed.iteritems(): preprocessed[index] = re.sub( r'\w*\d\w*', '', preprocessed[index]).strip() #remove numbers preprocessed[index] = textacy.preprocess.normalize_whitespace( preprocessed[index]) preprocessed[index] = textacy.preprocess_text(preprocessed[index], lowercase=True, no_punct=True) print('preprocessed corpus') return preprocessed
def read_clean_text(file): f = open('./data/FixedJudgements/' + file + '.txt') raw = f.read() text = drop_html(raw) text = textacy.preprocess.unpack_contractions(text) regex = re.compile('[%s]' % re.escape(string.punctuation)) text = regex.sub(' ', text) text = text.replace('\n', ' ') text = preprocess_text(text, lowercase=True, fix_unicode=True, no_numbers=True) return text
def test_plaintext_functionality(self): preprocessed_text = textacy.preprocess_text( self.text, lowercase=True, no_punct=True)[:100] self.assertTrue( all(char.islower() for char in preprocessed_text if char.isalpha())) self.assertTrue( all(char.isalnum() or char.isspace() for char in preprocessed_text)) keyword = 'America' kwics = textacy.text_utils.keyword_in_context( self.text, keyword, window_width=35, print_only=False) for pre, kw, post in kwics: self.assertEqual(kw, keyword) self.assertIsInstance(pre, textacy.compat.unicode_) self.assertIsInstance(post, textacy.compat.unicode_)
def test_plaintext_functionality(text): preprocessed_text = preprocess_text(text, lowercase=True, no_punct=True)[:100] assert all(char.islower() for char in preprocessed_text if char.isalpha()) assert all(char.isalnum() or char.isspace() for char in preprocessed_text) keyword = 'America' kwics = text_utils.keyword_in_context(text, keyword, window_width=35, print_only=False) for pre, kw, post in kwics: assert kw == keyword assert isinstance(pre, compat.unicode_) assert isinstance(post, compat.unicode_)
def read_texts(corpus_path): # spacy_lang = data.load_spacy('en') spacy_lang = en_core_web_sm.load() # corpus = textacy.corpus.Corpus(u'en') texts = [] for filename in glob.glob(corpus_path + '/*.txt'): print "Keyterms for " + filename content = open(filename, 'r').read().decode('utf-8') texts.append(content) # break # Just for one doc for testing purposes text = '\n'.join(content for content in texts) return spacy_lang(preprocess_text(text), parse=False)
def show_plot(self, pos, neg, plot_type='semantic'): for group in [('Positive', pos), ('Negative', neg)]: name, group = group if plot_type == 'semantic': # if name == 'Positive' and group.shape[0] > 150: # group = group.sample(155) corpus = [self.prep_tokens(tweet) for tweet in group] corpus = ' '.join(word for word in corpus) cleaned_text = textacy.preprocess_text(corpus, fix_unicode=True, no_accents=True) doc = textacy.Doc(cleaned_text, lang='en') graph = doc.to_semantic_network(nodes='words', edge_weighting='cooc_freq', window_width=10) drop_nodes = ['pron'] for node in drop_nodes: try: graph.remove_node(node) except: pass node_weights = nx.pagerank_scipy(graph) ax = textacy.viz.network.draw_semantic_network( graph, node_weights=node_weights, spread=50.0) plt.suptitle(name + ' Sentiment Topics:' + '\n{} {} tweets\n{}'.format( group.shape[0], name, self.hashtag)) # plt.savefig('../images/plots/' + name) else: corpus = [self.prep_tokens(tweet) for tweet in group] tf = TfidfVectorizer().fit(corpus) doc_term_matrix = tf.transform(corpus) vocab = tf.get_feature_names() vocab = [word for word in vocab if word != 'pron'] model = textacy.tm.TopicModel('nmf', n_topics=3) model.fit(doc_term_matrix) model.termite_plot(doc_term_matrix, vocab, topics=-1, n_terms=25, sort_terms_by='seriation', rank_terms_by='topic_weight', highlight_topics=range(3)) plt.suptitle(name + ' Sentiment Topics:') # plt.savefig('semantic_plot') plt.show(block=False)
def normalize_text(text): text = textacy.preprocess.normalize_whitespace( textacy.preprocess.transliterate_unicode(str(text))) return textacy.preprocess_text(text, fix_unicode=False, lowercase=False, transliterate=False, no_urls=True, no_emails=True, no_phone_numbers=True, no_numbers=False, no_currency_symbols=True, no_punct=False, no_contractions=True, no_accents=True)
def hard_preprocess(df): df.iloc[:,1]=df.iloc[:,1].swifter.apply(ftfy.fix_text) df.iloc[:,1]=df.iloc[:,1].swifter.apply(clean_text) df.iloc[:,1]=df.iloc[:,1].swifter.apply(lambda x: x.replace('"', '').replace("\n", " ").replace("\\","")) df.iloc[:,1]=df.iloc[:,1].swifter.apply(lambda text: textacy.preprocess_text( text, no_currency_symbols=True,no_urls=True,no_emails=True,no_phone_numbers=True,no_numbers=True)) df.iloc[:,1] = df.iloc[:,1].apply(lambda x: x.replace("`","'").replace( "& amp ;", " and ").replace( "@ USER", "[USER]").replace( "@ URL", "[URL]").replace( "@ HASHTAG", "[HASHTAG]").replace( "*NUMBER*", "[NUMBER]") ) df.iloc[:,1]=df.iloc[:,1].apply(fix_contractions) return df
def setUp(self): spacy_lang = data.load_spacy('en_core_web_sm') text = """ Friedman joined the London bureau of United Press International after completing his master's degree. He was dispatched a year later to Beirut, where he lived from June 1979 to May 1981 while covering the Lebanon Civil War. He was hired by The New York Times as a reporter in 1981 and re-dispatched to Beirut at the start of the 1982 Israeli invasion of Lebanon. His coverage of the war, particularly the Sabra and Shatila massacre, won him the Pulitzer Prize for International Reporting (shared with Loren Jenkins of The Washington Post). Alongside David K. Shipler he also won the George Polk Award for foreign reporting. In June 1984, Friedman was transferred to Jerusalem, where he served as the New York Times Jerusalem Bureau Chief until February 1988. That year he received a second Pulitzer Prize for International Reporting, which cited his coverage of the First Palestinian Intifada. He wrote a book, From Beirut to Jerusalem, describing his experiences in the Middle East, which won the 1989 U.S. National Book Award for Nonfiction. Friedman covered Secretary of State James Baker during the administration of President George H. W. Bush. Following the election of Bill Clinton in 1992, Friedman became the White House correspondent for the New York Times. In 1994, he began to write more about foreign policy and economics, and moved to the op-ed page of The New York Times the following year as a foreign affairs columnist. In 2002, Friedman won the Pulitzer Prize for Commentary for his "clarity of vision, based on extensive reporting, in commenting on the worldwide impact of the terrorist threat." In February 2002, Friedman met Saudi Crown Prince Abdullah and encouraged him to make a comprehensive attempt to end the Arab-Israeli conflict by normalizing Arab relations with Israel in exchange for the return of refugees alongside an end to the Israel territorial occupations. Abdullah proposed the Arab Peace Initiative at the Beirut Summit that March, which Friedman has since strongly supported. Friedman received the 2004 Overseas Press Club Award for lifetime achievement and was named to the Order of the British Empire by Queen Elizabeth II. In May 2011, The New York Times reported that President Barack Obama "has sounded out" Friedman concerning Middle East issues. """ self.spacy_doc = spacy_lang(preprocess_text(text), parse=False)
def process_content(self): if self._content is not None: ct = 0 for document in self._content.documents: metadata = {} try: metadata['title'] = self._content.titles[ct] except IndexError: metadata['title'] = 'Empty' # self._corpus.add_text(textacy.preprocess_text(document, lowercase=True, no_punct=True, no_numbers=True), # metadata=metadata) doc_text = textacy.preprocess_text(document, lowercase=True, no_punct=True, no_numbers=True) doc = textacy.make_spacy_doc((doc_text, metadata), lang=self._en) self._corpus.add_doc(doc) ct += 1 self.load_matrix()
def build_document(df): ''' Rebuilding a representation of a document from the sentences_nlp352 format. Parameters ---------- df : pandas dataframe ''' # Find unique documents document_ids = pd.Series(df.docid.unique()) for docid in document_ids[:1]: doc = df[df.docid == docid]['word'] cleaned_doc = textacy.preprocess_text(' '.join(doc.str.cat().replace( '{', '').replace('}', '').replace('.', ' . ').split(','))) return cleaned_doc.replace('" "', '')
def read_cv(cv_id): import json spacy_lang = en_core_web_sm.load() corpus = textacy.corpus.Corpus(spacy_lang) filename = 'cv_{0}.json'.format(cv_id) cv_path = CV_PATH + filename content = '' with open(cv_path) as cv_file: content = json.load(cv_file) corpus_text = content['description'] # corpus.add_text(preprocess_text(corpus_text, no_urls=True, no_emails=True, # no_phone_numbers=True, no_numbers=True, no_currency_symbols=True, # no_punct=True, no_contractions=True, no_accents=True)) corpus.add_text( preprocess_text(corpus_text, no_punct=True, no_contractions=True, no_accents=True)) return corpus
def read_cv2(cv_path): import json spacy_lang = en_core_web_sm.load() corpus = textacy.corpus.Corpus(spacy_lang) content = '' with open(cv_path) as cv_file: content = json.load(cv_file) # corpus_text = content['description'] corpus_text = content.get('description', u'') # corpus.add_text(preprocess_text(corpus_text, no_urls=True, no_emails=True, # no_phone_numbers=True, no_numbers=True, no_currency_symbols=True, # no_punct=True, no_contractions=True, no_accents=True)) corpus.add_text( preprocess_text(corpus_text, no_punct=True, no_contractions=True, no_accents=True)) return corpus
def show_plot(self, pos, neg, plot_type='semantic'): for group in [('Positive', pos), ('Negative', neg)]: name, group = group if plot_type == 'semantic': corpus = [self.prep_tokens(tweet) for tweet in group] corpus = ' '.join(word for word in corpus) cleaned_text = textacy.preprocess_text(corpus, fix_unicode=True, no_accents=True) doc = textacy.Doc(cleaned_text, lang='en') graph = doc.to_semantic_network(nodes='words', edge_weighting='cooc_freq', window_width=10) # drop_nodes = [textacy.spacy_utils.normalized_str(tok) # for tok in doc.tokens] # import pdb; pdb.set_trace() drop_nodes = ['pron'] for node in drop_nodes: try: graph.remove_node(node) except: pass node_weights = nx.pagerank_scipy(graph) ax = textacy.viz.network.draw_semantic_network(graph, node_weights=node_weights, spread=75.0) plt.suptitle(name + ' Sentiment Topics:') else: corpus = [self.prep_tokens(tweet) for tweet in group] tf = TfidfVectorizer().fit(corpus) doc_term_matrix = tf.transform(corpus) vocab = tf.get_feature_names() vocab = [word for word in vocab if word != 'pron'] model = textacy.tm.TopicModel('nmf', n_topics=3) model.fit(doc_term_matrix) model.termite_plot(doc_term_matrix, vocab, topics=-1, n_terms=25, sort_terms_by='seriation', rank_terms_by='topic_weight', highlight_topics=range(3)) plt.suptitle(name + ' Sentiment Topics:') plt.show(block=False)
def read_corpus_in_doc(corpus_path): spacy_lang = en_core_web_sm.load() corpus = textacy.corpus.Corpus(spacy_lang) texts = [] for filename in glob.glob(corpus_path + '/*.txt'): content = open(filename, 'r').read().decode('utf-8') # testing preprocess clean_text = preprocess_text(content, no_punct=True, no_contractions=True, no_accents=True) texts.append(clean_text) # texts.append(open(filename, 'r').read().decode('utf-8')) # break corpus_text = '\n'.join(text for text in texts) corpus.add_text(corpus_text) # corpus.add_text(spacy_lang(preprocess_text(content), parse=False)) # break # Just for one doc for testing purposes return corpus
def dependecy_parsing(): """Dependecy Parsing The parser also powers the sentence boundary detection, and lets you iterate over base noun phrases, or "chunks". You can check whether a Doc object has been parsed with the doc.is_parsed attribute, which returns a boolean value. If this attribute is False, the default sentence iterator will raise an exception. """ data = request.get_json() result = data["data"] result = textacy.preprocess_text(result, fix_unicode=True, no_accents=True) result = textacy.preprocess.fix_bad_unicode(result, normalization="NFC") doc = ner_model(result) dependency = [({ "text": token.text, "dependency": token.dep_, "tokenHead": token.head.text, "tokenHeadPartOfSpeach": token.head.pos_, "children": [str(child) for child in token.children], }) for token in doc] result = jsonify({"dependency": dependency}) return result
def iter_id_with_text(filename, chunk_size=1000): log.info('Loading spaCy model...') nlp = spacy.load('en', disable=['ner', 'parser', 'tagger']) log.info('SpaCy model was loaded...') with open(filename) as file: for index, article in enumerate(map(json.loads, file), 1): abstract = article.get('abstract', '') title = article.get('title', '') text = textacy.preprocess_text(title + '. ' + abstract, lowercase=True, transliterate=True, no_punct=True, no_numbers=True) terms_list = list( textacy.Doc(text, lang=nlp).to_terms_list(as_strings=True, named_entities=False, normalize='lemma', ngrams=(1))) if index % chunk_size == 0: log.info('%d articles were loaded...', index) id = article['id'] yield id, [term for term in terms_list if term not in STOPWORDS]
def run_custom_task(self, temp_file, mongo_client: MongoClient): print('run custom task') n_num = self.get_integer('n', default=2) filter_stops = self.get_boolean('filter_stops', default=True) filter_punct = self.get_boolean('filter_punct', default=True) filter_nums = self.get_boolean('filter_nums', default=False) lemmas = self.get_boolean('lemmas', default=True) limit_to_termset = self.get_boolean('limit_to_termset', default=False) termset = self.pipeline_config.terms if not termset: termset = list() lower_termset = [x.lower() for x in termset] for doc in self.docs: ngrams = list() cln_txt = self.get_document_text(doc, clean=True) t_doc = Doc(preprocess_text(cln_txt, lowercase=True)) res = extract.ngrams(t_doc, n_num, filter_stops=filter_stops, filter_punct=filter_punct, filter_nums=filter_nums) for r in res: if lemmas: text = r.lemma_ else: text = r.text if limit_to_termset: for t in lower_termset: if text == t or t in text: ngrams.append({ 'text': text, 'count': 1 }) else: ngrams.append({ 'text': text, 'count': 1 }) self.write_multiple_result_data(temp_file, mongo_client, doc, ngrams)