def wordTuples(graph, textEntry): text = rootify(graph, textEntry) pt = textacy.load_spacy('pt') doc = textacy.Doc(text, lang=pt) ts = textacy.TextStats(doc) words = [{ w[0]: w[1] } for w in textacy.keyterms.textrank( doc, normalize='lower', n_keyterms=ts.n_unique_words)] return words
def __init__(self): self._content = "" self._min_occurrence_for_topic = 2 self._common_verbs = 10 # create an empty corpus en = textacy.load_spacy('en_core_web_sm', disable=('parser',)) self._corpus = textacy.Corpus(lang=en) self._content = None self._model = None self._numdocs = 0 self._numtopics = 0 self._terms = None self._doc_term_matrix = None self._doc_topic_matrix = None self._vectorizer = Vectorizer(tf_type='linear', apply_idf=True, idf_type='smooth', norm='l2', min_df=3, max_df=0.95, max_n_terms=100000)
def setup_nlp_language_model(language, **nlp_args): if (len(nlp_args.get('disable', [])) == 0): nlp_args.pop('disable') def remove_whitespace_entities(doc): doc.ents = [e for e in doc.ents if not e.text.isspace()] return doc logger.info('Loading model: %s...', language) Language.factories[ 'remove_whitespace_entities'] = lambda nlp, **cfg: remove_whitespace_entities model_name = LANGUAGE_MODEL_MAP[language] #if not model_name.endswith('lg'): # logger.warning('Selected model is not the largest availiable.') nlp = textacy.load_spacy(model_name, **nlp_args) nlp.tokenizer = keep_hyphen_tokenizer(nlp) pipeline = lambda: [x[0] for x in nlp.pipeline] logger.info('Using pipeline: ' + ' '.join(pipeline())) return nlp
# coding: utf-8 # In[65]: from cassandra.cluster import Cluster import spacy import pandas as pd import operator from datetime import datetime import textacy nlp = spacy.load('en_core_web_lg') cluster = Cluster(['127.0.0.1']) session = cluster.connect('sharelock') en = textacy.load_spacy('en_core_web_lg') # Clean text rows, lemmatize, remove stop words. # # In[66]: from emoji import UNICODE_EMOJI def is_emoji(s): count = 0 for emoji in UNICODE_EMOJI: count += s.count(emoji) if count > 1: return False return bool(count)
# pip install textblob # python -m textblob.download_corpora # pip install spacy textacy # python -m spacy download en import json from collections import defaultdict, Counter import re import textacy from textblob import TextBlob en = textacy.load_spacy("en_core_web_sm") # my_stop_words = {"the", "to"} my_stop_words = en.Defaults.stop_words - {"not"} data = [] with open("only_cellphones_brands.json") as json_data: data = json.load(json_data) # re-implemented from https://github.com/chartbeat-labs/textacy/blob/26abafae7d745614b68d2f90310741b5a8de24d7/textacy/extract.py#L327 # to ignore stop words in matching def pos_regex_matches(doc, pattern): # standardize and transform the regular expression pattern... pattern = re.sub(r"\s", "", pattern) pattern = re.sub(r"<([A-Z]+)\|([A-Z]+)>", r"( (\1|\2))", pattern) pattern = re.sub(r"<([A-Z]+)>", r"( \1)", pattern) filtered_tokens = [ tok for tok in doc if tok.text not in my_stop_words and tok.is_alpha
import pickle as pkl import re import numpy as np import pandas as pd from gensim.models.phrases import Phrases, Phraser import textacy from textacy import preprocess_text, Doc, Corpus from textacy.vsm import Vectorizer, GroupVectorizer from textacy.tm import TopicModel en = textacy.load_spacy("en_core_web_sm", disable='parser') data = pd.read_csv('data/qaData.csv', parse_dates=['Date']) ec_data = data.loc[data['EventType'] == "Earnings call", [ 'Date', 'Company', 'Participants', 'AnalystName', 'AnalystCompany', 'EventName', 'EarningTag2', "Question" ]].copy() ec_data['Quarter'] = ec_data['EventName'].str.split("Q").str[0] ec_data = ec_data.groupby( ['Date', "Company", "Participants", "EventName", "Quarter"]).apply(lambda x: x.reset_index()).reset_index(drop=True) ec_data.columns = [ "QuestionOrder", "Date", "Company", "Participants", "AnalystName", "AnalystCompany", "EventName", "Tag", "Question", "Quarter" ] ec_data = ec_data[[ "Date", "Quarter", "Company", "Participants", "AnalystCompany", "AnalystName", "QuestionOrder", "Tag", "Question" ]]
import spacy import textacy from spacy import displacy, tokens from textacy import extract # This is common global object which is used for computing similarity # Some private members so as to make it work nlp = spacy.load('en_core_web_sm') lang_en = textacy.load_spacy('en_core_web_sm') possible_docs = {'textacy', 'spacy'} subjects = ["nsubj", "nsubjpass", "csubj", "csubjpass", "agent", "expl"] objects = ["dobj", "dative", "attr", "oprd"] conjunctions = ["or", "and"] def render_pos_html(list_of_docs): return displacy.render(map(lambda x: get_spacy_doc(x), list_of_docs), style='dep', page=True) def get_new_doc(phrase, doc_type='textacy'): assert isinstance(phrase, basestring) assert isinstance(doc_type, str) assert doc_type in possible_docs, "Only {} doc types are supported".format( possible_docs) if doc_type == 'textacy': return textacy.Doc(phrase, lang=lang_en) elif doc_type == 'spacy': return nlp(phrase)