def combine_stopwords(dataframe_in, stopword_dict): """ Please use the stopwords() function and input that into the stopword_dict parameter. returns filtered tokens. :param dataframe_in: :param stopword_dict: :return: """ nlp = spacy.load("en_core_web_lg") # Tokenizer tokenizer = Tokenizer(nlp.vocab) tokens = [] for doc in tokenizer.pipe(dataframe_in, batch_size=500): doc_tokens = [] for token in doc: if token.text.lower() not in stopword_dict: doc_tokens.append(token.text.lower()) tokens.append(doc_tokens) return tokens
def predict(): #define a prediction function body = body.str.replace tokenizer = Tokenizer(nlp.vocab) tokens = [] """ Make them tokens """ #stop words STOP_WORDS = nlp.Defaults.stop_words.union( ['', ' ', '-', 'reddit', 'post']) tokens = [] for doc in tokenizer.pipe(df['combo'], batch_size=500): doc_tokens = [] for token in doc: if ((token.text.lower() not in STOP_WORDS) and (token.is_stop == False) and (token.is_punct == False) and (token.pos_ != 'PRON')): doc_tokens.append(token.lemma_.lower()) tokens.append(' '.join(doc_tokens)) df['tokens'] = tokens tfidf = TfidfVectorizer(min_df=0.025, max_df=.98, ngram_range=(1, 2)) vec_text = tfidf.transform(user_input) output = model.predict(vec_text.todense()) # give output to sender. return jsonify({"response": output})
def tokenize_data(input_data): nlp = spacy.load("en") tokenizer = Tokenizer(nlp.vocab) string_data = [str(data) for data in input_data] tokenized_data = [[str(w) for w in doc] for doc in tokenizer.pipe(string_data, batch_size=50)] return tokenized_data
def _tokenizer(df): nlp = English() tokenizer = Tokenizer(nlp.vocab) for doc in tokenizer.pipe(df.values.tolist(), batch_size=50): for token in doc: yield token
def transform(self, data): tokenizer = Tokenizer(nlp.vocab) return np.array([ np.mean([ self.model[w.text.lower()] * self.word2weight[w.text.lower()] for w in words if w.text.lower() in self.model ] or [np.zeros(self.dim)], axis=0) for words in tokenizer.pipe(data) ])
def get_lemmas(text): # nlp = spacy.load("en_core_web_sm-2.2.5", path="airbnb_api/") nlp = spacy.load("en_core_web_sm-2.2.5", path="./") # nlp = spacy.load("en_core_web_sm") # nlp = en_core_web_sm.load() tokenizer = Tokenizer(nlp.vocab) STOP_WORDS = nlp.Defaults.stop_words.union([ ' ', 'und', '-', 'die', 'der', 'berlin', 'ein', 'das', 'mit', 'ist', 'im', 'zu', 'eine', 'es', 'für' 'berlin.', 'zum', 'sind', 'für', 'Berlin.', '-pron-', 's', 'u', '', "'", ' ', '-PRON-' ]) lemmas = [] doc = nlp(text) for token in doc: lemmas.append(token.lemma_) lemma_summary = [] working_set = "" for lemma in lemmas: working_set += lemma + ' ' lemma_summary.append(working_set) description = [lemma_summary[0]] tokens = [] for doc in tokenizer.pipe(description, batch_size=500): doc_tokens = [] for token in doc: if ((token.is_stop == False) and (token.is_punct == False)) and (token.pos_ != 'PRON'): if token.text.lower() not in STOP_WORDS: doc_tokens.append(token.text.lower()) tokens.append(doc_tokens) token_summary = [] for set_of_tokens in tokens: working_set = "" for variable in set_of_tokens: working_set += variable + ' ' token_summary.append(working_set) return token_summary[0]
class SpacyTokenize(Transformer): def __init__(self): nlp = spacy.load('en') self.tok = Tokenizer(nlp.vocab) def transform(self, xx): rrr = [] for doc in self.tok.pipe(xx): rr = [] for token in doc: rr.append(token.text.lower()) rrr.append(rr) return rrr
def test_spacy_tokenizer_pipe(nlp): tokenizer = Tokenizer(nlp.vocab) token_sets = [] for doc in tokenizer.pipe(DOCUMENTS, batch_size=2): doc_tokens = [token.text for token in doc] token_sets.append(doc_tokens) assert token_sets == [['all', 'the', 'kings', 'men'], ['ate', 'all', 'the', 'kings', 'hens'], [ 'until', 'they', 'all', 'got', 'tired', 'and', 'went', 'to', 'sleep', 'zzz' ]]
def tokenize(df_in): """ Tokenize by inputting a dataframe. Outputs a tokenized list. :param df_in: :return: """ nlp = spacy.load("en_core_web_lg") # Tokenizer tokenizer = Tokenizer(nlp.vocab) tokens = [] for doc in tokenizer.pipe(df_in, batch_size=500): doc_tokens = [token.text for token in doc] tokens.append(doc_tokens) return tokens
def __init__(self): """ coppied from notebook at app/ml/Build_week_IsaacGrove.ipynb """ self.PICKLE_PATH = path.join(path.dirname(__file__), '..', 'pickles', '') # for now i'm loading data from a static link, will try to pull live data # in future iters leafly = pd.read_csv( 'https://raw.githubusercontent.com/Build-Week-Med-Cabinet-6/DS/mark-dev/data/cannabis.csv' ) # Set up spacy tokenizer nlp = English() tokenizer = Tokenizer(nlp.vocab) # work around for pickle self.nlp = nlp # clean some missing info leafly.replace('None', np.NaN, inplace=True) leafly = leafly.dropna() # Make tokens out of descriptions tokens = [] for desc in tokenizer.pipe(leafly['Description'], batch_size=500): desc_tokens = [token.text for token in desc] tokens.append(desc_tokens) leafly['tokens'] = tokens leafly['tokens'].head() # Instantiate vectorizer object tfidf = TfidfVectorizer(ngram_range=(1, 2), max_df=.7, min_df=.001, tokenizer=self.tokenize) # Create a vocabulary and get word counts per listing dtm = tfidf.fit_transform(leafly['Description']) # Get feature names to use a dataframe column headers dtm = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names()) # Fit on dtm nn = NearestNeighbors(n_neighbors=20, algorithm='kd_tree') nn.fit(dtm) self.model = nn self.transform = tfidf return
def transform(self, data): tokenizer = Tokenizer(nlp.vocab) return np.array( [ np.mean( [ self.model[w.text.lower()] for w in words if w.text.lower() in self.model ] or [np.zeros(self.dim)], axis=0, ) for words in tokenizer.pipe(data) ] )
def tokenizing_text(text): tokenizer = Tokenizer(nlp.vocab) custom_stopwords = ['hi', '\n', '\n\n', '&', ' ', '.', '-', 'got', "it's", 'it’s', "i'm", 'i’m', 'im', 'want', 'like', '$', '@'] STOP_WORDS = nlp.Defaults.stop_words.union(custom_stopwords) ALL_STOP_WORDS = STOP_WORDS.union(stopwords) tokens = [] for doc in tokenizer.pipe(text): doc_tokens = [] for token in doc: if token.text.lower() not in STOP_WORDS: doc_tokens.append(token.text.lower()) tokens.append(doc_tokens) # Makes tokens column return tokens
def tokenize_v5(my_docs, my_nlp=NLP, batch_size=200): """ Uses a tokenizer pipeline for performance gains (JK still very slow). Params: my_docs (list of str, or dataframe column of str) the documents to tokenize my_nlp (spacy.lang.en.English) one of spacy's natural language models Returns: a token set (list of token lists) """ #print("TOKENIZING (v5)...") tokenizer = Tokenizer(my_nlp.vocab) token_sets = [] for doc in tokenizer.pipe(my_docs, batch_size=batch_size): # tokens = [token.lemma_.lower() for token in doc if token.is_stop == False and token.is_punct == False and token.is_space == False] # ... for some reason there are special characters, so maybe... clean_text = re.sub(ALPHANUMERIC_PATTERN, "", doc.text) clean_doc = my_nlp(clean_text) tokens = [token.lemma_.lower() for token in clean_doc if token.is_stop == False and token.is_punct == False and token.is_space == False] # ... hmm stopwords are still making their way through if the lemma is a stopword token_sets.append(tokens) return token_sets
def ValuePredictor(yelp_url, from_isbn=False): '''Takes a url, scrape site for reviews and calculates the term frequencies sorts and returns the top 10 as a json object containing term, highratingscore, poorratingscore.''' base_url = "https://www.yelp.com/biz/" # add business id api_url = "/review_feed?sort_by=date_desc&start=" bid = yelp_url.replace('https://www.yelp.com/biz/', '') if '?' in yelp_url: #deletes everything after "?" in url bid = yelp_url.split('?')[0] class Scraper(): def __init__(self): self.data = pd.DataFrame() def get_data(self, n, bid=bid): with Session() as s: with s.get( base_url + bid + api_url + str(n * 20) ) as resp: #makes an http get request to given url and returns response as json r = dict( resp.json()) #converts json response into a dictionary _html = html.fromstring( r['review_list']) #loads from dictionary dates = _html.xpath( "//div[@class='review-content']/descendant::span[@class='rating-qualifier']/text()" ) reviews = [ el.text for el in _html.xpath( "//div[@class='review-content']/p") ] ratings = _html.xpath( "//div[@class='review-content']/descendant::div[@class='biz-rating__stars']/div/@title" ) df = pd.DataFrame([dates, reviews, ratings]).T self.data = pd.concat([self.data, df]) def scrape(self): #makes it faster # multithreaded looping with Executor(max_workers=40) as e: list(e.map(self.get_data, range(10))) s = Scraper() s.scrape() df = s.data #converts scraped data into df.columns = ['date', 'review', 'rating'] df = df.set_index(df.columns.drop('review', 1).tolist()).review.str.split( '.', expand=True).stack().reset_index().rename(columns={ 0: 'review' }).loc[:, df.columns] df = df.replace(',', '') df = df.replace('!', '') df = df.replace('#', '') df = df.replace('.', '') tokenizer = Tokenizer(nlp.vocab) STOP_WORDS = nlp.Defaults.stop_words.union([ 'gets', 'incredible', 'disappoint', 'from', 'perfection', 'loved', 'definitely', 'happy', 'find', 'found', 'simply', 'fantastic', 'recommend', 'feel', 'little', 'i', 'wow', 'absolute', 'favorite', 'excellent', 'delicious', 'great', 'maybe', 'very', 'enjoy', 'list', 'gave', 'date', 'went', 'disappointed', 'nyc', 'got', '#', 'crazy', 'other', 'fairness', 'fair', 'mid', 'from', 'highly', 'perfect', 'perfectly', 'come', 'lovely', 'visit', 'ny', 'nyc', 'best', 'amazing', 'love', 'absolutely', 'like', 'good', 'other', 'from', 'ny', 'restaurant', 'we', 'will', 'because', 'not', 'friends', 'amazing', 'awesome', 'first', 'he', 'check-in', '=', '= =', 'male', 'u', 'want', 'u want', 'cuz', 'him', "i've", 'deaf', 'on', 'her', 'told', 'told him', 'ins', 'check-ins', 'check-in', 'check', 'I', 'i"m', 'i', 'it', "it's", 'it.', 'they', 'coffee', 'place', 'they', 'the', 'this', 'its', 'l', '-', 'they', 'this', 'don"t', 'the ', ' the', 'it', 'i"ve', 'i"m', '!', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '(', ')', '/', '.', ',', '!' ]) # STOP_WORDS df = df[df['review'] != None] tokens = [] for doc in tokenizer.pipe(df['review'], batch_size=500): doc_tokens = [] for token in doc: if (token.text not in STOP_WORDS) & (token.is_punct == False): doc_tokens.append(token.text.lower()) tokens.append(doc_tokens) df['review'] = tokens jointty = lambda x: ' '.join(map(lambda x: str(x), x['review'])) df['review'] = df.apply(jointty, axis=1) df['review'].replace(' ', np.nan, inplace=True) df = df.dropna() corpus = (st.CorpusFromPandas(df, category_col='rating', text_col='review', nlp=nlp).build().remove_terms( STOP_WORDS, ignore_absences=True)) term_freq_df = corpus.get_term_freq_df() term_freq_df['highratingscore'] = corpus.get_scaled_f_scores( '5.0 star rating') term_freq_df['poorratingscore'] = corpus.get_scaled_f_scores( '1.0 star rating') # term_freq_df = term_freq_df[term_freq_df['1.0 star rating freq'] > 3] dp = term_freq_df.sort_values(by='poorratingscore', ascending=False) dp = dp[~dp.index.str.contains('-')] dp = dp[~dp.index.str.contains("'")] dp = dp[~dp.index.str.contains('/')] dh = term_freq_df.sort_values(by='highratingscore', ascending=False) dh = dh[~dh.index.str.contains('-')] dh = dh[~dh.index.str.contains("'")] dh = dh[~dh.index.str.contains('/')] dhi = dh.head(75) dpo = dh.tail(75) dfinal = pd.concat([dhi, dpo]) # dh = dh.reset_index(drop=False) # return dh.to_dict('index') return dfinal.to_dict('index')
# Read data form URL url = "https://raw.githubusercontent.com/LambdaSchool/DS-Unit-4-Sprint-1-NLP/master/module1-text-data/data/yelp_coffeeshop_review_data.csv" shops = pd.read_csv(url) #clean up data shops['date'] = shops['full_review_text'].apply(lambda x: x.split()[0]) shops['review'] = shops['full_review_text'].apply( lambda x: " ".join(x.split()[1:])) # Tokenizer STOP_WORDS = nlp.Defaults.stop_words.union( ["it's", '1', "i'm", "i've", 'place', "-"]) tokenizer = Tokenizer(nlp.vocab) tokens = [] """ tokens w/o stopwords""" for doc in tokenizer.pipe(shops['full_review_text'], batch_size=500): doc_tokens = [] for token in doc: if (token.text.lower() not in STOP_WORDS) & (token.is_punct == False): doc_tokens.append(token.text.lower()) tokens.append(doc_tokens) shops['tokens'] = tokens # View Counts by Rating shops.loc[(shops.star_rating == ' 5.0 star rating ') | (shops.star_rating == ' 4.0 star rating '), 'rating'] = 'good' shops.loc[(shops.star_rating == ' 3.0 star rating ') | (shops.star_rating == ' 2.0 star rating ') | (shops.star_rating == ' 1.0 star rating '), 'rating'] = 'bad'
from sklearn.neighbors import NearestNeighbors from sklearn.feature_extraction.text import TfidfVectorizer # Instantiate nlp with pretrained statistical model for English language (first install with, "python -m spacy download en_core_web_lg") nlp = spacy.load("en_core_web_lg") df.head() # The Tokenizer tokenizer = Tokenizer(nlp.vocab) # Make the tokens for descrption description_tokens = [] for txt in tokenizer.pipe(df['Description'], batch_size=500): txt_tokens = [token.text for token in txt] description_tokens.append(txt_tokens) df['description_tokens'] = description_tokens #print(df['description_tokens'].head()) # Make the tokens for flavor flavor_tokens = [] for txt in tokenizer.pipe(df['Flavor'], batch_size=500): txt_tokens = [token.text for token in txt] flavor_tokens.append(txt_tokens) df['flavor_tokens'] = flavor_tokens #print(df['flavor_tokens'].head()) # Make the tokens for effects