def get_list_tuples(read_file): stop = set(stopwords.words('english')) list_tuples = [] with open(read_file, 'r') as r: reader = csv.reader(r, delimiter=',') x = 0 for line in reader: #tabsep = line.strip().split('\r') #tabsep = line.strip().split('') msg = TextBlob(line[1]) msg.ngrams( n=2 ) ######################### BI-Grams ####################### try: words = msg.words except: continue for word in words: #if word not in stopwords.words() and not word.isdigit(): if word not in stop and not word.isdigit(): word = word.lower() #list_tuples.append((word.lower(),line[0])) list_tuples.append((stemmer.stem(word), line[0])) #list_tuples.append((lmtzr.lemmatize(word),line[0])) x += 1 return list_tuples
def _get_detailed_stats(no_code_text): """ Returns detailed stats on text :param no_code_text: String to analyse :return: list of details """ results = [] group_by = 'Detailed Text Statistics' tb = TextBlob(no_code_text) # Spell check here...it's very slow results.append( TextFeature('Number of sentences', textstat.sentence_count(no_code_text), group_by)) results.append( TextFeature('Number of sentences (again)', len(tb.sentences), group_by)) results.append(TextFeature('Number of words', len(tb.words), group_by)) results.append( TextFeature('Sentiment Polarity', tb.sentiment.polarity, group_by)) results.append( TextFeature('Sentiment Subjectivity', tb.sentiment.subjectivity, group_by)) results.append( TextFeature('Detected Language', tb.detect_language(), group_by)) results.append( TextFeature('Number of important phrases', len(tb.noun_phrases), group_by)) results.append( TextFeature('Number of word bi-grams', len(tb.ngrams(2)), group_by)) results.append( TextFeature('Number of word tri-grams', len(tb.ngrams(3)), group_by)) results.append( TextFeature('Number of word 4-grams', len(tb.ngrams(4)), group_by)) return results
def get_blob_messages(message): blob = TextBlob(message) if len(blob.words) > 1: messages = blob.ngrams(n=2) + blob.ngrams(n=3) else: messages = blob.ngrams(n=1) return messages
def topwords(renoted_id): default_stopwords = set(nltk.corpus.stopwords.words('english')) # We're adding some on our own - could be done inline like this... # ... but let's read them from a file instead (one stopword per line, UTF-8) #stopwords_file = './stopwords.txt' #custom_stopwords = set(codecs.open(stopwords_file, 'r', 'utf-8').read().splitlines()) #all_stopwords = default_stopwords | custom_stopwords all_stopwords = default_stopwords filepath="alldocs/processed/"+renoted_id+".txt" fp = codecs.open(filepath, 'r', 'utf-8') wiki=TextBlob(fp.read()) print wiki.noun_phrases print wiki.ngrams(n=2) words = nltk.word_tokenize(fp.read()) # Remove single-character tokens (mostly punctuation) words = [word for word in words if len(word) > 1] # Remove numbers words = [word for word in words if not word.isnumeric()] # Lowercase all words (default_stopwords are lowercase too) words = [word.lower() for word in words] # Stemming words seems to make matters worse, disabled # stemmer = nltk.stem.snowball.SnowballStemmer('german') # words = [stemmer.stem(word) for word in words] # Remove stopwords words = [word for word in words if word not in all_stopwords] ###lemmetize words lmtzr = WordNetLemmatizer() # port = PorterStemmer() words = [lmtzr.lemmatize(word) for word in words ] # words = [port.stem(word) for word in words ] # Calculate frequency distribution fdist = nltk.FreqDist(words) retdict=[] topwords={} # Output top 50 words for word, frequency in fdist.most_common(50): if len(word)>3: value= {"word":word,"frequency":frequency} retdict.append(value) topwords["topwords"]=retdict return topwords
def get_joke_location(caption, full=False): """ Parameters ---------- caption : str Returns ------- stats : dict Dictionary with describing stats. Includes key ``joke_location in range(4)`` which describes the quarter the joke is in. """ blob = TextBlob(caption) ngrams = [ngram for ngram in blob.ngrams(n=4)] if len(ngrams) <= 2: ngrams = [ngram for ngram in blob.ngrams(n=2)] if len(ngrams) == 0: return {} perplexities = [perplexity(" ".join(ngram)) for ngram in ngrams] perplexities = np.array(perplexities) idx = np.argmin(perplexities) # Between 1 and 4 phrase = " ".join(ngrams[idx]) word = str(ngrams[idx][-1]) joke_words = 1 # idx += 3 # because 4-gram frac = idx / (len(ngrams)) joke_quarter = int(frac * 4) + 1 doc = nlp(caption, disable=["tagger", "ner", "entityrecognizer", "textcat"]) noun_phrases = [str(x) for x in doc.noun_chunks] if any(word in bnp for bnp in noun_phrases): idx = [i for i, bnp in enumerate(noun_phrases) if word in bnp][0] joke_words = len(noun_phrases[idx].split(" ")) phrase = noun_phrases[idx] kwargs = ({ "word": "".join(word), "phrase": phrase, "noun_phrases": noun_phrases } if full else {}) return { "joke_quarter": joke_quarter, "joke_words": joke_words, "min_perplexity": perplexities.min().item(), "max_perplexity": perplexities.max().item(), "mean_perplexity": perplexities.mean().item(), "median_perplexity": np.median(perplexities).item(), **kwargs, }
def headingsdata(tagtype, soup): for element in soup.select(tagtype): value = {} text = element.get_text() lines = (line.strip() for line in text.splitlines()) chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) text = '\n'.join(chunk for chunk in chunks if chunk) wiki = TextBlob(text) value["partofspeech"] = wiki.tags value["bigrams"] = wiki.ngrams(n=2) value["trigrams"] = wiki.ngrams(n=3) value1 = {} value1[tagtype] = value return value1
def check_speech_patterns(text): PATTERNS={ ("PRP","DT"), ("CC","VBD"), ("VB","RB"), ("VB","PRP$"), ("NN","POS"), ("NN","MD","VB"), ("VB","PRP$","NN"), ("MD","VB","VBN"), ("NN","IN","PRP$"), ("IN","PRP$","JJ"), ("VB","PRP","DT","NN"), ("VBD","RB","JJ","NNS"), ("NNP","NNP","NNP","NNP"), ("PRP$","NN","CC","PRP"), ("NNP", "NNP", "NNP", "NNP", "NNP"), ("NN", "IN", "DT", "NNS", "IN"), ("PRP$", "NN", "IN", "DT", "NN"), ("IN", "DT", "NN", "WDT", "VBZ"), ("NN", "IN", "PRP$", "JJ", "NN"), ("DT", "NN", "IN", "NN", "NN") } blob= TextBlob(text) for i in range (2,6): ngrams=blob.ngrams(n=i) for gram in ngrams: str_gram=" ".join(gram) gram_blob=TextBlob(str_gram) tags=gram_blob.tags lst1, lst2 = zip(*tags) if lst2 in PATTERNS: return True return False
def main(): """.""" args = parse_arguments() # read the string from the playbook to get the actual value of the argument string = tcex.playbook.read(args.string) n_gram_number = int(tcex.playbook.read(args.n_gram)) tcex.log.info('String value: {}'.format(string)) tcex.log.info('n-gram number: {}'.format(n_gram_number)) blob = TextBlob(string) tags = dict() for tag in blob.tags: tags[tag[0]] = tag[1] tcex.playbook.create_output('json', blob.json) tcex.playbook.create_output( 'nGrams', [str(n_gram) for n_gram in blob.ngrams(n=n_gram_number)]) tcex.playbook.create_output('nounPhrases', blob.noun_phrases) tcex.playbook.create_output('npCounts', blob.np_counts[1]) tcex.playbook.create_output('polarity', blob.polarity) tcex.playbook.create_output('sentences', [str(sentence) for sentence in blob.sentences]) tcex.playbook.create_output('subjectivity', blob.subjectivity) tcex.playbook.create_output('tags', tags) tcex.playbook.create_output('tokens', blob.tokens) tcex.playbook.create_output('wordCounts', blob.word_counts[1]) tcex.playbook.create_output('words', blob.words) tcex.exit(0)
def sentiment_pattern(text, gram_n=6): blob= TextBlob(text) ngrams=blob.ngrams(n=gram_n) sentiment_list=[] datalist = [] for gram in ngrams: str_gram=" ".join(gram) print str_gram data = (0, 0, str_gram, None) datalist.append(Datapoint(*data)) #gram_blob=TextBlob(str_gram) #sentiment=gram_blob.sentiment[0] #if sentiment>0: # sentiment=1 #elif sentiment<0: # sentiment=-1 #sentiment_list.append(sentiment) predictor = pickle.load(open("predictor.pickle", "rb" ) ) prediction = predictor.predict(datalist) for sentiment in prediction: sentiment = int(sentiment) if sentiment < 2: sentiment_list.append(-1) if sentiment == 2: sentiment_list.append(0) if sentiment > 2: sentiment_list.append(1) print sentiment_list return sentiment_list
def get_tupels(text): lower = text.lower() blob = TextBlob(lower) ngrams = blob.ngrams(n=2) # assumption: don't is two words (do n't), as in "do not" # this can be easily changed by modifying the tokenizer # http://stackoverflow.com/questions/30550411 tuples = map(tuple,map(tuple, ngrams)) return tuples
def build_ngrams(text, language="en"): blob = TextBlob(lower(text, language)) ngrams = [blob.ngrams(n=n) for n in (3, 2, 1)] wordlists = reduce(operator.add, ngrams) tokenized = (tokenize(wordlist, language, stem=True) for wordlist in wordlists) pure = (tokenize(wordlist, language, stem=False) for wordlist in wordlists) return itertools.chain(tokenized, pure)
def get_ngrams(sent_annotations, is_test): # tokens = [w.token for w in sent_annotations] tokens = [w.lemma for w in sent_annotations] sentence = ' '.join(tokens).decode('utf-8', 'ignore') blob = TextBlob(sentence) unigrams = tokens bigrams = blob.ngrams(n=2) trigrams = blob.ngrams(n=3) unigram_dict = defaultdict(int) bigram_dict = defaultdict(int) trigram_dict = defaultdict(int) for unigram in unigrams: unigram_dict[unigram] = 1 for bigram in bigrams: bigram_dict['_'.join(bigram)] = 1 for trigram in trigrams: trigram_dict['_'.join(trigram)] = 1 return unigram_dict, bigram_dict, trigram_dict
def generate_ngram(self, text, max_ngram): result = [] word = '' for i in range(1, max_ngram): blob = TextBlob(text) ngram_var = blob.ngrams(n=i) word = ' '.join(ngram_var[0]) result.append(word) return result
def Ngrams(strSeq, n=2): strSeq_blob = TextBlob(" ".join([word for word in strSeq])) seq_grams = strSeq_blob.ngrams(n) grammed_words = ["".join([w for w in sentence]) for sentence in seq_grams] # grammed_strSeq = [strSeq[i] + strSeq[i+1] for i in range(0, len(strSeq)-1)] # print(grammed_strSeq) return seq_grams, grammed_words
def sentiment_reviews(reviews, gram_n=5, predictor=None): datalist = [] tag = [] counttag = [0] * len(reviews) for (i, review) in enumerate(reviews): blob = TextBlob(review) ngrams=blob.ngrams(n=min(gram_n, len(blob.words))) for gram in ngrams: str_gram=" ".join(gram) data = (0, 0, str_gram, None) datalist.append(Datapoint(*data)) tag.append(i) counttag[i] += 1 print "start prediction" prediction = predictor.predict(datalist) cstm = [[0] * 5 for x in reviews] for (i, sentiment) in enumerate(prediction): sentiment = int(sentiment) cstm[tag[i]][sentiment] += 1.0 / counttag[tag[i]] trating = 0.0 tcount = 0.0 for i in range(len(reviews)): if counttag[i] == 0: continue cstm[i][2] = cstm[i][2] / math.pow(counttag[i], 0.44) cstm[i][0] = cstm[i][0] * math.pow(counttag[i], 0.22) cstm[i][3] = cstm[i][3] * math.pow(counttag[i], 0.22) rating = 0.0 count = 0.0 for j in range(5): rating += (j + 1) * cstm[i][j] count += cstm[i][j] print cstm[i], " ", counttag[i] t = 1 / (1 + math.exp(-(cstm[i][2] / count - 0.45) * 15)) print cstm[i][2] / count trating += rating / count * (1 - t) tcount += 1 - t trating = trating / tcount if trating > 3: x = trating - 3 x = math.pow(x, 0.4647) * 1.4492 return x + 3 else: x = 3 - trating x = math.pow(x, 0.4647) * 1.4492 return 3 - x
def concept_extr(temp): temp1 = TextBlob(temp) sample = [] for i in range(5): for ngram in temp1.ngrams(i): sample.append(" ".join(ngram)) sample.append(" ".join(ngram.lemmatize())) sample = list(dict.fromkeys(sample)) return (sample)
def GetBigrams(text): blob = TextBlob(text) WordLists = blob.ngrams(n = 2) Bigrams = [] for wordlist in WordLists: cstr = '' for word in wordlist: cstr = cstr+word+"_" Bigrams.append(cstr) return Bigrams
def get_ngrams(string, size=3): blob = TextBlob(string) sentences = [] ngrams = blob.ngrams(n=size) for ngram in ngrams: sentences.append([x for x in ngram]) return ngrams
def GetBigrams(text): blob = TextBlob(text) WordLists = blob.ngrams(n=2) Bigrams = [] for wordlist in WordLists: cstr = '' for word in wordlist: cstr = cstr + word + "_" Bigrams.append(cstr) return Bigrams
def build_ngrams(text, language='en'): blob = TextBlob(lower(text, language)) ngrams = [blob.ngrams(n=n) for n in (3, 2, 1)] wordlists = reduce(operator.add, ngrams) tokenized = ( tokenize(wordlist, language, stem=True) for wordlist in wordlists) pure = ( tokenize(wordlist, language, stem=False) for wordlist in wordlists) return itertools.chain(tokenized, pure)
def get_ngrams(sent_annotations, is_test): tokens = [w.token for w in sent_annotations] sentence = ' '.join(tokens).decode('utf-8', 'ignore') blob = TextBlob(sentence) unigrams = tokens bigrams = blob.ngrams(n=2) trigrams = blob.ngrams(n=3) unigram_dict = defaultdict(int) bigram_dict = defaultdict(int) trigram_dict = defaultdict(int) global UNIGRAM_DICT, BIGRAM_DICT, TRIGRAM_DICT for unigram in unigrams: unigram_dict[unigram] = 1 UNIGRAM_DICT[unigram] += 1 for bigram in bigrams: bigram_dict['_'.join(bigram)] = 1 BIGRAM_DICT['_'.join(bigram)] += 1 for trigram in trigrams: trigram_dict['_'.join(trigram)] = 1 TRIGRAM_DICT['_'.join(trigram)] += 1 return unigram_dict, bigram_dict, trigram_dict
def _get_detailed_stats(no_code_text): """ Returns detailed stats on text :param no_code_text: String to analyse :return: list of details """ results = [] group_by = 'Detailed Text Statistics' tb = TextBlob(no_code_text) # Spell check here...it's very slow results.append(TextFeature('Number of sentences', textstat.sentence_count(no_code_text), group_by)) results.append(TextFeature('Number of sentences (again)', len(tb.sentences), group_by)) results.append(TextFeature('Number of words', len(tb.words), group_by)) results.append(TextFeature('Sentiment Polarity', tb.sentiment.polarity, group_by)) results.append(TextFeature('Sentiment Subjectivity', tb.sentiment.subjectivity, group_by)) results.append(TextFeature('Detected Language', tb.detect_language(), group_by)) results.append(TextFeature('Number of important phrases', len(tb.noun_phrases), group_by)) results.append(TextFeature('Number of word bi-grams', len(tb.ngrams(2)), group_by)) results.append(TextFeature('Number of word tri-grams', len(tb.ngrams(3)), group_by)) results.append(TextFeature('Number of word 4-grams', len(tb.ngrams(4)), group_by)) return results
def extract_trigrams(client): documents = client['cornell']['documents'] for doc in documents.find(): blob = TextBlob(doc['text']) valid_trigrams = [] for s in blob.sentences: sentence = TextBlob(s.dict['raw']) sentence = TextBlob(sentence.parse()) trigrams = sentence.ngrams(n=3) valid_trigrams = valid_trigrams + get_valid_trigrams(trigrams) documents.update({'name':doc['name']},{'$set':{'trigrams':valid_trigrams}})
def n_gram(tweets, n, stop_words): """ Produces an list of highest frequency N-grams along with their count Arguments: tweets (DataFrame): tweets DataFrame for a certain candidate n (int): number of words for n gram stop_words (set): set of words that are not used for making the N-gram Returns: max_grams (list): list of highest frequency N-grams """ assert isinstance(tweets, pd.DataFrame) assert isinstance(n, int) assert isinstance(stop_words, set) tweet_list = "" for tweet in tweets.loc[:, 'text']: tweet_list += tweet tweet_list += " " tweet_list = remove_punctuation(tweet_list) tweets_analysis = TextBlob(tweet_list) grams = tweets_analysis.ngrams(n=n) gram_counter = collections.Counter() for words in grams: words_list = list(words) word = " ".join(words_list) gram_counter[word] += 1 gram_dict = dict(gram_counter) num_grams = 100 max_grams_list = [] for i in range(num_grams): current_max = max(gram_dict.items(), key=operator.itemgetter(1)) max_grams_list.append(current_max) del gram_dict[current_max[0]] max_grams = copy.deepcopy(max_grams_list) for gram in max_grams_list: words = gram[0].split(" ") for word in words: if word in stop_words and gram in max_grams: max_grams.remove(gram) return max_grams
def getNGram(text, n, polarity): key_words = [] text = TextBlob(text) for word in text.ngrams(n): word = ' '.join(word) wordBlob = TextBlob(word) if wordBlob.sentiment.polarity > 0.5 and polarity > 0.2: if str(wordBlob) not in key_words: key_words.append(str(wordBlob)) if wordBlob.sentiment.polarity < -0.5 and polarity < -0.2: if str(wordBlob) not in key_words: key_words.append(str(wordBlob)) return key_words
def analyze_sentiment(filename, verbose=False): """Performs sentiment analysis using textblob. filename -- txt file containing story transcript (first line is assumed to be author's name) """ with open(filename, "r") as f: author = f.readline().rstrip() transcript = TextBlob(f.read()) # Keep a list of polarity and subjectivity polx, poly = [], [] suby = [] num_words = len(transcript.words) # Number of words in transcript num_windows = 1000 # Number of windows to analyze sentiment from seg_length = num_words - num_windows # Print out variables if verbose: print(f"""Number of Words: {num_words} \n Segment Length: {seg_length} \n Number of Windows: {num_windows}""") # Conduct the sentiment analysis!! We do this according to Reagan's method of gathering all the # words in a sliding window of the text. Each window is analyzed as a whole for sentiment. print(f"Conducting SC Analysis on {author}...") # Let's keep track of how long this takes; bigger files may become a problem startTime = time.time() # We use the TextBlob ngrams() function to retrieve all the possible windows of our # specified segLength in the transcript for index, window in enumerate(transcript.ngrams(seg_length)): poly.append(TextBlob(" ".join(window)).sentiment.polarity) polx.append(len(poly)) suby.append(TextBlob(" ".join(window)).sentiment.subjectivity) # Report progress during analysis if verbose: if index % 250 == 0: print("Finished window {}/{} [{:.2f}%]".format( index, num_windows, index / num_windows)) # storing as np array is easier to write to csv results = np.stack([polx, poly, suby]).transpose() # Report how long the analysis took print("SC Analysis Runtime:", round(time.time() - startTime, 2), "seconds") return author, results
def check_for_name(text): b = TextBlob(text) pairs = b.ngrams(n=2) for pair in pairs: composed_sentence = '' for word in pair: composed_sentence += word.lower() composed_sentence += ' ' composed_sentence_trimmed = composed_sentence.strip() if composed_sentence_trimmed == 'my name': for word, part_of_speech in b.pos_tags: if part_of_speech == 'NNP': return 0.91, 'Hello {0}'.format(word), 'giggling' return 0, None, None
def check_for_suicide(text): b = TextBlob(text.replace("'", "")) # iterating through n-grams of 2,3,4,5,6,7,8,9 for (key, value) in NGRAM_DICT.items(): ngrams = b.ngrams(n=value) for ngram in ngrams: composed_sentence = '' for word in ngram: composed_sentence += word.lower() composed_sentence += ' ' composed_sentence_trimmed = composed_sentence.strip() if composed_sentence_trimmed in eval(key): return 1, EMERGENCY, 'afraid' return 0, None, None
def getNGrams(text, n): blob = TextBlob(text) listofBlobs = blob.ngrams(n) listofBigrams = [] for wordList in listofBlobs: flag = True for item in wordList: if flag: bigram = unicode(item) flag = False else: bigram = bigram + " "+ unicode(item) # print type(bigram) listofBigrams.append(bigram) return listofBigrams
def ngram(text, gram=2): '''convert test to n-gram Args: text(str): Text to convert in n-gram gram(int): Number of n-gram Results: __gram_model(list(list)): return list of list depends on n-gram input Defaults: gram = 2 ''' __analyzer = TextBlob(str(text)) return __analyzer.ngrams(n=gram)
def encode(self, text): blob = TextBlob(text) ngram = self.args.ngram out_parts = [] for n in range(1, ngram + 1): ng_vec_all = [] # collect all ngram vectors for ng in blob.ngrams(n=n): ng_vec = np.ones(self.model.vector_size) for tok in ng: ng_vec *= self._get_vector(tok) ng_vec_all.append(self._normalize_vector(ng_vec)) # normalize ngram vectors ng_vec_all = self._normalize_vector(np.sum(ng_vec_all, axis=0)) out_parts.append(ng_vec_all) return np.concatenate(out_parts, axis=None)
def check_problematic(text): blob = TextBlob(text) subjective = blob.sentiment.subjectivity polarity = blob.sentiment.polarity if subjective > 0.33 or polarity < 0: for word in blob.words: for word1 in problem_words: if word.lower() == word1: if word1 == 'criminal' or word1 == 'felon' or word1 == 'criminals': if check_noun(blob, word1): return True, word1 else: return True, word1 for digram in blob.ngrams(2): for digram1 in problem_digrams: if digram[0].lower() == digram1[0] and digram[1].lower( ) == digram1[1]: return True, digram1 return False, None
def get_grams(comment, n=2, keep_emoji_words=False): ''' Returns n-grams for a sentence, optionally cleaning the string in the process. Parameters: ----------- comment: str: sentence for which n-grams will be made n: int: number of tokens to include in n-gram keep_emoji_words: bool: whether emoji will be removed or substituted with text descriptions Returns: -------- list: list of n-grams ''' blob = TextBlob(clean_text(comment, keep_emoji_words=\ keep_emoji_words)) return list([' '.join(wordlist) for wordlist in blob.ngrams(n)])
def names_ext(sentence): ''' Extracts Names using first_name_search and last_name_search ''' sentence = TextBlob(sentence) possible_names = sentence.noun_phrases print "NOUN PHRASES: ", possible_names sentence = sentence.ngrams(n=2) names = [] female_first = open('./Names_db/Females_Firsts.txt').read().strip().split("\n") male_first = open('./Names_db/Males_Firsts.txt').read().strip().split("\n") all_last = open('./Names_db/Last_Namess.txt').read().strip().split("\n") for phrases in sentence: female_names = first_name_search(phrases[0],female_first) male_names = first_name_search(phrases[0],male_first) last_names = last_name_search(phrases[1],all_last) if female_names and male_names and last_names != "None": print female_names print male_names print last_names, "\n" return "None"
def dict_unique_words(str_list, n): c = Counter() for i in str_list: blob = TextBlob(i) x = list(blob.words.lemmatize()) phrases = list(blob.noun_phrases) for phr in phrases: if phr not in x: phrase = ' '.join(phr) x.append(phrase) ngrams = blob.ngrams(n) for gr in ngrams: if gr not in x: ngram = ' '.join(gr) x.append(ngram) for w in x: c[w] += 1 unique_words = dict(c) return unique_words
def blogWords(self): regex1 = '[^a-zA-Z0-9-/]' regex2 = '[^a-zA-Z0-9-\'\"/]' filename = 'blogwords.txt' i = 0 textblob = TextBlob(" ".join(self.listOfWords)) #load blog words text file blogWords_file = open(filename, 'r') #line represents a blog word for line in blogWords_file: #Remove non-alphanumeric characters in sequence line = re.sub(regex2, ' ', line) #array of words in line lineArray = [x.lower() for x in line.split()] #entry represents an n-gram instance of the input text for entry in textblob.ngrams(n = len(lineArray)): entry = [re.sub(regex1, '', x).lower() for x in entry] if lineArray == entry: i += 1 return i
def sentiment_pattern(text, gram_n=6, predictor=None): blob= TextBlob(text) ngrams=blob.ngrams(n=gram_n) sentiment_list=[] datalist = [] for gram in ngrams: str_gram=" ".join(gram) data = (0, 0, str_gram, None) datalist.append(Datapoint(*data)) prediction = predictor.predict(datalist) for sentiment in prediction: sentiment = int(sentiment) if sentiment < 2: sentiment_list.append(-1) if sentiment == 2: sentiment_list.append(0) if sentiment > 2: sentiment_list.append(1) return sentiment_list """
def update_reviews(attr, old, new): data_set = pd.read_csv('Outputs/data_set.csv') i = int(star_rating.value) #output_review_list = extract_ngrams(str(data_set[(data_set['Star_count'] == i)]['Review'].values),2,3) data_set_blob = data_set.copy() data_set_blob['Noun_sentences'] = data_set_blob['Review'].apply(lambda x:get_nouns(x)) n_gram_blob = TextBlob(str(data_set_blob[(data_set_blob['Star_count'] == i)]['Noun_sentences'].values)) #Styling the paragraph element text1 = Paragraph(style={'font-variant': 'small-caps','font-family': "Tahoma"}) text1.text="" #review1 = text_cleaner(str(n_gram_blob.ngrams(1)[0])) #review2 = text_cleaner(str(n_gram_blob.ngrams(1)[1])) review1 = text_cleaner(n_gram_blob.ngrams(1)[1][0]) review2 = text_cleaner(n_gram_blob.ngrams(1)[2][0]) text1.text = "Top "+str(i)+" star reviews feel: "+review1+", followed by "+review2 curdoc().add_root(Row(text1))
def index(): response.content_type = 'text/text; charset=utf-8' ret = 'Hi there, I\'m process {0}!\n\n'.format(os.getpid()) sentence = 'Now is better than never.' ret += 'Testing TextBlob ngram (n=3) with sentence: \n "{0}" \n'.format(sentence) blob = TextBlob(sentence) for word_list in blob.ngrams(n=3): ret += (' '.join(word_list) + '\n') data = pd.DataFrame({'A': np.random.randn(3), 'B': np.random.randn(3)}) func = "pd.DataFrame({'A': np.random.randn(3), 'B': np.random.randn(3)})" ret += '\nTesting Numpy and Pandas with command: \n {0} \n{1} \n'.format(func, data.to_json()) ret += '\nCode at: \n https://github.com/alyssaq/bottle-heroku-skeleton \n' ret += '\nEnvironment vars:\n' for k, v in env.iteritems(): if 'bottle.' in k: continue ret += '%s=%s\n' % (k, v) return ret
attack_text = """ A drone attack that failed to kill President Nicolás Maduro of Venezuela unfolded on live TV and in front of many witnesses """ attack_blob = TextBlob(attack_text) print(attack_blob.noun_phrases) print(attack_blob.words) # toNote: pluralize & singularize! print(attack_blob.words.singularize()) print(attack_blob.words.pluralize()) print(attack_blob.word_counts['of']) print(attack_blob.ngrams(n=2)) print(attack_blob.ngrams(n=4)) from textblob import Word for word in attack_blob.words: print(Word(word).correct() == word) #%% Example from https://www.analyticsvidhya.com/blog/2018/02/natural-language-processing-for-beginners-using-textblob/ av_blob = TextBlob("Analytics Vidhya is a great platform to learn data science. \n It helps community through blogs, hackathons, discussions,etc.") print(av_blob.tokenize()) print(av_blob.sentences, av_blob.sentences[0]) for phrase in av_blob.noun_phrases: print(phrase) # analytics vidhya; great platform; data science # toNote: part-of-speech tagging
case_sensitive=True)) # specify case sensitivity print(wiki.noun_phrases.count('python')) # translation and language detection # en_blob = TextBlob(u'Simple is better than complex.') # print(en_blob.translate(to='es')) # chinese_blob = TextBlob(u"美丽优于丑陋") # print(chinese_blob.translate(from_lang="zh-CN", to='en')) # b = TextBlob(u"بسيط هو أفضل من مجمع") # print(b.detect_language()) # parsing b = TextBlob("And now for something completely different.") print(b.parse()) # textblobs are like python strings! print(zen[0:19]) print(zen.upper()) print(zen.find("Simple")) apple_blob = TextBlob('apples') banana_blob = TextBlob('bananas') print(apple_blob < banana_blob) print(apple_blob == 'apples') apple_blob + ' and ' + banana_blob TextBlob("apples and bananas") print("{0} and {1}".format(apple_blob, banana_blob)) # n-grams blob = TextBlob("Now is better than never.") print(blob.ngrams(n=3)) # getting start and end indices of sentences for s in zen.sentences: print(s) print("---- Starts at index {}, Ends at index {}".format(s.start, s.end))
import pip #!pip install textblob #!python -m textblob.download_corpora from textblob import TextBlob import numpy as np import pandas as pd tx =df.loc[0,'full_text'] blob = TextBlob (tx) blob.tags blob.sentences[0].words blob.noun_phrases blob.ngrams(3) blob.correct( ) blob.words[3].spellcheck( ) blob.detect_language( ) blob. translate (to= 'ar' ) verbs=[ ] for word, tag in blob.tags: if tag == 'VB ' : verbs.append(word.lemmatize( )) nouns = [ ] for word,tag in blob.tags: if tag == 'NN' : nouns.append(word.lemmatize( ) ) nounsp = [ ]
def tweet_content(): """Generate tweet string (140 characters or less) """ # with open('basho.txt', 'r') as content_file: # content = content_file.read() r = requests.get("http://novicevagabond.com/projects/haiku/basho.txt") content = r.content nltk.data.path.append("nltk_data/") nltk.data.path.append("nltk_data/punkt") nltk.data.path.append("fizzle_dizzle/") # nltk.download() #print content tokenizer = BlanklineTokenizer() cleaned_content = content.lower() corpus = TextBlob(cleaned_content, tokenizer=tokenizer) haiku = corpus.sentences #print haiku bigrams = corpus.ngrams(n=2) trigrams = corpus.ngrams(n=3) #print bigrams dict = {} for bigram in bigrams: k = bigram[0] v = bigram[1] if k in dict: if v in dict[k]: dict[k][v] = dict[k][v] + 1 else: dict[k][v] = 1 else: dict[k] = { v : 1} #print dict def weighted_choice(map): choices = [] for k in map: #print k for n in range(1, map[k] + 1): choices.append(k) #print choices choice = random.choice(choices) #print choice return choice seed = random.choice(dict.keys()) length = random.randint(11,15) output = [seed] #print output for i in range(length): output.append(weighted_choice(dict[output[i]])) whitespace = " " line1 = whitespace.join(output[0:4]) line2 = whitespace.join(output[4:9]) line3 = whitespace.join(output[9:]) line4 = "-- #markov_basho_haiku" sep = "\n" tweet = sep.join([line1, line2, line3, line4]); # print tweet return tweet
def get_ngrams(doc, n): blob = TextBlob(doc) ngrams = blob.ngrams(n = n) return ngrams
for i in main_list: xy = i[2].split("-") if len(xy) > 1 : #print xy[1] a[mystr[k]] = xy[1] else: a[mystr[k]] = xy[0] k = k + 1 myset = Set() total_words=0 n1 = b.ngrams(n=1) for i in n1: total_words+=1 if a[i[0]] == "VP" : myset.add(i[0]) #print "this " , i[0] n2 = b.ngrams(n=2) for i in n2: if i[0]=="ADVP" and i[1]=="VP": myset.add(i[0]) myset.add(i[1]) elif i[0]=="VP" and i[1]=="ADVP":
# WordLists (A WordList is just a Python list with additional methods.) animals = TextBlob("cat dog octopus") print animals.words print animals.words.pluralize() # Spelling Correction (Use the correct() method to attempt spelling correction.) b = TextBlob("I havv goood speling!") print(b.correct()) w = Word('falibility') print w.spellcheck() # Get Word and Noun Phrase Frequencies monty = TextBlob("We are no longer the Knights who say Ni. " "We are now the Knights who say Ekki ekki ekki PTANG.") print monty.word_counts['ekki'] # The second way is to use the count() method. print monty.words.count('ekki') print monty.words.count('Ekki', case_sensitive=True) # TextBlobs Are Like Python Strings print zen.upper() # You can make comparisons between TextBlobs and strings. apple_blob = TextBlob('apples') banana_blob = TextBlob('bananas') print apple_blob < banana_blob # You can concatenate and interpolate TextBlobs and strings. print apple_blob + ' and ' + banana_blob print "{0} and {1}".format(apple_blob, banana_blob) # n-grams ( The TextBlob.ngrams() method returns a list of tuples of n successive words. ) blob = TextBlob("Now is better than never.") print blob.ngrams(n=3)
def find_component_match(self, title, body, template_data): '''Make a list of matching files for arbitrary text in an issue''' # DistributionNotFound: The 'jinja2<2.9' distribution was not found and # is required by ansible # File # "/usr/lib/python2.7/site-packages/ansible/plugins/callback/foreman.py", # line 30, in <module> STOPWORDS = [u'ansible', u'core', u'plugin'] STOPCHARS = [u'"', u"'", u'(', u')', u'?', u'*', u'`', u','] matches = [] if u'Traceback (most recent call last)' in body: lines = body.split(u'\n') for line in lines: line = line.strip() if line.startswith(u'DistributionNotFound'): matches = [u'setup.py'] break elif line.startswith(u'File'): fn = line.split()[1] for SC in STOPCHARS: fn = fn.replace(SC, u'') if u'ansible_module_' in fn: fn = os.path.basename(fn) fn = fn.replace(u'ansible_module_', u'') matches = [fn] elif u'cli/playbook.py' in fn: fn = u'lib/ansible/cli/playbook.py' elif u'module_utils' in fn: idx = fn.find(u'module_utils/') fn = u'lib/ansible/' + fn[idx:] elif u'ansible/' in fn: idx = fn.find(u'ansible/') fn1 = fn[idx:] if u'bin/' in fn1: if not fn1.startswith(u'bin'): idx = fn1.find(u'bin/') fn1 = fn1[idx:] if fn1.endswith(u'.py'): fn1 = fn1.rstrip(u'.py') elif u'cli/' in fn1: idx = fn1.find(u'cli/') fn1 = fn1[idx:] fn1 = u'lib/ansible/' + fn1 elif u'lib' not in fn1: fn1 = u'lib/' + fn1 if fn1 not in self.files: if C.DEFAULT_BREAKPOINTS: logging.error(u'breakpoint!') import epdb; epdb.st() if matches: return matches craws = template_data.get(u'component_raw') if craws is None: return matches # compare to component mapping matches = self._string_to_cmap_key(craws) if matches: return matches # do not re-process the same strings over and over again if craws.lower() in self.match_cache: return self.match_cache[craws.lower()] # make ngrams from largest to smallest and recheck blob = TextBlob(craws.lower()) wordcount = len(blob.tokens) + 1 for ng_size in reversed(xrange(2, wordcount)): ngrams = [u' '.join(x) for x in blob.ngrams(ng_size)] for ng in ngrams: matches = self._string_to_cmap_key(ng) if matches: self.match_cache[craws.lower()] = matches return matches # https://pypi.python.org/pypi/fuzzywuzzy matches = [] for cr in craws.lower().split(u'\n'): ratios = [] for k in self.CMAP.keys(): ratio = fw_fuzz.ratio(cr, k) ratios.append((ratio, k)) ratios = sorted(ratios, key=lambda tup: tup[0]) if ratios[-1][0] >= 90: cnames = self.CMAP[ratios[-1][1]] matches += cnames if matches: self.match_cache[craws.lower()] = matches return matches # try to match to repo files if craws: clines = craws.split(u'\n') for craw in clines: cparts = craw.replace(u'-', u' ') cparts = cparts.split() for idx, x in enumerate(cparts): for SC in STOPCHARS: if SC in x: x = x.replace(SC, u'') for SW in STOPWORDS: if x == SW: x = u'' if x and u'/' not in x: x = u'/' + x cparts[idx] = x cparts = [x.strip() for x in cparts if x.strip()] for x in cparts: for f in self.files: if u'/modules/' in f: continue if u'test/' in f and u'test' not in craw: continue if u'galaxy' in f and u'galaxy' not in body: continue if u'dynamic inv' in body.lower() and u'contrib' not in f: continue if u'inventory' in f and u'inventory' not in body.lower(): continue if u'contrib' in f and u'inventory' not in body.lower(): continue try: f.endswith(x) except UnicodeDecodeError: continue fname = os.path.basename(f).split(u'.')[0] if f.endswith(x): if fname.lower() in body.lower(): matches.append(f) break if f.endswith(x + u'.py'): if fname.lower() in body.lower(): matches.append(f) break if f.endswith(x + u'.ps1'): if fname.lower() in body.lower(): matches.append(f) break if os.path.dirname(f).endswith(x): if fname.lower() in body.lower(): matches.append(f) break logging.info(u'%s --> %s' % (craws, sorted(set(matches)))) self.match_cache[craws.lower()] = matches return matches
def text_to_ngrams(text): blob = TextBlob(text) ngrams = blob.ngrams(NGRAM_SIZE) parse_ngrams(ngrams) return
# Section 12.2.14 snippets from textblob import TextBlob text = 'Today is a beautiful day. Tomorrow looks like bad weather.' blob = TextBlob(text) blob.ngrams() blob.ngrams(n=5) ########################################################################## # (C) Copyright 2019 by Deitel & Associates, Inc. and # # Pearson Education, Inc. All Rights Reserved. # # # # DISCLAIMER: The authors and publisher of this book have used their # # best efforts in preparing the book. These efforts include the # # development, research, and testing of the theories and programs # # to determine their effectiveness. The authors and publisher make # # no warranty of any kind, expressed or implied, with regard to these # # programs or to the documentation contained in these books. The authors # # and publisher shall not be liable in any event for incidental or # # consequential damages in connection with, or arising out of, the # # furnishing, performance, or use of these programs. # ##########################################################################
def getBigramCount(inputFileName,state,cat,rating): #create the 5 different category hashmaps to store the bigram and the frequency from the csv file service = {} value = {} variety = {} ambience = {} taste = {} accessibility = {} cr = csvReader("bigramCombined.csv") for r in cr: if r[2] == "value": k = r[0] if k not in value: polarity = r[3] score = int(r[4]) if polarity == "Neg": score *= -1 value[k] = score elif r[2] == "service": k = r[0] if k not in service: polarity = r[3] score = int(r[4]) if polarity == "Neg": score *= -1 service[k] = score elif r[2] == "ambience": k = r[0] if k not in ambience: polarity = r[3] score = int(r[4]) if polarity == "Neg": score *= -1 ambience[k] = score elif r[2] == "taste": k = r[0] if k not in taste: polarity = r[3] score = int(r[4]) if polarity == "Neg": score *= -1 taste[k] = score elif r[2] == "variety": k = r[0] if k not in variety: polarity = r[3] score = int(r[4]) if polarity == "Neg": score *= -1 taste[k] = score else: k = r[0] if k not in accessibility: polarity = r[3] score = int(r[4]) if polarity == "Neg": score *= -1 accessibility[k] = score cr = csvReader(inputFileName) inputStr = " " for r in cr: print r[0].split("|")[1] print r[0].split("|")[0] text = r[0].split("|")[14] try: text = unicode(text) inputStr += text + " " except UnicodeDecodeError: pass blob = TextBlob(inputStr) wordsArray = blob.words bigrams = blob.ngrams(2) bigramsList = [] for i in bigrams: bigramsList.append(i[0]+" "+i[1]) cw1 = csvWriter("tableau_db_3.csv") #headers1 = ["State","Business_Category","Business_Rating","Bigram","Bigram_Cat","Bigram_Freq","Bigram_Sentiment","Bigram_Importance"] #cw1.writerow(headers1) serviceList = {} for k in service: #check whether the word in the service category exists in the textreview if k in bigramsList: #store the frequency of the word in the serviceList hashmap if k not in serviceList: serviceList[k] = 1; else: serviceList[k] +=1 #write into the csv output file used for tableau visualisation for k in serviceList: d = [] d.append(state) d.append(cat) d.append(rating) d.append(k) d.append("Service") freq = serviceList[k] d.append(freq) sentiment = service[k] d.append(sentiment) #importance of word is calculated by the number of times it appears in the textreview #and the sentiment score assigned to the word #for example, happy hour will have 25 importance when it appears 25 times in the #textreview and is assigned a sentiment score of 1 importance = freq * sentiment d.append(importance) cw1.writerow(d) ambienceList = {} for k in ambience: if k in bigramsList: print k if k not in ambienceList: ambienceList[k] = 1; else: ambienceList[k] +=1 for k in ambienceList: d = [] d.append(state) d.append(cat) d.append(rating) d.append(k) d.append("Ambience") freq = ambienceList[k] d.append(freq) sentiment = ambience[k] d.append(sentiment) importance = freq * sentiment d.append(importance) cw1.writerow(d) varietyList = {} for k in variety: if k in bigramsList: print k if k not in varietyList: varietyList[k] = 1; else: varietyList[k] +=1 for k in varietyList: d = [] d.append(state) d.append(cat) d.append(rating) d.append(k) d.append("Variety") freq = varietyList[k] d.append(freq) sentiment = variety[k] d.append(sentiment) importance = freq * sentiment d.append(importance) cw1.writerow(d) tasteList = {} for k in taste: if k in bigramsList: print k if k not in tasteList: tasteList[k] = 1; else: tasteList[k] +=1 for k in tasteList: d = [] d.append(state) d.append(cat) d.append(rating) d.append(k) d.append("Taste") freq = tasteList[k] d.append(freq) sentiment = taste[k] d.append(sentiment) importance = freq * sentiment d.append(importance) cw1.writerow(d) accessibilityList = {} for k in accessibility: if k in bigramsList: print k if k not in accessibilityList: accessibilityList[k] = 1; else: accessibilityList[k] +=1 for k in accessibilityList: d = [] d.append(state) d.append(cat) d.append(rating) d.append(k) d.append("Accessibility") freq = accessibilityList[k] d.append(freq) sentiment = accessibility[k] d.append(sentiment) importance = freq * sentiment d.append(importance) cw1.writerow(d)
def on_success(self, data): # Digest if 'text' in data: line = data['text'] # Do n-grams blob = TextBlob(line) ngrams = list(blob.ngrams(n=2)) for ng in ngrams: for word in list(ng): word = word.lower() if word in self.stemmer.stems.keys(): word = self.stemmer.stems[word] match = re.search('\w+',word) if match: word = match.group() if word in self.stop_words: word = '' if ng[0] and ng[1]: if ' '.join(ng) in self.bookshelf.keys(): self.bookshelf[' '.join(ng)] += 1 else: self.bookshelf[' '.join(ng)] = 1 ''' # Do Unigrams for word in line.split(' '): word = word.lower() # Stemming if word in self.stemmer.stems.keys(): word = self.stemmer.stems[word] # Removing punctuation match = re.search('\w+',word) if match: word = match.group() # Stop words if word not in self.stop_words: if word in self.bookshelf.keys(): self.bookshelf[word] += 1 else: self.bookshelf[word] = 1 ''' self.count += 1 # How often to update? Framerate will depend on the number of relevant # tweets, so one size does not necessarily fit all it = 5 # Move forward if self.count % it == 0: sorted_words = sorted(self.bookshelf.items(), key=operator.itemgetter(1), reverse=True) clear() print('=== Update ==='.format(self.count)) for i in range(0,20): print('{}): {} [{}]'.format(str(i+1),sorted_words[i][0],sorted_words[i][1]))
from nltk.tokenize import BlanklineTokenizer import random with open('basho.txt', 'r') as content_file: content = content_file.read() #print content tokenizer = BlanklineTokenizer() cleaned_content = content.lower() corpus = TextBlob(cleaned_content, tokenizer=tokenizer) haiku = corpus.sentences #print haiku bigrams = corpus.ngrams(n=2) trigrams = corpus.ngrams(n=3) #print bigrams dict = {} for bigram in bigrams: k = bigram[0] v = bigram[1] if k in dict: if v in dict[k]: dict[k][v] = dict[k][v] + 1 else: dict[k][v] = 1 else: dict[k] = { v : 1}