def get_similar_words(cleaned_query_words, doc): global expanded_query_terms try: command = "/home/sneha/phoenix/galago/galago-3.6-bin/bin/galago doc --index=/phoenix/ir_code/galago-index-rb4/ --id=" + doc + " --text=true --metadata=false --tokenize=true | sed -n '/Term vector:/,/<TEXT>/p'" outl = subprocess.check_output(command, shell=True) out = outl.split('\n') word_array = list() for i in range(1, len(out) - 2): words = out[i].split() if(len(words) > 0): word_array.append(words[len(words)-1]) q_syn_list = list() for q in cleaned_query_words: q_wn = Word(q) q_syn_list.append(q_wn.get_synsets(NOUN)) for w in word_array: if w not in cleaned_query_words: w_wn = Word(w) w_syn = w_wn.get_synsets(NOUN) for q_syn in q_syn_list: max_syn = 0 for i in range(0, min(2, len(q_syn))): for j in range(0, min(2, len(w_syn))): syn = q_syn[i].path_similarity(w_syn[j]) max_syn = max(max_syn, syn) if(max_syn > 0.3): expanded_query_terms[w] = max_syn except: pass print "Done processing " + doc
def default_adv_xxx_bigram_polarity(bigram, negation=None, prior_polarity_score=False, linear_score=None): """Calculates the bigram polarity based on a empirical factor from each adverb group and SENTIWORDNET word polarity """ second_word_polarity = word_polarity(bigram['second_word'], bigram['second_word']['tag'], prior_polarity_score = prior_polarity_score, linear_score = linear_score) # If is a verb, tries again in lemmatized form if bigram['second_word']['tag'] in util.PENN_VERBS_TAGS and \ (second_word_polarity == None or second_word_polarity[0] == 0): w = Word(bigram['second_word']['raw']) bigram['second_word']['lemma'] = w.lemmatize("v") second_word_polarity = word_polarity(bigram['second_word'], bigram['second_word']['tag'], prior_polarity_score = prior_polarity_score, linear_score = linear_score) #if the ngram_2 does not have polarity, so stops the method if second_word_polarity == None: return None return apply_adverb_factor(bigram['first_word']['raw'],second_word_polarity[0], negation)
def preprocess(tagged): word = Word(tagged[0]) if word.isalpha() and word not in stopwords: tag = penn_to_wn(tagged[1]) l = word.lemmatize(tag) else: l = '' return l
def getword(word): w = Word(word[0]) result = word[0] if word[1] == 'JJR' or word[1] == 'JJS': result = w.lemmatize('a') elif word[1] == 'NNS' or word[1] == 'NNP' or word[1] == 'NNPS': result = w.lemmatize('n') return result
def get_verbs(self, words_tags): result = [] for idx, (word, tag) in enumerate(words_tags): if tag in self.verb_tags: verbz = Word(word) if verbz.lemmatize("v") not in self.neglect and word not in self.neglect: src_verb = verbz.lemmatize("v") result.append((idx, src_verb)) return result
def getTags(string): tb = TextBlob(string) # tb = TextBlob(str(tb.correct())) nouns = tb.noun_phrases real_nouns_of_NY = list() sentiment = tb.sentiment.polarity for noun in nouns: n = Word(noun) real_nouns_of_NY.append(n.lemmatize()) return sentiment, real_nouns_of_NY
def extractWordPosPredicate(word, line,after): numofwords = len(word.split()) i=0 #print "WordPOSPREDICATE",word,line w = Word(word.split()[0]) result = list() found = 0 while i <= (len(line.split()) - numofwords): if re.match("[A-Za-z]+$",line.split()[i]): w = Word(word.split()[0]) w1 = Word(line.split()[i]) if w.lemmatize("v") == w1.lemmatize("v"): j = 1 flag = 0 for j in xrange(numofwords): #print line.split()[i+j],word.split()[j],numofwords if re.match("^[A-Za-z]+$",line.split()[i+j]): w = Word(word.split()[j]) w1 = Word(line.split()[i+j]) if w.lemmatize("v") != w1.lemmatize("v"): flag = 1 break if flag == 0: found = 1 if after==1: result.append(i+numofwords) #Returns the position from where we have to take n gram after else: result.append(i - 1) #Returns the position from where we have to take n gram before i+=1 if found == 1: return result else: return [-1]
def extractWordPosPredicate(word, line,after): numofwords = len(word.split()) i=0 w = Word(word.split()[0]) while i <= (len(line.split()) - numofwords): if re.match("[A-Za-z]+$",line.split()[i]): w = Word(word.split()[0]) w1 = Word(line.split()[i]) if w.lemmatize("v") == w1.lemmatize("v"): j = 1 flag = 0 for j in xrange(numofwords): #print line.split()[i+j],word.split()[j],numofwords if re.match("^[A-Za-z]+$",line.split()[i+j]): w = Word(word.split()[j]) w1 = Word(line.split()[i+j]) if w.lemmatize("v") != w1.lemmatize("v"): flag = 1 break if flag == 0: if after==1: return i+numofwords #Returns the position from where we have to take n gram after else: return i - 1 #Returns the position from where we have to take n gram before i+=1 return -1
def transformToWords(self,x): ''' permet d'avoir une list de mot lemmanizé ''' words = [] for word in x.textBlob.words: myWord = Word(word.lemmatize('v').encode('utf-8')) myWord = Word(myWord.lemma) myWord = Word(myWord.singularize().upper().encode('utf-8')) words.append(myWord) return words
def lemmatize(textblob): ''' permet d'avoir une list de mot lemmanizé ''' words = [] for word in textblob.words: myWord = Word(word.lemmatize('v').encode('utf-8')) myWord = Word(myWord.lemma) myWord = Word(myWord.singularize().upper().encode('utf-8')) words.append(myWord) return words
def sentenceToFeatures(self, sentence): feat = [] for word in sentence: werd = Word(word) syns = [w.lemma_names for w in werd.get_synsets()] for syn in syns: try: feat.append(self.word_to_idx[syn]) except KeyError: continue return list(set(feat))
def imprimir_resto(clase, puesto, descrip, req): #lineaTotal = filter(lambda x: x in string.printable, lineaTotal) archEscritura.write(clase) archEscritura.write(",") blobPuesto = TextBlob(puesto.decode('utf-8')) blobDescrip = TextBlob(descrip.decode('utf-8')) blobReq = TextBlob(req.decode('utf8', 'ignore')) wordsPuesto = blobPuesto.words wordsDescrip = blobDescrip.words wordsReq = blobReq.words for wordP in wordsPuesto: nword = strip_accents(wordP) exclude = set(string.punctuation) nword = ''.join(ch for ch in nword if ch not in exclude) nword = nword.lower() nword = filter(lambda x: x in string.printable, nword) archEscritura.write(nword) archEscritura.write(" ") archEscritura.write(",") stemmer = SnowballStemmer("spanish") cad = "" for wordD in wordsDescrip: nwordD = strip_accents(wordD) exclude = set(string.punctuation) nwordD = ''.join(ch for ch in nwordD if ch not in exclude) nwordD = filter(lambda x: x in string.printable, nwordD) if nwordD not in (stopwords.words('spanish')):#Elimnimar Stop words w=Word(nwordD) #comentarios.append(w) word2= stemmer.stem(w.lower()) archEscritura.write(word2) archEscritura.write(" ") archEscritura.write(",") lista = [] for wordP in wordsReq: nwordP = strip_accents(wordP) exclude = set(string.punctuation) nwordP = ''.join(ch for ch in nwordP if ch not in exclude) nwordP = filter(lambda x:x in string.printable, nwordP) if nwordP not in (stopwords.words('spanish')): w=Word(nwordP) word3 = stemmer.stem(w.lower()) if word3 not in lista: lista.append(word3) for pal in lista: archEscritura.write(pal) archEscritura.write(" ") archEscritura.write("\n")
def initialize(self, sentences): self.max_feat_len = 0 self.word_to_idx = {} idx = self.index_offset for sentence in sentences: syn_count = 0 for word in sentence: werd = Word(word) syns = [w.lemma_names for w in werd.get_synsets()] for syn in syns: syn_count += 1 if syn not in self.word_to_idx: self.word_to_idx[syn] = idx idx += 1 self.max_feat_len = max(self.max_feat_len, syn_count)
def loadData(self, filename): with open(filename, "r") as reviewfile: review = TextBlob("") for line in reviewfile: review += TextBlob(line).lower() for sentence in review.sentences: tmp = [] for word in sentence.words: if word not in self.stopwords: w = Word(word) tmp.append(w.lemmatize()) if word not in self.wordfrequency: self.wordfrequency[word] = review.word_counts[word] self.sentences.append(tmp)
def split_sentences(self, line): correction_line = "" tokens = line.split() for token in tokens: if token.lower() in self.features: correction_line = correction_line + str(" ") + token continue b = Word(token) possible_values = b.spellcheck() result = possible_values[0][0] for word in possible_values: if word[0].lower() in self.features: result = word[0] break correction_line = correction_line + str(" ") + result return correction_line
def spell_check(line): modified_line=line word_list=word_tokenize(line) for word in word_list: word=word.lower() if word in spell_dict.keys(): modified_line = re.sub(word,spell_dict[word],line) elif word.isalnum(): search = open('English_words.txt', 'r') if word not in english_dict and word not in search.read(): w = Word(word) suggestion=w.spellcheck() if max(suggestion)[1] > 0.9: word_checked=max(suggestion)[0] spell_dict[word]=word_checked modified_line = re.sub(word,spell_dict[word],modified_line) return modified_line
def stopWordStem(linea): blob=TextBlob(linea.decode('utf-8')) words=blob.words comentarios="" stemmer = SnowballStemmer("spanish") primero=True for word in words: if word not in (stopwords.words('spanish')):#Elimnimar Stop words w=Word(word) if (primero): comentarios+=(stemmer.stem(w.lower())) primero=False else: comentarios+=" " comentarios+=(stemmer.stem(w.lower())) return comentarios
def preposition(line): first_letter = line[0] for word in line.split(): tb = TextBlob(word) for w, t in tb.tags: if t == 'NN': b = Word(word) if word == str(b.singularize()): # print word + " is probably singular like " + b.singularize() if not_a_vowel(first_letter): return random.choice(['The ', 'A ', '']) + line else: return random.choice(['The ', 'An ', '']) + line elif word == str(b.pluralize()): return random.choice(['The ', 'Some ', 'Many ', 'Of ', 'For all of the ', '']) + line ## if it gets to this point, we dont know if it is plural, so just figure out if 'a' or 'an' if not_a_vowel(first_letter): return random.choice(['A ', 'The ', '']) + line else: return random.choice(['An ', 'The ', '']) + line
def imprimir_resto(lineaTotal): blob=TextBlob(lineaTotal.decode('utf-8')) #separamos en palabras words=blob.words for word in words: nword = strip_accents(word) exclude = set(string.punctuation) nword = ''.join(ch for ch in nword if ch not in exclude) ncomments.append(nword) comentarios=[] stemmer = SnowballStemmer("spanish") count = 0 for word in ncomments: if count < 2: exclude = set(string.punctuation) word = ''.join(ch for ch in word if ch not in exclude) word = word.lower() #word = strip_accents(word) comentarios.append(word) else: if word not in (stopwords.words('spanish')):#Elimnimar Stop words #comentarios.append(w) exclude = set(string.punctuation) word = ''.join(ch for ch in word if ch not in exclude) word = word.lower() word = strip_accents(word) w=Word(word) comentarios.append(stemmer.stem(w.lower())) count += 1 ulist = [] for com in comentarios: if com not in ulist: ulist.append(com) for com in ulist: com = filter(lambda x: x in string.printable, com) archEscritura.write(com) archEscritura.write(",") archEscritura.write("\n")
def processRow(row): import re import nltk from textblob import TextBlob from nltk.corpus import stopwords from nltk.stem import PorterStemmer from textblob import Word from nltk.util import ngrams import re from wordcloud import WordCloud, STOPWORDS from nltk.tokenize import word_tokenize tweet = row #Lower case tweet.lower() #Removes unicode strings like "\u002c" and "x96" tweet = re.sub(r'(\\u[0-9A-Fa-f]+)',r'', tweet) tweet = re.sub(r'[^\x00-\x7f]',r'',tweet) #convert any url to URL tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet) #Convert any @Username to "AT_USER" tweet = re.sub('@[^\s]+','AT_USER',tweet) #Remove additional white spaces tweet = re.sub('[\s]+', ' ', tweet) tweet = re.sub('[\n]+', ' ', tweet) #Remove not alphanumeric symbols white spaces tweet = re.sub(r'[^\w]', ' ', tweet) #Removes hastag in front of a word """ tweet = re.sub(r'#([^\s]+)', r'\1', tweet) #Replace #word with word tweet = re.sub(r'#([^\s]+)', r'\1', tweet) #Remove :( or :) tweet = tweet.replace(':)','') tweet = tweet.replace(':(','') #remove numbers tweet = ''.join([i for i in tweet if not i.isdigit()]) #remove multiple exclamation tweet = re.sub(r"(\!)\1+", ' ', tweet) #remove multiple question marks tweet = re.sub(r"(\?)\1+", ' ', tweet) #remove multistop tweet = re.sub(r"(\.)\1+", ' ', tweet) #lemma from textblob import Word tweet =" ".join([Word(word).lemmatize() for word in tweet.split()]) #stemmer #st = PorterStemmer() #tweet=" ".join([st.stem(word) for word in tweet.split()]) #Removes emoticons from text tweet = re.sub(':\)|;\)|:-\)|\(-:|:-D|=D|:P|xD|X-p|\^\^|:-*|\^\.\^|\^\-\^|\^\_\^|\,-\)|\)-:|:\'\(|:\(|:-\(|:\S|T\.T|\.\_\.|:<|:-\S|:-<|\*\-\*|:O|=O|=\-O|O\.o|XO|O\_O|:-\@|=/|:/|X\-\(|>\.<|>=\(|D:','', tweet) #trim tweet = tweet.strip('\'"') row = tweet return row
def tokenize_tweet(document): global uselessTerm document = document.lower() a = document.index("username") b = document.index("clusterno") c = document.rindex("tweetid") - 1 d = document.rindex("errorcode") e = document.index("text") f = document.index("timestr") - 3 # 提取用户名、tweet内容和tweetid三部分主要信息 document = document[c:d] + document[a:b] + document[e:f] terms = TextBlob(document).words.singularize() result = [] for word in terms: expected_str = Word(word) expected_str = expected_str.lemmatize("v") if expected_str not in uselessTerm: result.append(expected_str) return result
def nb_of_speliing_errors(essay): """Feature 4: Utiliza a bilbioteca textblob para correção de erros de ortografia""" essay = re.sub('@\S+','',essay) # Retira nomes padrão do ASAP iniciados em @ vector = CountVectorizer() vector.fit_transform([essay]) list_tokens = vector.get_feature_names() spell_errors = 0 for i in list_tokens: __result = None __result = re.search('@\S+', i) # retira nomes padrão do ASAP como @Location if __result is None: w = Word(i) # da biblioteca textblob result_tuple = w.spellcheck() # o método spellcheck() returna uma tupla com a sugestão de correção e o nível de certeza em % if i != result_tuple[0][0] and result_tuple[0][1] == 1.0: # só faz correções que o corretor tem certeza, 100%, evita sugestão de singular em formas plurais spell_errors += 1 nb_of_spell_errors = spell_errors return nb_of_spell_errors
def standardize_note(note, abbrDict): ''' Standardize a note (collection of words). Assumes a first pass of abbreviation approximation has been performed. ''' noteLw = note.lower() noteLw = noteLw.replace("'s", "") blob = TextBlob(noteLw) lematBlob = [] for k, (word, pos) in enumerate(blob.tags): # clean the word word = word.strip("/") word = word.strip(".") # check once more to see if there is something word = Word(_replace_abbr(word, abbrDict)) if "." in word: splitMore = word.split('.') for spWord in splitMore: fixedWord = _replace_abbr(spWord, abbrDict) for w in fixedWord.split(): lematBlob.append(_word_std(Word(w), pos)) else: for w in word.split(): lematBlob.append(_word_std(Word(w), pos)) return " ".join(filter(lambda x: x != None, lematBlob))
def textblob_adj(filepath, outfilepath, countfilepath, minpos): file = open(filepath) t = file.read() blobed = TextBlob(t) #counts = Counter(tag for word,tag in blobed.tags) adj_list = [] adv_list = [] adj_tag_list = ['JJ', 'JJR', 'JJS'] adv_tag_list = ['RB', 'RBR', 'RBS'] for (a, b) in blobed.tags: if b in adj_tag_list: expected_str = Word(a) expected_str = expected_str.lemmatize('a') adj_list.append(expected_str) elif b in adv_tag_list: expected_str = Word(a) expected_str = expected_str.lemmatize('r') adv_list.append(expected_str) else: pass with open(outfilepath, "w") as txt_file: for line in adj_list: txt_file.write(line + " ") for line in adv_list: txt_file.write(line + " ") # return adj_list, adv_list, counts['JJ']+counts['JJR']+counts['JJS'], counts['RB']+counts['RBR']+counts['RBS'] count_from_text_file(outfilepath, countfilepath, minpos)
def spell_check(query): #split query splitted_query = query.split() #empty list for spell checked query corrected_query = [] #searching freq_dict in db dict_collection = mongo.db["dict_collection"] freq_dict = dict_collection.find_one({"name": "freq_dict"})["freq_dict"] #for each word in splitted query for word in splitted_query: #convert to testblob word blob_word = Word(word) #all the possible corrections to word possible_corrections = blob_word.spellcheck() #initial counter freq_counter = 1 #for the case when spelling is incorrected but no word in document to correct it at_least_one = False #in case the spelling is correct corrected_word = blob_word #for each possible correction in the word for p in possible_corrections: #p[0]'s are the corrections and p[1] scores if p[0] in freq_dict.keys(): #signifies at least one correction is present in dictionary so frequency based correction at_least_one = True #frequency of p[0] frequency = freq_dict[p[0]] else: frequency = 0 #keeping highest frequency and corresponding word in record if frequency >= freq_counter: freq_counter = frequency corrected_word = p[0] #no correction was present in dictionary if at_least_one is False: #return correction with highest score corrected_word = blob_word.correct() corrected_query.append(corrected_word) return " ".join(corrected_query)
def save_file_dict(file_path, save_file_dict_path): # file_dict = {} # file_dict_keys = [] stop_words = stopwords.words('english') count = 0 for fname in file_path: with open(fname, 'r', encoding='ISO-8859-1') as f: # tokenization / normalization s = f.read().lower() s = s.replace('/', ' ') s = s.replace('-', ' ') w = TextBlob(s).words clean_w_list = [] for word in w: # stemming (w = w.stem() another way) word = Word(word) word = word.lemmatize() word = Word(word) word = word.lemmatize("v") # stopwords (nltk.download("stopwords") download the dataset) if(word not in stop_words and (word not in ['\'s', '\'ll', '\'t'])): clean_w_list.append(word) clean_w_dict = dict(collections.Counter(clean_w_list)) path_list = fname.split('\\') save_dict_name = path_list[-2] + '_' + path_list[-1] save_dict_path = save_file_dict_path + '/' + save_dict_name + '.txt' save_dict(clean_w_dict, save_dict_path) # file_dict[save_dict_name] = clean_w_dict # file_dict_keys.append(save_dict_name) print("The %dth file finished."%(count)) count += 1
def process_data_cleaning(data): # remove undesired unicode characters print("start data cleaning") data["message"] = data["message"].apply(lambda x: x.replace( "\\/", "/").encode("ascii", "ignore").decode("ascii")) # remove html tags, urls etc. tag_cleaner = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});') url_cleaner = re.compile('http\S+|http\S+|www\S+') data["message"] = [re.sub(tag_cleaner, " ", x) for x in data["message"]] data["message"] = [re.sub(url_cleaner, " ", x) for x in data["message"]] # transform into lowercase data["message"] = data["message"].apply(lambda x: x.lower()) # remove smileys, symbols and all the crap data["message"] = [remove_emoji(x) for x in data["message"]] # remove all numbers data["message"] = data["message"].apply(lambda x: " ".join( x for x in x.split() if check_if_number(x) is not True)) # remove all special characters, which might be left spec = set(string.punctuation) data["message"] = data["message"].apply( lambda x: x.translate({ord(i): None for i in spec})) # remove stopwords - check on lowercase level stop = nltk.corpus.stopwords.words('english') data["message"] = data["message"].apply( lambda x: " ".join(x for x in x.split() if x not in stop)) # remove rows which include empty messages empty_string_filter = data["message"] != "" data = data[empty_string_filter] print("end data cleaning") print("start spelling correction") # spelling correction data["message"] = data["message"].apply(lambda x: (str(TextBlob(x).correct()))) print("end spelling correction") print("start lemmatisation") # perform lemmatisation data["message"] = data["message"].apply( lambda x: " ".join([Word(word).lemmatize() for word in x.split()])) print("start lemmatisation") return data
def Clean(self): data = pd.read_csv(self.path) data = data.drop('author', axis=1) data = data.drop(data[data.sentiment == 'anger'].index) data = data.drop(data[data.sentiment == 'boredom'].index) data = data.drop(data[data.sentiment == 'enthusiasm'].index) data = data.drop(data[data.sentiment == 'empty'].index) data = data.drop(data[data.sentiment == 'fun'].index) data = data.drop(data[data.sentiment == 'relief'].index) data = data.drop(data[data.sentiment == 'surprise'].index) data = data.drop(data[data.sentiment == 'love'].index) data = data.drop(data[data.sentiment == 'hate'].index) data = data.drop(data[data.sentiment == 'neutral'].index) data = data.drop(data[data.sentiment == 'worry'].index) data['content'] = data['content'].apply(lambda x: " ".join(x.lower() for x in x.split())) data['content'] = data['content'].str.replace('[^\w\s]',' ') stop = stopwords.words('english') data['content'] = data['content'].apply(lambda x: " ".join(x for x in x.split() if x not in stop)) from textblob import Word data['content'] = data['content'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()])) import re def del_repeat(text): pattern = re.compile(r"(.)\1{2,}") return pattern.sub(r"\1\1", text) data['content'] = data['content'].apply(lambda x: " ".join(del_repeat(x) for x in x.split())) freq = pd.Series(' '.join(data['content']).split()).value_counts()[-10000:] freq = list(freq.index) data['content'] = data['content'].apply(lambda x: " ".join(x for x in x.split() if x not in freq)) lbl_enc = preprocessing.LabelEncoder() y = lbl_enc.fit_transform(data.sentiment.values) return data
def transformToWords(self, x): ''' permet d'avoir une list de mot lemmanizé ''' words = [] for word in x.textBlob.words: myWord = Word(word.lemmatize('v').encode('utf-8')) myWord = Word(myWord.lemma) myWord = Word(myWord.singularize().upper().encode('utf-8')) words.append(myWord) return words
def get_hypernym(askedfor): # put in a word, get a list of other words word = Word(askedfor) if word.synsets: generalsenses = word.synsets[0] next_level = [ h.name().split(".")[0] for h in generalsenses.hypernyms() ] print('A {} is a/an {}'.format(askedfor, next_level[0])) return (next_level[0]) else: print('{} is NOT a word'.format(askedfor)) return ('{} is NOT a word'.format(askedfor))
def defineWord(self, word): with humanfriendly.AutomaticSpinner("Loading, this can take awhile for the first time, but repeating the command again will be considerably faster..."): try: blob = Word(word) defs = blob.definitions s = "" if len(defs) > 0: for item in defs[0:4]: s += item.capitalize()+".\n" return s else: return "No result found." except RuntimeError: blob = Word(word) defs = blob.definitions s = "" if len(defs) > 0: for item in defs[0:4]: s += item.capitalize()+".\n" return s else: return "No result found."
def main(): f = open('relativity.txt', 'r') content = f.read() wiki = TextBlob("My namee is John!") #wiki.tags sentiment = wiki.sentiment w = Word('hullo') print(wiki.correct()) to_esp = wiki.translate(to='fr') print(to_esp) input()
def Input_pipeline(Tweet, filename): Tweet = str(Tweet).lower().replace('[^\w\s]', '').replace( r'http\S+', '').strip().replace('\s+', ' ') Tweet = ' '.join( [Word(item).lemmatize() for item in Tweet.split() if item not in stop]) TFIDF = Vectorizer.transform([Tweet]).toarray() Classifier_ = pickle.load(open(filename, 'rb')) Prediction = Classifier_.predict(TFIDF) if Prediction[0] == 0: return 'Real' elif Prediction[0] == 1: return 'Fake' return Prediction
def predict(): if request.method == 'POST': message = request.form['message'] if (len(message) > 2): text = message pre_processed_reviews = [] data = gensim.utils.simple_preprocess(text, min_len=2) review = ' '.join(WordNetLemmatizer().lemmatize(word) for word in data if word not in stop_words) pre_processed_reviews.append(review.strip()) tfidf_model = joblib.load(MODEL_tfidf) vect = tfidf_model.transform(pre_processed_reviews) lr_model = joblib.load(MODEL_lr) my_prediction = lr_model.predict(vect) else: my_prediction = 3 return render_template('home.html', prediction=my_prediction) blob = TextBlob(text) nouns = list() for word, tag in blob.tags: if tag == 'NN': nouns.append(word.lemmatize()) display = [] output = "" for item in random.sample(nouns, len(nouns)): word = Word(item) if word not in display: display.append(word.capitalize()) for i in display: if len(i) > 2: output = output + " " + i else: output = "" return render_template('home.html', prediction=my_prediction, summary=output)
def findLemmas(adj): synonyms = [] antonyms = [] word = Word(adj) for syn in word.synsets[:]: # for syn in list(wn.senti_synsets(adj)): for l in syn.lemmas(): # for l in syn.synset.lemmas(): synonyms.append(l.name()) if l.antonyms(): antonyms.append(l.antonyms()[0].name()) return synonyms, antonyms
def index(request): Note = "Not Found" if request.method == 'POST': text = request.POST['text'] lower = text.lower() defination = Word(lower).definitions final = ' '.join(map(str, defination)) if final is "": messages.error(request, 'Either spelling mistake or can\'t find your word') context = {"defination": final} return render(request, 'index.html', context) return render(request, 'index.html')
def answer_yes_no_question(self, qcorpus, tk, relevant_sentences): relevant_sentence = nlp(str(relevant_sentences[0])) q_tags = tk.tags keyword = "" key_tag = "" for (token, tag) in q_tags: if tag[0] == 'N' or tag[0] == "V": keyword = token key_tag = tag keyword_found = False neg = False for token in relevant_sentence: if str(token) == keyword or (key_tag[0] == "V" and Word(str(token)).lemmatize("v") == Word(keyword).lemmatize("v")): keyword_found = True if token.dep_ == 'neg': neg = not neg if keyword_found: if neg: return "No" else: return "Yes" return "No"
def recuperer_mots_tweet(data, seuil): # Fonction renvoyant les mots uniques et lemmatisés d'un ensemble de tweets, et qui sont peu fréquents. ## Création de la liste des mots L = [] ## Récupération des mots for tweet in data['tweet_textual_content']: t = TextBlob(tweet) L.append(t.words) ## Suppression des mots trop fréquents for word in L: if L.count(word) > seuil: L.delete(word) ## Lemmatisation for word in L: w = Word(word) L.append(w.lemmatize()) return L
def synonyms(word, maxSyns): syns, ants = [], [] for syn in Word(word).synsets: for l in syn.lemmas(): syns.append(l.name()) if l.antonyms(): ants.append(l.antonyms()[0].name()) final = [ syns[i] for i in sample(range(0, len(syns)), min(maxSyns, len(syns))) ] for ant in ants: final.append(ant) return final
def place_mark(self, word): if word in self.previous_words: self.status = 2 return self.letters elif len(word) <= 2 or len(word) > 6: self.status = 0 return self.letters elif not (self.dictionary.meaning(word) == None): self.previous_words.append(word) self.calc_score(len(word)) self.status = 1 else: w = WordDict(word) check = w.spellcheck() if (check[0][1] == 1.0) and (word == check[0][0]): self.previous_words.append(word) self.calc_score(len(word)) self.status = 1 else: self.calc_score(0) self.status = 0 return self.letters
def normalization(array): sw = stopwords.words("english") tempArray = [] for i in range(0, len(array)): comment = array[i] # obtaining the words inside of the sentence sentence = "" # used for storing words after processes for word in comment.split(): # each word if word in sw: # if word is stopword, skip this word for process in below continue word = Word(word).lemmatize() # lemmatize and stemming word = word.lower() # capitalize word = "".join( char for char in word if char.isalpha() or char == " " or char == "'" ) # to joining. There is 3 different control occur because eliminating the possiblity of missing stopwords due to " ' " and space. sentence += word + " " tempArray.append(sentence) sentiment_score = TextBlob(sentence).sentiment print(sentiment_score) return tempArray
def find_synms(word, c=None, pos=None): from textblob import Word from itertools import chain synonyms = Word(word).get_synsets(pos) #for wl in synonyms: # print(synonyms[0], wl.path_similarity(synonyms[0]), wl.lemma_names()) lemmas = chain.from_iterable([word.lemma_names() for word in synonyms]) lemmas = mm.remove_dup_list(lemmas, case=True) return lemmas[0:c]
def paragraph_lemma(txt): """ Lemmatize a paragraph. Parameters ---------- txt : str Texts after removing numbers and punctuations. Returns -------(pd.Series(nltk.ngrams(words, 2)).value_counts())[:10] lemma_txt : str lemmatized paragraph with each word being lowercase. """ token_word = word_tokenize(txt) lemma_txt = ' '.join([ Word(word.lower()).lemmatize() for word in token_word if len(Word(word.lower()).lemmatize()) > 1 ]) return lemma_txt
def get_correct(text: TextBlob) -> Dict[str, Union[str, Dict]]: """ Функция, возвращающая текст без ошибок и варианты правильного написания слов. """ corrected_word = text.correct() correctly_vars: Dict = { x: str(Word(x).spellcheck()[0][0]) for x in text.words } return { "corrected": str(corrected_word), "correctly words": correctly_vars }
def definition(text='hate'): e = '' mean = [] try: if request.method == 'POST': text = request.form['mean'] mean = Word(text).definitions else: e = 'Sorry, I dont have anything about this.' except: e = 'Sorry, I dont have anything about this.' return render_template('home.html', e=e, mean=mean[:5])
def __to_singular(row, ne): if 'NIW' not in ne.izen_lexikografikoa.values: return row word = Word(row.word).singularize() try: singular = ne[ne.word == word].reset_index().loc[0] row.izen_lexikografikoa = singular.izen_lexikografikoa except: pass return row
def spell_checker(deduped_text, final_text): temp = list() for text in deduped_text: zen = text.split(' ') num_words = len(zen) crt_words = 0 empty_words = 0 for word in zen: if word == '': empty_words += 1 else: w = Word(word) if w.spellcheck()[0][1] > 0.9: crt_words += 1 num_words -= empty_words if crt_words / num_words >= 0.6: temp.append(text) if len(temp) > 0: final_text.extend(temp)
def preprocess(sentence, use_stemmer=False, use_lowercase=False, use_stopwords = False,remove_nonalpha=False, use_lemma = False): # We always tokenize blob = TextBlob(sentence) # words = blob.words words_and_tags = blob.tags words_and_tags_list = [list(x) for x in words_and_tags] if use_stopwords: stopwords = list(set(nltk_stopwords.words('english'))) else: stopwords = [] # Using just stop words without preprocessing to lower makes little sense, as it doesn't find all of them then # Therefore, we need to lowercase akk the words, and then look if they appear in stopwords. # Note that we still return the unlowered version for the word, if it's not found in stopwords. # words = [word for word in words if word.lower() not in stopwords] # List comprehension sorting out. Take the first element for each nested list, and check if it's # in the stopwords list. Then root out nested lists, which have only pos remaining. words_and_tags_list = [[word for word in wordpos if word.lower() not in stopwords] for wordpos in words_and_tags_list] words_and_tags_list = [x for x in words_and_tags_list if len(x) > 1] if use_lowercase: # words = [word.lower() for word in words if word.lower() not in stopwords] words_and_tags_list = [[word[0].lower(), word[1]] for word in words_and_tags_list] # Note that this also removes all words, which have a number in them if remove_nonalpha: # words = [word for word in words if word.isalpha() and word not in stopwords] words_and_tags_list = [[word for word in wordpos if word.isalpha()] for wordpos in words_and_tags_list] words_and_tags_list = [x for x in words_and_tags_list if len(x) > 1] # We stem if asked to, although it will cause problems with synset searches # Stemmer also makes words lowercase by default if use_stemmer: stemmer = SnowballStemmer("english") # words = [stemmer.stem(word) for word in words] words_and_tags_list = [[stemmer.stem(word[0]), word[1]] for word in words_and_tags_list] if use_lemma: words_and_tags_list = [[Word(word[0]).lemmatize(pos=posTb2Wn.get(word[1])), word[1]] for word in words_and_tags_list] return words_and_tags_list
def Sim2(text1, text2) : stop = stopwords.words('english') text1=regexpProcessing(text1) text2=regexpProcessing(text2) # convert both texts into upper case TEXT1=text1.strip() TEXT2=text2.strip() TEXT1=TEXT1.lower() TEXT2=TEXT2.lower() token1 = generateTokens(TEXT1) token2 = generateTokens(TEXT2) t1List=[] for tok1 in token1: word1 = Word(tok1) w1=word1.spellcheck() correctw=w1[0][0] confidence = w1[0][1] if (confidence > 0.8) and (correctw not in stop): t1List.append(correctw) t2List=[] for tok2 in token2: word2 = Word(tok2) w2=word2.spellcheck() correctw=w2[0][0] confidence = w2[0][1] if (confidence > 0.8) and (correctw not in stop): t2List.append(correctw) for i in range(len(TextItems)): token = generateTokens(TextItems[i]) tokenList.append(token) token = [] # spell correction # POS Tagging word1 = wn.synset('dog.n.01') word2 = wn.synset('cat.n.01') word1.path_similarity(word2) return CosineSimilarity
def processSentence(sentence): result_list = list() wiki = TextBlob(sentence) for word_tuple in wiki.tags: # print(word_tuple) try: w = Word(word_tuple[0]) if word_tuple[1].startswith('JJ'): k = 'a' else: k = word_tuple[1][0].lower() if k in {'c','p','i','w','d','t','m','e','u','f','s'}: norm = w.lemmatize() else: norm = w.lemmatize(k) # print(k,norm) except: #print word_tuple norm = '' if norm == word_tuple[0]: norm = '' result_list.append(word_tuple[0]+'/'+norm+'/'+word_tuple[1]) return ' '.join(result_list)
print() #sentiment analysis testimonial = TextBlob("Textblob is amazingly simple to use. What great fun!") print("Textblob is amazingly simple to use. What great fun!") a = testimonial.sentiment print(a) #returns polarity[-1(for worst) to 1(for best)] and subjectivity print() #word inflection sentence = TextBlob('Use 4 spaces per indentation level.') print(sentence.words[2].singularize()) #similarly you can use pluralize print() #word lemmatization w = Word("octopi") print("octopi -> ",w.lemmatize()) w = Word("went") print("went -> ",w.lemmatize("v")) print() #definition print("Octopus : ",Word("octopus").definitions) print() #translation and language detection en_blob = TextBlob(u'Simple is better than complex.') print('Simple is better than complex.') print("SPANISH : ",en_blob.translate(to='es')) en_blob = TextBlob(u'Comment allez vous?') print('Comment allez vous?')
def asprate(review): food_selected_sent = [] service_selected_sent = [] price_selected_sent = [] ambience_selected_sent = [] category = [] zen = TextBlob(review) sentences = zen.sentences for sentence in sentences: words = sentence.words for i in words: w = Word(i) i = w.lemmatize() if i in food: food_selected_sent.append(sentence) break elif i in service: service_selected_sent.append(sentence) break elif i in price: price_selected_sent.append(sentence) break elif i in ambience: ambience_selected_sent.append(sentence) break # print (food_selected_sent,service_selected_sent,price_selected_sent,ambience_selected_sent) food_polarity = [] service_polarity = [] price_polarity = [] ambience_polarity = [] for i in food_selected_sent: food_polarity.append(i.sentiment.polarity) for i in service_selected_sent: service_polarity.append(i.sentiment.polarity) for i in price_selected_sent: price_polarity.append(i.sentiment.polarity) for i in ambience_selected_sent: ambience_polarity.append(i.sentiment.polarity) # print (food_polarity,service_polarity,price_polarity,ambience_polarity) if food_polarity: sum_food = 0 for i in food_polarity: sum_food += i print "food", scale_rating(sum_food / len(food_polarity)) if service_polarity: sum_service = 0 for i in service_polarity: sum_service += i print "service", scale_rating(sum_service / len(service_polarity)) if price_polarity: sum_price = 0 for i in price_polarity: sum_price += i print "price", scale_rating(sum_price / len(price_polarity)) if ambience_polarity: sum_ambience = 0 for i in ambience_polarity: sum_ambience += i print "ambience", scale_rating(sum_ambience / len(ambience_polarity))
im=enhancer.enhance(5) im=im.convert('1') #Saving the image im.save('final.jpg') #Reading the image text=pytesseract.image_to_string(Image.open('final.jpg')) #Getting the text from the image using pytesseract if len(text)!=0: print(text) token=nltk.word_tokenize(text) l=len(token) list_sugg=[] for i in range(0,l): print("...................") t_line=TextBlob(token[i]) w_line=Word(token[i]) l=w_line.spellcheck() length=len(l) print("are you looking for") for i in range(0,length): print(str(i+1)+"->"+str(l[i][0])) print("according to me :"+str(t_line.correct())) list_sugg.append(str(t_line.correct())) print("according to me......") print(" ".join(list_sugg)) #'q' for exit if cv2.waitKey(1) &0xFF == ord('q'): break except: break
import pandas as pd from textblob import TextBlob from textblob import Word mon_ami_photos = pd.read_pickle(r'C:\Users\LauraM\Desktop\mon_ami_gabi_photos.pkl') mon_ami_reviews = pd.read_pickle(r'C:\Users\LauraM\Desktop\mon_ami_gabi_reviews.pkl') #print(mon_ami_photos['caption']) #print(mon_ami_reviews['text']) reviewsEntities=[] counter=0 totalRev=len(mon_ami_reviews['text']) for review in mon_ami_reviews['text']: print '%d : %s'%(len(reviewsEntities),review) textTB=TextBlob(review); textTB.correct(); print textTB.sentiment entities=[] for word in textTB.noun_phrases: w=Word(word) w.singularize(); w.lemmatize() entities.append(w) print(w) reviewsEntities.append(entities) print '%d/%d : %s'%(len(reviewsEntities),totalRev,entities)
def decodeWeatherData(data): status = Word(data['status'].lower()) status = status.lemmatize() # Set origin setStructureOrigin(status) if status == 'clear': pass elif status == 'rain': status = 'rainy' elif status == 'cloud': status = 'cloudy' poemStructure['weather_status'] = [status] poemStructure['actual_temp'] = [str(data['temp'])] ''' Above 120F: Torrid (R34,G0,B0) 110 to 120F: Extremely hot (R58,G0,B0) 100 to 110F: Excessively hot (R88,G0,B0) 90 to 100F: Very hot (R192,G0,B0) 80 to 90F: Hot (R255,B0,G0) 70 to 80F: Very warm (R255,G192,B0) 60 to 70F: Warm (R255,G255,B0) 50 to 60F: Mild (R204,G102,B0) 40 to 50F: Cool (R146,G208,B80) 30 to 40F: Chilly (R115,G190,B211) 10 to 30F: Cold (R0,G112,B192) 10 to -20F: Very cold (R112,G48,B160) -20 to -40F: Bitterly cold (R214,G0,B147) Below -40F: Brutally cold (R255,G102,B153) Read more: http://www.city-data.com/forum/weather/1620160-your-personal-temperature-colors-descriptors-climate.html#ixzz47Ga7mIFW ''' if data['temp'] < 20: poemStructure['temp_descriptor'] = ['biting','frigid','frosty','glacial','icy','numbing','polar','wintry','arctic','bitter','chill','chilled','cutting'] elif data['temp'] < 40: poemStructure['temp_descriptor'] = ['breezy','brisk','cool','crisp','freezing','frosty','icy','wintry','arctic','icebox','sharp','biting','blowy','drafty','fresh','glacial','hawkish','nippy','penetrating','snappy'] elif data['temp'] < 60: poemStructure['temp_descriptor'] = ['mild','moderate','pleasant','refreshing','summerlike','summery','temperate'] elif data['temp'] < 80: poemStructure['temp_descriptor'] = ['balmy','broiling','clement','flushed','glowing','heated','hot','lukewarm','pleasant','snug','summery','sweaty','temperate','thermal','warmish'] elif data['temp'] < 100: poemStructure['temp_descriptor'] = ['baking','blistering','broiling','burning','fiery','hot','red-hot','roasting','scalding','scorching','sizzling','torrid','tropical','warm'] # Wind descriptors found here - http://gyre.umeoce.maine.edu/data/gomoos/buoy/php/variable_description.php?variable=wind_speed wind = data['wind'] if wind < 3: poemStructure['wind_descriptor'] = ['smoke rises vertically', 'the air is calm'] wind = ['calmly','stilly'] elif wind < 7: poemStructure['wind_descriptor'] = ['weather vanes are quiet','smoke drifts calmly'] wind = ['lightly'] elif wind < 12: poemStructure['wind_descriptor'] = ['small twigs move', 'light flags extend'] wind = ['gently'] elif wind < 18: poemStructure['wind_descriptor'] = ['small branches sway','paper blows about'] wind = ['moderately'] elif wind < 24: poemStructure['wind_descriptor'] = ['trees lazily sway', 'waves are breaking'] wind = ['freshly'] elif wind < 31: poemStructure['wind_descriptor'] = ['the wind tugs', 'wind rushes', 'umbrellas revolt'] wind = ['strongly'] elif wind < 38: poemStructure['wind_descriptor'] = ['people walk at acute angles', 'twigs break'] wind = ['gusting'] else: poemStructure['wind_descriptor'] = ['feels like a hurricane', 'trees are falling'] wind = ['severely', 'violently'] return {}
for i in range(20): random_ballet = story_ballets[randint(0,len(story_ballets)-1)] section = len(random_ballet) / 20 random_line = random_ballet[randint(i*section, (i+1)*section)] random_line = random_line.decode('utf-8') blob = TextBlob(random_line) nouns = [] verbs = [] adjs = [] advs = [] prps = [] for word,pos in blob.tags: #print word, pos if pos == "NNS" or pos == "NNP" or pos == "NN" or pos == "PRP": w = Word(word.lower()) nouns.append(w.lemmatize()) elif pos == "VBZ" or pos == "VBG" or pos == "VBD": w = Word(word) verbs.append(w.lemmatize()) #print w.lemmatize(word) elif pos == "JJ": adjs.append(word) elif pos == "RB": advs.append(word) elif pos == "PRP$": prps.append(word) loop = True search_for = [] print random_line
#!/usr/bin/python from textblob import Word from textblob.wordnet import NOUN word = Word("plant") print word.get_synsets(NOUN)
from textblob import Word import sys string_tocheck = Word(sys.argv[1]) print string_tocheck.spellcheck()[0][0]
#archFreq=open('Frecuencia.txt','w') #archTF_IDF=open('TF_IDF.txt','w') #archDoc=open('Documentos.txt','w') lineas=archLectura.readlines() #print "Read Line: %s" % (lineas) lineaTotal="" for linea in lineas: lineaTotal+=linea blob=TextBlob(lineaTotal.decode('utf-8')) #separamos en palabras words=blob.words #print words #quitamos los stop words #hacemos un stemming snowball comentarios=[] stemmer = SnowballStemmer("spanish") for word in words: if word not in (stopwords.words('spanish')):#Elimnimar Stop words w=Word(word) comentarios.append(stemmer.stem(w.lower())) print "Nuevo: %s " % (comentarios) #print stopwords.words('spanish')