def removestop(text): text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text) text = text.lower().split() stops = ['so', 'his', 't', 'y', 'ours', 'herself', 'your', 'all', 'some', 'they', 'i', 'of', 'didn', 'them', 'when', 'will', 'that', 'its', 'because', 'while', 'those', 'my', 'don', 'again', 'her', 'if', 'further', 'now', 'does', 'against', 'won', 'same', 'a', 'during', 'who', 'here', 'have', 'in', 'being', 'it', 'other', 'once', 'itself', 'hers', 'after', 're', 'just', 'their', 'himself', 'theirs', 'whom', 'then', 'd', 'out', 'm', 'mustn', 'where', 'below', 'about', 'isn', 'shouldn', 'wouldn', 'these', 'me', 'to', 'doesn', 'into', 'the', 'until', 'she', 'am', 'under', 'how', 'yourself', 'couldn', 'ma', 'up', 'than', 'from', 'themselves', 'yourselves', 'off', 'above', 'yours', 'having', 'mightn', 'needn', 'on', 'too', 'there', 'an', 'and', 'down', 'ourselves', 'each', 'hadn', 'ain', 'such', 've', 'did', 'be', 'or', 'aren', 'he', 'should', 'for', 'both', 'doing', 'this', 'through', 'do', 'had', 'own', 'but', 'were', 'over', 'not', 'are', 'few', 'by', 'been', 'most', 'no', 'as', 'was', 'what', 's', 'is', 'you', 'shan', 'between', 'wasn', 'has', 'more', 'him', 'nor', 'can', 'why', 'any', 'at', 'myself', 'very', 'with', 'we', 'which', 'hasn', 'weren', 'haven', 'our', 'll', 'only', 'o', 'before'] text = [w for w in text if not w in stops] text = " ".join(text) text = text.replace("."," ").replace(","," ") return(text)
def transform_review_text(text): text = text.lower() text = re.sub('[^a-z ]', '', text) text_array = [ stemmer.stem(word) for word in text.split() if word not in stopwords ] return ' '.join(text_array)
def clean_text_round1(text): '''Make text lowercase, remove punctuation, mentions, hashtags and words containing numbers.''' # make text lowercase text = text.lower() # removing text within brackets text = re.sub('\[.*?\]', '', text) # removing text within parentheses text = re.sub('\(.*?\)', '', text) # removing numbers text = re.sub('\w*\d\w*', '', text) # if there's more than 1 whitespace, then make it just 1 text = re.sub('\s+', ' ', text) # if there's a new line, then make it a whitespace text = re.sub('\n', ' ', text) # removing any quotes text = re.sub('\"+', '', text) # removing & text = re.sub('(\&\;)', '', text) # removing any usernames text = re.sub('(@[^\s]+)', '', text) # removing any hashtags text = re.sub('(#[^\s]+)', '', text) # remove `rt` for retweet text = re.sub('(rt)', '', text) # string.punctuation is a string of all punctuation marks # so this gets rid of all punctuation text = re.sub('[%s]' % re.escape(string.punctuation), '', text) # getting rid of `httptco` text = re.sub('(httptco)', '', text) return text
def predict(): # Prediction text text = request.args.get('text') if request.args.get('text') else "" # Split text using non alpha numeric, then remove stopwords text = ' '.join( [i for i in re.split('[^a-z0-9]', text.lower()) if i not in stop]) # Number of predictions predictions = int( request.args.get('num')) if request.args.get('num') else 3 retval = dict() slicing_value = -1 * (predictions + 2) cat_train.insert(0, text) tfidf = TfidfVectorizer(max_df=0.4, min_df=2, stop_words='english', norm='l1', use_idf=True).fit_transform(cat_train) cosine_similarities = linear_kernel(tfidf[0:1], tfidf).flatten() related_docs_indices = cosine_similarities.argsort()[-2:slicing_value:-1] for index, x in enumerate(list(related_docs_indices)): cos_value = list(cosine_similarities)[x] if cos_value > 0: retval[category[x - 1]] = str(cos_value) del cat_train[0] response = jsonify(retval) response.headers.add('Access-Control-Allow-Origin', CORS) #response.headers.add('Access-Control-Allow-Origin', 'https://dashboard.heroku.com/apps/bazarr-web') return response
def preProcess(text): text = text.lower() tokenizer = RegexpTokenizer(r'\w+') tokens = tokenizer.tokenize(text) filtered_words = [w for w in tokens if not w in stopwords.words('english') and not w.isdigit()] print "preprocess:" + " ".join(filtered_words) return " ".join(filtered_words)
def clean_text_round1(text): '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.''' text = text.lower() text = re.sub('\[.*?\]', '', text) text = re.sub('[%s]' % re.escape(string.punctuation), '', text) text = re.sub('\w*\d\w*', '', text) return text
def clean_text(self): text = re.sub('[^a-zA-z]', ' ', str(self.doc)) text = re.sub('\[.*?\]', ' ', text) text = re.sub('\d', ' ', text) text = " ".join(text.split()) text = text.lower() return text
def clean_lowercase(text): # remove nan values if str(text) == 'nan': text = re.sub('nan', '', str(text)) else: text = text.lower() return text
def MyNormalize(text): tokens = nltk.word_tokenize(text.lower()) tokens = [t for t in tokens if t not in string.punctuation] toks = [ WNL.lemmatize(t) if t not in exceptions else t for t in tokens ] return toks
def ProcessText(text): """ processes either a string or list of tokenized words """ if isinstance((text), (str)): # remove non-word characters and lower the letters text = re.sub(r"<[^>]*>", r"", text) text = re.sub(r"#(\w+)", "", text) text = re.sub(r"@", "", text) text = re.sub(r"[^a-zA-Z0-9\s]+", r"", text.lower()) # remove url strings from text text = RemoveUrl(text) # remove extra spaces text = re.sub(r"^ +", r"", text) text = re.sub(r" +", r" ", text) return text if isinstance((text), (list)): return_list = [] for i in range(len(text)): # remove non-word characters and lower the letters temp_text = re.sub(r"<[^>]*>", r"", text[i]) temp_text = re.sub(r"#(\w+)", "", temp_text) temp_text = re.sub(r"@", "", temp_text) temp_text = re.sub(r"[^a-zA-Z0-9\s]+", r"", temp_text.lower()) # remove url strings from text temp_text = RemoveUrl(temp_text) # remove extra spaces temp_text = re.sub(r"^ +", r"", temp_text) temp_text = re.sub(r" +", r" ", temp_text) return_list.append(temp_text) return return_list else: pass
def textClean(text): text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text) text = text.lower().split() # May change here (remove all the stop words) stops = set(my_stop_words) text = [w for w in text if not w in stops] text = " ".join(text) return(text)
def clean_text(text): '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.''' text = text.lower() text = re.sub('\[.*?\]', '', text) text = re.sub('[%s]' % re.escape(string.punctuation), '', text) text = re.sub('\w*\d\w*', '', text) '''Get rid of some additional punctuation and nonsensical text that was missed the first time around.''' text = re.sub('[‘’“”…]', '', text) text = re.sub('\n', '', text) return text
def generate_spelling_graph(text): text = text.lower() text = remove_punct(text) text = tokenizing(text) text = remove_stopwords(text) misspelled = spell.unknown(text) sizes = [len(text)-len(misspelled), len(misspelled)] labels = 'correctly spelled words', 'misspelled words' generate_pie_chart(sizes, labels) generate_bar_chart(text, misspelled, "Occurance")
def sentimentAFINN(text): words = pattern_split.split(text.lower()) sentiments = map(lambda word: afinn.get(word, 0), words) if sentiments: sentiment = float(sum(sentiments))/math.sqrt(len(sentiments)) else: sentiment = 0 return sentiment
def initial_clean(text): """ Function to clean text-remove punctuations, lowercase text etc. """ # remove_digits and special chars text = re.sub("[^a-zA-Z ]", "", text) text = text.lower() # lower case text text = nltk.word_tokenize(text) return text
def clean_text_round1(text): '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.''' text = text.lower() #print('text after changing to lower case:- %s' %(text)) text = re.sub('\[.*?\]', '', text) # removes brackets. text = re.sub('[%s]' % re.escape(string.punctuation), '', text) # removes punctuation markes from a string. text = re.sub( '\w*\d\w*', '', text ) # \w* means remove numbers from a string \d removes digitis from string. return text
def clean_text_round1(text): '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.''' text = text.lower() text = text.replace('-', ' ') text = text.replace("’s ", " ").replace("' ", " ") text = text.replace("s’ ", " ").replace("s' ", " ") #remove proper possesives text = re.sub('\[.*?\]', '', text) text = re.sub('[%s]' % re.escape(string.punctuation), '', text) text = re.sub('\w*\d\w*', '', text) return text
def process_text(text): doc = nlp(text.lower()) result = [] for token in doc: if token.text in nlp.Defaults.stop_words: continue if token.is_punct: continue if token.lemma_ == '-PRON-': continue result.append(token.lemma_) return " ".join(result)
def clean_data(text): """ Clean data part 1: Lower case, """ text = text.lower() text = re.sub('[%s]' % re.escape(string.punctuation), '', text) text = re.sub('\w*\d\w*', '', text) text = re.sub('[‘’“”…]', '', text) text = re.sub('\n', ' ', text) text = re.sub('\t', '', text) text = re.sub('\b', '', text) text = re.sub('[^a-z ]+', '', text) text = re.sub('\s\s+', ' ', text) return text
def process(text, tokenizer=TweetTokenizer(), stopwords=[]): """Process the text of a tweet: - Lowercase - Tokenize - Stopword removal - Digits removal Return: list of strings """ text = text.lower() tokens = tokenizer.tokenize(text) return [tok for tok in tokens if tok not in stopwords and not any(i.isdigit() for i in tok) and len(tok)>2 and tok is tok.strip('#') and tok is tok.strip('@') and tok not in (tok for tok in tokens if re.search('http', tok)) ]
def prepare_text(text): tweet_text = [] try: tokens = nltk.word_tokenize(text.lower()) clean_tokens = [word for word in tokens if word not in stopwords] stem_words = [stemmer.stem(token) for token in clean_tokens] tweet_text.append(" ".join(stem_words)) X_test = tf_vect.transform(tweet_text) except Exception: pass return X_test
def sentimentAFINN(text): """ Returns a float for sentiment strength based on the input text. Positive values are positive valence, negative value are negative valence. """ words = pattern_split.split(text.lower()) sentiments = map(lambda word: afinn.get(word, 0), words) if sentiments: # How should you weight the individual word sentiments? # You could do N, sqrt(N) or 1 for example. Here I use sqrt(N) sentiment = float(sum(sentiments))/math.sqrt(len(sentiments)) else: sentiment = 0 return sentiment
def preprocessing(text): if type(text) == pd.Series: text = [entry.lower() for entry in text] text = [remove_punct(entry) for entry in text] text = [tokenizing(entry) for entry in text] text = [remove_stopwords(entry) for entry in text] text = [lemmatizing(entry) for entry in text] text = [" ".join(entry) for entry in text] elif type(text) == str: text = text.lower() text = remove_punct(text) text = tokenizing(text) text = remove_stopwords(text) text = lemmatizing(text) text = " ".join(text) return text
def cleaner( text, ): filters_regex = r'(?!\')(?:\W|_)' clean_text = re.sub( pattern=filters_regex, repl=' ', string=text.lower(), ) clean_text = re.sub( pattern=r'\s+', repl=' ', string=clean_text, ) return clean_text
def clean_text(text): ''' Parameters: ing Returns: String This function does the following process on the text: convert to lowercase removes punctuation removes special characters ''' ''' New implementation to remove the punctuation and replace with space Ref: https://stackoverflow.com/questions/42614458/how-to-replace-punctuation-with-whitespace ''' punc_list = list(string.punctuation) translator = text.maketrans(dict.fromkeys(punc_list, " ")) cleantext = text.lower().translate(translator) ## clear off numbers and normalize spaces between words ## and lowercase it cleantext = " ".join( [s for s in cleantext.split(" ") if s.strip() is not ""]).lower() ## remove any non-printable (non-ascii) characters in the text printable = set(string.printable) cleantext = list(filter(lambda x: x in printable, cleantext)) cleantext = "".join(cleantext) ## remove roman numberals from string which ## are not in brackets toremove = [ ' ii ', ' iii ', ' iv ', ' v ', ' vi ', ' vii ', ' viii ', ' ix ', ' x ', '!', '@', '#', '$', '%', '^', '&', '*', '$.' ] text_array = cleantext.split("\s+") cleantext = [word.strip() for word in text_array if word not in toremove] cleantext = " ".join(cleantext) ## clear off all arabic numerals / digits in the text which are attached ## together with text numbers = [1] while (len(numbers) != 0): numbers = re.findall('\d+', cleantext) for number in numbers: cleantext = cleantext.replace(number, " ") cleantext = re.sub(' +', ' ', cleantext) return cleantext.strip()
def get_parts(text): nouns = set() descriptives = set() text = text.lower().split() text = [i for i in text if i not in stop] text = [i for i in text if i not in punctuation] text = [i for i in text if len(i) > 1] for word, pos in nltk.pos_tag( text ): # remove the call to nltk.pos_tag if `sentence` is a list of tuples as described above if pos in ['NN', "NNP"]: # feel free to add any other noun tags nouns.add(word) elif pos in ["JJ", "JJR"]: descriptives.add(word) return list(nouns), list(descriptives)
def tokenize_text(text): text = text.lower() text = re.sub(r"http\S+", "", text) text = text.translate(string.punctuation) words = nltk.word_tokenize(text) words = [re.sub("[^A-Za-z0-9]", "", word) for word in words] final_words = [] for word in words: if not word: continue if word in nltk.corpus.stopwords.words("english"): continue if word.startswith("@") or word.startswith("#"): continue if word.isnumeric(): continue final_words.append(word) return final_words
def clean_text(text, remove_stop_words=False): text = text.lower() replace_punctuation = str.maketrans(string.punctuation, ' ' * len(string.punctuation)) text = text.translate(replace_punctuation) text = re.sub(r'\s+', ' ', text) text = re.sub('[\n\r]', '', text) if remove_stop_words == True: text = text.split() new_text = [] stemmer = PorterStemmer() for word in text: if word not in STOPWORDS: new_text.append(stemmer.stem(word)) text = ' '.join(new_text) return text
def clean_text(text, country): text = reduce(lambda a, kv: a.replace(*kv), contractions.items(), text.lower()) text = text.replace('\t', ' ').replace('\n', ' ').replace('\r', ' ') text = strip_accents(text) text = text.translate( str.maketrans(string.punctuation, ' ' * len(string.punctuation))) tokens = tk.tokenize(text) if country == 'USA': stopwords = usa_stopwords elif country == 'Canada': stopwords = canada_stopwords elif country == 'UK': stopwords = britain_stopwords else: raise ValueError("Country is invalid.") tokens = [ w for w in tokens if w not in stopwords and len(w) > 2 and w != ' ' and not w.isdigit() ] return ' '.join(tokens)
def postTags(): retval = {'tags': []} text = re.sub('%20', ' ', request.args.get('text')) if request.args.get('text') else "" # Remove characters that are not punctuations, numbers and alphabets text = re.sub('[^a-zA-Z0-9\s.?!-]', '', text) # Remove extra spaces and tabs text = re.sub('[\s+]', ' ', text) print(text) blob = TextBlob(text.lower()) temp = blob.tags for i in range(len(temp)): if temp[i][1] == 'JJ': k = 1 found = '' while i + k < len(temp) and k < 2: if temp[i + k][1] == 'NN' or temp[i + k][1] == 'NNP': retval['tags'].append( string.capwords(temp[i][0] + ' ' + temp[i + k][0])) temp[i + k] = (temp[i + k][0], 'DONE') break k += 1 for i in range(len(temp)): if temp[i][ 1] == 'NNP' or temp[i][1] == 'VBN' and temp[i][0] not in stop: retval['tags'].append(string.capwords(temp[i][0].lemmatize())) retval['tags'] = list(set(retval['tags'])) response = jsonify(retval) response.headers.add('Access-Control-Allow-Origin', CORS) return (response)
def tokenize_and_stem(text): return stem(text.lower().split(" "))
def normalizeWords(self, text): return re.compile(r'\W+', re.UNICODE).split(text.lower())