def txt2words(self, txt, remove_stopwords=True): txt = BeautifulSoup(txt).get_text() txt = ftfy.fix_text(txt) txt = txt.replace("\\n", '') txt = re.sub("[^0-9a-zA-Z]"," ", txt) if remove_stopwords: words = [self.save_stem(w) for w in txt.lower().split() if (w not in self.stopwords) & (len(w) > 2) & (not w.isdigit())] else: words = [self.save_stem(w) for w in txt.lower().split() if (len(w) > 2) & (not w.isdigit())] return words
def report_to_wordlist(report): # Function to convert document text to a sequence of words, # optionally removing stop words. Returns a list of words. # Remove HTML tags and related report_text = BeautifulSoup(report).get_text() # Remove non-letters report_text = re.sub("[^a-zA-Z]"," ", report_text) # Convert words to lower case and split them words = report_text.lower().split() myStops = ["any", "my","like","another","one","two","else","bras","ago","cos","get","yet","k","go", "every", "sort", "push","pull"] stoplist = set(stopwords.words("english") + myStops) words = [w for w in words if (not w in stoplist and len(w)>3)] wordListTuple = Counter(words).most_common() listofWords = [[tuple[0],tuple[1]] for tuple in wordListTuple] #print listofWords #lisofWords = map(list, wordListTuple) # Return a list of words return listofWords
def process_song(song, remove_stopwords = True): # Function to convert raw song lyrics to a sequence of words, # optionally removing stop words. Returns a list of words. # # 1. Remove HTML song_text = BeautifulSoup(song).get_text() # # 2. Remove \\n, separate out comman and ! symbols from words \ # and remove rest of the characters. song_text = re.sub(r"\\n"," ", song_text) # TODO: Should we keep comman and ! ?? song_text = re.sub("(,|!)",r" \1", song_text) song_text = re.sub("[^a-zA-Z',!]"," ", song_text) # # 3. Convert words to lower case and split them words = song_text.lower().split() # # 4. Optionally remove stop words (false by default) if remove_stopwords: stops = set(stopwords.words("english")) words = [w for w in words if not w in stops] # # 5. Return a list of words # return(words) # 6. Join the words back into one string separated by space, # and return the result. return( " ".join(words))
def clean_review(raw_review, remove_stopwords = False, output_format = "string"): """ Input: raw_review: raw text of a movie review remove_stopwords: a boolean variable to indicate whether to remove stop words output_format: if "string", return a cleaned string if "list", a list of words extracted from cleaned string. Output: Cleaned string or list. """ # Remove HTML markup text = BeautifulSoup(raw_review) # Keep only characters text = re.sub("[^a-zA-Z]", " ", text.get_text()) # Split words and store to list text = text.lower().split() if remove_stopwords: # Use set as it has O(1) lookup time stops = set(stopwords.words("english")) words = [w for w in text if w not in stops] else: words = text # Return a cleaned string or list if output_format == "string": return " ".join(words) elif output_format == "list": return words
def review_to_words(raw_review, remove_stopwords = False): # BeautifulSoup pulls data out of html file # here it removes html tags and markups text = BeautifulSoup(raw_review).get_text() # replace numbers by word number text=re.sub(r'[0-9]+','number',text) # remove punctuations (they can be analyzed for better results) text = re.sub(r'[^a-zA-Z]', ' ', text) text = text.lower() #make a list of words words_list = text.split() #download nltk text data sets, including stop words #nltk.download() if remove_stopwords: # get stopwords, searching a set is faster than searching a list stops = set(stopwords.words('english')) # remove stopwords words_list = [word for word in words_list if not word in stops] # reduce words to their stems stemmer=PorterStemmer() words_list=[stemmer.stem(word) for word in words_list] # return the list of words return words_list
def reviewToWordList(rawReview, removeStopWords = False): """ Converts a document to sequence of words optionally removing stop words will later extend to optionally remove numbers I/O -Input: raw html in string form -Output: list of words """ #Remove HTML cleanedReview = BeautifulSoup(rawReview).get_text() #Remove non-letters cleanedReview = re.sub("[^a-zA-Z]", " ", cleanedReview) #Convert words to lowerCase cleanedReview = cleanedReview.lower() #Split Words wordList = cleanedReview.split() #Optionally remove stop words if ( removeStopWords ): stops = set(stopwords.words('english')) wordList = [ word for word in wordList if word not in stops] #Return list of words return(wordList)
def process_strings( string ): # 1. Remove HTML words = BeautifulSoup(string).get_text() # separate joint words words = re.sub('(\w+)([A-Z][a-z]+)',lambda m: " " + m.group(1) +\ " " + m.group(2), words ) # 3. Convert to lower case words = words.lower() # remove unwanted characters ddd = re.sub('[^a-zA-Z0-9\s]', " ", words ) ddd2 = re.sub( "(\d+)x(\d+)", lambda m: m.group(1) + " " + m.group(2) , ddd ) ddd3 = re.sub( "(\d+)x\s", lambda m: m.group(1) + " ", ddd2 ) ddd4 = re.sub( "\sx(\d+)", lambda m: " " + m.group(1), ddd3 ) ddd5 = re.sub( "\sx\s", " " , ddd4 ) fff = re.sub( "(\D+)(\d+)", lambda m: m.group(1) + " " + m.group(2), ddd5 ) fff2 = re.sub( "(\d+)(\D+)", lambda m: m.group(1) + " " + m.group(2), fff ) words = re.sub( "(\d+)(\D+)(\d+)", lambda m: m.group(1) + " " + m.group(2) + " " \ + m.group(3), fff2) for i in range(1,10): words = re.sub('\s(ft|sq|in|gal|cu|h|oz|dia|yd|yds|a|p|qt|ah|amp|gpm|mp\ |quart|watt|cc|d|inc|incl|lb|lbs|lin|ln|mil|mm|no|n|oc\ |od|pc|pal|pt|s|sch|cs|case|pallet|w)\s' , lambda m: " ", words ) # Join the words back into one string separated by space return ( words.split() )
def get_flickr_image_title(url): def meta(tag): return tag.name == 'meta' and 'name' in tag.attrs and tag['name'] == 'title' html = HTTP.request('GET', url + '/sizes/o/') title = BeautifulSoup(html.data, 'html5lib').find(meta)['content'].split('|')[0] return title.lower()
def review_to_wordlist( review, remove_stopwords=False ): # Function to convert a document to a sequence of words, # optionally removing stop words. Returns a list of words. # # 1. Remove HTML review_text = BeautifulSoup(review).get_text() # # 2. Remove non-letters review_text = re.sub("[^a-zA-Z]"," ", review_text) # # 3. Convert words to lower case and split them words = review_text.lower().split() # # 4. Optionally remove stop words (false by default) if remove_stopwords: stops = set(stopwords.words("english")) words = [w for w in words if not w in stops] b=[] stemmer = english_stemmer #PorterStemmer() for word in words: b.append(stemmer.stem(word)) # 5. Return a list of words return(words)
def review_to_words(raw_review): # function to convert a raw review to a string of words # the input is a single string (a raw movie review), and # the output is a single string (a preprocessed movie review) # 1. remove html review_text = BeautifulSoup(raw_review).get_text() # # 2. remove non-letters # letters_only = re.sub("[^a-za-z]", " ", review_text) # # 3. convert to lower case, split into individual words # words = letters_only.lower().split() words = review_text.lower().split() # # 4. in python, searching a set is much faster than searching # a list, so convert the stop words to a set # stops = set(stopwords.words("english")) # # 5. remove stop words # meaningful_words = [w for w in words if not w in stops] # # 6. join the words back into one string separated by space, # and return the result. # return( " ".join( meaningful_words )) return( " ".join( words ))
def reviewToWords(rawReview): """ Converts raw review to a string of words -Input is single html string -Output is preprocessed single string """ cleanedReview = None #Remove HTML cleanedReview = BeautifulSoup(rawReview) #Remove numbers and punctuation cleanedReview = re.sub("[^a-zA-Z]", " ", cleanedReview.get_text()) #Make all words lowercase cleanedReview = cleanedReview.lower() #Split into individual words cleanedReviewWords = cleanedReview.split() #Convert to set instead of list for efficiency stops = set(stopwords.words("english")) #Remove stop words meaningfulWords = [word for word in cleanedReviewWords if word not in stops] #Join words back into one string return (" ".join( meaningfulWords ))
def _extract_date(tag: str, el: bs4.element.Tag, verbose: bool = False) -> list: result = [] if len(el) > 300: return [] # if verbose: # print(el) if tag == 'meta' and el.has_attr('content'): result.append(el['content']) if tag == 'abbr' and all([el.has_attr('itemprop'), el.has_attr('title')]): result.append(el['title']) # if tag == 'time' and el.has_attr('datetime'): # result.append(el['datetime']) _ = el.prettify() _ = BeautifulSoup(_, "lxml").getText() _ = _[:300] if _: result.append(_.lower().strip()) # # if verbose: # pprint.pprint(result) return result
def review_str_to_wordlist(raw_review, clean_method, remove_numbers=True, remove_punct=True, remove_stopwords=True): """Clean one single review item (string) and return it as a list of words :param raw_review: the unprocessed raw review string :param clean_method: the method to clean review, e.g., BeautifulSoup :param remove_numbers: boolean if remove numbers :param remove_punct: boolean, if remove punctuations :param remove_stopwords: boolean, if remove stopwords :returns: cleaned reviews, :rtype: string """ if clean_method == 'BeautifulSoup': word_list = BeautifulSoup(raw_review, 'lxml').get_text() else: sys.exit(('review_str_to_wordlist: The clean method not ' 'supported yet!')) if remove_numbers and remove_punct: word_list = re.sub('[^a-zA-Z]', ' ', word_list).lower().split() elif remove_numbers and not remove_punct: word_list = re.sub('[0-9]', ' ', word_list).lower().split() elif not remove_numbers and remove_punct: word_list = re.sub('[^a-zA-Z0-9]', ' ', word_list).lower().split() else: word_list = word_list.lower().split() if remove_stopwords: stops = set(stopwords.words('english')) word_list = [word for word in word_list if word not in stops] return word_list
class MovieReview(object): def __init__(self, mreview): self.mreview = mreview self.mreview_clean = None self.mreview_word_list = [] self.mreview_sentence_list = [] def clean_review(self): # function to clean the review by stripping html from review text body self.mreview_clean = BeautifulSoup(self.mreview).get_text() def remove_punctuation_and_nums(self): self.mreview_clean = re.sub("[^a-zA-Z]", " ", self.mreview_clean) def split_review_into_words(self): # function to split the review text to list of words self.mreview_word_list = self.mreview_clean.lower().split() def remove_stop_words(self): self.mreview_word_list = [word for word in self.mreview_word_list if not word in set(stopwords.words("english"))] self.mreview_clean = " ".join(self.mreview_word_list) def split_review_into_sentences(self): # function to split review into list of sentences # where each setence is a list of words extracted_sentences = TOKENIZER.tokenize(self.mreview_clean.strip()) for extracted_sentence in extracted_sentences: if len(extracted_sentence) > 0: # extracted_sentence needs to be operated on if stopword or punctuation # removal is required eventually(not required for word2Vec) self.mreview_sentence_list.append(extracted_sentence.lower().split())
def review_to_wordlist( review, remove_stopwords=False , generate_bigrams=False): # Function to convert a document to a sequence of words, # optionally removing stop words. Returns a list of words. # # 1. Remove HTML review_text = BeautifulSoup(review).get_text() # # 2. Remove non-letters review_text = re.sub("[^a-zA-Z\'\"]"," ", review_text) # # 3. Convert words to lower case and split them words = review_text.lower().split() if generate_bigrams: bigrams = [] for gram in KaggleWord2VecUtility.generate_ngrams(words, 2): bigrams.append('{0} {1}'.format(gram[0], gram[1])) words.extend(bigrams) # # 4. Optionally remove stop words (false by default) if remove_stopwords: stops = set(stopwords.words("english")) words = [w for w in words if not w in stops] # # 5. Return a list of words return(words)
def review_to_wordlist( review, remove_stopwords=False ): # Function to convert a document to a sequence of words, stopwords are needed here # optionally removing stop words. Returns a list of words. # # 1. Remove HTML review_text = BeautifulSoup(review).get_text() # # 2. Remove non-letters review_text = re.sub("[^a-zA-Z]"," ", review_text) # # 3. Convert words to lower case and split them words = review_text.lower().split() #3.5 remove more words # review_text = re.sub(filter_words, " ", review_text) # # 4. Optionally remove stop words (false by default) if remove_stopwords: stops = set(stopwords.words("english")) words = [w for w in words if not w in stops] # # 5. Return a list of words return(words)
def text_to_wordlist( review, remove_stopwords=False ): # Function to convert a document to a sequence of words, # optionally removing stop words. Returns a list of words. # # 1. Remove HTML text = BeautifulSoup(review,'html.parser').get_text() # # 2. Remove non-letters text = re.sub("[^A-za-z0-9^,?!.\/'+-=]"," ", text) text = re.sub(r"what's", "what is ", text) text = re.sub(r"\'s", " ", text) text = re.sub(r"\'ve", " have ", text) text = re.sub(r"can't", "cannot ", text) text = re.sub(r"n't", " not ", text) text = re.sub(r"i'm", "i am ", text) text = re.sub(r"\'re", " are ", text) text = re.sub(r"\'d", " would ", text) text = re.sub(r"\'ll", " will ", text) text = re.sub(r"\'scuse", " excuse ", text) text = re.sub(r",", " ", text) text = re.sub(r"\.", " ", text) text = re.sub(r"!", " ! ", text) text = re.sub(r"\?", " ? ", text) # # 3. Convert words to lower case and split them words = text.lower().split() # # 4. Optionally remove stop words (false by default) if remove_stopwords: stops = set(stopwords.words("english")) words = [w for w in words if not w in stops] # 5. Return a list return(words)
def retrieve_from_url(url): """ Retrieves text from url, removes all string formatting \n and \t """ soup = BeautifulSoup(requests.get(url).text) soup = soup.text.replace("\n", " ").replace("\t", " ") return soup.lower()
def sentimentToWordlist(rawReview, removeStopwords=False, removeNumbers=False, removeSmileys=False): # use BeautifulSoup library to remove the HTML/XML tags (e.g., <br />) reviewText = BeautifulSoup(rawReview).get_text() # Emotional symbols may affect the meaning of the review smileys = """:-) :) :o) :] :3 :c) :> =] 8) =) :} :^) :D 8-D 8D x-D xD X-D XD =-D =D =-3 =3 B^D :( :/ :-( :'( :D :P""".split() smiley_pattern = "|".join(map(re.escape, smileys)) # [^] matches a single character that is not contained within the brackets # re.sub() replaces the pattern by the desired character/string # Check to see how we need to perform cleanup if removeNumbers and removeSmileys: reviewText = re.sub("[^a-zA-Z]", " ", reviewText) elif removeSmileys: reviewText = re.sub("[^a-zA-Z0-9]", " ", reviewText) elif removeNumbers: reviewText = re.sub("[^a-zA-Z" + smiley_pattern + "]", " ", reviewText) else: reviewText = re.sub("[^a-zA-Z0-9" + smiley_pattern + "]", " ", reviewText) # split in to a list of words words = reviewText.lower().split() if removeStopwords: # create a set of all stop words stops = set(stopwords.words("english")) # remove stop words from the list words = [w for w in words if w not in stops] return words
def review_to_wordlist( review, remove_stopwords=False ): # Function to convert a document to a sequence of words, # optionally removing stop words. Returns a list of words. # # 1. Remove HTML review_text = BeautifulSoup(review).get_text() # # 2. Remove non-letters review_text = re.sub("[^a-zA-Z]"," ", review_text) # # 3. Convert words to lower case and split them words = review_text.lower().split() # # 4. Optionally remove stop words (false by default) if remove_stopwords: stops = set(stopwords.words("english")) words = [w for w in words if not w in stops] # * Keep only more common appeared words, seem to make result worse # common_words = nltk.FreqDist(words).most_common(50) # words = [ w[0] for w in common_words ] # * Morphological processing, no clear improvement # words = filter(lambda w: w != None, [ wn.morphy(w) for w in words ]) # 5. Return a list of words return(words)
def processing(raw_review): # 1. Remove HTML review_text = BeautifulSoup(raw_review).get_text() # 2. Convert all to lower Case review_text=review_text.lower() # 3. Remove Punctuations letters_only = remove_punctuations(review_text) return letters_only
def review_to_wordlist(review, remove_stopwords=False): review_text = BeautifulSoup(review).get_text() review_text = re.sub("^[a-zA-Z]", " ", review_text) words = review_text.lower().split() if remove_stopwords: stops = set(stopwords.words("english")) words = [w for w in words if not w in stops] return(words)
def sentenceToWordList(self, review, remove_stopwords=False): review_text = BeautifulSoup(review).getText() review_text = re.sub("[^a-zA-Z]", " ", review_text) words = review_text.lower().split() if remove_stopwords: stops = set(stopwords.words("english")) words = [w for w in words if not w in stops] return(words)
def review_to_words(review, stopwords): #Function to convert a review into a sequence of words review_text = BeautifulSoup(review).get_text() review_text = re.sub("[^a-zA-Z]", ' ', review_text) words = review_text.lower().split() words = [w for w in words if w not in stopwords] return " ".join(words)
def review_to_words( raw_review ): review_words_only = BeautifulSoup(raw_review) review_words_only = re.sub("[^0-9a-zA-z]", " ", review_words_only.get_text()) review_words_only = review_words_only.lower() # words_in_lower_case = only_letters_lower.split() # words_without_stopwords = [w for w in words_in_lower_case if not w in stopwords.words("english")] # return " ".join( words_without_stopwords ) return review_words_only
def preprocess_post(text): """Preprocessor for MSE questions, to be applied before TFIDF. Strips out HTML, converts everything to lowercase, and removes digits. """ result = BeautifulSoup(text).get_text() result = result.lower() result = ''.join(c for c in result if not c.isdigit()) return BeautifulSoup(text).get_text()
def get_restaurants(url): try: urls = get_page_urls(url) for url in urls: data = get_text_from_url(url) search_div = BeautifulSoup(str(data)).find('div', class_='search-results-content') uls = BeautifulSoup(str(search_div)).findAll('ul', class_='ylist ylist-bordered search-results') for restaurant in BeautifulSoup(str(uls[1])).findAll('li', class_='regular-search-result'): main_attrs = BeautifulSoup(str(restaurant)).find('div', class_='main-attributes') rating = BeautifulSoup(str(main_attrs)).find('div', class_='rating-large') rating_data = str(BeautifulSoup(str(rating)).find('i').attrs['title']) rating_data = rating_data.replace('star rating', '') review_count = str(BeautifulSoup(str(main_attrs)).find('span', class_='review-count rating-qualifier').text.strip()) review_count = review_count.replace(' reviews', '') sub_url = BeautifulSoup(str(main_attrs)).find('a').attrs['href'] url = 'http://www.yelp.com' + sub_url category_data = BeautifulSoup(str(main_attrs)).find('div', class_='price-category') category_str_list = BeautifulSoup(str(category_data)).findAll('span', class_='category-str-list') categories = '' for a in BeautifulSoup(str(category_str_list)).findAll('a'): categories = categories + a.text.strip() + ',' expensive_level = BeautifulSoup(str(category_data)).find('span', 'business-attribute price-range').text h3 = BeautifulSoup(str(restaurant)).find('h3', class_='search-result-title') h3_a = BeautifulSoup(str(h3)).find('a').text name = h3_a.strip() sec_attrs = BeautifulSoup(str(restaurant)).find('div', class_='secondary-attributes') address = BeautifulSoup(str(sec_attrs)).find('address') if '<br/>' in str(address): address = str(address).replace('<br/>', ' ') address = BeautifulSoup(str(address)).find('address').text.strip() city = get_city_from_address(address) if not str(city).lower() in address.lower(): print 'Invalid city detected' RestaurantModel.objects.create( name=name, expensivelevel=expensive_level, city=city, current_rating=float(rating_data), url=url, category=categories, address=address, reviewcount=review_count ) set_db_status(False) except Exception, e: print str(e) + 'get restturats' set_db_status(False)
def review_to_wordlist(review,remove_stopwords=False): review_text = BeautifulSoup(review).get_text() review_text = re.sub("[^a-zA-Z]"," ", review_text) review_text = re.sub(r'(.)\1+', r'\1\1',review_text) # replace doubled up letters words = review_text.lower().split() if remove_stopwords: stops = set(stopwords.words("english")) words = [w for w in words if not w in stops] return(words)
def preproc(review, use_stopwords=False): review_text = BeautifulSoup(review, "lxml").get_text() review_text = re.sub("[^a-zA-Z]"," ", review_text) if use_stopwords: stops = set(nltk.stopwords.words("english")) words = [w for w in review_text.split() if not w in stops] return " ".join(words) return review_text.lower()
def clean_text(text): # Remove HTML review_text = BeautifulSoup(text='lxml').get_text() # Remove non-letters review_text = re.sub("[^a-zA-Z]", " ", review_text) #Convert words to lower case and split them words = review_text.lower().split() #Remove stopwords stops = set(stopwords.words('english')) words = [w for w in words if not w in stops] return words
def tweet_cleaning_for_sentiment_analysis(tweet): #Escaping HTML characters tweet = BeautifulSoup(tweet).get_text() #Special case not handled previously. tweet = tweet.replace('\x92', "'") tweet = tweet.replace('"', "'") tweet = tweet.replace("…", ".") tweet = tweet.replace("\\\'", "'") tweet = tweet.replace("#", "") tweet = tweet.replace("—", "") #Removal of hastags/account tweet = ' '.join( re.sub("(@[A-Za-z0-9]+)|(#[A-Za-z0-9]+)", " ", tweet).split()) #Removal of address tweet = ' '.join(re.sub("(\w+:\/\/\S+)", " ", tweet).split()) #Removal of Punctuation tweet = ' '.join(re.sub("[\[\]\'\\\.\,\!\?\:\;\-\=]", " ", tweet).split()) #Lower case tweet = tweet.lower() #CONTRACTIONS source: https://en.wikipedia.org/wiki/Contraction_%28grammar%29 CONTRACTIONS = load_dict_contractions() tweet = tweet.replace("’", "'") tweet = tweet.replace(" ", " ") tweet = tweet.replace(" ", " ") words = tweet.split() reformed = [ CONTRACTIONS[word] if word in CONTRACTIONS else word for word in words ] tweet = " ".join(reformed) # Standardizing words tweet = ''.join(''.join(s)[:2] for _, s in itertools.groupby(tweet)) #Deal with emoticons source: https://en.wikipedia.org/wiki/List_of_emoticons SMILEY = load_dict_smileys() words = tweet.split() reformed = [SMILEY[word] if word in SMILEY else word for word in words] tweet = " ".join(reformed) #Deal with emojis tweet = emoji.demojize(tweet) tweet = tweet.replace(":", " ") tweet = ' '.join(tweet.split()) return tweet
def cleanReview(review): #1.Remove HTML review_text = BeautifulSoup(review).get_text() #2.Remove non-letters review_text = re.sub("[^a-zA-Z]", " ", review_text) #3.Convert words to lower case review_text = review_text.lower() #4.remove stop words review_words = word_tokenize(review) words = [w for w in review_words if not w in stopwords.words("english")] return ' '.join(words)
def install(name): """ Usage: dash.py install <name>... Options: -h --help Show this screen and exit. """ if isinstance(name, list): return [install(n) for n in name] content = "" name = name.lower() if os.path.exists(name): content = open(name, "r").read() else: if '//' in name: url = name else: url = "https://raw.github.com/whtsky/Dash.py/" \ "master/dash_py/packages/%s.yaml" % name if resource_exist(url): r = requests.get(url) content = r.content if content: package = yaml.load(content) install_package(package) return # Try to download document from rtfd r = requests.get("https://readthedocs.org/projects/%s/downloads/" % name) if r.status_code != 200: logger.error("Can't find package %s" % name) return name = BeautifulSoup(r.content).title.string.split("|")[0].strip() for branch in ['stable', 'master', 'latest']: if branch not in r.content: continue docset_url = "https://media.readthedocs.org/dash/" \ "{0}/{1}/{2}.tgz".format(name.lower(), branch, name) if resource_exist(docset_url): install_package({ "name": name, "type": "docset", "url": docset_url, "format": "tar" }) return logger.error("Can't find package %s" % name) return -1
def headline_to_words(headline): stemmer = WordNetLemmatizer() text = BeautifulSoup(headline, "html.parser").get_text() # Remove HTML tags text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower()) # Convert to lower case words = text.split() # Split string into words words = [w for w in words if w not in stopwords.words("english")] # Remove stopwords words = [stemmer.lemmatize(w) for w in words] # stem return words
def review_to_words(review): nltk.download("stopwords", quiet=True) text = BeautifulSoup(review, "html.parser").get_text() # Remove HTML tags text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower()) # Convert to lower case words = text.split() # Split string into words words = [w for w in words if w not in stopwords.words("english")] # Remove stopwords words = [PorterStemmer().stem(w) for w in words] # stem return words
def review_to_wordlist(review, remove_stopwords=False): review_text = BeautifulSoup(review).get_text() review_text = re.sub("[^a-zA-Z]", " ", review_text) words = review_text.lower().split() if remove_stopwords: stops = set(stopwords.words("english")) words = [w for w in words if w not in stops] return (words)
def clean_sentences(df): reviews = [] for sent in tqdm(df['Phrase']): review_text = BeautifulSoup(sent).get_text() review_text = re.sub("[^a-zA-Z]"," ", review_text) words = word_tokenize(review_text.lower()) lemma_words = [lemmatizer.lemmatize(i) for i in words] reviews.append(lemma_words) return(reviews)
def clean_review(raw): # remove HTML review_text = BeautifulSoup(raw).get_text() # remove no-letters review_text = re.sub("[^a-zA-Z]", " ", review_text) words = review_text.lower().split() # remove stopwords stops = stopwords.words("english") meaningful_words = [w for w in words if w not in stops] return " ".join(meaningful_words)
def clean_text(text): text = BeautifulSoup(text, "lxml").text # HTML decoding text = text.lower() # lowercase text text = REPLACE_BY_SPACE_RE.sub( ' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text text = BAD_SYMBOLS_RE.sub( ' ', text) # delete symbols which are in BAD_SYMBOLS_RE from text text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text stemmer = PorterStemmer() return text
def clean_text(self, text): """ text: a string return: modified initial string """ text = BeautifulSoup(text, "lxml").text # HTML decoding text = text.lower() # lowercase text text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text return text
def clean_data(text): cleaned = [] for t in text: t = (re.sub(r'@[A-Za-z0-9]+', '', t)) # remove @mentions t = re.sub('https?://[A-Za-z0-9./]+', '', t) # remove links t = re.sub("[^a-zA-Z]", " ", t) # Remove numbers and punctuations t = BeautifulSoup(t, 'lxml') # remove html encoded text t = t.text.replace("RT", "") t = t.lower() cleaned.append(t) return cleaned
def review_to_wordlist(review, remove_stopwords=False): # 1. Remove HTML review_text = BeautifulSoup(review, features="html.parser").get_text() # 2. Remove non-letters review_text = re.sub("[^a-zA-Z]", " ", review_text) # 3. Convert words to lower case and split them words = review_text.lower().split() # 4. Optionally remove stop words (false by default) if remove_stopwords: stops = set(stopwords.words("english")) words = [w for w in words if not w in stops] return words
def review_to_wordlist(review): ''' 把IMDB的评论转成词序列 ''' # 去掉HTML标签,拿到内容 review_text = BeautifulSoup(review).get_text() # 用正则表达式取出符合规范的部分 review_text = re.sub("[^a-zA-Z]", " ", review_text) # 小写化所有的词,并转成词list words = review_text.lower().split() # 返回words return words
def clean_sentences(sentence): # 去除HTML Tag review = BeautifulSoup(sentence, features="html5lib").get_text() # 去除标点 review = re.sub("[^a-zA-Z]", " ", review) # 所有字母小写 review = review.lower() # 将句子分割成单词列表 words_lists = review.split() # return like ['word0', 'word1', ...] return words_lists
def clean_text(text): """ text: a string return: cleaned initial string """ text = BeautifulSoup(text, "lxml").text # HTML decoding text = re.compile('[/(){}\[\]\|@,;]').sub( ' ', text) # replace matched symbols by space in text text = re.compile('[^0-9a-z #+_]').sub( '', text) # delete symbols which are in symbols_re from text text = text.lower() return text
def simplified_answer(answer): # nltk.download("stopwords", quiet=True) text = BeautifulSoup(answer, "html.parser").get_text() # Remove HTML tags # text = re.sub(r"[^a-zA-Z0-9\-]", " ", text.lower()) # Convert to lower case # Removed "-" to better leverage the pretrained vocabulary text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower()) # Convert to lower case words = text.split() # Split string into words # words = [w for w in words if w not in stopwords.words("english")] # Remove stopwords # words = [PorterStemmer().stem(w) for w in words] # stem return ' '.join(words)
def clean_text(text): # 去标签 text = BeautifulSoup(text, 'html.parser').get_text() # 去标点 text = re.sub(r'[^a-zA-Z]', ' ', text) # 小写 words = text.lower().split() # 去停用词 stopwords = {}.fromkeys([line.rstrip() for line in open('stopwords.txt')]) # words = [word for word in words if word not in stopwords] return ' '.join(words)
def email_list_action(): try: banner() loading_4() except KeyboardInterrupt: print() error_text("Detected Ctrl+C. Shutting down...") exit(1) email_code() info_text("Installing awk if not installed...") os.system("apt-get -qq install -y awk 2> /dev/null") HASH_FILE = str(sys.argv[2]) info_text("Changing email:hash to hash...") os.system("awk -F: \'{print $2}\' %s > only_hahes.txt" % HASH_FILE) HASH_FILE = "only_hahes.txt" os.system("echo " " > results.txt") info_text("Checking the hashes. This might take a while...") with open(HASH_FILE, "r") as reader: while True: line = reader.readline() if not line: break HASHED = line.strip() URL = "https://md5decrypt.net/en/Api/api.php?hash=%s&hash_type=md5&email=%s&code=%s" % ( HASHED, USER_EMAIL, API_CODE) try: PAGE = requests.get( URL, headers=HEADERS ) # Uses requests lib to get the content of the page PAGE_CONTENT = BeautifulSoup(PAGE.content, "html.parser").get_text() except Exception as e: error_text( "Exception happened while connecting to md5decrypt: ") + e if PAGE_CONTENT.strip() != "": if "error" not in PAGE_CONTENT.lower(): with open("results.txt", "a") as add_text: # this is better add_text.write("{}:{}".format(HASHED, PAGE_CONTENT)) elif "ERROR CODE : 002" in PAGE_CONTENT: os.system("rm results.txt") print() print(" %s%s[!] Error. Wrong email / code.%s" % (Style.RESET_ALL, Fore.RED, Style.RESET_ALL)) print() exit(1) os.system("echo " " >> results.txt") success_text("All done! Results sent to results.txt [hash:text]") warning_text( "Make sure to move results.txt if you are going to run the script again. It will delete the actual one!" ) print() exit(1)
def tweet_cleaning_for_sentiment_analysis(tweet): translator = Translator() #Escaping HTML characters tweet = BeautifulSoup(tweet).get_text() #Deal with emoticons words = tweet.split() reformed = [SMILEY[word] if word in SMILEY else word for word in words] tweet = " ".join(reformed) #Special case not handled previously. tweet = tweet.replace('\x92', "") tweet = tweet.replace('\x85', "") #Removal of hastags/account tweet = ' '.join(re.sub("(@[A-Za-z0-9]+)", " user ", tweet).split()) #verificare posizione nel testo tweet = ' '.join(re.sub("#", "", tweet).split()) #|(#[A-Za-z0-9]+) #Removal of address tweet = ' '.join( re.sub("(\w+:\/\/\S+)", " url ", tweet).split()) #valutare cancellezione url se a fine frase #Removal of Punctuation tweet = ' '.join(re.sub("[\.\,\!\?\:\;\-\=\]", " ", tweet).split()) #rimuovere << oppure "" #CONTRACTIONS source: https://en.wikipedia.org/wiki/Contraction_%28grammar%29 tweet = tweet.replace("’", " ") # Standardizing words tweet = ''.join(''.join(s)[:2] for _, s in itertools.groupby(tweet)) number = emoji.emoji_count(tweet) #Deal with emojis tweet = emoji.demojize(tweet, use_aliases=False, delimiters=("", "")) tweet = tweet.replace("_", " ") tweet = tweet.replace(":", " ") if number != 0: tweet = translator.translate(tweet, src='en', dest='it').text tweet = ' '.join(tweet.split()) #Lower case tweet = tweet.lower() return tweet
def cleanText(text): text = BeautifulSoup(text, "lxml").text text = re.sub(r'\|\|\|', r' ', text) text = re.sub(r'http\S+', r'<URL>', text) text = text.lower() text = text.replace('x', '') clean_text = [] for w in word_tokenize(text): if w.lower() not in stop: clean_text.append(w) return clean_text
def preprocess_text(text): # Removing html tags processed_text = BeautifulSoup(text, features="html.parser").get_text() # Remove capitalization processed_text = processed_text.lower() # Remove punctuations and numbers processed_text = re.sub(r"[^a-z'\s]", "", processed_text) # Remove quotations processed_text = re.sub(r"([^a-z])\'|\'([^a-z])", r"\1\2", processed_text) # Remove excessive whitespace processed_text = re.sub(r"\s+", r" ", processed_text) return processed_text
def reviewToWordlist(review): #First remove the HTML. reviewText = BeautifulSoup(review, features="html.parser").get_text() #Use regular expressions to only include words. reviewText = re.sub("[^a-zA-Z]", " ", reviewText) #Convert words to lower case and split them into separate words. words = reviewText.lower().split() #Return a list of words return words
def tweet_to_wordlist(tweet, remove_stopwords=False): tweet_text = BeautifulSoup(tweet).get_text() tweet_text = re.sub("[^a-zA-Z]", " ", tweet_text) words = tweet_text.lower().split() if remove_stopwords: stops = set(stopwords.words("english")) words = [w for w in words if not w in stops] return (words)
def review_to_words(review): nltk.download('stopwords', quiet=True) stemmer = PorterStemmer() text = BeautifulSoup(review, 'html.parser').get_text() # Remove HTML tags text = re.sub(r'[^a-zA-Z0-9]', ' ', text.lower()) # Convert to lower case words = text.split() # Split string into words words = [w for w in words if w not in stopwords.words('english')] # Remove stopwords words = [PorterStemmer().stem(w) for w in words] # stem return words
def review_wordlist(review, remove_stopwords=False): # 1. Removing html tags review_text = BeautifulSoup(review, "html.parser").get_text() # 2. Removing non-letter. review_text = re.sub("[^a-zA-Z]", " ", review_text) # 3. Converting to lower case and splitting words = review_text.lower().split() # 4. Optionally remove stopwords if remove_stopwords: words = [w for w in words if not w in stops] return words
def singleBook( location ): myDir = location book = epub.read_epub(myDir) items = book.get_items_of_type(ebooklib.ITEM_DOCUMENT) outputtext = "" for item in items: cleantext = BeautifulSoup(item.get_content(), "lxml").text cleantext = re.sub(r'[^\w\s]', '', cleantext) cleantext = cleantext.lower() outputtext += cleantext print(outputtext) return outputtext
def review_to_wordlist(review): # 去掉<br /><br /> review_text = BeautifulSoup(review, "html.parser").get_text() # 去除标点 review_text = re.sub("[^a-zA-Z]"," ", review_text) # 分词 words = review_text.lower().split() #去除停用词 #words = [w for w in words if not w in stopwords.words("chinese")] # 返回words return words
def clean_text(text): REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]') BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]') STOPWORDS = set(stopwords.words('english')) text = BeautifulSoup(text, "lxml").text text = text.lower() text = REPLACE_BY_SPACE_RE.sub(' ', text) text = BAD_SYMBOLS_RE.sub('', text) text = ' '.join(word for word in text.split() if word not in STOPWORDS) return text
def doc_preprocessing(doc): # Removes HTML tags doc = BeautifulSoup(doc, features="lxml").get_text() # Lowercase doc = doc.lower() # Remove accentuation doc = unicodedata.normalize('NFKD', doc).encode( 'ASCII', 'ignore').decode('ASCII') # Remove punctuation doc = doc.translate( str.maketrans('', '', self.strip_punctuation)) return doc
def review_to_wordlist(review, remove_stopwords=False): review_text = BeautifulSoup(review, 'html.parser').get_text() review_text = re.sub('[^a-zA-Z]', ' ', review_text) words = review_text.lower().split() if remove_stopwords: stops = set(stopwords.words('english')) words = [w for w in words if not w in stops] stemmer = SnowballStemmer('english') words = [stemmer.stem(w) for w in words] return words