import re import numpy as np from numpy import log from Vector import Vector # ==== Cleaning Text === # Load stopwords from stopwords import stopwords new_stop_words = set() for word in stopwords: if "'" in word: new_stop_words.add(word.replace("'", "")) stopwords = stopwords.union(new_stop_words) # Sanitizer def sanitize(text): """ clean up: 1. split a string into a list of words 2. remove all @ handles 3. remove all hash tags 4. remove all liks 5. remove all stopwords 6. throw away punctuation, except smiley faces 7. make the final vector of words a set @args:
import sys import re import math from Vector import Vector # ==== Cleaning Text === # Load stopwords from stopwords import stopwords new_stop_words = set() for word in stopwords: if "'" in word: new_stop_words.add(word.replace("'","")) stopwords = stopwords.union(new_stop_words) # Sanitizer def sanitize(text): """ clean up: 1. split a string into a list of words 2. remove all @ handles 3. remove all hash tags 4. remove all liks 5. remove all stopwords 6. throw away punctuation, except smiley faces 7. make the final vector of words a set @args: