def preProcessData(tuple): count=0 for tweet,sentiment in tuple: #Remove HashTags tweet=re.sub('#','',tweet) #Remove Username like @Rahul tweet=re.sub('@[\w\d_]*','',tweet) #Remove URL's tweet=re.sub('http.//[\w\d\.\\/]*','',tweet) #Remove Puntuations tweet=re.sub(r'[%\.\'\"\?:,;!-]',' ',tweet) #Remove HTML Tags tweet=re.sub('<.*?>','',tweet) #Remove rpeadted Words tweet=re.sub(r'([a-z])\1+',r'\1',tweet) #Removing words that start with a number or a special character tweet = re.sub(r'^[^a-zA-Z]+',' ',tweet) #Convert camel Casing into space Separated word tweet=re.sub("([a-z])([A-Z])","\g<1> \g<2>",tweet) #Remove additional white spaces tweet = re.sub('[\s]+', ' ', tweet) #Remove StopWords tweet=tweet.split() nltkVariable=nltk.corpus.stopwords.words('english') for word in tweet: if word in nltkVariable: tweet.remove(word) #Lemmatize Words tweet=[wnl().lemmatize(word) for word in tweet] if count==6: #print tweet # tweet=re.sub('http.//[\w\d\.\\/]*','',tweet) print tweet count=count+1
def __init__(self): self.tokenizer = data.load('tokenizers/punkt/english.pickle') self.wnl = wnl() self.delims = {".", ',', "!", ":", ";"} self.stopwords = {"the"}
sys.path.append("..") from Parsers.Stemming.lovins import stem as lovins_stem from nltk.stem.porter import PorterStemmer as porter from nltk.stem.lancaster import LancasterStemmer as lancs from nltk.stem.snowball import EnglishStemmer as snowball from nltk.stem import WordNetLemmatizer as wnl #Source : http://stackoverflow.com/questions/446052/python-best-way-to-check-for-python-version-in-a-program-that-uses-new-language reqVersion = (3,0) curVersion = sys.version_info _lancs = lancs() _porter = porter() _snowball = snowball() _wnl = wnl() def removeEscapeChars(textString): if curVersion < reqVersion: return textString.encode('string_escape').decode('string_escape') #Python 2 else: return textString.encode('encode_escape').decode("unicode_escape") #Python 3 def getASCIIChars(textString): return unicode(string, 'ascii', 'ignore') #Source: http://www.packtpub.com/article/parsing-specific-data-python-text-processing def detectEncoding(textString): try: return chardet.detect(textString) except UnicodeDecodeError:
def wordLemmatizer(tweet_words): return [wnl().lemmatize(word) for word in tweet_words]