def preProcessData(tuple):
	count=0
	for tweet,sentiment in tuple:	
	
		#Remove HashTags
		tweet=re.sub('#','',tweet)
		
		#Remove Username like @Rahul
		tweet=re.sub('@[\w\d_]*','',tweet)

		#Remove URL's
		tweet=re.sub('http.//[\w\d\.\\/]*','',tweet)

		#Remove Puntuations
		tweet=re.sub(r'[%\.\'\"\?:,;!-]',' ',tweet)
		
		#Remove HTML Tags
		tweet=re.sub('<.*?>','',tweet)
		
		#Remove rpeadted Words
		tweet=re.sub(r'([a-z])\1+',r'\1',tweet)
		
		#Removing words that start with a number or a special character
		tweet = re.sub(r'^[^a-zA-Z]+',' ',tweet)
		
		#Convert camel Casing into space Separated word
		tweet=re.sub("([a-z])([A-Z])","\g<1> \g<2>",tweet)
		
		#Remove additional white spaces
		tweet = re.sub('[\s]+', ' ', tweet)
		
		#Remove StopWords
		tweet=tweet.split()
		nltkVariable=nltk.corpus.stopwords.words('english')
		for word in tweet:
			if word in nltkVariable:
				tweet.remove(word)
				
		#Lemmatize Words
		tweet=[wnl().lemmatize(word) for word in tweet]
		
		if count==6:
			#print tweet
		#	tweet=re.sub('http.//[\w\d\.\\/]*','',tweet)
			print tweet
		count=count+1
    def __init__(self):

        self.tokenizer = data.load('tokenizers/punkt/english.pickle')
        self.wnl       = wnl()
        self.delims = {".", ',', "!", ":", ";"}
        self.stopwords = {"the"}
示例#3
0
sys.path.append("..")
from Parsers.Stemming.lovins    import stem              as lovins_stem
from nltk.stem.porter           import PorterStemmer     as porter
from nltk.stem.lancaster        import LancasterStemmer  as lancs
from nltk.stem.snowball         import EnglishStemmer    as snowball
from nltk.stem                  import WordNetLemmatizer as wnl

#Source : http://stackoverflow.com/questions/446052/python-best-way-to-check-for-python-version-in-a-program-that-uses-new-language
reqVersion = (3,0)
curVersion = sys.version_info

_lancs = lancs()
_porter = porter()
_snowball = snowball()
_wnl = wnl()
        
def removeEscapeChars(textString):
        if curVersion < reqVersion:
                return textString.encode('string_escape').decode('string_escape')  #Python 2
        else:
                return textString.encode('encode_escape').decode("unicode_escape") #Python 3

def getASCIIChars(textString):
        return unicode(string, 'ascii', 'ignore')

#Source: http://www.packtpub.com/article/parsing-specific-data-python-text-processing
def detectEncoding(textString):
        try:
                return chardet.detect(textString)
        except UnicodeDecodeError:
def wordLemmatizer(tweet_words):
    return [wnl().lemmatize(word) for word in tweet_words]