def createNewTrainingSet(self, training_data_file): XTrain = [] YTrain = [] XTrainFeatures = [] XTrainSentiment = [] XTrainFreqTweets = [] geo_latitude = [] geo_longitude = [] objFilterStopWords = FilterStopWords() objPreprocessTweets = PreprocessTweets() stopWords = objFilterStopWords.getStopWordList('../../TwitterData/StopWords.txt') # Read the tweets one by one and process it inpTweets = csv.reader(open(training_data_file, 'rb'), delimiter=',') inpTweets.next() tweets = [] i = 0 for row in inpTweets: # print row personality = row[5] tweet = row[1] cleanTweet = tweet.replace('"",""', " ") cleanTweet = cleanTweet.replace('""', " ") processedTweet = objPreprocessTweets.processTweet(cleanTweet) XTrainFreqTweets.append(int(row[4])) wordsList = processedTweet.split() # Remove stop words filtered_words = [word for word in wordsList if word not in stopwords.words('english')] filteredTweets = ' '.join(filtered_words) featureVector = objFilterStopWords.getFeatureVector(processedTweet, stopWords) geo_latitude.append(float(row[2])) geo_longitude.append(float(row[3])) blob = TextBlob(processedTweet) sentiment = 0 for sentence in blob.sentences: sentiment += sentence.sentiment.polarity totSentiment = sentiment / len(blob.sentences) XTrainSentiment.append(totSentiment) XTrainFeatures.append(filteredTweets) YTrain.append(personality.replace('[', '').replace('\"', '').replace(']', '')) # i+=1 # if i==3: # break return XTrain, YTrain, XTrainFeatures, XTrainSentiment, XTrainFreqTweets, geo_latitude, geo_longitude
def createNewTrainingSet(self, training_data_file): XTrain = [] YTrain = [] XTrainFeatures = [] XTrainSentiment = [] XTrainFreqTweets = [] geo_latitude = [] geo_longitude = [] objFilterStopWords = FilterStopWords() objPreprocessTweets = PreprocessTweets() stopWords = objFilterStopWords.getStopWordList( '../../TwitterData/StopWords.txt') # Read the tweets one by one and process it inpTweets = csv.reader(open(training_data_file, 'rb'), delimiter=',') inpTweets.next() tweets = [] i = 0 for row in inpTweets: # print row personality = row[5] tweet = row[1] cleanTweet = tweet.replace('"",""', " ") cleanTweet = cleanTweet.replace('""', " ") processedTweet = objPreprocessTweets.processTweet(cleanTweet) XTrainFreqTweets.append(int(row[4])) wordsList = processedTweet.split() # Remove stop words filtered_words = [ word for word in wordsList if word not in stopwords.words('english') ] filteredTweets = ' '.join(filtered_words) featureVector = objFilterStopWords.getFeatureVector( processedTweet, stopWords) geo_latitude.append(float(row[2])) geo_longitude.append(float(row[3])) blob = TextBlob(processedTweet) sentiment = 0 for sentence in blob.sentences: sentiment += sentence.sentiment.polarity totSentiment = sentiment / len(blob.sentences) XTrainSentiment.append(totSentiment) XTrainFeatures.append(filteredTweets) YTrain.append( personality.replace('[', '').replace('\"', '').replace(']', '')) # i+=1 # if i==3: # break return XTrain, YTrain, XTrainFeatures, XTrainSentiment, XTrainFreqTweets, geo_latitude, geo_longitude
# import preprocess_tweets # import filter_stop_words from mmds.supervised.filter_stop_words import FilterStopWords from mmds.supervised.preprocess_tweets import PreprocessTweets #Read the tweets one by one and process it fp = open('../../TwitterData/UserTweets.txt', 'r') line = fp.readline() objFilterStopWords = FilterStopWords() objPreprocessTweets = PreprocessTweets() st = open('../../TwitterData/StopWords.txt', 'r') stopWords = objFilterStopWords.getStopWordList('../../TwitterData/StopWords.txt') while line: processedTweet = objPreprocessTweets.processTweet(line) featureVector = objFilterStopWords.getFeatureVector(processedTweet, stopWords) print featureVector line = fp.readline() #end loop fp.close()