def main(): # read all tweets and labels fp = open( 'face_tweets.csv', 'rb' ) reader = csv.reader( fp, delimiter=',', quotechar='"', escapechar='\\' ) tweets = [] for row in reader: tweets.append( [row[1], row[0]] ) fp.close() # # treat neutral and irrelevant the same # for t in tweets: # if t[1] == 'irrelevant': # t[1] = 'neutral' random.shuffle( tweets ); fvecs = [(tweet_features.make_tweet_dict(t),s) for (t,s) in tweets] # dump tweets which our feature selector found nothing #for i in range(0,len(tweets)): # if tweet_features.is_zero_dict( fvecs[i][0] ): # print tweets[i][1] + ': ' + tweets[i][0] # apply PCA reduction #(v_train, v_test) = \ # tweet_pca.tweet_pca_reduce( v_train, v_test, output_dim=1.0 ) basic_analyze(fvecs) cross_validate(fvecs, 10, 0.8)
def main(): # read all tweets and labels fp = open('face_tweets.csv', 'rb') reader = csv.reader(fp, delimiter=',', quotechar='"', escapechar='\\') tweets = [] for row in reader: tweets.append([row[1], row[0]]) fp.close() # # treat neutral and irrelevant the same # for t in tweets: # if t[1] == 'irrelevant': # t[1] = 'neutral' random.shuffle(tweets) fvecs = [(tweet_features.make_tweet_dict(t), s) for (t, s) in tweets] # dump tweets which our feature selector found nothing #for i in range(0,len(tweets)): # if tweet_features.is_zero_dict( fvecs[i][0] ): # print tweets[i][1] + ': ' + tweets[i][0] # apply PCA reduction #(v_train, v_test) = \ # tweet_pca.tweet_pca_reduce( v_train, v_test, output_dim=1.0 ) basic_analyze(fvecs) cross_validate(fvecs, 10, 0.8)
for row in reader: tweets.append( [row[1], row[0]] ); print(row[1]) print(tweets[0:50]) # treat neutral and irrelevant the same for t in tweets: print(t[0]) if t[0] == 'irrelevant': t[0] = 'neutral' # split in to training and test sets random.shuffle( tweets ); fvecs = [(tweet_features.make_tweet_dict(s),t) for (t,s) in tweets] v_train = fvecs[:375] v_test = fvecs[375:] #print(str(v_train)) #print(str(v_test)) for i in range(0, 20): print(fvecs[i]) # dump tweets which our feature selector found nothing tot = 0; for i in range(0,len(tweets)): if tweet_features.is_zero_dict( fvecs[i][0] ): #print(tweets[i][1] + ': ' + tweets[i][0]) tot = tot + 1 print(tot)
# read all tweets and labels fp = open('sentiment.csv', 'rb') reader = csv.reader(fp, delimiter=',', quotechar='"', escapechar='\\') tweets = [] for row in reader: tweets.append([row[3], row[4]]) # treat neutral and irrelevant the same for t in tweets: if t[1] == 'irrelevant': t[1] = 'neutral' # split in to training and test sets random.shuffle(tweets) fvecs = [(tweet_features.make_tweet_dict(t), s) for (t, s) in tweets] v_train = fvecs[:2500] v_test = fvecs[2500:] # dump tweets which our feature selector found nothing #for i in range(0,len(tweets)): # if tweet_features.is_zero_dict( fvecs[i][0] ): # print tweets[i][1] + ': ' + tweets[i][0] # apply PCA reduction #(v_train, v_test) = \ # tweet_pca.tweet_pca_reduce( v_train, v_test, output_dim=1.0 ) # train classifier classifier = nltk.NaiveBayesClassifier.train(v_train) #classifier = nltk.classify.maxent.train_maxent_classifier_with_gis(v_train);
text = nltk.word_tokenize(row[1], language='english') text = nltk.pos_tag(text) tweets.append([text,row[4]]) #added: pos tag is added as feature print tweets # treat neutral and irrelevant the same for t in tweets: if t[1] == 'irrelevant': t[1] = 'neutral' # split in to training and test sets random.shuffle(tweets) #pdb.set_trace() fvecs = [(tweet_features.make_tweet_dict(t),s) for (t,s) in tweets] v_train = fvecs[:10] v_test = fvecs[:10] # dump tweets which our feature selector found nothing #for i in range(0,len(tweets)): # if tweet_features.is_zero_dict( fvecs[i][0] ): # print tweets[i][1] + ': ' + tweets[i][0] # apply PCA reduction (v_train, v_test) = \ tweet_pca.tweet_pca_reduce( v_train, v_test, output_dim=1.0 )