Пример #1
0
def main():
# read all tweets and labels
    fp = open( 'face_tweets.csv', 'rb' )
    reader = csv.reader( fp, delimiter=',', quotechar='"', escapechar='\\' )
    tweets = []
    for row in reader:
        tweets.append( [row[1], row[0]] )
    
    fp.close()

#    # treat neutral and irrelevant the same
#    for t in tweets:
#        if t[1] == 'irrelevant':
#            t[1] = 'neutral'

    random.shuffle( tweets );
    fvecs = [(tweet_features.make_tweet_dict(t),s) for (t,s) in tweets]

    # dump tweets which our feature selector found nothing
    #for i in range(0,len(tweets)):
    #    if tweet_features.is_zero_dict( fvecs[i][0] ):
    #        print tweets[i][1] + ': ' + tweets[i][0]

    # apply PCA reduction
    #(v_train, v_test) = \
    #        tweet_pca.tweet_pca_reduce( v_train, v_test, output_dim=1.0 )
    
    basic_analyze(fvecs)

    cross_validate(fvecs, 10, 0.8)
Пример #2
0
def main():
    # read all tweets and labels
    fp = open('face_tweets.csv', 'rb')
    reader = csv.reader(fp, delimiter=',', quotechar='"', escapechar='\\')
    tweets = []
    for row in reader:
        tweets.append([row[1], row[0]])

    fp.close()

    #    # treat neutral and irrelevant the same
    #    for t in tweets:
    #        if t[1] == 'irrelevant':
    #            t[1] = 'neutral'

    random.shuffle(tweets)
    fvecs = [(tweet_features.make_tweet_dict(t), s) for (t, s) in tweets]

    # dump tweets which our feature selector found nothing
    #for i in range(0,len(tweets)):
    #    if tweet_features.is_zero_dict( fvecs[i][0] ):
    #        print tweets[i][1] + ': ' + tweets[i][0]

    # apply PCA reduction
    #(v_train, v_test) = \
    #        tweet_pca.tweet_pca_reduce( v_train, v_test, output_dim=1.0 )

    basic_analyze(fvecs)

    cross_validate(fvecs, 10, 0.8)
Пример #3
0
for row in reader:
    tweets.append( [row[1], row[0]] );
    print(row[1])
print(tweets[0:50])

# treat neutral and irrelevant the same
for t in tweets:
    print(t[0])
    if t[0] == 'irrelevant':
        t[0] = 'neutral'


# split in to training and test sets
random.shuffle( tweets );

fvecs = [(tweet_features.make_tweet_dict(s),t) for (t,s) in tweets]
v_train = fvecs[:375]
v_test  = fvecs[375:]
#print(str(v_train))
#print(str(v_test))
for i in range(0, 20):
    print(fvecs[i])

# dump tweets which our feature selector found nothing
tot = 0;
for i in range(0,len(tweets)):
    if tweet_features.is_zero_dict( fvecs[i][0] ):
        #print(tweets[i][1] + ': ' + tweets[i][0])
        tot = tot + 1
print(tot)
Пример #4
0
# read all tweets and labels
fp = open('sentiment.csv', 'rb')
reader = csv.reader(fp, delimiter=',', quotechar='"', escapechar='\\')
tweets = []
for row in reader:
    tweets.append([row[3], row[4]])

# treat neutral and irrelevant the same
for t in tweets:
    if t[1] == 'irrelevant':
        t[1] = 'neutral'

# split in to training and test sets
random.shuffle(tweets)

fvecs = [(tweet_features.make_tweet_dict(t), s) for (t, s) in tweets]
v_train = fvecs[:2500]
v_test = fvecs[2500:]

# dump tweets which our feature selector found nothing
#for i in range(0,len(tweets)):
#    if tweet_features.is_zero_dict( fvecs[i][0] ):
#        print tweets[i][1] + ': ' + tweets[i][0]

# apply PCA reduction
#(v_train, v_test) = \
#        tweet_pca.tweet_pca_reduce( v_train, v_test, output_dim=1.0 )

# train classifier
classifier = nltk.NaiveBayesClassifier.train(v_train)
#classifier = nltk.classify.maxent.train_maxent_classifier_with_gis(v_train);
Пример #5
0
    text = nltk.word_tokenize(row[1], language='english')
    text = nltk.pos_tag(text)
    tweets.append([text,row[4]])
#added: pos tag is added as feature
print tweets

# treat neutral and irrelevant the same
for t in tweets:
    if t[1] == 'irrelevant':
        t[1] = 'neutral'


# split in to training and test sets
random.shuffle(tweets)
#pdb.set_trace()
fvecs = [(tweet_features.make_tweet_dict(t),s) for (t,s) in tweets]
v_train = fvecs[:10]
v_test  = fvecs[:10]


# dump tweets which our feature selector found nothing
#for i in range(0,len(tweets)):
#    if tweet_features.is_zero_dict( fvecs[i][0] ):
#        print tweets[i][1] + ': ' + tweets[i][0]


# apply PCA reduction
(v_train, v_test) = \
          tweet_pca.tweet_pca_reduce( v_train, v_test, output_dim=1.0 )