def main(): #FIELDS: ID, SENTIMENT, TEXT base_dir = os.path.dirname(os.path.realpath(__file__)) _file = "dataset"+os.sep+"SemevalTrainB.tsv" train_file = os.path.join(base_dir,_file) _file = "dataset"+os.sep+"SemevalTestB2013.tsv" test_file = os.path.join(base_dir,_file) tweetsTrain_ALL = pandas.read_csv(train_file, header=0, delimiter="\t", index_col=False) tweetsTest_ALL = pandas.read_csv(test_file, header=0, delimiter="\t", index_col=False) #only use first 500 of training entries #tweetsTrain = tweetsTrain_ALL[0:500] #0403 tweetsTrain = tweetsTrain_ALL[0:1000] #0459 tweetsTest = tweetsTest_ALL #bow bag of words vectorizer = createVectorizer() #used to create the feature vectors featuresOfTrainData = vectorizer.fit_transform(tweetsTrain['TEXT'].tolist()) labelsOfTrainData = tweetsTrain['SENTIMENT'].tolist() featuresOfTestData = vectorizer.transform(tweetsTest['TEXT'].tolist()) labelsOfTestData = tweetsTest['SENTIMENT'].tolist() classifier = trainClassifier(featuresOfTrainData, labelsOfTrainData) print "BOW:Final score on test set: " + str(calculateFinalScore(classifier, featuresOfTestData, labelsOfTestData)) """
def main(): #FIELDS: ID, SENTIMENT, TEXT tweetsTrain_ALL = pandas.read_csv('dataset/SemevalTrainB.tsv', header=0, delimiter="\t", index_col=False) tweetsTest_ALL = pandas.read_csv('dataset/SemevalTestB2013.tsv', header=0, delimiter="\t", index_col=False) #only use first 500 of training entries tweetsTrain = tweetsTrain_ALL[0:500] tweetsTest = tweetsTest_ALL vectorizer = createVectorizer() #used to create the feature vectors featuresOfTrainData = vectorizer.fit_transform( tweetsTrain['TEXT'].tolist()) labelsOfTrainData = tweetsTrain['SENTIMENT'].tolist() featuresOfTestData = vectorizer.transform(tweetsTest['TEXT'].tolist()) labelsOfTestData = tweetsTest['SENTIMENT'].tolist() classifier = trainClassifier(featuresOfTrainData, labelsOfTrainData) print "Final score on test set: " + str( calculateFinalScore(classifier, featuresOfTestData, labelsOfTestData))
def evaluate_base(train_file, test_file, desc): tweetsTrain = pd.read_csv(train_file, header=0, delimiter="\t", index_col=False) tweetsTest = pd.read_csv(test_file, header=0, delimiter="\t", index_col=False) vectorizer = createVectorizer("") featuresOfTrainData = vectorizer.fit_transform( tweetsTrain['TEXT'].tolist(), "bow") labelsOfTrainData = tweetsTrain['SENTIMENT'].tolist() featuresOfTestData = vectorizer.transform(tweetsTest['TEXT'].tolist()) labelsOfTestData = tweetsTest['SENTIMENT'].tolist() classifier = trainClassifier(featuresOfTrainData, labelsOfTrainData) cross_validation = str('%.5f' % getCrossValidationScore(classifier)) final_score = str( '%.5f' % calculateFinalScore(classifier, featuresOfTestData, labelsOfTestData)) print desc + ":Final score: " + final_score, " CrossValidationScore:", cross_validation ret = [final_score, cross_validation] return ret
def evaluate(vectorizer,tweetsTrain,tweetsTest,typ): #all featuresOfTrainData = vectorizer.fit_transform(tweetsTrain['TEXT'].tolist(),"bow") labelsOfTrainData = tweetsTrain['SENTIMENT'].tolist() featuresOfTestData = vectorizer.transform(tweetsTest['TEXT'].tolist()) labelsOfTestData = tweetsTest['SENTIMENT'].tolist() classifier = trainClassifier(featuresOfTrainData, labelsOfTrainData) cross_validation= str('%.5f' % getCrossValidationScore(classifier)) final_score = str('%.5f' % calculateFinalScore(classifier, featuresOfTestData, labelsOfTestData)) print typ+":Final score: " + final_score," CrossValidationScore:",cross_validation ret =[final_score,cross_validation] return ret
def evaluate(vectorizer, tweetsTrain, tweetsTest, typ): #all featuresOfTrainData = vectorizer.fit_transform( tweetsTrain['TEXT'].tolist(), "bow") labelsOfTrainData = tweetsTrain['SENTIMENT'].tolist() featuresOfTestData = vectorizer.transform(tweetsTest['TEXT'].tolist()) labelsOfTestData = tweetsTest['SENTIMENT'].tolist() classifier = trainClassifier(featuresOfTrainData, labelsOfTrainData) cross_validation = str('%.5f' % getCrossValidationScore(classifier)) final_score = str( '%.5f' % calculateFinalScore(classifier, featuresOfTestData, labelsOfTestData)) print typ + ":Final score: " + final_score, " CrossValidationScore:", cross_validation ret = [final_score, cross_validation] return ret
def main(): #FIELDS: ID, SENTIMENT, TEXT tweetsTrain_ALL = pandas.read_csv('dataset/SemevalTrainB.tsv', header=0, delimiter="\t", index_col=False) tweetsTest_ALL = pandas.read_csv('dataset/SemevalTestB2013.tsv', header=0, delimiter="\t", index_col=False) #only use first 500 of training entries tweetsTrain = tweetsTrain_ALL[0:500] tweetsTest = tweetsTest_ALL vectorizer = createVectorizer() #used to create the feature vectors featuresOfTrainData = vectorizer.fit_transform(tweetsTrain['TEXT'].tolist()) labelsOfTrainData = tweetsTrain['SENTIMENT'].tolist() featuresOfTestData = vectorizer.transform(tweetsTest['TEXT'].tolist()) labelsOfTestData = tweetsTest['SENTIMENT'].tolist() classifier = trainClassifier(featuresOfTrainData, labelsOfTrainData) print "Final score on test set: " + str(calculateFinalScore(classifier, featuresOfTestData, labelsOfTestData))
def evaluate_base(train_file,test_file,desc): tweetsTrain = pd.read_csv(train_file, header=0, delimiter="\t", index_col=False) tweetsTest = pd.read_csv(test_file, header=0, delimiter="\t", index_col=False) vectorizer = createVectorizer("") featuresOfTrainData = vectorizer.fit_transform(tweetsTrain['TEXT'].tolist(),"bow") labelsOfTrainData = tweetsTrain['SENTIMENT'].tolist() featuresOfTestData = vectorizer.transform(tweetsTest['TEXT'].tolist()) labelsOfTestData = tweetsTest['SENTIMENT'].tolist() classifier = trainClassifier(featuresOfTrainData, labelsOfTrainData) cross_validation= str('%.5f' % getCrossValidationScore(classifier)) final_score = str('%.5f' % calculateFinalScore(classifier, featuresOfTestData, labelsOfTestData)) print desc+":Final score: " + final_score," CrossValidationScore:",cross_validation ret =[final_score,cross_validation] return ret
def main(): #FIELDS: ID, SENTIMENT, TEXT base_dir = os.path.dirname(os.path.realpath(__file__)) _file = "dataset" + os.sep + "SemevalTrainB.tsv" train_file = os.path.join(base_dir, _file) _file = "dataset" + os.sep + "SemevalTestB2013.tsv" test_file = os.path.join(base_dir, _file) tweetsTrain_ALL = pandas.read_csv(train_file, header=0, delimiter="\t", index_col=False) tweetsTest_ALL = pandas.read_csv(test_file, header=0, delimiter="\t", index_col=False) #only use first 500 of training entries #tweetsTrain = tweetsTrain_ALL[0:500] #0403 tweetsTrain = tweetsTrain_ALL[0:1000] #0459 tweetsTest = tweetsTest_ALL #bow bag of words vectorizer = createVectorizer() #used to create the feature vectors featuresOfTrainData = vectorizer.fit_transform( tweetsTrain['TEXT'].tolist()) labelsOfTrainData = tweetsTrain['SENTIMENT'].tolist() featuresOfTestData = vectorizer.transform(tweetsTest['TEXT'].tolist()) labelsOfTestData = tweetsTest['SENTIMENT'].tolist() classifier = trainClassifier(featuresOfTrainData, labelsOfTrainData) print "BOW:Final score on test set: " + str( calculateFinalScore(classifier, featuresOfTestData, labelsOfTestData)) """
def main(): # lange finalzeichen punctuation mark capschar/len feat1countpositivewordss #FIELDS: ID, SENTIMENT, TEXT base_dir = os.path.dirname(os.path.realpath(__file__)) _file = "dataset"+os.sep+"SemevalTrainB.tsv" train_file = os.path.join(base_dir,_file) _file = "dataset"+os.sep+"SemevalTestB2013.tsv" test_file = os.path.join(base_dir,_file) tweetsTrain_ALL = pandas.read_csv(train_file, header=0, delimiter="\t", index_col=False) tweetsTest_ALL = pandas.read_csv(test_file, header=0, delimiter="\t", index_col=False) #only use first 500 of training entries #tweetsTrain = tweetsTrain_ALL[0:500] #0403 tweetsTrain = tweetsTrain_ALL#[0:500] #0459 tweetsTest = tweetsTest_ALL """for tweet in tweetsTrain['TEXT'].tolist(): print tweet raw_input(">") """ # all combinations stuff = ["bow","hashtag","smile","feat1","feat2","feat3","capslock","Punctuationmark"] max_fs = 0 max_cv = 0 max_r=[] for L in range(0, len(stuff)+1): for subset in itertools.combinations(stuff, L): if len(subset) > 2 : vectorizer = createVectorizer_dynamic(subset) w="" for s in subset: w=w+"-"+s #print w,"X",s #print subset #raw_input(">") r=evaluate(vectorizer,tweetsTrain,tweetsTest,w) if r[0] > max_fs or r[1] > max_cv: print "*"*33 print subset print type(subset) print len(subset) print r max_fs = r[0] max_cv = r[1] max_r = r #raw_input(">") print "CHAMPS" ,max_r,max_fs,max_cv print "Single features evaluation......." #bow vectorizer = createVectorizer("bow") #used to create the feature vectors evaluate(vectorizer,tweetsTrain,tweetsTest,"BAGWOR") #hashtag vectorizer = createVectorizer("hashtag") #used to create the feature vectors evaluate(vectorizer,tweetsTrain,tweetsTest,"HASHTA") #smile vectorizer = createVectorizer("smile") #used to create the feature vectors evaluate(vectorizer,tweetsTrain,tweetsTest,"SMILE ") #feat1 vectorizer = createVectorizer("feat1") #used to create the feature vectors evaluate(vectorizer,tweetsTrain,tweetsTest,"SCORE ") #feat2 vectorizer = createVectorizer("feat2") #used to create the feature vectors evaluate(vectorizer,tweetsTrain,tweetsTest,"COUNT+") #feat3 vectorizer = createVectorizer("feat3") #used to create the feature vectors evaluate(vectorizer,tweetsTrain,tweetsTest,"LENGTH") #capslock vectorizer = createVectorizer("capslock") #used to create the feature vectors evaluate(vectorizer,tweetsTrain,tweetsTest,"CAPSLK") #Punctuationmark vectorizer = createVectorizer("Punctuationmark") #used to create the feature vectors evaluate(vectorizer,tweetsTrain,tweetsTest,"PUMARK") print "Grouped features evaluation......." #all vectorizer = createVectorizer("") #used to create the feature vectors evaluate(vectorizer,tweetsTrain,tweetsTest,"BASE_3") #all (feat1,feat2,feat3) vectorizer = createVectorizer("all") #used to create the feature vectors evaluate(vectorizer,tweetsTrain,tweetsTest,"ALL ") """ all trainings data Single features evaluation....... BAGWOR:Final score: 0.57997 CrossValidationScore: 0.56994 HASHTA:Final score: 0.04242 CrossValidationScore: 0.03448 SMILE :Final score: 0.08459 CrossValidationScore: 0.10531 SCORE :Final score: 0.44349 CrossValidationScore: 0.42490 COUNT+:Final score: 0.42304 CrossValidationScore: 0.41968 LENGTH:Final score: 0.23442 CrossValidationScore: 0.23512 CAPSLK:Final score: 0.27601 CrossValidationScore: 0.27802 PUMARK:Final score: 0.11608 CrossValidationScore: 0.13662 Grouped features evaluation....... BASE_3:Final score: 0.58561 CrossValidationScore: 0.58662 ALL :Final score: 0.60424 CrossValidationScore: 0.60120 """ #schlaumeier train all all_tweets=pandas.concat([tweetsTrain_ALL,tweetsTest]) tweetsTrain = all_tweets#0.934980197195 tweetsTest = tweetsTest_ALL vectorizer = createVectorizer("") #used to create the feature vectors evaluate(vectorizer,tweetsTrain,tweetsTest,"OVERFITTING:schlaumeier train all") #graph x = [100, 200, 500, 1000, 2000, 4000 ,8000] y=[] for dim_train in x: tweetsTrain = tweetsTrain_ALL[0:dim_train] #0459 tweetsTest = tweetsTest_ALL vectorizer = createVectorizer("") #used to create the feature vectors featuresOfTrainData = vectorizer.fit_transform(tweetsTrain['TEXT'].tolist()) labelsOfTrainData = tweetsTrain['SENTIMENT'].tolist() featuresOfTestData = vectorizer.transform(tweetsTest['TEXT'].tolist()) labelsOfTestData = tweetsTest['SENTIMENT'].tolist() classifier = trainClassifier(featuresOfTrainData, labelsOfTrainData) fs= calculateFinalScore(classifier, featuresOfTestData, labelsOfTestData) print "Final score on test set (dim_train=" +str(dim_train)+"):"+ str(fs) y.append(fs) ###########THE FIGURE fig = plt.figure() ax = fig.add_subplot(1,1,1) ax.set_title('training data size VS final score') ax.set_ylabel('final score') ax.set_xlabel('training data size') ###########SCATTER plt.scatter(x, y) plt.plot(x,y) #plt.plot(x,y,'o',y,x,'r') #plt.grid(True) ###########SHOW plt.show()
def main(): # lange finalzeichen punctuation mark capschar/len feat1countpositivewordss #FIELDS: ID, SENTIMENT, TEXT base_dir = os.path.dirname(os.path.realpath(__file__)) _file = "dataset" + os.sep + "SemevalTrainB.tsv" train_file = os.path.join(base_dir, _file) _file = "dataset" + os.sep + "SemevalTestB2013.tsv" test_file = os.path.join(base_dir, _file) tweetsTrain_ALL = pandas.read_csv(train_file, header=0, delimiter="\t", index_col=False) tweetsTest_ALL = pandas.read_csv(test_file, header=0, delimiter="\t", index_col=False) #only use first 500 of training entries #tweetsTrain = tweetsTrain_ALL[0:500] #0403 tweetsTrain = tweetsTrain_ALL #[0:500] #0459 tweetsTest = tweetsTest_ALL """for tweet in tweetsTrain['TEXT'].tolist(): print tweet raw_input(">") """ # all combinations stuff = [ "bow", "hashtag", "smile", "feat1", "feat2", "feat3", "capslock", "Punctuationmark" ] max_fs = 0 max_cv = 0 max_r = [] for L in range(0, len(stuff) + 1): for subset in itertools.combinations(stuff, L): if len(subset) > 2: vectorizer = createVectorizer_dynamic(subset) w = "" for s in subset: w = w + "-" + s #print w,"X",s #print subset #raw_input(">") r = evaluate(vectorizer, tweetsTrain, tweetsTest, w) if r[0] > max_fs or r[1] > max_cv: print "*" * 33 print subset print type(subset) print len(subset) print r max_fs = r[0] max_cv = r[1] max_r = r #raw_input(">") print "CHAMPS", max_r, max_fs, max_cv print "Single features evaluation......." #bow vectorizer = createVectorizer("bow") #used to create the feature vectors evaluate(vectorizer, tweetsTrain, tweetsTest, "BAGWOR") #hashtag vectorizer = createVectorizer( "hashtag") #used to create the feature vectors evaluate(vectorizer, tweetsTrain, tweetsTest, "HASHTA") #smile vectorizer = createVectorizer("smile") #used to create the feature vectors evaluate(vectorizer, tweetsTrain, tweetsTest, "SMILE ") #feat1 vectorizer = createVectorizer("feat1") #used to create the feature vectors evaluate(vectorizer, tweetsTrain, tweetsTest, "SCORE ") #feat2 vectorizer = createVectorizer("feat2") #used to create the feature vectors evaluate(vectorizer, tweetsTrain, tweetsTest, "COUNT+") #feat3 vectorizer = createVectorizer("feat3") #used to create the feature vectors evaluate(vectorizer, tweetsTrain, tweetsTest, "LENGTH") #capslock vectorizer = createVectorizer( "capslock") #used to create the feature vectors evaluate(vectorizer, tweetsTrain, tweetsTest, "CAPSLK") #Punctuationmark vectorizer = createVectorizer( "Punctuationmark") #used to create the feature vectors evaluate(vectorizer, tweetsTrain, tweetsTest, "PUMARK") print "Grouped features evaluation......." #all vectorizer = createVectorizer("") #used to create the feature vectors evaluate(vectorizer, tweetsTrain, tweetsTest, "BASE_3") #all (feat1,feat2,feat3) vectorizer = createVectorizer("all") #used to create the feature vectors evaluate(vectorizer, tweetsTrain, tweetsTest, "ALL ") """ all trainings data Single features evaluation....... BAGWOR:Final score: 0.57997 CrossValidationScore: 0.56994 HASHTA:Final score: 0.04242 CrossValidationScore: 0.03448 SMILE :Final score: 0.08459 CrossValidationScore: 0.10531 SCORE :Final score: 0.44349 CrossValidationScore: 0.42490 COUNT+:Final score: 0.42304 CrossValidationScore: 0.41968 LENGTH:Final score: 0.23442 CrossValidationScore: 0.23512 CAPSLK:Final score: 0.27601 CrossValidationScore: 0.27802 PUMARK:Final score: 0.11608 CrossValidationScore: 0.13662 Grouped features evaluation....... BASE_3:Final score: 0.58561 CrossValidationScore: 0.58662 ALL :Final score: 0.60424 CrossValidationScore: 0.60120 """ #schlaumeier train all all_tweets = pandas.concat([tweetsTrain_ALL, tweetsTest]) tweetsTrain = all_tweets #0.934980197195 tweetsTest = tweetsTest_ALL vectorizer = createVectorizer("") #used to create the feature vectors evaluate(vectorizer, tweetsTrain, tweetsTest, "OVERFITTING:schlaumeier train all") #graph x = [100, 200, 500, 1000, 2000, 4000, 8000] y = [] for dim_train in x: tweetsTrain = tweetsTrain_ALL[0:dim_train] #0459 tweetsTest = tweetsTest_ALL vectorizer = createVectorizer("") #used to create the feature vectors featuresOfTrainData = vectorizer.fit_transform( tweetsTrain['TEXT'].tolist()) labelsOfTrainData = tweetsTrain['SENTIMENT'].tolist() featuresOfTestData = vectorizer.transform(tweetsTest['TEXT'].tolist()) labelsOfTestData = tweetsTest['SENTIMENT'].tolist() classifier = trainClassifier(featuresOfTrainData, labelsOfTrainData) fs = calculateFinalScore(classifier, featuresOfTestData, labelsOfTestData) print "Final score on test set (dim_train=" + str( dim_train) + "):" + str(fs) y.append(fs) ###########THE FIGURE fig = plt.figure() ax = fig.add_subplot(1, 1, 1) ax.set_title('training data size VS final score') ax.set_ylabel('final score') ax.set_xlabel('training data size') ###########SCATTER plt.scatter(x, y) plt.plot(x, y) #plt.plot(x,y,'o',y,x,'r') #plt.grid(True) ###########SHOW plt.show()