def C_comparison(length,features_train,labels_train,features_test,labels_test): C = [0.001,0.05,0.1,0.3,0.5,0.8,1,10,100,350,500,1000,3500,5000,10000,50000,100000] scores = [] for c in C: model = LogisticRegression.train(features_train,labels_train,c) prediction = LogisticRegression.predict(features_test,model) scores.append((measures.avgF1(labels_test,prediction,0,1))) plt.plot(C,scores,color="blue",linewidth="2.0") plt.xticks(C) plt.ylabel("F1") plt.xlabel("C") plt.show()
def plot_learning_curve(features_train,labels_train,features_test,labels_test,C=1): #run for every 10% of training set and compute training error and testing error step = len(features_train)/10 train = [] test = [] maj_clas = [] for i in range(0,10): print i #train for (i+1)*10 percent of training set f = features_train[0:((i+1)*(step))] l=labels_train[0:((i+1)*(step))] #train classifier for the specific subset of training set model = LogisticRegression.train(f,l) #model = SVM.train(f,l,c=C,k="linear") #get training error prediction = LogisticRegression.predict(f,model) #prediction = SVM.predict(f,model) train.append(measures.error(l,prediction)) #get testing error prediction = LogisticRegression.predict(features_test,model) #prediction = SVM.predict(features_test,model) test.append(measures.error(labels_test,prediction)) #get error for majority classifier prediction = MajorityClassifier.predictSubj(features_test) maj_clas.append(measures.error(labels_test,prediction)) #karabatsis = [0.6431]*len(train) x = np.arange(len(train))*10 plt.plot(x,train,color="blue",linewidth="2.0",label="Training Error") plt.plot(x,test,color="blue",linestyle="dashed",linewidth="2.0",label="Testing Error") plt.plot(x,maj_clas,color="red",linewidth="2.0",label="Majority Classifier Error") #plt.plot(x,karabatsis,color="green",linewidth="2.0",label="Karabatsis 14") plt.ylim(0,1) plt.ylabel('Error') plt.xlabel("% of messages") plt.legend(loc="lower left") plt.show()
def plotFeaturesF1(features_train,labels_train,features_test,labels_test): x = list(np.arange(len(features_train[0]))) #x = list(np.arange(5)) y = [] for i in range(0,len(features_train[0])): f_train = features_train[:,i] f_test = features_test[:,i] f_train = f_train.reshape(f_train.shape[0],1) f_test = f_test.reshape(f_test.shape[0],1) model = LogisticRegression.train(f_train,labels_train) prediction = LogisticRegression.predict(f_test,model) y.append(measures.avgF1(labels_test,prediction,0,1)) plt.plot(x,y,color="blue",linewidth="2.0") plt.ylabel("F1") plt.xlabel("# of Feature") plt.xticks(x) plt.show()
def plot_recall_precision(length,features_train,labels_train,features_test,labels_test): #threshold=[0.1 ,0.2 ,0.3 ,0.4,0.5,0.6,0.7,0.8,0.9] threshold = [x / 1000.0 for x in range(0, 1001, 1)] step = length/3 colors=['b','r','g'] for i in range(0,3): #((i+1)*(step)) percent of train data f = features_train[0:((i+1)*(step))] l=labels_train[0:((i+1)*(step))] #train classifier for the specific subset of training set model = LogisticRegression.train(f,l) #recall-precision for every threshold value recall = [] precision=[] for t in threshold : prediction = LogisticRegression.predict(features_test,model,t) recall.append(measures.recall(labels_test,prediction,0)) precision.append(measures.precision(labels_test,prediction,0)) plt.plot(recall,precision,linewidth="2.0",label=str((i+1)*33)+"% of train data",color=colors[i]) plt.xlim(0,1) plt.ylim(0,1) plt.xlabel('Recall') plt.ylabel('Precision') plt.title('Negative tweets') plt.legend() plt.show()