def featureSelection() : #load Dataset X_0, y, biomarkerNames = loadDataset()\ #use K-Fold kf = KFold(n_splits=10) kf.get_n_splits(X_0) for i in (250,500,1000): print("Number of Features "+str(i)) fold=0 for train_index, test_index in kf.split(X_0): print("Fold "+str(fold)) fold=fold+1 #declare selector with 4 features using F-score selector=SelectKBest(f_classif, k=i) #Normalize Data scaler = StandardScaler() X_train, X_test = X_0[train_index], X_0[test_index] y_train, y_test = y[train_index], y[test_index] X_train = scaler.fit_transform(X_train) X_test=scaler.transform(X_test) #Calculate Scores X_train = selector.fit_transform(X_train, y_train) #Get positions of Best Scores selected=selector.get_support(indices=True) X_test=selector.transform(X_test) ##Print ANOVA F-Values #print("ANOVA F-value") #print(selector.scores_[selected]) ##Print P-values #print("p values") #print(selector.pvalues_[selected]) #Print Resulting FeaturesS #print("features names") #print(biomarkerNames[selected]) #print("features index") ##Print Features Index #print(selected) #Declare Classifier clf = PassiveAggressiveClassifier(max_iter=1000, random_state=0,tol=1e-3) #Train Classifier clf.fit(X_train, y_train) #Print Accuracy accuracy_train=clf.score(X_train,y_train) accuracy_test=clf.score(X_test,y_test) print("Accuracy Train " + str(accuracy_train)) print("Accuracy Test " + str(accuracy_test)) ## create folder #folderName ="./results/" #if not os.path.exists(folderName) : os.makedirs(folderName) ##Print reduce Dataset #pd.DataFrame(X_new).to_csv(folderName+"data_"+str(0)+".csv", header=None, index =None) #pd.DataFrame(biomarkerNames[selected]).to_csv(folderName+"features_"+str(0)+".csv", header=None, index =None) #pd.DataFrame(y).to_csv(folderName+"labels.csv", header=None, index =None) return
def PassiveAggressive_clf(training_set_np, validation_set_np, testing_set_np, training_label, validation_label, testing_label): clf = PassiveAggressiveClassifier(max_iter=50) clf.fit(training_set_np, training_label) print("Passive Aggressive Classifier") print("Training Set Accuracy : " + str(100 * clf.score(training_set_np, training_label))) print("Validation Set Accuracy: " + str(100 * clf.score(validation_set_np, validation_label))) print("Testing Set Accuracy : " + str(100 * clf.score(testing_set_np, testing_label))) print("\n")
def model_PassiveAggressive(train_x, train_y, test_x, test_y, n_est=100): model = PassiveAggressiveClassifier() model.fit(train_x, train_y) sc = model.score(test_x, test_y) prediction = model.predict(test_x) mae = mean_absolute_error(test_y, prediction) return (sc, mae, prediction, model)
def linear_models(x_train, y_train): from sklearn.linear_model import LogisticRegression classifier1 = LogisticRegression(C=1.2, random_state=0, max_iter=1500) classifier1.fit(x_train, y_train) from sklearn.linear_model import PassiveAggressiveClassifier classifier2 = PassiveAggressiveClassifier() classifier2.fit(x_train, y_train) from sklearn.linear_model import RidgeClassifierCV classifier3 = RidgeClassifierCV() classifier3.fit(x_train, y_train) from sklearn.linear_model import SGDClassifier classifier4 = SGDClassifier() classifier4.fit(x_train, y_train) from sklearn.linear_model import Perceptron classifier5 = Perceptron() classifier5.fit(x_train, y_train) print('LogisticRegression training accuracy: ', classifier1.score(x_train, y_train)) print('PassiveAggressiveClassifier training accuracy: ', classifier2.score(x_train, y_train)) print('RidgeClassifierCV training accuracy: ', classifier3.score(x_train, y_train)) print('SGDClassifier training accuracy: ', classifier4.score(x_train, y_train)) print('Perceptron training accuracy: ', classifier5.score(x_train, y_train)) return classifier1, classifier2, classifier3, classifier4, classifier5
def training(): X, y = get_data() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7) tfidf_Xtrain, tfidf_Xtest = Vectorize(X_train, X_test) Pac = PassiveAggressiveClassifier(C=0.5, random_state=5) Pac.fit(tfidf_Xtrain, y_train) Pac_acc = Pac.score(tfidf_Xtest, y_test) print(Pac_acc) y_pred = Pac.predict(tfidf_Xtest) Pac_accuracy = accuracy_score(y_test, y_pred) print(Pac_accuracy) conf_matrix = confusion_matrix(y_test, y_pred, labels=['FAKE', 'REAL']) print(conf_matrix) clf_report = classification_report(y_test, y_pred) print(clf_report) makePickleFile(Pac)
def passiveAggresive(train, test, Y_train, Y_test, column): ''' Fits a Passive Aggresive Perceptron Classifer ''' clf = PassiveAggressiveClassifier(C = .1, max_iter = 1000, class_weight = 'balanced', tol = 1e-3) clf.fit(train, Y_train[column]) clf.predict(test) return clf.score(test, Y_test[column])
def test_classifier_accuracy(): for data in (X, X_csr): for fit_intercept in (True, False): clf = PassiveAggressiveClassifier(C=1.0, n_iter=30, fit_intercept=fit_intercept, random_state=0) clf.fit(data, y) score = clf.score(data, y) assert_greater(score, 0.79)
def test_classifier_partial_fit(): classes = np.unique(y) for data in (X, X_csr): clf = PassiveAggressiveClassifier(C=1.0, fit_intercept=True, random_state=0) for t in xrange(30): clf.partial_fit(data, y, classes) score = clf.score(data, y) assert_greater(score, 0.79)
def test_classifier_partial_fit(): classes = np.unique(y) for data in (X, X_csr): clf = PassiveAggressiveClassifier(C=1.0, fit_intercept=True, random_state=0) for t in range(30): clf.partial_fit(data, y, classes) score = clf.score(data, y) assert_greater(score, 0.79)
def get_baseline_pa(dataset, train_label_list, test_label_list, verbose=True): (X_train, Y_train), (X_test, Y_test) = dataset classifier = PassiveAggressiveClassifier(n_jobs=-1, fit_intercept=True) classifier.fit(X_train, train_label_list) accuracy = classifier.score(X_test, test_label_list) if verbose: print('Got baseline of %f with Passive Aggressive classifier' % accuracy) return accuracy
def get_baseline_pa(dataset_info, verbose=True): (X_train, Y_train), (X_test, Y_test) = dataset_info.ds.get_dataset(to_categorical=True, num_labels=num_labels) classifier = PassiveAggressiveClassifier(n_jobs=-1, fit_intercept=True) classifier.fit(X_train, dataset_info.ds.get_Y_train(X_train)) accuracy = classifier.score(X_test, dataset_info.ds.get_Y_test(X_train)) if verbose: print('Got baseline of %f with Passive Aggressive classifier' % accuracy) return accuracy
def featureSelection(): #load Dataset X, y, biomarkerNames = loadDataset() #Normalize Data scaler = StandardScaler() X = scaler.fit_transform(X) #Declare Classifier clf = PassiveAggressiveClassifier(max_iter=1000, random_state=0, tol=1e-3) #Train Classifier clf.fit(X, y) #Print Accuracy print(clf.score(X, y)) return
def test_classifier_accuracy(): for data in (X, X_csr): for fit_intercept in (True, False): for average in (False, True): clf = PassiveAggressiveClassifier( C=1.0, max_iter=30, fit_intercept=fit_intercept, random_state=1, average=average, tol=None) clf.fit(data, y) score = clf.score(data, y) assert score > 0.79 if average: assert hasattr(clf, 'average_coef_') assert hasattr(clf, 'average_intercept_') assert hasattr(clf, 'standard_intercept_') assert hasattr(clf, 'standard_coef_')
def test_classifier_partial_fit(): classes = np.unique(y) for data in (X, X_csr): for average in (False, True): clf = PassiveAggressiveClassifier(random_state=0, average=average, max_iter=5) for t in range(30): clf.partial_fit(data, y, classes) score = clf.score(data, y) assert score > 0.79 if average: assert hasattr(clf, 'average_coef_') assert hasattr(clf, 'average_intercept_') assert hasattr(clf, 'standard_intercept_') assert hasattr(clf, 'standard_coef_')
def pac(x, y, x_t, y_t, y_pred): score = 0 t = 0 for i in range(48): classifier = PassiveAggressiveClassifier(max_iter=len(x[i])) try: classifier.fit(np.array(x[i]), np.array(y[i])) y_pred[i] = classifier.predict(x_t[i]) score += classifier.score(x_t[i], y_t[i]) t += 1 except: print('error in ' + str(i)) y_pred[i] = np.zeros(17) continue return score / t
def test_classifier_accuracy(): for data in (X, X_csr): for fit_intercept in (True, False): for average in (False, True): clf = PassiveAggressiveClassifier( C=1.0, max_iter=30, fit_intercept=fit_intercept, random_state=1, average=average, tol=None) clf.fit(data, y) score = clf.score(data, y) assert_greater(score, 0.79) if average: assert hasattr(clf, 'average_coef_') assert hasattr(clf, 'average_intercept_') assert hasattr(clf, 'standard_intercept_') assert hasattr(clf, 'standard_coef_')
def test_classifier_partial_fit(): classes = np.unique(y) for data in (X, X_csr): for average in (False, True): clf = PassiveAggressiveClassifier( C=1.0, fit_intercept=True, random_state=0, average=average, max_iter=5) for t in range(30): clf.partial_fit(data, y, classes) score = clf.score(data, y) assert_greater(score, 0.79) if average: assert hasattr(clf, 'average_coef_') assert hasattr(clf, 'average_intercept_') assert hasattr(clf, 'standard_intercept_') assert hasattr(clf, 'standard_coef_')
def test_classifier_accuracy(): for data in (X, X_csr): for fit_intercept in (True, False): for average in (False, True): clf = PassiveAggressiveClassifier(C=1.0, n_iter=30, fit_intercept=fit_intercept, random_state=0, average=average) clf.fit(data, y) score = clf.score(data, y) assert_greater(score, 0.79) if average: assert_true(hasattr(clf, 'average_coef_')) assert_true(hasattr(clf, 'average_intercept_')) assert_true(hasattr(clf, 'standard_intercept_')) assert_true(hasattr(clf, 'standard_coef_'))
def test_classifier_partial_fit(): classes = np.unique(y) for data in (X, X_csr): for average in (False, True): clf = PassiveAggressiveClassifier(C=1.0, fit_intercept=True, random_state=0, average=average) for t in range(30): clf.partial_fit(data, y) score = clf.score(data, y) assert_greater(score, 0.79) if average: assert_true(hasattr(clf, 'average_coef_')) assert_true(hasattr(clf, 'average_intercept_')) assert_true(hasattr(clf, 'standard_intercept_')) assert_true(hasattr(clf, 'standard_coef_'))
def featureSelection(): #load Dataset X_0, y, biomarkerNames = loadDataset() for i in (2, 4, 8, 16): #declare selector with 4 features using F-score selector = SelectKBest(f_classif, k=i) #Normalize Data scaler = StandardScaler() X = scaler.fit_transform(X_0) #Calculate Scores X_new = selector.fit_transform(X, y) #Get positions of Best Scores selected = selector.get_support(indices=True) ##Print ANOVA F-Values #print("ANOVA F-value") #print(selector.scores_[selected]) ##Print P-values #print("p values") #print(selector.pvalues_[selected]) ##Print Resulting Features #print("features names") #print(biomarkerNames[selected]) #print("features index") ##Print Features Index #print(selected) print(i) #Declare Classifier clf = PassiveAggressiveClassifier(max_iter=1000, random_state=0, tol=1e-3) #Train Classifier clf.fit(X_new, y) #Print Accuracy print(clf.score(X_new, y)) ## create folder #folderName ="./results/" #if not os.path.exists(folderName) : os.makedirs(folderName) ##Print reduce Dataset #pd.DataFrame(X_new).to_csv(folderName+"data_"+str(0)+".csv", header=None, index =None) #pd.DataFrame(biomarkerNames[selected]).to_csv(folderName+"features_"+str(0)+".csv", header=None, index =None) #pd.DataFrame(y).to_csv(folderName+"labels.csv", header=None, index =None) return
print(df.shape) df.head() labels = df.label labels.head() X = df['text'] y = df['label'] cv = CountVectorizer() X = cv.fit_transform(X) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.24, random_state=4) pac = PassiveAggressiveClassifier(max_iter=50) pac.fit(X_train, y_train) pac.score(X_test, y_test) y_pred = pac.predict(X_test) score = accuracy_score(y_test, y_pred) print(f'Accuracy: {round(score*100,2)}%') confusion_matrix(y_test, y_pred, labels=['FAKE', 'REAL']) pickle.dump(pac, open('model.pkl', 'wb')) model = pickle.load(open('model.pkl', 'rb')) print(classification_report(y_test, y_pred))
y_pred=ac.predict(tfidf_test) from sklearn.metrics import accuracy_score, confusion_matrix Confusion=confusion_matrix(y_pred,y_test) print(Confusion) Accuracy=accuracy_score(y_pred,y_test) print("Accuracy :",(Accuracy*100).round(3)) ACC=[] for i in range(20,50): acc=PassiveAggressiveClassifier(max_iter=i).fit(tfidf_train,y_train) ACC.append((acc.score(tfidf_test,y_test)*100).round(2)) print(max(ACC)) ############################################################################ from sklearn.linear_model import LogisticRegression log=LogisticRegression().fit(tfidf_train,y_train) L_pred=log.predict(tfidf_test)
clf7.fit(X_train, y_train) vc = VotingClassifier(estimators=[ ('mlp', clf), ('dt', clf3), ('et', clf6), ('bag', clf5), ('grad', clf7) ], voting='soft', weights=[0.3, 0.1, 0.2, 0.1, 0.3]) vc.fit(X_train, y_train) predicted = clf.predict(X_test) predicted2 = clf2.predict(X_test) predicted3 = clf3.predict(X_test) predicted_vc = vc.predict(X_test) score1 = clf.score(X_test, y_test) score2 = clf2.score(X_test, y_test) score3 = clf3.score(X_test, y_test) score4 = clf4.score(X_test, y_test) score5 = clf5.score(X_test, y_test) score6 = clf6.score(X_test, y_test) score7 = clf7.score(X_test, y_test) score_vc = vc.score(X_test, y_test) sia = SIA() pol_scores = [0]*len(y_test) for i in range(0,len(y_test)): pol_score = sia.polarity_scores(X_test1.values[i])['compound'] pol_scores[i] = int(round(2*pol_score + 2)) print('Diff') print('(MLP SVC DT) (POL, VC)')
percScoresTrain = [] percScoresDev = [] for i in range(10): perceptron.fit(trainX, trainY) percScoresDev.append(perceptron.score(devX, devY)) percScoresTrain.append(perceptron.score(trainX, trainY)) print "Perceptron Train:", np.mean(percScoresTrain) print "Perceptron Dev:", np.mean(percScoresDev) passAggScoresTrain = [] passAggScoresDev = [] for i in range(10): passAgg.fit(trainX, trainY) passAggScoresDev.append( passAgg.score(devX, devY)) passAggScoresTrain.append( passAgg.score(trainX, trainY)) print "Passive Aggressive Train:", np.mean(passAggScoresTrain) print "Passive Aggressive Dev:", np.mean(passAggScoresDev) passAggScoresSmallTrain = [] passAggScoresSmallDev = [] for i in range(10): passAgg.fit(trainX, trainY) passAggScoresSmallDev.append( passAgg.score(devX, devY)) passAggScoresSmallTrain.append( passAgg.score(trainXSmall,trainYSmall)) print "Passive Aggressive (Small Dataset)) Train:", np.mean(passAggScoresSmallTrain)
from sklearn import svm from sklearn.preprocessing import StandardScaler from sklearn.model_selection import train_test_split from sklearn.feature_selection import SelectKBest, chi2 from sklearn.linear_model import SGDClassifier, PassiveAggressiveClassifier from sklearn.externals import joblib import numpy as np import pickle #clf = svm.SVC() clf1 = PassiveAggressiveClassifier() clf2 = SGDClassifier() scaler = StandardScaler() X = np.loadtxt('features.txt') y = [0] * 4192 + [1] * 3317 #X = SelectKBest(chi2, k=10).fit_transform(X, y) X = scaler.fit_transform(X) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) #clf.fit(X_train, y_train) clf1.fit(X, y) clf2.fit(X, y) print clf1.score(X_test, y_test) print clf2.score(X_test, y_test) joblib.dump(clf1, 'passive_aggressive.pkl') joblib.dump(clf2, 'sgd.pkl') #joblib.dump(scaler, 'scaler.pkl')
per(svm_bow_test) + ' testing accuracy') print('Bigram Results: ' + per(svm_bigram_train) + ' training accuracy, ' + per(svm_bigram_test) + ' testing accuracy') # Now lets try using passive aggressive classifier: from sklearn.linear_model import PassiveAggressiveClassifier, PassiveAggressiveRegressor pac = PassiveAggressiveClassifier() pac2 = PassiveAggressiveClassifier() par = PassiveAggressiveRegressor() par2 = PassiveAggressiveRegressor() # Now fit pac.fit(train_bow, train_ratings) par.fit(train_bow, train_ratings) pac2.fit(train_bigram, train_ratings) par2.fit(train_bigram, train_ratings) # Record and desplay results pac_bow_train = pac.score(train_bow, train_ratings) pac_bow_test = pac.score(test_bow, test_ratings) pac_bigram_train = pac2.score(train_bigram, train_ratings) pac_bigram_test = pac2.score(test_bigram, test_ratings) par_bow_train = par.score(train_bow, train_ratings) par_bow_test = par.score(test_bow, test_ratings) par_bigram_train = par2.score(train_bigram, train_ratings) par_bigram_test = par2.score(test_bigram, test_ratings) # pac = par = pac2 = par2 = 1 del pac, par, pac2, par2 # Results print('Passive Aggressive Classifier') print('BOW Results: ' + per(pac_bow_train) + ' training accuracy, ' + per(pac_bow_test) + ' testing accuracy') print('Bigram Results: ' + per(pac_bigram_train) + ' training accuracy, ' + per(pac_bigram_test) + ' testing accuracy')
height_list = [] logging.info("GhCore on '%s' database", db_name) logging.info("#samples = %d; #features = %d" % (X.shape[0], X.shape[1])) logging.info("Creting train/test split...") skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED) for trainval_index, test_index in skf.split(X, y): X_trainval, X_test = X[trainval_index], X[test_index] y_trainval, y_test = y[trainval_index], y[test_index] # accuracy baseline if True: model = PassiveAggressiveClassifier() model.fit(X_trainval, y_trainval) base_accuracy_test = model.score(X_test, y_test) else: model, base_accuracy_test = keras_cnn_model( X_trainval, y_trainval, X_test, y_test) print("Baseline accuracy: %.4f" % (base_accuracy_test)) base_accuracy_list.append(base_accuracy_test) # try with GhCore skf2 = StratifiedKFold(n_splits=3, shuffle=True, random_state=SEED) list_of_splits = [split for split in skf.split(X_trainval, y_trainval)] train_index, val_index = list_of_splits[0] X_train, X_val = X_trainval[train_index], X_trainval[val_index] y_train, y_val = y_trainval[train_index], y_trainval[val_index] root, _, model, accuracy_train, X_arch_core, y_arch_core, outliers, pruned_nodes = Ghcore(X_train, y_train, X_val, y_val, \ max_height=20, min_epochs=10, \
def runLearner(printStages = True, useSelector = False, discreteHelpfulness = True, useRST = True, useFew = False): learner = PassiveAggressiveClassifier() if discreteHelpfulness else PassiveAggressiveRegressor() #bestwords = getBestWords(instances,num=1000) tfidvec = TfidfVectorizer(sublinear_tf=True,stop_words='english', ngram_range=(1,3), decode_error='replace') selector = SelectKBest(chi2, k=50000) if useSelector else None encoder = LabelEncoder() if discreteHelpfulness else None if discreteHelpfulness: classlabels = encoder.fit_transform(labels) newData = False count = 0 if useRST: print 'Getting RST data' nums, texts, ilabels = getPickledRSTSciKitDataLists(True) if newData else getRSTSciKitDataLists(True) random = RandomFeatureExtractor() lengthBaseline = LenFeatureExtractor() fullRST = FullPickledRSTFeatureExtractor(nums) if newData else FullTextRSTFeatureExtractor(nums) limitedRST = LimitedPickledRSTFeatureExtractor(nums) if newData else LimitedTextRSTFeatureExtractor(nums) vectorizer = FeatureUnion([('extra',limitedRST),('tfid',tfidvec)]) print 'Fitting random features baseline' random.fit(texts) print 'Fitting text length baseline' lengthBaseline.fit(texts) print 'Fitting full RST features' fullRST.fit(texts) print 'Fitting limited RST features' limitedRST.fit(texts) print 'Fitting limited RST with tfidvec features' vectorizer.fit(texts) print 'Fitting tfidvec features' tfidvec.fit(texts) split = int(0.8*len(ilabels)) trainData = (texts[:split],ilabels[:split]) testData = (texts[split:],ilabels[split:]) X,y = getAsSciKit(trainData[0],trainData[1],random,encoder,selector) learner.fit(X,y) X,y = getAsSciKit(trainData[0],trainData[1],random,encoder,selector) print 'random features baseline trained on %d instances has accuracy %f'%(len(trainData[0]),learner.score(X,y)) dummy = DummyClassifier() X,y = getAsSciKit(trainData[0],trainData[1],random,encoder,selector) dummy.fit(X,y) X,y = getAsSciKit(testData[0],testData[1],random,encoder,selector) print 'Dummy label distribution baseline trained on %d instances has accuracy %f'%(len(trainData[0]),dummy.score(X,y)) X,y = getAsSciKit(trainData[0],trainData[1],lengthBaseline,encoder,selector) learner.fit(X,y) X,y = getAsSciKit(testData[0],testData[1],lengthBaseline,encoder,selector) print 'text length baseline trained on %d instances has accuracy %f'%(len(trainData[0]),learner.score(X,y)) X,y = getAsSciKit(trainData[0],trainData[1],fullRST,encoder,selector) learner.fit(X,y) X,y = getAsSciKit(testData[0],testData[1],fullRST,encoder,selector) print 'Full RST learner trained on %d instances has accuracy %f'%(len(trainData[0]),learner.score(X,y)) X,y = getAsSciKit(trainData[0],trainData[1],limitedRST,encoder,selector) learner.fit(X,y) X,y = getAsSciKit(testData[0],testData[1],limitedRST,encoder,selector) print 'Limited RST learner trained on %d instances has accuracy %f'%(len(trainData[0]),learner.score(X,y)) X,y = getAsSciKit(trainData[0],trainData[1],vectorizer,encoder,selector) learner.fit(X,y) X,y = getAsSciKit(testData[0],testData[1],vectorizer,encoder,selector) print 'Limited RST with ngram learner trained on %d instances has accuracy %f'%(len(trainData[0]),learner.score(X,y)) X,y = getAsSciKit(trainData[0],trainData[1],tfidvec,encoder,selector) learner = learner.fit(X,y) X,y = getAsSciKit(testData[0],testData[1],tfidvec,encoder,selector) print 'ngram learner trained on %d instances has accuracy %f'%(len(trainData[0]),learner.score(X,y)) else: vectorizer = tfidvec testData = None vocabGotten = False instances = ([],[]) numVocab = 50000 numTest = 50000 numTrain = 100000 maxTrainStages = 20 for text,label in getSciKitData(stateProgress = False, discreteLabels=discreteHelpfulness): if label!='few' or useFew: instances[0].append(text) instances[1].append(label) if not vocabGotten and len(instances[0]) == numVocab: if printStages: print 'Fitting vocabulary with %d instances'%numVocab vectorizer.fit(instances[0],None) if selector is not None: X,y = getSciKitInstance(instances[0],instances[1],vectorizer,encoder,None) selector.fit(X,y) vocabGotten = True instances = ([],[]) elif vocabGotten and testData is None and len(instances[0]) == numTest: if printStages: print 'Getting test data with %d instances'%numTest testData = getSciKitInstance(instances[0],instances[1],vectorizer,encoder,selector) instances = ([],[]) elif vocabGotten and testData is not None and len(instances[0]) == numTrain: X,y = getSciKitInstance(instances[0],instances[1],vectorizer,encoder,selector) if discreteHelpfulness: learner = learner.partial_fit(X,y, classes = classlabels) else: learner = learner.partial_fit(X,y) instances = ([],[]) count = count + 1 if printStages: print 'Baseline trained on %d instances has accuracy %f'%(count*numTrain,learner.score(testData[0],testData[1])) elif count == maxTrainStages: break print 'Final learner trained on %d instances has accuracy %f'%(maxTrainStages*numTrain,learner.score(testData[0],testData[1]))
#y is a categorical variable so will encode it from sklearn.preprocessing import LabelEncoder le = LabelEncoder() y = le.fit_transform(y) #now splittin the model into train and test set from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2) #training the model from sklearn.linear_model import PassiveAggressiveClassifier model = PassiveAggressiveClassifier() model.fit(x_train, y_train) #predicting the values y_pred = model.predict(x_test) #score of the model model.score(x_test, y_test) from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test, y_pred) print(f"Classification Report : \n\n{classification_report(y_test, y_pred)}") '''Classification Report : precision recall f1-score support 0 0.98 0.99 0.99 965 1 0.96 0.88 0.92 150 accuracy 0.98 1115 macro avg 0.97 0.94 0.95 1115 weighted avg 0.98 0.98 0.98 1115 '''
# Create the model pac = PassiveAggressiveClassifier(C=0.05, loss='squared_hinge', max_iter=2000, random_state=1000) # Train with the start-up samples nb_initial_samples = int(X_train.shape[0] / 1.5) pac.fit(X_train[0:nb_initial_samples], Y_train[0:nb_initial_samples]) # Continue with the incremental samples validation_accuracies = [] for (x, y) in zip(X_train[nb_initial_samples:], Y_train[nb_initial_samples:]): pac.partial_fit(x.reshape(1, -1), y.ravel(), classes=np.unique(iris['target'])) validation_accuracies.append(pac.score(X_test, Y_test)) # Show the validation plot fig, ax = plt.subplots(figsize=(18, 8)) ax.plot(validation_accuracies) ax.set_xlabel('Online sample') ax.set_ylabel('Validation accuracy') ax.grid() plt.show()
def main(): # Vectorizer with 2^18 buckets. chunkSize = 300000 n_buckets = 2**19 vectorizer = HashingVectorizer(decode_error='ignore', n_features=n_buckets, non_negative=True) classifier = PassiveAggressiveClassifier() #JSONGenerator = readChunk("data/dataSampleFile",chunkSize) #JSONGenerator = readChunk("data/RC_2007-10",chunkSize) #JSONGenerator = readChunk("data/RC_2008-01",chunkSize) JSONGenerator = readChunk("data/RC_2008-12", chunkSize) #JSONGenerator = readChunk("data/RC_2009-12",chunkSize) #JSONGenerator = readChunk("data/RC_2012-01",chunkSize) JSONArrayTestSet = next(JSONGenerator) X_test_text = [] Y_test = [] for JSONString in JSONArrayTestSet: JSONObject = json.loads(JSONString) # Don't care about deleted content. if JSONObject["body"] == "[deleted]": continue X_test_text.append(JSONObject["body"]) Y_test.append(rangifyScore(int(JSONObject["score"]))) X_test = vectorizer.transform(X_test_text) log("Start till MainLoop timer: " + str(time.time() - startTick)) generatorTimeTick = time.time() # For loop for generators. Smart! for i, JSONArray in enumerate(JSONGenerator): log("readChunkTimer: " + str(time.time() - generatorTimeTick)) X_train_text = [] Y_train = [] extractFeatureTimeTick = time.time() for JSONString in JSONArray: JSONObject = json.loads(JSONString) # Don't care about deleted content. if JSONObject["body"] == "[deleted]": continue X_train_text.append(JSONObject["body"]) Y_train.append(rangifyScore(int(JSONObject["score"]))) log("Feature Extract timer: " + str(time.time() - extractFeatureTimeTick)) tick = time.time() X_train = vectorizer.transform(X_train_text) log("Vectorize timer:" + str(time.time() - tick)) tick = time.time() classifier.partial_fit(X_train, Y_train, classes=[i for i in range(41)]) log("Partial fit timer:" + str(time.time() - tick)) generatorTimeTick = time.time() log("Total Time: " + str(time.time() - startTick)) print(classifier.score(X_test, Y_test))
plt.legend(loc="lower right") plt.show() #Passive Aggressive Classifier Algorithm from sklearn.linear_model import PassiveAggressiveClassifier PC = PassiveAggressiveClassifier() PC = PC.fit(X_train, y_train) PC #accuracy of Passive Aggressive Classifier Algorithm y_pred1 = PC.predict(X_test) print('Accuracy score= {:.2f}'.format(PC.score(X_test, y_test))) #ROC curve of Passive Aggressive Classifier Algorithm from sklearn.metrics import roc_curve, auc import matplotlib.pyplot as plt fpr, tpr, thresholds = roc_curve(y_test, y_pred1) roc_auc = auc(fpr, tpr) plt.figure() plt.plot(fpr, tpr, color='darkorange',
from sklearn.linear_model import PassiveAggressiveClassifier P_estimator = PassiveAggressiveClassifier(C=1.0, fit_intercept=True, shuffle=True, verbose=0, loss='hinge', n_jobs=1, random_state=None, warm_start=False, class_weight=None, n_iter=5) P_estimator.fit(X_train, y_train) P_estimator.predict(X_test) print("Accuracy:{}".format(P_estimator.score(X_test, y_test))) # # Comparing Different Classifiers # In[136]: import numpy as np import matplotlib.pyplot as plt from sklearn import datasets from sklearn.model_selection import train_test_split from sklearn.linear_model import SGDClassifier, Perceptron from sklearn.linear_model import PassiveAggressiveClassifier from sklearn.linear_model import LogisticRegression heldout = [0.95, 0.90, 0.75, 0.50, 0.01]
print("Shape of Train X {}\n".format(trainX.shape)) print("Sample of the vocab:\n {}".format( np.random.choice(countVec.get_feature_names(), 20))) #%% PICK A MODEL AND EXPERIMENT lr = LogisticRegression() passAgg = PassiveAggressiveClassifier() perceptron = Perceptron() lr.fit(trainX, trainY) print("Logistic Regression Train:", lr.score(trainX, trainY)) print("Logistic Regression Dev:", lr.score(devX, devY)) print("--") passAgg.fit(trainX, trainY) print("Passive Aggressive Train:", passAgg.score(trainX, trainY)) print("Passive Aggressive Dev:", passAgg.score(devX, devY)) print("--") perceptron.fit(trainX, trainY) print("Perceptron Train:", perceptron.score(trainX, trainY)) print("Perceptron Dev:", perceptron.score(devX, devY)) print("--") #%% ANALYSIS AND DEBUGGING lr = LogisticRegression() lr.fit(trainX, trainY) print("Logistic Regression Train:", lr.score(trainX, trainY)) print("Logistic Regression Dev:", lr.score(devX, devY)) print("--")
#以下使用CLASSIFICATION,Passive Aggressive Classifier from sklearn.linear_model import PassiveAggressiveClassifier classifier = PassiveAggressiveClassifier(random_state=None) #classifier.fit(data,t) # training on the iris dataset #print(classifier.predict(data[0])) #print(t[0]) #from sklearn import cross_validation #train, test, t_train, t_test = cross_validation.train_test_split(X, y, test_size=0.4, random_state=0) from sklearn.model_selection import train_test_split train, test, t_train, t_test = train_test_split(X, y, test_size=0.4, random_state=0) classifier.fit(train,t_train) # train #print(classifier.get_params()) print(classifier.score(test,t_test)) # test print() #confusion matrix from sklearn.metrics import confusion_matrix print(confusion_matrix(classifier.predict(test),t_test)) print() #計算f1-score from sklearn.metrics import classification_report print(classification_report(classifier.predict(test), t_test, target_names=['setosa', 'versicolor', 'virginica'])) print() #----------------------------------------------------- #from sklearn.cross_validation import cross_val_score