def test_classifier_refit(): # Classifier can be retrained on different labels and features. clf = PassiveAggressiveClassifier(max_iter=5).fit(X, y) assert_array_equal(clf.classes_, np.unique(y)) clf.fit(X[:, :-1], iris.target_names[y]) assert_array_equal(clf.classes_, iris.target_names)
def constructPickles(filename): dataDF = pd.read_csv(filename) labels = dataDF.label # DataFlair - Split the dataset x_train, x_test, y_train, y_test = train_test_split(dataDF['text'], labels, test_size=0.2, random_state=7) # DataFlair - Initialize a TfidfVectorizer tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7) # DataFlair - Fit and transform train set, transform test set tfidf_train = tfidf_vectorizer.fit_transform(x_train) tfidf_test = tfidf_vectorizer.transform(x_test) # DataFlair - Initialize a PassiveAggressiveClassifier pac = PassiveAggressiveClassifier(max_iter=50) pac.fit(tfidf_train, y_train) # DataFlair - Predict on the test set and calculate accuracy pac = joblib.load("testPickle") tfidf_vectorizer = joblib.load("testPickleVector") tfidf_test = tfidf_vectorizer.transform(x_test) y_pred = pac.predict(tfidf_test) score = accuracy_score(y_test, y_pred) print(f'Accuracy: {round(score * 100, 2)}%') joblib.dump(pac, "testPickle") joblib.dump(tfidf_vectorizer, "testPickleVector")
def train_and_predict_m7 (train, test, labels) : ## Apply basic concatenation + stemming trainData, testData = stemmer_clean (train, test, stemmerEnableM7, stemmer_type = 'snowball') ## TF-IDF transform with sub-linear TF and stop-word removal tfv = TfidfVectorizer(min_df = 5, max_features = None, strip_accents = 'unicode', analyzer = 'word', token_pattern = r'\w{1,}', ngram_range = (1, 5), smooth_idf = 1, sublinear_tf = 1, stop_words = ML_STOP_WORDS) tfv.fit(trainData) X = tfv.transform(trainData) X_test = tfv.transform(testData) ## Create the classifier print ("Fitting Passive-Aggressive Classifer...") clf = PassiveAggressiveClassifier(random_state = randomState, loss = 'squared_hinge', n_iter = 100, C = 0.01) ## Create a parameter grid to search for best parameters for everything in the pipeline # Note: minkowski with p > 2 does not work for sparse matrices param_grid = {'C' : [0.003, 0.01, 0.03, 0.1], 'loss': ['hinge', 'squared_hinge'], 'n_iter': [5, 10, 30, 100, 300]} #param_grid = {'C' : [0.003, 0.01, 0.03, 0.1, 0.3, 1], 'loss': ['hinge'], 'n_iter': [5, 10, 30, 100, 300, 1000]} ## Predict model with best parameters optimized for quadratic_weighted_kappa if (gridSearch) : model = perform_grid_search (clf, param_grid, X, labels) pred = model.predict(X_test) else : clf.fit(X, labels) pred = clf.predict(X_test) return pred
def train(tfidf_train, y_train, tfidf_test): pac = PassiveAggressiveClassifier(max_iter=50) pac.fit(tfidf_train, y_train) y_pred = pac.predict(tfidf_test) return y_pred
def training(): X, y = get_data() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7) tfidf_Xtrain, tfidf_Xtest = Vectorize(X_train, X_test) Pac = PassiveAggressiveClassifier(C=0.5, random_state=5) Pac.fit(tfidf_Xtrain, y_train) Pac_acc = Pac.score(tfidf_Xtest, y_test) print(Pac_acc) y_pred = Pac.predict(tfidf_Xtest) Pac_accuracy = accuracy_score(y_test, y_pred) print(Pac_accuracy) conf_matrix = confusion_matrix(y_test, y_pred, labels=['FAKE', 'REAL']) print(conf_matrix) clf_report = classification_report(y_test, y_pred) print(clf_report) makePickleFile(Pac)
def train_models(train_X, train_y, test_X=None, test_y=None, model_prefix='./model/temp'): # Train {{{. logging.info('Training NB model ...') nb_model = MultinomialNB(alpha=0.01) nb_model.fit(train_X, train_y) logging.info('Training SVM model ...') svm_model = LinearSVC(random_state=1) svm_model.fit(train_X, train_y) logging.info('Training PA model ...') pa_model = PassiveAggressiveClassifier() pa_model.fit(train_X, train_y) # }}}. # Test {{{. if test_X is not None and test_y is not None: logging.info('Evaluating on test set ...') test_y = [l.replace('__label__', '') for l in test_y] for model, desp in zip([nb_model, pa_model, svm_model], ['NB_Report', 'PA_Report', 'SVM_report']): print >>sys.stderr, ( '================== %s ==================\n' % desp) pred_y = model.predict(test_X) pred_y = [l.replace('__label__', '') for l in pred_y] print >>sys.stderr, classification_report(test_y, pred_y, digits=4) # }}}. # Save models {{{. joblib.dump(nb_model, model_prefix + '.nb', compress=True) joblib.dump(svm_model, model_prefix + '.svm', compress=True) joblib.dump(pa_model, model_prefix + '.pa', compress=True)
def Predict(): #user_inputเข้ามา user_input = request.form['text'] #อ่านไฟล์dataset df = pd.read_csv('train.csv') #กำหนดค่าจากตัวเลขให้เป็นtext conversion_dict = {0: 'HQ', 1: 'LQ_EDIT', 2: 'LQ_CLOSE'} df['Body'] = df['Y'].replace(conversion_dict) # print(df.label.value_counts()) # Train test split #แบ่งข้อมูลมาtrain x_train, x_test, y_train, y_test = train_test_split(df['Body'], df['Y'], test_size=0.25, random_state=7, shuffle=True) tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.75) # แปลงเป็นตัวเลข vec_train = tfidf_vectorizer.fit_transform(x_train.values.astype('U')) vec_test = tfidf_vectorizer.transform(x_test.values.astype('U')) # Train Model pac = PassiveAggressiveClassifier(max_iter=50) pac.fit(vec_train, y_train) model = MultinomialNB() model.fit(vec_train, y_train) # Predict user_input_tranform = tfidf_vectorizer.transform([user_input]) y_predict = pac.predict(user_input_tranform) return render_template("Predict.html", text=user_input, predict=y_predict)
def model_PassiveAggressive(train_x, train_y, test_x, test_y, n_est=100): model = PassiveAggressiveClassifier() model.fit(train_x, train_y) sc = model.score(test_x, test_y) prediction = model.predict(test_x) mae = mean_absolute_error(test_y, prediction) return (sc, mae, prediction, model)
def linear_models(x_train, y_train): from sklearn.linear_model import LogisticRegression classifier1 = LogisticRegression(C=1.2, random_state=0, max_iter=1500) classifier1.fit(x_train, y_train) from sklearn.linear_model import PassiveAggressiveClassifier classifier2 = PassiveAggressiveClassifier() classifier2.fit(x_train, y_train) from sklearn.linear_model import RidgeClassifierCV classifier3 = RidgeClassifierCV() classifier3.fit(x_train, y_train) from sklearn.linear_model import SGDClassifier classifier4 = SGDClassifier() classifier4.fit(x_train, y_train) from sklearn.linear_model import Perceptron classifier5 = Perceptron() classifier5.fit(x_train, y_train) print('LogisticRegression training accuracy: ', classifier1.score(x_train, y_train)) print('PassiveAggressiveClassifier training accuracy: ', classifier2.score(x_train, y_train)) print('RidgeClassifierCV training accuracy: ', classifier3.score(x_train, y_train)) print('SGDClassifier training accuracy: ', classifier4.score(x_train, y_train)) print('Perceptron training accuracy: ', classifier5.score(x_train, y_train)) return classifier1, classifier2, classifier3, classifier4, classifier5
def test_main(self): categories, documents = get_docs_categories() clean_function = lambda text: '' if text.startswith('[') else text entity_types = set(['GPE']) term_doc_mat = (TermDocMatrixFactory( category_text_iter=zip(categories, documents), clean_function=clean_function, nlp=_testing_nlp, feats_from_spacy_doc=FeatsFromSpacyDoc( entity_types_to_censor=entity_types)).build()) clf = PassiveAggressiveClassifier(n_iter=5, C=0.5, n_jobs=-1, random_state=0) fdc = FeatsFromDoc( term_doc_mat._term_idx_store, clean_function=clean_function, feats_from_spacy_doc=FeatsFromSpacyDoc( entity_types_to_censor=entity_types)).set_nlp(_testing_nlp) tfidf = TfidfTransformer(norm='l1') X = tfidf.fit_transform(term_doc_mat._X) clf.fit(X, term_doc_mat._y) X_to_predict = fdc.feats_from_doc('Did sometimes march UNKNOWNWORD') pred = clf.predict(tfidf.transform(X_to_predict)) dec = clf.decision_function(X_to_predict)
def PassiveAggressiveClassifier_1(train_predictors,test_predictors,train_target,test_target): clf = PassiveAggressiveClassifier() clf.fit(train_predictors,train_target) predicted = clf.predict(test_predictors) accuracy = accuracy_score(test_target, predicted) print "Accuracy for Linear Model PassiveAggressiveClassifier: "+str(accuracy) return accuracy,predicted
def get_delay(): result = request.form query_title = result['title'] query_text = result['maintext'] # print(query_text) query = get_all_query(query_title, query_text) # query = remove_punctuation_stopwords_lemma(query_text) # print(query) # user_input = {'query':query} toSearch = query_title query_text = [query_text] query_title = [query_title] tfidf_test_input = tfidf_vectorizer.transform(query) linear_clf = PassiveAggressiveClassifier() linear_clf.fit(tfidf_train, y_train) pred = linear_clf.predict(tfidf_test_input) print(pred) try: from googlesearch import search except ImportError: print("No module named 'google' found") # to search links = [] for j in search(toSearch, tld="co.in", num=10, stop=10, pause=2): links.append(j) # <style>body{text-align: center;font-family: Arial, Helvetica, sans-serif;}</style> # return f'<html><style>body{text-align: center;font-family: Arial, Helvetica, sans-serif;}</style><body><h1>It is a {pred[0]} news. </h1><h2>You may refer to the following articles for more details.</h2> <a href={links[0]}> Article 1 </a><br> <a href={links[1]}> Article 2 </a> <form action="/"> <button type="submit">back </button> </form></body></html>' return render_template('result.html', links=links, pred=pred[0])
def passiveAgressive(train_x, train_y, test_x): from sklearn.linear_model import PassiveAggressiveClassifier # apply Linear Regression: model = PassiveAggressiveClassifier() model.fit(train_x, train_y) y_prediction = model.predict(test_x) return y_prediction
def test_wrong_class_weight_label(): # ValueError due to wrong class_weight label. X2 = np.array([[-1.0, -1.0], [-1.0, 0], [-0.8, -1.0], [1.0, 1.0], [1.0, 0.0]]) y2 = [1, 1, 1, -1, -1] clf = PassiveAggressiveClassifier(class_weight={0: 0.5}, max_iter=100) with pytest.raises(ValueError): clf.fit(X2, y2)
def train_passve_aggresive_classifier(self, tfidf_train, b_train, tfidf_test, b_test): pclass = PassiveAggressiveClassifier(max_iter=60) pclass.fit(tfidf_train, b_train) b_pred = pclass.predict(tfidf_test) factcheckscore = accuracy_score(b_test, b_pred) print(f"Accuracy Is {round(factcheckscore*100,2)}%") return self.save_model(pclass)
def passive_aggressive(sample_data, test_percentage): """ Implement Naive Bayes and perform accuracy_tests """ # Bag of words for first x words X_train, X_test, y_train, y_test = data_models.split_test_train_data(sample_data, test_percentage) linear_clf = PassiveAggressiveClassifier() linear_clf.fit(X_train, y_train) pred = linear_clf.predict(X_test) return (y_test == pred).sum() * 100 / len(y_test)
def passiveAggresive(train, test, Y_train, Y_test, column): ''' Fits a Passive Aggresive Perceptron Classifer ''' clf = PassiveAggressiveClassifier(C = .1, max_iter = 1000, class_weight = 'balanced', tol = 1e-3) clf.fit(train, Y_train[column]) clf.predict(test) return clf.score(test, Y_test[column])
def paclassifier(train_X, train_Y, test_X): print("Training model.....") pac = PassiveAggressiveClassifier( random_state=0) # Training the Passive Aggressive Classifier Model pac.fit(train_X, train_Y) # Fitting the training data into the model y_pred = pac.predict( test_X) # Predicting the label output for the test data return y_pred
def test_classifier_accuracy(): for data in (X, X_csr): for fit_intercept in (True, False): clf = PassiveAggressiveClassifier(C=1.0, n_iter=30, fit_intercept=fit_intercept, random_state=0) clf.fit(data, y) score = clf.score(data, y) assert_greater(score, 0.79)
def featureSelection() : #load Dataset X_0, y, biomarkerNames = loadDataset()\ #use K-Fold kf = KFold(n_splits=10) kf.get_n_splits(X_0) for i in (250,500,1000): print("Number of Features "+str(i)) fold=0 for train_index, test_index in kf.split(X_0): print("Fold "+str(fold)) fold=fold+1 #declare selector with 4 features using F-score selector=SelectKBest(f_classif, k=i) #Normalize Data scaler = StandardScaler() X_train, X_test = X_0[train_index], X_0[test_index] y_train, y_test = y[train_index], y[test_index] X_train = scaler.fit_transform(X_train) X_test=scaler.transform(X_test) #Calculate Scores X_train = selector.fit_transform(X_train, y_train) #Get positions of Best Scores selected=selector.get_support(indices=True) X_test=selector.transform(X_test) ##Print ANOVA F-Values #print("ANOVA F-value") #print(selector.scores_[selected]) ##Print P-values #print("p values") #print(selector.pvalues_[selected]) #Print Resulting FeaturesS #print("features names") #print(biomarkerNames[selected]) #print("features index") ##Print Features Index #print(selected) #Declare Classifier clf = PassiveAggressiveClassifier(max_iter=1000, random_state=0,tol=1e-3) #Train Classifier clf.fit(X_train, y_train) #Print Accuracy accuracy_train=clf.score(X_train,y_train) accuracy_test=clf.score(X_test,y_test) print("Accuracy Train " + str(accuracy_train)) print("Accuracy Test " + str(accuracy_test)) ## create folder #folderName ="./results/" #if not os.path.exists(folderName) : os.makedirs(folderName) ##Print reduce Dataset #pd.DataFrame(X_new).to_csv(folderName+"data_"+str(0)+".csv", header=None, index =None) #pd.DataFrame(biomarkerNames[selected]).to_csv(folderName+"features_"+str(0)+".csv", header=None, index =None) #pd.DataFrame(y).to_csv(folderName+"labels.csv", header=None, index =None) return
def paClassify(X, Y, Xt, Yt, class_weight): title = "Passive Aggressive Classifier" classifier = PassiveAggressiveClassifier(n_iter=10, class_weight=class_weight) classifier.fit(X, Y) YPredict = classifier.predict(Xt) printAccuracy(YPredict, Yt, title)
def _passiveaggressiveclassifier(*, train, test, x_predict=None, metrics, C=1.0, fit_intercept=True, max_iter=1000, tol=0.001, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, shuffle=True, verbose=0, loss='hinge', n_jobs=None, random_state=None, warm_start=False, class_weight=None, average=False): """For for info visit : https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.PassiveAggressiveClassifier.html#sklearn.linear_model.PassiveAggressiveClassifier """ model = PassiveAggressiveClassifier( C=C, fit_intercept=fit_intercept, max_iter=max_iter, tol=tol, early_stopping=early_stopping, validation_fraction=validation_fraction, n_iter_no_change=n_iter_no_change, shuffle=shuffle, verbose=verbose, loss=loss, n_jobs=n_jobs, random_state=random_state, warm_start=warm_start, class_weight=class_weight, average=average) model.fit(train[0], train[1]) model_name = 'PassiveAggressiveClassifier' y_hat = model.predict(test[0]) if metrics == 'f1_score': accuracy = f1_score(test[1], y_hat) if metrics == 'jaccard_score': accuracy = jaccard_score(test[1], y_hat) if metrics == 'accuracy_score': accuracy = accuracy_score(test[1], y_hat) if x_predict is None: return (model_name, accuracy, None) y_predict = model.predict(x_predict) return (model_name, accuracy, y_predict)
class DeployedClassifierFactory: def __init__(self, term_doc_matrix, term_doc_matrix_factory, category, nlp=None): '''This is a class that enables one to train and save a classification model. Parameters ---------- term_doc_matrix : TermDocMatrix term_doc_matrix_factory : TermDocMatrixFactory category : str Category name nlp : spacy.en.English ''' self._term_doc_matrix = term_doc_matrix self._term_doc_matrix_factory = term_doc_matrix_factory assert term_doc_matrix_factory._nlp is None assert term_doc_matrix_factory.category_text_iter is None self._category = category self._clf = None self._proba = None def passive_aggressive_train(self): '''Trains passive aggressive classifier ''' self._clf = PassiveAggressiveClassifier(n_iter=50, C=0.2, n_jobs=-1, random_state=0) self._clf.fit(self._term_doc_matrix._X, self._term_doc_matrix._y) y_dist = self._clf.decision_function(self._term_doc_matrix._X) pos_ecdf = ECDF(y_dist[y_dist >= 0]) neg_ecdf = ECDF(y_dist[y_dist <= 0]) def proba_function(distance_from_hyperplane): if distance_from_hyperplane > 0: return pos_ecdf(distance_from_hyperplane) / 2. + 0.5 elif distance_from_hyperplane < 0: return pos_ecdf(distance_from_hyperplane) / 2. return 0.5 self._proba = proba_function return self def build(self): '''Builds Depoyed Classifier ''' if self._clf is None: raise NeedToTrainExceptionBeforeDeployingException() return DeployedClassifier(self._category, self._term_doc_matrix._category_idx_store, self._term_doc_matrix._term_idx_store, self._term_doc_matrix_factory)
def train_online_model(xtr, ytr, model=None): # Train classifier t0 = time.time() if model is None: model = PassiveAggressiveClassifier() model.fit(xtr, ytr) else: model.partial_fit(xtr, ytr) print "Training took %.2f seconds" % (time.time()-t0) return model
def train_online_model(xtr, ytr, model=None): # Train classifier t0 = time.time() if model is None: model = PassiveAggressiveClassifier() model.fit(xtr, ytr) else: model.partial_fit(xtr, ytr) print "Training took %.2f seconds" % (time.time() - t0) return model
def classify(): # Initialize a PassiveAggressiveClassifier pac = PassiveAggressiveClassifier(max_iter=50) pac.fit(tfidf_train, y_train) # Predict on the test set and calculate accuracy y_pred_data = pac.predict(tfidf_test) score = accuracy_score(y_test, y_pred) print(f'Accuracy: {round(score * 100, 2)}%') return y_pred_data
class BinaryClassifier(object): def __init__(self, classifier_type, scale_features=True): self.scale_features = scale_features self.classifier_type = classifier_type self.clear() self.train_std = 0 self.random_gen = np.random.RandomState(136543785) def clear(self, remember_train_std_if_supported=False): self.positive_instances = [] self.negative_instances = [] # self.classifier = svm.SVC(kernel='linear') if self.classifier_type == CLASSIFIER_TYPE.LR: #print (CLASSIFIER_TYPE.LR) self.classifier = LogisticRegression(C=1.0) elif self.classifier_type == CLASSIFIER_TYPE.PA: #print (CLASSIFIER_TYPE.PA) self.classifier = PassiveAggressiveClassifier(loss='hinge', C=1.0) elif self.classifier_type == CLASSIFIER_TYPE.SVM: self.classifier = svm.SVC(kernel='linear') if not remember_train_std_if_supported: self.train_std = 0 def add_positive_instances(self, positive_instances): self.positive_instances.extend(positive_instances) def add_negative_instances(self, negative_instances): self.negative_instances.extend(negative_instances) def train(self): X = self.positive_instances + self.negative_instances y = np.asarray([1] * len(self.positive_instances) + [0] * len(self.negative_instances)) # shuffling the train instances in case classifier is sensitive to this order Xy = list(zip(X, y)) self.random_gen.shuffle(Xy) X[:], y[:] = zip(*Xy) X = mat_concat(X) if self.scale_features: if self.train_std == 0: self.train_std = (pointwise_mult(X, X).mean() - X.mean()**2)**0.5 X = X / self.train_std # X = X/self.train_std self.classifier.fit(X, y) def predict(self, instances): # scaled_instances = [inst/self.train_std for inst in instances] instances = mat_concat(instances) if self.scale_features and self.train_std > 0: instances = instances / self.train_std return self.classifier.predict(instances)
def test_passive_aggressive_2(): """Ensure that the TPOT PassiveAggressiveClassifier outputs the same as the sklearn classifier when C == 0.0""" tpot_obj = TPOT() result = tpot_obj._passive_aggressive(training_testing_data, 0.0, 0) result = result[result['group'] == 'testing'] pagg = PassiveAggressiveClassifier(C=0.0001, loss='hinge', fit_intercept=True, random_state=42) pagg.fit(training_features, training_classes) assert np.array_equal(result['guess'].values, pagg.predict(testing_features))
def train_model(): #just using dummy data from a text article = extract("/home/david/2019-ca400-taland2/src/dataset/test.txt") dftrain = pd.read_csv( '/home/david/2019-ca400-taland2/src/dataset/train.csv') #drops rows that have null values dftrain = dftrain.dropna() #Set column names to variables df_x = dftrain['text'] df_y = dftrain['label'] #split training data x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.33, random_state=53) # cv = CountVectorizer(stop_words = 'english', max_features = 1000) # x_traincv = cv.fit_transform(x_train) # article_testcv = cv.transform(article) tfv = TfidfVectorizer(stop_words='english', max_df=0.7, max_features=1000) x_traintf = tfv.fit_transform(x_train) article_testtf = tfv.transform(article) tfv_test = tfv.transform(x_test) #tfv_df = pd.DataFrame(x_traintf.A, columns = tfv.get_feature_names()) #print(tfv_df.head()) #accuracy = 0.873 # mnb_clf = MultinomialNB() # mnb_clf.fit(x_traintf, y_train) # pred = mnb_clf.predict(tfv_test) # #accuracy = 0.925 pac = PassiveAggressiveClassifier(n_iter_no_change=5, max_iter=10, early_stopping=True) pac.fit(x_traintf, y_train) pred = pac.predict(article_testtf) accuracy = metrics.accuracy_score(y_test, pred) #pred = .predict(tfv_test) #pred = mnb_clf.predict(article_testtf) # # if pred == [0]: # print("This news article is reliable") # else: # print("This news article is deemed unreliable") print("MultinomialNB accuracy: %0.3f" % accuracy)
def get_baseline_pa(dataset, train_label_list, test_label_list, verbose=True): (X_train, Y_train), (X_test, Y_test) = dataset classifier = PassiveAggressiveClassifier(n_jobs=-1, fit_intercept=True) classifier.fit(X_train, train_label_list) accuracy = classifier.score(X_test, test_label_list) if verbose: print('Got baseline of %f with Passive Aggressive classifier' % accuracy) return accuracy
def mainworker(limit1,limit2): N=10 l=[] w1=[] # +1 class w2=[]#-1 class temp=[] classlist=[] f=open("pdata.txt") for line in f: x=(line.strip("\n")).split(",") temp=[] for i in xrange(len(x)): x[i]=int(x[i]) temp.append(x[i]) clas=temp.pop() temp=temp[:limit1]+temp[limit2+1:] l.append(temp) classlist.append(clas) """if(temp[-1]==-1): w2.append(temp) else: w1.append(temp)""" f.close() X=np.array(l) y=np.array(classlist) X=np.array(l) y=np.array(classlist) karray=[2,3,4,5] for k in karray: kf = cross_validation.KFold(11054, n_folds=k) averager=[] for train_index,test_index in kf: #print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] #print X_train, len(X_test), len(y_train), len(y_test) train_data=[] test_data=[] train_label=[] test_label=[] X1 = X_train#train_data Y1 = y_train#train_label clf = PassiveAggressiveClassifier() #clf = svm.SVC(kernel='linear') clf.fit(X1,Y1) Z = X_test#test_data predicted = clf.predict(Z) accuracy = getAccuracy(predicted, y_test)#test_label) averager.append(accuracy) answer=np.mean(averager) print "The mean for",k,"fold is:" print answer
def test_wrong_class_weight_format(): # ValueError due to wrong class_weight argument type. X2 = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0], [1.0, 1.0], [1.0, 0.0]]) y2 = [1, 1, 1, -1, -1] clf = PassiveAggressiveClassifier(class_weight=[0.5], max_iter=100) with pytest.raises(ValueError): clf.fit(X2, y2) clf = PassiveAggressiveClassifier(class_weight="the larch", max_iter=100) with pytest.raises(ValueError): clf.fit(X2, y2)
def PassiveAggressive_clf(training_set_np, validation_set_np, testing_set_np, training_label, validation_label, testing_label): clf = PassiveAggressiveClassifier(max_iter=50) clf.fit(training_set_np, training_label) print("Passive Aggressive Classifier") print("Training Set Accuracy : " + str(100 * clf.score(training_set_np, training_label))) print("Validation Set Accuracy: " + str(100 * clf.score(validation_set_np, validation_label))) print("Testing Set Accuracy : " + str(100 * clf.score(testing_set_np, testing_label))) print("\n")
def test_classifier_correctness(loss): y_bin = y.copy() y_bin[y != 1] = -1 clf1 = MyPassiveAggressive(loss=loss, n_iter=2) clf1.fit(X, y_bin) for data in (X, X_csr): clf2 = PassiveAggressiveClassifier(loss=loss, max_iter=2, shuffle=False, tol=None) clf2.fit(data, y_bin) assert_array_almost_equal(clf1.w, clf2.coef_.ravel(), decimal=2)
def TrainSVM(data,labels): usealgo = 1 if usealgo == 0: from sklearn.linear_model import PassiveAggressiveClassifier clf=PassiveAggressiveClassifier(class_weight='balanced',n_jobs=-1,n_iter=15,fit_intercept=True) elif usealgo ==1: clf = SVC(probability= True,decision_function_shape='ovr',random_state=np.random.randint(1000),kernel="linear") elif usealgo ==2: from sklearn.svm import LinearSVC clf = LinearSVC() clf.fit(data,labels) return clf
class DeployedClassifierFactory: def __init__(self, term_doc_matrix, term_doc_matrix_factory, category, nlp=None): '''This is a class that enables one to train and save a classification model. Parameters ---------- term_doc_matrix : TermDocMatrix term_doc_matrix_factory : TermDocMatrixFactory category : str Category name nlp : spacy parser ''' self._term_doc_matrix = term_doc_matrix self._term_doc_matrix_factory = term_doc_matrix_factory assert term_doc_matrix_factory._nlp is None assert term_doc_matrix_factory.category_text_iter is None self._category = category self._clf = None self._proba = None def passive_aggressive_train(self): '''Trains passive aggressive classifier ''' self._clf = PassiveAggressiveClassifier(n_iter=50, C=0.2, n_jobs=-1, random_state=0) self._clf.fit(self._term_doc_matrix._X, self._term_doc_matrix._y) y_dist = self._clf.decision_function(self._term_doc_matrix._X) pos_ecdf = ECDF(y_dist[y_dist >= 0]) neg_ecdf = ECDF(y_dist[y_dist <= 0]) def proba_function(distance_from_hyperplane): if distance_from_hyperplane > 0: return pos_ecdf(distance_from_hyperplane) / 2. + 0.5 elif distance_from_hyperplane < 0: return pos_ecdf(distance_from_hyperplane) / 2. return 0.5 self._proba = proba_function return self def build(self): '''Builds Depoyed Classifier ''' if self._clf is None: raise NeedToTrainExceptionBeforeDeployingException() return DeployedClassifier(self._category, self._term_doc_matrix._category_idx_store, self._term_doc_matrix._term_idx_store, self._term_doc_matrix_factory)
def test_classifier_correctness(loss): y_bin = y.copy() y_bin[y != 1] = -1 clf1 = MyPassiveAggressive( C=1.0, loss=loss, fit_intercept=True, n_iter=2) clf1.fit(X, y_bin) for data in (X, X_csr): clf2 = PassiveAggressiveClassifier( C=1.0, loss=loss, fit_intercept=True, max_iter=2, shuffle=False, tol=None) clf2.fit(data, y_bin) assert_array_almost_equal(clf1.w, clf2.coef_.ravel(), decimal=2)
def test_classifier_accuracy(): for data in (X, X_csr): for fit_intercept in (True, False): for average in (False, True): clf = PassiveAggressiveClassifier( C=1.0, max_iter=30, fit_intercept=fit_intercept, random_state=1, average=average, tol=None) clf.fit(data, y) score = clf.score(data, y) assert_greater(score, 0.79) if average: assert hasattr(clf, 'average_coef_') assert hasattr(clf, 'average_intercept_') assert hasattr(clf, 'standard_intercept_') assert hasattr(clf, 'standard_coef_')
def test_classifier_correctness(): y_bin = y.copy() y_bin[y != 1] = -1 for loss in ("hinge", "squared_hinge"): clf1 = MyPassiveAggressive(C=1.0, loss=loss, fit_intercept=True, n_iter=2) clf1.fit(X, y_bin) clf2 = PassiveAggressiveClassifier(C=1.0, loss=loss, fit_intercept=True, n_iter=2) clf2.fit(X, y_bin) assert_array_almost_equal(clf1.w, clf2.coef_.ravel())
def test_class_weights(): # Test class weights. X2 = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0], [1.0, 1.0], [1.0, 0.0]]) y2 = [1, 1, 1, -1, -1] clf = PassiveAggressiveClassifier(C=0.1, max_iter=100, class_weight=None, random_state=100) clf.fit(X2, y2) assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([1])) # we give a small weights to class 1 clf = PassiveAggressiveClassifier(C=0.1, max_iter=100, class_weight={1: 0.001}, random_state=100) clf.fit(X2, y2) # now the hyperplane should rotate clock-wise and # the prediction on this point should shift assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([-1]))
def main(): #stemmer = SnowballStemmer('english') #stemmer = EnglishStemmer() training_data=open('trainingdata.txt', 'rU') n = int(training_data.readline().strip()) train_data = [] class_data = [] for i in range(n): line = training_data.readline().strip() train_data.append(line[1:].strip()) class_data.append(int(line[0])) train_data = np.array(train_data) class_data = np.array(class_data) # 2) Vectorize bag of words vectorizer = TfidfVectorizer(stop_words="english", max_df=0.5, sublinear_tf=True ) vectorizer.fit(train_data) X_train = vectorizer.transform(train_data) # Read test data from input X_test = np.array([raw_input().strip() for i in range(int(raw_input().strip()))]) X_test = vectorizer.transform(X_test) clf = PassiveAggressiveClassifier(n_iter=9) clf.fit(X_train, class_data) pred = clf.predict(X_test) for i in pred: print i
def test_main(self): categories, documents = get_docs_categories() clean_function = lambda text: '' if text.startswith('[') else text entity_types = set(['GPE']) term_doc_mat = ( TermDocMatrixFactory( category_text_iter=zip(categories, documents), clean_function=clean_function, nlp=_testing_nlp, feats_from_spacy_doc=FeatsFromSpacyDoc(entity_types_to_censor=entity_types) ).build() ) clf = PassiveAggressiveClassifier(n_iter=5, C=0.5, n_jobs=-1, random_state=0) fdc = FeatsFromDoc(term_doc_mat._term_idx_store, clean_function=clean_function, feats_from_spacy_doc=FeatsFromSpacyDoc( entity_types_to_censor=entity_types)).set_nlp(_testing_nlp) tfidf = TfidfTransformer(norm='l1') X = tfidf.fit_transform(term_doc_mat._X) clf.fit(X, term_doc_mat._y) X_to_predict = fdc.feats_from_doc('Did sometimes march UNKNOWNWORD') pred = clf.predict(tfidf.transform(X_to_predict)) dec = clf.decision_function(X_to_predict)
class PassiveAgressiveClassifier(Classifier): def __init__(self, matrixdatabase): self._matrix_database = matrixdatabase self._has_fit = False self._occ = OCC(C=0.0083, n_iter=27, loss="hinge") def learn(self, ingredients, cuisine): return def classify(self, ingredients): if not self._has_fit: matrix, classes = self._matrix_database.make_train_matrix() self._occ = self._occ.fit(matrix, classes) print "Fitting complete..." self._has_fit = True output = self._occ.predict(self._matrix_database.make_row_from_recipe(ingredients)) return output[0]
def test_equal_class_weight(): X2 = [[1, 0], [1, 0], [0, 1], [0, 1]] y2 = [0, 0, 1, 1] clf = PassiveAggressiveClassifier(C=0.1, n_iter=1000, class_weight=None) clf.fit(X2, y2) # Already balanced, so "balanced" weights should have no effect clf_balanced = PassiveAggressiveClassifier(C=0.1, n_iter=1000, class_weight="balanced") clf_balanced.fit(X2, y2) clf_weighted = PassiveAggressiveClassifier(C=0.1, n_iter=1000, class_weight={0: 0.5, 1: 0.5}) clf_weighted.fit(X2, y2) # should be similar up to some epsilon due to learning rate schedule assert_almost_equal(clf.coef_, clf_weighted.coef_, decimal=2) assert_almost_equal(clf.coef_, clf_balanced.coef_, decimal=2)
plot_confusion_matrix(cm, classes=['FAKE', 'REAL']) clf = MultinomialNB() clf.fit(count_train, y_train) pred = clf.predict(count_test) score = metrics.accuracy_score(y_test, pred) print("accuracy: %0.3f" % score) cm = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL']) plot_confusion_matrix(cm, classes=['FAKE', 'REAL']) # Testing from sklearn.linear_model import PassiveAggressiveClassifier linear_clf = PassiveAggressiveClassifier(n_iter=50) linear_clf.fit(tfidf_train, y_train) pred = linear_clf.predict(tfidf_test) score = metrics.accuracy_score(y_test, pred) print("accuracy: %0.3f" % score) cm = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL']) plot_confusion_matrix(cm, classes=['FAKE', 'REAL']) clf = MultinomialNB(alpha=0.1) last_score = 0 for alpha in np.arange(0,1,.1): nb_classifier = MultinomialNB(alpha=alpha) nb_classifier.fit(tfidf_train, y_train) pred = nb_classifier.predict(tfidf_test) score = metrics.accuracy_score(y_test, pred)
testX = getFeatures(testTweets, countVecNGram, dictVec) percScoresTrain = [] percScoresDev = [] for i in range(10): perceptron.fit(trainX, trainY) percScoresDev.append(perceptron.score(devX, devY)) percScoresTrain.append(perceptron.score(trainX, trainY)) print "Perceptron Train:", np.mean(percScoresTrain) print "Perceptron Dev:", np.mean(percScoresDev) passAggScoresTrain = [] passAggScoresDev = [] for i in range(10): passAgg.fit(trainX, trainY) passAggScoresDev.append( passAgg.score(devX, devY)) passAggScoresTrain.append( passAgg.score(trainX, trainY)) print "Passive Aggressive Train:", np.mean(passAggScoresTrain) print "Passive Aggressive Dev:", np.mean(passAggScoresDev) passAggScoresSmallTrain = [] passAggScoresSmallDev = [] for i in range(10): passAgg.fit(trainX, trainY) passAggScoresSmallDev.append( passAgg.score(devX, devY)) passAggScoresSmallTrain.append( passAgg.score(trainXSmall,trainYSmall))
targets.append(int(line[0])) docs.append(' '.join([i for i in line[1:] if not is_stopword(i)])) count_vect = CountVectorizer(input='content',ngram_range=(1,2)) X_train_counts = count_vect.fit_transform(docs) tf_transformer = TfidfTransformer(use_idf=True).fit(X_train_counts) X_train_tf = tf_transformer.transform(X_train_counts) #svd = TruncatedSVD(n_components=55, random_state=7) #X_train = svd.fit_transform(X_train_tf) #clf = KNeighborsClassifier(n_neighbors=8).fit(X_train, targets) #clf = BernoulliNB(alpha=.01) #clf = LinearSVC() clf=PassiveAggressiveClassifier(n_iter=9) clf.fit(X_train_tf, targets) def classify(content): global count_vect global tf_transformer global svd global clf X_new_counts = count_vect.transform(content) X_new_tfidf = tf_transformer.transform(X_new_counts) #X_new = svd.transform(X_new_tfidf) return clf.predict(X_new_tfidf) tc = int(raw_input()) inp = [] for tcc in range(tc): x = raw_input()
#print X_train_tfidf.shape ntest = input() testdoc = [] for t in range(0, ntest): doc = raw_input() testdoc.append(doc) X_new_counts = count_vect.transform(testdoc) X_new_tfidf = tfidf_transformer.transform(X_new_counts) """" #Naive bayes from sklearn.naive_bayes import MultinomialNB clf = MultinomialNB().fit(X_train_tfidf, trainlabel) predicted = clf.predict(X_new_tfidf) #test random forest from sklearn.ensemble import RandomForestClassifier clf = RandomForestClassifier(n_estimators=10) clf = clf.fit(X_train_tfidf, trainlabel) predicted = clf.predict(X_new_tfidf) """ from sklearn.linear_model import PassiveAggressiveClassifier clf = PassiveAggressiveClassifier(n_iter=50) clf = clf.fit(X_train_tfidf, trainlabel) predicted = clf.predict(X_new_tfidf) for t in range(0, ntest): print predicted[t]
#https://www.hackerrank.com/challenges/document-classification/submissions/code/10577787 # Enter your code here. Read input from STDIN. Print output to STDOUT documents=[] target=[] cnt=0 from sklearn.linear_model import PassiveAggressiveClassifier with open("trainingdata.txt","rb") as infile: for line in infile: if cnt==0: cnt=1 continue category=int(line[0:2]) doc=line[2:] target.append(category) documents.append(doc) from sklearn.feature_extraction.text import TfidfVectorizer transformer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, analyzer='word',stop_words='english') X = transformer.fit_transform(documents) from sklearn.naive_bayes import MultinomialNB clf = PassiveAggressiveClassifier(n_iter=50) clf.fit(X, target) n=int(raw_input()) for i in range(0,n): X=transformer.transform([raw_input()]) print(clf.predict(X))[0]
def runLearner(printStages = True, useSelector = False, discreteHelpfulness = True, useRST = True, useFew = False): learner = PassiveAggressiveClassifier() if discreteHelpfulness else PassiveAggressiveRegressor() #bestwords = getBestWords(instances,num=1000) tfidvec = TfidfVectorizer(sublinear_tf=True,stop_words='english', ngram_range=(1,3), decode_error='replace') selector = SelectKBest(chi2, k=50000) if useSelector else None encoder = LabelEncoder() if discreteHelpfulness else None if discreteHelpfulness: classlabels = encoder.fit_transform(labels) newData = False count = 0 if useRST: print 'Getting RST data' nums, texts, ilabels = getPickledRSTSciKitDataLists(True) if newData else getRSTSciKitDataLists(True) random = RandomFeatureExtractor() lengthBaseline = LenFeatureExtractor() fullRST = FullPickledRSTFeatureExtractor(nums) if newData else FullTextRSTFeatureExtractor(nums) limitedRST = LimitedPickledRSTFeatureExtractor(nums) if newData else LimitedTextRSTFeatureExtractor(nums) vectorizer = FeatureUnion([('extra',limitedRST),('tfid',tfidvec)]) print 'Fitting random features baseline' random.fit(texts) print 'Fitting text length baseline' lengthBaseline.fit(texts) print 'Fitting full RST features' fullRST.fit(texts) print 'Fitting limited RST features' limitedRST.fit(texts) print 'Fitting limited RST with tfidvec features' vectorizer.fit(texts) print 'Fitting tfidvec features' tfidvec.fit(texts) split = int(0.8*len(ilabels)) trainData = (texts[:split],ilabels[:split]) testData = (texts[split:],ilabels[split:]) X,y = getAsSciKit(trainData[0],trainData[1],random,encoder,selector) learner.fit(X,y) X,y = getAsSciKit(trainData[0],trainData[1],random,encoder,selector) print 'random features baseline trained on %d instances has accuracy %f'%(len(trainData[0]),learner.score(X,y)) dummy = DummyClassifier() X,y = getAsSciKit(trainData[0],trainData[1],random,encoder,selector) dummy.fit(X,y) X,y = getAsSciKit(testData[0],testData[1],random,encoder,selector) print 'Dummy label distribution baseline trained on %d instances has accuracy %f'%(len(trainData[0]),dummy.score(X,y)) X,y = getAsSciKit(trainData[0],trainData[1],lengthBaseline,encoder,selector) learner.fit(X,y) X,y = getAsSciKit(testData[0],testData[1],lengthBaseline,encoder,selector) print 'text length baseline trained on %d instances has accuracy %f'%(len(trainData[0]),learner.score(X,y)) X,y = getAsSciKit(trainData[0],trainData[1],fullRST,encoder,selector) learner.fit(X,y) X,y = getAsSciKit(testData[0],testData[1],fullRST,encoder,selector) print 'Full RST learner trained on %d instances has accuracy %f'%(len(trainData[0]),learner.score(X,y)) X,y = getAsSciKit(trainData[0],trainData[1],limitedRST,encoder,selector) learner.fit(X,y) X,y = getAsSciKit(testData[0],testData[1],limitedRST,encoder,selector) print 'Limited RST learner trained on %d instances has accuracy %f'%(len(trainData[0]),learner.score(X,y)) X,y = getAsSciKit(trainData[0],trainData[1],vectorizer,encoder,selector) learner.fit(X,y) X,y = getAsSciKit(testData[0],testData[1],vectorizer,encoder,selector) print 'Limited RST with ngram learner trained on %d instances has accuracy %f'%(len(trainData[0]),learner.score(X,y)) X,y = getAsSciKit(trainData[0],trainData[1],tfidvec,encoder,selector) learner = learner.fit(X,y) X,y = getAsSciKit(testData[0],testData[1],tfidvec,encoder,selector) print 'ngram learner trained on %d instances has accuracy %f'%(len(trainData[0]),learner.score(X,y)) else: vectorizer = tfidvec testData = None vocabGotten = False instances = ([],[]) numVocab = 50000 numTest = 50000 numTrain = 100000 maxTrainStages = 20 for text,label in getSciKitData(stateProgress = False, discreteLabels=discreteHelpfulness): if label!='few' or useFew: instances[0].append(text) instances[1].append(label) if not vocabGotten and len(instances[0]) == numVocab: if printStages: print 'Fitting vocabulary with %d instances'%numVocab vectorizer.fit(instances[0],None) if selector is not None: X,y = getSciKitInstance(instances[0],instances[1],vectorizer,encoder,None) selector.fit(X,y) vocabGotten = True instances = ([],[]) elif vocabGotten and testData is None and len(instances[0]) == numTest: if printStages: print 'Getting test data with %d instances'%numTest testData = getSciKitInstance(instances[0],instances[1],vectorizer,encoder,selector) instances = ([],[]) elif vocabGotten and testData is not None and len(instances[0]) == numTrain: X,y = getSciKitInstance(instances[0],instances[1],vectorizer,encoder,selector) if discreteHelpfulness: learner = learner.partial_fit(X,y, classes = classlabels) else: learner = learner.partial_fit(X,y) instances = ([],[]) count = count + 1 if printStages: print 'Baseline trained on %d instances has accuracy %f'%(count*numTrain,learner.score(testData[0],testData[1])) elif count == maxTrainStages: break print 'Final learner trained on %d instances has accuracy %f'%(maxTrainStages*numTrain,learner.score(testData[0],testData[1]))
#y = [1,1,1,1] trans = vectorizer.fit_transform(x) #print vectorizer.transform(["I am in a tree tree"]).toarray() #print vectorizer.get_feature_names() #print trans.toarray() #print sorted(vectorizer.vocabulary_) print len(vectorizer.vocabulary_) K = 1 from sklearn import neighbors #clf = neighbors.KNeighborsClassifier(K,weights = 'distance', leaf_size= 30) from sklearn.linear_model import PassiveAggressiveClassifier clf = PassiveAggressiveClassifier(n_iter=50) clf.fit(trans, y) #f = open("testDatatextClassification.txt",'r') f = open("input01.txt",'r') f2 = open("output01.txt","r") d = f.readlines() d = d[1:] ans = map(int,f2.readlines()) t0= time.clock() summing = 0; for j,i in enumerate(d): sol = int(clf.predict(vectorizer.transform([i]).toarray())[0]) #print sol, ans[j] if (sol==ans[j]): summing = summing + 1
def run(self, nFold=3, loss='hinge', iter=10, verbose=1): log.debug("PA: run") (numx, numy) = self._network.shape pp = permutation(numx) model = PassiveAggressiveClassifier(loss=loss, n_iter=iter, verbose=verbose) model.fit(self._network, self._annotation.ravel()) scores = model.decision_function(self._network) self._scores = self._convertScore(scores) fold = 0 offset = 0 meanroc = [] labelIx = range(numx) while fold < nFold: log.debug("NV: ___ fold= %d ___" % fold) lastelem = int(min(numx, offset+floor(numx/nFold))) ix = [] for index in pp[offset+1:lastelem]: ix.append(index) print lastelem offset = lastelem labeltmp = [] for value in self._annotation: labeltmp.append(float(value)) for index in ix: labeltmp[index] = 0 model = PassiveAggressiveClassifier(loss=loss, n_iter=iter, verbose=verbose) model.fit(self._network, labeltmp) scores = model.decision_function(self._network) scores = self._convertScore(scores) score = [] label = [] protein = [] for index in ix: score.append(float(scores[index])) label.append(int(self._annotation[index])) protein.append(int(self._proteinid[index])) self._foldlabels.append(int(self._annotation[index])) self._foldscores.append(float(scores[index])) self._foldproteins.append(int(self._proteinid[index])) auroc = self.AUROC(label, score) log.debug("AUROC= %.4f" % auroc) meanroc.append(auroc) fold += 1 self._auroc = reduce(lambda x, y: x + y / float(len(meanroc)), meanroc, 0) auroc = self.AUROC(self._foldlabels, self._foldscores) self._TPR_FPR(self._foldlabels, self._foldscores)
X_train_select = sfm.transform(X_train) X_test_select = sfm.transform(X_test) # test with new clf clf1 = PassiveAggressiveClassifier(C=0.5, n_iter=200, loss='hinge',random_state = 42) benchmark(clf1, X_train_select, y_train, X_test_select, y_test) # GridSearch for C # Set the parameters by cross-validation tuned_parameters = [{'C': np.logspace(-6, 0, 1000)}] score = 'accuracy' print("# Tuning hyper-parameters for %s" % score) print() clf = GridSearchCV(clf1, tuned_parameters, cv=cv) clf.fit(X_train, y_train) print("Best parameters set found on development set:") print() print(clf.best_params_) print() print("Grid scores on development set:") print() for params, mean_score, scores in clf.grid_scores_: print("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() * 2, params)) print()
import numpy as np import pandas as pd from sklearn.cross_validation import train_test_split from sklearn.linear_model import PassiveAggressiveClassifier # NOTE: Make sure that the class is labeled 'class' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR') training_indices, testing_indices = train_test_split(tpot_data.index, stratify = tpot_data['class'].values, train_size=0.75, test_size=0.25) result1 = tpot_data.copy() # Perform classification with a passive aggressive classifier pagr1 = PassiveAggressiveClassifier(C=0.81, loss="squared_hinge", fit_intercept=True, random_state=42) pagr1.fit(result1.loc[training_indices].drop('class', axis=1).values, result1.loc[training_indices, 'class'].values) result1['pagr1-classification'] = pagr1.predict(result1.drop('class', axis=1).values)
ls=[] for _ in range(n): # change this to n in stead of 3 x=f.readline() xs=x[2:] xn=x[0] ls.append(xs) ln.append(xn) #stem the words bag_of_words=vectorizer.fit(ls) bag_of_words=vectorizer.transform(ls) cmax=0 for cc in range(1,100): #sw=stopwords.words() #stopwords are not supported, requires download clf = PassiveAggressiveClassifier(n_iter=9,C=cc/10) # svm=LinearSVC(C=cc/10.0) clf.fit(bag_of_words,ln) #Now get input (test) data lt=[] filename=open("testdata.txt") line = filename.readline() ntests=int(line) for _ in range(ntests): lt.append(filename.readline()) bag_of_test_words=vectorizer.transform(lt) result=clf.predict(bag_of_test_words) actuals=[] filename=open("testresults.txt") z=0 for x in range(len(result)):