class FakeNewDetector: def __init__(self): self.tf_idf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7) self.pac_model = None def prepare_model(self): df = pd.read_csv('data_set/news.csv', encoding='latin-1') df.head(10) x_train, x_test, y_train, y_test = train_test_split(df.text, df.label, test_size=0.2, random_state=7) tfidf_train = self.tf_idf_vectorizer.fit_transform(x_train) tfidf_test = self.tf_idf_vectorizer.transform(x_test) self.pac_model = PassiveAggressiveClassifier(max_iter=50) result = self.pac_model.fit(tfidf_train, y_train) print(result) y_pred = self.pac_model.predict(tfidf_test) score = accuracy_score(y_test, y_pred) print(f'Accuracy: {round(score * 100, 2)}%') confusion_matrix(y_test, y_pred, labels=['FAKE', 'REAL']) def predict_outcome(self, input_message): test_array = np.array([input_message]) tfidf_test = self.tf_idf_vectorizer.transform(test_array) return self.pac_model.predict(tfidf_test) def predict_outcome_list(self, input_message_list): test_array = np.array(input_message_list) tfidf_test = self.tf_idf_vectorizer.transform(test_array) return self.pac_model.predict(tfidf_test)
def passiveAggresive(train, test, Y_train, Y_test, column): ''' Fits a Passive Aggresive Perceptron Classifer ''' clf = PassiveAggressiveClassifier(C = .1, max_iter = 1000, class_weight = 'balanced', tol = 1e-3) clf.fit(train, Y_train[column]) clf.predict(test) return clf.score(test, Y_test[column])
def _passiveaggressiveclassifier(*, train, test, x_predict=None, metrics, C=1.0, fit_intercept=True, max_iter=1000, tol=0.001, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, shuffle=True, verbose=0, loss='hinge', n_jobs=None, random_state=None, warm_start=False, class_weight=None, average=False): """For for info visit : https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.PassiveAggressiveClassifier.html#sklearn.linear_model.PassiveAggressiveClassifier """ model = PassiveAggressiveClassifier( C=C, fit_intercept=fit_intercept, max_iter=max_iter, tol=tol, early_stopping=early_stopping, validation_fraction=validation_fraction, n_iter_no_change=n_iter_no_change, shuffle=shuffle, verbose=verbose, loss=loss, n_jobs=n_jobs, random_state=random_state, warm_start=warm_start, class_weight=class_weight, average=average) model.fit(train[0], train[1]) model_name = 'PassiveAggressiveClassifier' y_hat = model.predict(test[0]) if metrics == 'f1_score': accuracy = f1_score(test[1], y_hat) if metrics == 'jaccard_score': accuracy = jaccard_score(test[1], y_hat) if metrics == 'accuracy_score': accuracy = accuracy_score(test[1], y_hat) if x_predict is None: return (model_name, accuracy, None) y_predict = model.predict(x_predict) return (model_name, accuracy, y_predict)
def train(tfidf_train, y_train, tfidf_test): pac = PassiveAggressiveClassifier(max_iter=50) pac.fit(tfidf_train, y_train) y_pred = pac.predict(tfidf_test) return y_pred
def constructPickles(filename): dataDF = pd.read_csv(filename) labels = dataDF.label # DataFlair - Split the dataset x_train, x_test, y_train, y_test = train_test_split(dataDF['text'], labels, test_size=0.2, random_state=7) # DataFlair - Initialize a TfidfVectorizer tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7) # DataFlair - Fit and transform train set, transform test set tfidf_train = tfidf_vectorizer.fit_transform(x_train) tfidf_test = tfidf_vectorizer.transform(x_test) # DataFlair - Initialize a PassiveAggressiveClassifier pac = PassiveAggressiveClassifier(max_iter=50) pac.fit(tfidf_train, y_train) # DataFlair - Predict on the test set and calculate accuracy pac = joblib.load("testPickle") tfidf_vectorizer = joblib.load("testPickleVector") tfidf_test = tfidf_vectorizer.transform(x_test) y_pred = pac.predict(tfidf_test) score = accuracy_score(y_test, y_pred) print(f'Accuracy: {round(score * 100, 2)}%') joblib.dump(pac, "testPickle") joblib.dump(tfidf_vectorizer, "testPickleVector")
def Predict(): #user_inputเข้ามา user_input = request.form['text'] #อ่านไฟล์dataset df = pd.read_csv('train.csv') #กำหนดค่าจากตัวเลขให้เป็นtext conversion_dict = {0: 'HQ', 1: 'LQ_EDIT', 2: 'LQ_CLOSE'} df['Body'] = df['Y'].replace(conversion_dict) # print(df.label.value_counts()) # Train test split #แบ่งข้อมูลมาtrain x_train, x_test, y_train, y_test = train_test_split(df['Body'], df['Y'], test_size=0.25, random_state=7, shuffle=True) tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.75) # แปลงเป็นตัวเลข vec_train = tfidf_vectorizer.fit_transform(x_train.values.astype('U')) vec_test = tfidf_vectorizer.transform(x_test.values.astype('U')) # Train Model pac = PassiveAggressiveClassifier(max_iter=50) pac.fit(vec_train, y_train) model = MultinomialNB() model.fit(vec_train, y_train) # Predict user_input_tranform = tfidf_vectorizer.transform([user_input]) y_predict = pac.predict(user_input_tranform) return render_template("Predict.html", text=user_input, predict=y_predict)
def model_PassiveAggressive(train_x, train_y, test_x, test_y, n_est=100): model = PassiveAggressiveClassifier() model.fit(train_x, train_y) sc = model.score(test_x, test_y) prediction = model.predict(test_x) mae = mean_absolute_error(test_y, prediction) return (sc, mae, prediction, model)
def PassiveAggressiveClassifier_1(train_predictors,test_predictors,train_target,test_target): clf = PassiveAggressiveClassifier() clf.fit(train_predictors,train_target) predicted = clf.predict(test_predictors) accuracy = accuracy_score(test_target, predicted) print "Accuracy for Linear Model PassiveAggressiveClassifier: "+str(accuracy) return accuracy,predicted
def passiveAgressive(train_x, train_y, test_x): from sklearn.linear_model import PassiveAggressiveClassifier # apply Linear Regression: model = PassiveAggressiveClassifier() model.fit(train_x, train_y) y_prediction = model.predict(test_x) return y_prediction
def get_delay(): result = request.form query_title = result['title'] query_text = result['maintext'] # print(query_text) query = get_all_query(query_title, query_text) # query = remove_punctuation_stopwords_lemma(query_text) # print(query) # user_input = {'query':query} toSearch = query_title query_text = [query_text] query_title = [query_title] tfidf_test_input = tfidf_vectorizer.transform(query) linear_clf = PassiveAggressiveClassifier() linear_clf.fit(tfidf_train, y_train) pred = linear_clf.predict(tfidf_test_input) print(pred) try: from googlesearch import search except ImportError: print("No module named 'google' found") # to search links = [] for j in search(toSearch, tld="co.in", num=10, stop=10, pause=2): links.append(j) return f'<html><body><h1>{pred[0]}</h1> <a href={links[0]}> Article 1 </a><br> <a href={links[1]}> Article 2 </a> <form action="/"> <button type="submit">back </button> </form></body></html>'
def train_and_predict_m7 (train, test, labels) : ## Apply basic concatenation + stemming trainData, testData = stemmer_clean (train, test, stemmerEnableM7, stemmer_type = 'snowball') ## TF-IDF transform with sub-linear TF and stop-word removal tfv = TfidfVectorizer(min_df = 5, max_features = None, strip_accents = 'unicode', analyzer = 'word', token_pattern = r'\w{1,}', ngram_range = (1, 5), smooth_idf = 1, sublinear_tf = 1, stop_words = ML_STOP_WORDS) tfv.fit(trainData) X = tfv.transform(trainData) X_test = tfv.transform(testData) ## Create the classifier print ("Fitting Passive-Aggressive Classifer...") clf = PassiveAggressiveClassifier(random_state = randomState, loss = 'squared_hinge', n_iter = 100, C = 0.01) ## Create a parameter grid to search for best parameters for everything in the pipeline # Note: minkowski with p > 2 does not work for sparse matrices param_grid = {'C' : [0.003, 0.01, 0.03, 0.1], 'loss': ['hinge', 'squared_hinge'], 'n_iter': [5, 10, 30, 100, 300]} #param_grid = {'C' : [0.003, 0.01, 0.03, 0.1, 0.3, 1], 'loss': ['hinge'], 'n_iter': [5, 10, 30, 100, 300, 1000]} ## Predict model with best parameters optimized for quadratic_weighted_kappa if (gridSearch) : model = perform_grid_search (clf, param_grid, X, labels) pred = model.predict(X_test) else : clf.fit(X, labels) pred = clf.predict(X_test) return pred
def test_main(self): categories, documents = get_docs_categories() clean_function = lambda text: '' if text.startswith('[') else text entity_types = set(['GPE']) term_doc_mat = (TermDocMatrixFactory( category_text_iter=zip(categories, documents), clean_function=clean_function, nlp=_testing_nlp, feats_from_spacy_doc=FeatsFromSpacyDoc( entity_types_to_censor=entity_types)).build()) clf = PassiveAggressiveClassifier(n_iter=5, C=0.5, n_jobs=-1, random_state=0) fdc = FeatsFromDoc( term_doc_mat._term_idx_store, clean_function=clean_function, feats_from_spacy_doc=FeatsFromSpacyDoc( entity_types_to_censor=entity_types)).set_nlp(_testing_nlp) tfidf = TfidfTransformer(norm='l1') X = tfidf.fit_transform(term_doc_mat._X) clf.fit(X, term_doc_mat._y) X_to_predict = fdc.feats_from_doc('Did sometimes march UNKNOWNWORD') pred = clf.predict(tfidf.transform(X_to_predict)) dec = clf.decision_function(X_to_predict)
class my_model: def __init__(self): # defines the self function used in fit and predict self.preprocessor = CountVectorizer(stop_words='english') self.clf = PassiveAggressiveClassifier(C=0.1, fit_intercept=True, n_iter_no_change=10, validation_fraction=0.8) def fit(self, X, y): # do not exceed 29 mins X_df = get_train_test_df(X) XX = self.preprocessor.fit_transform(X_df) X_final = TfidfTransformer(norm='l2', use_idf=False, smooth_idf=False, sublinear_tf=True).fit_transform(XX) self.clf.fit(X_final, y) return def predict(self, X): # remember to apply the same preprocessing in fit() on test data before making predictions X_df = get_train_test_df(X) XX = self.preprocessor.transform(X_df) X_final = TfidfTransformer(norm='l2', use_idf=False, smooth_idf=False, sublinear_tf=True).fit_transform(XX) predictionsOfModel = self.clf.predict(X_final) return predictionsOfModel
def get_delay(): result = request.form query_title = result['title'] query_text = result['maintext'] # print(query_text) query = get_all_query(query_title, query_text) # query = remove_punctuation_stopwords_lemma(query_text) # print(query) # user_input = {'query':query} toSearch = query_title query_text = [query_text] query_title = [query_title] tfidf_test_input = tfidf_vectorizer.transform(query) linear_clf = PassiveAggressiveClassifier() linear_clf.fit(tfidf_train, y_train) pred = linear_clf.predict(tfidf_test_input) print(pred) try: from googlesearch import search except ImportError: print("No module named 'google' found") # to search links = [] for j in search(toSearch, tld="co.in", num=10, stop=10, pause=2): links.append(j) # <style>body{text-align: center;font-family: Arial, Helvetica, sans-serif;}</style> # return f'<html><style>body{text-align: center;font-family: Arial, Helvetica, sans-serif;}</style><body><h1>It is a {pred[0]} news. </h1><h2>You may refer to the following articles for more details.</h2> <a href={links[0]}> Article 1 </a><br> <a href={links[1]}> Article 2 </a> <form action="/"> <button type="submit">back </button> </form></body></html>' return render_template('result.html', links=links, pred=pred[0])
def training(): X, y = get_data() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7) tfidf_Xtrain, tfidf_Xtest = Vectorize(X_train, X_test) Pac = PassiveAggressiveClassifier(C=0.5, random_state=5) Pac.fit(tfidf_Xtrain, y_train) Pac_acc = Pac.score(tfidf_Xtest, y_test) print(Pac_acc) y_pred = Pac.predict(tfidf_Xtest) Pac_accuracy = accuracy_score(y_test, y_pred) print(Pac_accuracy) conf_matrix = confusion_matrix(y_test, y_pred, labels=['FAKE', 'REAL']) print(conf_matrix) clf_report = classification_report(y_test, y_pred) print(clf_report) makePickleFile(Pac)
class PassiveAggressiveModel(BaseModel): def __init__(self, cached_features): BaseModel.__init__(self, cached_features) self.model = PassiveAggressiveClassifier(loss='squared_hinge', C=1.0, random_state=1) def _predict_internal(self, X_test): return self.model.predict(X_test)
def paclassifier(train_X, train_Y, test_X): print("Training model.....") pac = PassiveAggressiveClassifier( random_state=0) # Training the Passive Aggressive Classifier Model pac.fit(train_X, train_Y) # Fitting the training data into the model y_pred = pac.predict( test_X) # Predicting the label output for the test data return y_pred
def passive_aggressive(sample_data, test_percentage): """ Implement Naive Bayes and perform accuracy_tests """ # Bag of words for first x words X_train, X_test, y_train, y_test = data_models.split_test_train_data(sample_data, test_percentage) linear_clf = PassiveAggressiveClassifier() linear_clf.fit(X_train, y_train) pred = linear_clf.predict(X_test) return (y_test == pred).sum() * 100 / len(y_test)
def train_passve_aggresive_classifier(self, tfidf_train, b_train, tfidf_test, b_test): pclass = PassiveAggressiveClassifier(max_iter=60) pclass.fit(tfidf_train, b_train) b_pred = pclass.predict(tfidf_test) factcheckscore = accuracy_score(b_test, b_pred) print(f"Accuracy Is {round(factcheckscore*100,2)}%") return self.save_model(pclass)
def paClassify(X, Y, Xt, Yt, class_weight): title = "Passive Aggressive Classifier" classifier = PassiveAggressiveClassifier(n_iter=10, class_weight=class_weight) classifier.fit(X, Y) YPredict = classifier.predict(Xt) printAccuracy(YPredict, Yt, title)
def classify(): # Initialize a PassiveAggressiveClassifier pac = PassiveAggressiveClassifier(max_iter=50) pac.fit(tfidf_train, y_train) # Predict on the test set and calculate accuracy y_pred_data = pac.predict(tfidf_test) score = accuracy_score(y_test, y_pred) print(f'Accuracy: {round(score * 100, 2)}%') return y_pred_data
class BinaryClassifier(object): def __init__(self, classifier_type, scale_features=True): self.scale_features = scale_features self.classifier_type = classifier_type self.clear() self.train_std = 0 self.random_gen = np.random.RandomState(136543785) def clear(self, remember_train_std_if_supported=False): self.positive_instances = [] self.negative_instances = [] # self.classifier = svm.SVC(kernel='linear') if self.classifier_type == CLASSIFIER_TYPE.LR: #print (CLASSIFIER_TYPE.LR) self.classifier = LogisticRegression(C=1.0) elif self.classifier_type == CLASSIFIER_TYPE.PA: #print (CLASSIFIER_TYPE.PA) self.classifier = PassiveAggressiveClassifier(loss='hinge', C=1.0) elif self.classifier_type == CLASSIFIER_TYPE.SVM: self.classifier = svm.SVC(kernel='linear') if not remember_train_std_if_supported: self.train_std = 0 def add_positive_instances(self, positive_instances): self.positive_instances.extend(positive_instances) def add_negative_instances(self, negative_instances): self.negative_instances.extend(negative_instances) def train(self): X = self.positive_instances + self.negative_instances y = np.asarray([1] * len(self.positive_instances) + [0] * len(self.negative_instances)) # shuffling the train instances in case classifier is sensitive to this order Xy = list(zip(X, y)) self.random_gen.shuffle(Xy) X[:], y[:] = zip(*Xy) X = mat_concat(X) if self.scale_features: if self.train_std == 0: self.train_std = (pointwise_mult(X, X).mean() - X.mean()**2)**0.5 X = X / self.train_std # X = X/self.train_std self.classifier.fit(X, y) def predict(self, instances): # scaled_instances = [inst/self.train_std for inst in instances] instances = mat_concat(instances) if self.scale_features and self.train_std > 0: instances = instances / self.train_std return self.classifier.predict(instances)
def reanalyze(article_text): df = pd.read_csv(train_file) # Change the labels df.loc[(df['label'] == 1), ['label']] = 'FAKE' df.loc[(df['label'] == 0), ['label']] = 'REAL' labels = df.label x_train, x_test, y_train, y_test = train_test_split(df['text'], labels, test_size=0.27, random_state=7, shuffle=True) # Initialize a TfidfVectorizer, vectorize the text tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7) # Fit and transform train set, transform test set tfidf_train = tfidf_vectorizer.fit_transform(x_train.values.astype('U')) tfidf_test = tfidf_vectorizer.transform(x_test.values.astype('U')) # Initialize a PassiveAggressiveClassifier and fit training sets pac = PassiveAggressiveClassifier(max_iter=50) pac.fit(tfidf_train, y_train) # Predict on the test set and calculate accuracy y_pred = pac.predict(tfidf_test) score = accuracy_score(y_test, y_pred) vec_new = tfidf_vectorizer.transform([article_text]) y_pred_new = pac.predict(vec_new) rounded_score = round(score * 100, 2) # confusion_matrix(y_test, y_pred, labels=['FAKE', 'REAL']) print(f'Accuracy: {rounded_score}') result = 'reliable content' if y_pred_new[ 0] == 'REAL' else 'unreliable content' str = f'Brrrrr, calculating... There is a good chance that this is **{result}**.' save_pickle(pac, tfidf_vectorizer) return str
def train_model(): #just using dummy data from a text article = extract("/home/david/2019-ca400-taland2/src/dataset/test.txt") dftrain = pd.read_csv( '/home/david/2019-ca400-taland2/src/dataset/train.csv') #drops rows that have null values dftrain = dftrain.dropna() #Set column names to variables df_x = dftrain['text'] df_y = dftrain['label'] #split training data x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.33, random_state=53) # cv = CountVectorizer(stop_words = 'english', max_features = 1000) # x_traincv = cv.fit_transform(x_train) # article_testcv = cv.transform(article) tfv = TfidfVectorizer(stop_words='english', max_df=0.7, max_features=1000) x_traintf = tfv.fit_transform(x_train) article_testtf = tfv.transform(article) tfv_test = tfv.transform(x_test) #tfv_df = pd.DataFrame(x_traintf.A, columns = tfv.get_feature_names()) #print(tfv_df.head()) #accuracy = 0.873 # mnb_clf = MultinomialNB() # mnb_clf.fit(x_traintf, y_train) # pred = mnb_clf.predict(tfv_test) # #accuracy = 0.925 pac = PassiveAggressiveClassifier(n_iter_no_change=5, max_iter=10, early_stopping=True) pac.fit(x_traintf, y_train) pred = pac.predict(article_testtf) accuracy = metrics.accuracy_score(y_test, pred) #pred = .predict(tfv_test) #pred = mnb_clf.predict(article_testtf) # # if pred == [0]: # print("This news article is reliable") # else: # print("This news article is deemed unreliable") print("MultinomialNB accuracy: %0.3f" % accuracy)
def test_passive_aggressive_2(): """Ensure that the TPOT PassiveAggressiveClassifier outputs the same as the sklearn classifier when C == 0.0""" tpot_obj = TPOT() result = tpot_obj._passive_aggressive(training_testing_data, 0.0, 0) result = result[result['group'] == 'testing'] pagg = PassiveAggressiveClassifier(C=0.0001, loss='hinge', fit_intercept=True, random_state=42) pagg.fit(training_features, training_classes) assert np.array_equal(result['guess'].values, pagg.predict(testing_features))
def test_class_weights(): # Test class weights. X2 = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0], [1.0, 1.0], [1.0, 0.0]]) y2 = [1, 1, 1, -1, -1] clf = PassiveAggressiveClassifier(C=0.1, max_iter=100, class_weight=None, random_state=100) clf.fit(X2, y2) assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([1])) # we give a small weights to class 1 clf = PassiveAggressiveClassifier(C=0.1, max_iter=100, class_weight={1: 0.001}, random_state=100) clf.fit(X2, y2) # now the hyperplane should rotate clock-wise and # the prediction on this point should shift assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([-1]))
def dive2(X_train, X_test, y_train, y_test, df, data, print_report=False, print_cm=False,\ print_top10=False, feature_names=False, target_names=False): # clf = KNeighborsClassifier(n_neighbors=10) # clf = RandomForestClassifier(n_estimators=100) clf = PassiveAggressiveClassifier(n_iter=50, random_state=1) clf.fit(X_train, y_train) pred = clf.predict(X_test) score = metrics.accuracy_score(y_test, pred) recall = metrics.recall_score(y_test, pred) pre_score = metrics.precision_score(y_test, pred) print(df.columns) # df['Integ_Issue_Probability'] = clf.predict_proba(data)[:,1] df['Integ_Issue_Prediction'] = clf.predict(data) free_text_columns = ['All_Comments', 'Activity_Memo', 'Comment_Summary'] # df_copy = df.copy() # for column in free_text_columns: # df[column]= df_copy[column].apply(clean_string) return df, clf, score, pred, recall, pre_score
def mainworker(limit1,limit2): N=10 l=[] w1=[] # +1 class w2=[]#-1 class temp=[] classlist=[] f=open("pdata.txt") for line in f: x=(line.strip("\n")).split(",") temp=[] for i in xrange(len(x)): x[i]=int(x[i]) temp.append(x[i]) clas=temp.pop() temp=temp[:limit1]+temp[limit2+1:] l.append(temp) classlist.append(clas) """if(temp[-1]==-1): w2.append(temp) else: w1.append(temp)""" f.close() X=np.array(l) y=np.array(classlist) X=np.array(l) y=np.array(classlist) karray=[2,3,4,5] for k in karray: kf = cross_validation.KFold(11054, n_folds=k) averager=[] for train_index,test_index in kf: #print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] #print X_train, len(X_test), len(y_train), len(y_test) train_data=[] test_data=[] train_label=[] test_label=[] X1 = X_train#train_data Y1 = y_train#train_label clf = PassiveAggressiveClassifier() #clf = svm.SVC(kernel='linear') clf.fit(X1,Y1) Z = X_test#test_data predicted = clf.predict(Z) accuracy = getAccuracy(predicted, y_test)#test_label) averager.append(accuracy) answer=np.mean(averager) print "The mean for",k,"fold is:" print answer
class FakeNews: def __init__(self, db): self.db = db self.x_train = None self.x_test = None self.y_train = None self.y_test = None self.classifier = None self.tfidf_vectorizer = None def showData(self): print(self.db.shape) print(self.db.head()) def splitData(self, testsize): self.x_train, self.x_test, self.y_train, self.y_test = train_test_split( self.db.text, self.db.label, test_size=testsize, random_state=7) def solve(self): self.tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7) tfidf_train = self.tfidf_vectorizer.fit_transform(self.x_train) tfidf_test = self.tfidf_vectorizer.transform(self.x_test) self.classifier = PassiveAggressiveClassifier(max_iter=50) self.classifier.fit(tfidf_train, self.y_train) y_pred = self.classifier.predict(tfidf_test) score = accuracy_score(self.y_test, y_pred) print('Accuracy of the solved model: {} %'.format(round( 100 * score, 2))) cm = confusion_matrix(self.y_test, y_pred, labels=['FAKE', 'REAL']) print(cm) def predict(self, news): return self.classifier.predict(self.tfidf_vectorizer.transform([news])) @classmethod def loadData(cls, data): return cls(db=pd.read_csv(data))
def main(): """Train and pickle the model/tokenizer.""" # Load the data df = pd.read_csv("./data/data.csv") # Get the labels labels = df.label # Split into training and testing sets x_train, x_test, y_train, y_test = train_test_split( df["text"], labels, test_size=0.3, random_state=7 ) # Initialize a TfidfVectorizer with stop words and max doc freq of 0.7 tfidf_vec = TfidfVectorizer(stop_words="english", max_df=0.7) # Fit and transform train set tfidf_train = tfidf_vec.fit_transform(x_train) # Transform test set tfidf_test = tfidf_vec.transform(x_test) # Initialize a PassiveAggressiveClassifier and fit tfidf_train and y_train pac = PassiveAggressiveClassifier(max_iter=50) pac.fit(tfidf_train, y_train) # Predict on the test set and calc accuracy y_pred = pac.predict(tfidf_test) score = accuracy_score(y_test, y_pred) # Report metrics print(f"Done training model.\n\nAccuracy: {round(score*100, 2)}%") print( f"\nClassification Report:\n\n{classification_report(y_test, y_pred)}") # Pickle the classifier pickle.dump(pac, open("./fake-news-app/pac.pickle", "wb")) # Pickle the TfidfVectorizer pickle.dump(tfidf_vec, open( "./fake-news-app/tfidf-vectorizer.pickle", "wb")) # Save testing results test_df = pd.DataFrame({ "label": y_test, "prediction": y_pred }) test_df.to_csv("./data/model-test-results.csv", index=False)
def pac(x, y, x_t, y_t, y_pred): score = 0 t = 0 for i in range(48): classifier = PassiveAggressiveClassifier(max_iter=len(x[i])) try: classifier.fit(np.array(x[i]), np.array(y[i])) y_pred[i] = classifier.predict(x_t[i]) score += classifier.score(x_t[i], y_t[i]) t += 1 except: print('error in ' + str(i)) y_pred[i] = np.zeros(17) continue return score / t
def Passive_Aggressive(X_train, Y_train, X_test, Y_test): ###################### Passive Aggressive ###########################--Code from ASTD classifier = PassiveAggressiveClassifier(n_iter=100) classifier.fit(X_train, Y_train) # Predicting the Test set results y_pred = classifier.predict(X_test) # Making the Confusion Matrix from sklearn.metrics import confusion_matrix cm = confusion_matrix(Y_test, y_pred) if len(cm[0]) == 2: total_correct_predictions = cm[0, 0] + cm[1, 1] elif len(cm[0]) == 3: total_correct_predictions = cm[0, 0] + cm[1, 1] + cm[2, 2] total_predictions_made = np.sum(cm) accuracy = total_correct_predictions / total_predictions_made * 100 return accuracy
class PassiveAggressiveClassifierImpl: def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X) def decision_function(self, X): return self._wrapped_model.decision_function(X)
class PassiveAgressiveClassifier(Classifier): def __init__(self, matrixdatabase): self._matrix_database = matrixdatabase self._has_fit = False self._occ = OCC(C=0.0083, n_iter=27, loss="hinge") def learn(self, ingredients, cuisine): return def classify(self, ingredients): if not self._has_fit: matrix, classes = self._matrix_database.make_train_matrix() self._occ = self._occ.fit(matrix, classes) print "Fitting complete..." self._has_fit = True output = self._occ.predict(self._matrix_database.make_row_from_recipe(ingredients)) return output[0]
class PassiveAgressiveClassifier(Classifier): def __init__(self, matrixdatabase): self._matrix_database = matrixdatabase self._has_fit = False self._occ = OCC(C=0.0083, n_iter=27, loss='hinge') def learn(self, ingredients, cuisine): return def classify(self, ingredients): if not self._has_fit: matrix, classes = self._matrix_database.make_train_matrix() self._occ = self._occ.fit(matrix, classes) print('Fitting complete...') self._has_fit = True output = self._occ.predict( self._matrix_database.make_row_from_recipe(ingredients)) return output[0]
def main(): #stemmer = SnowballStemmer('english') #stemmer = EnglishStemmer() training_data=open('trainingdata.txt', 'rU') n = int(training_data.readline().strip()) train_data = [] class_data = [] for i in range(n): line = training_data.readline().strip() train_data.append(line[1:].strip()) class_data.append(int(line[0])) train_data = np.array(train_data) class_data = np.array(class_data) # 2) Vectorize bag of words vectorizer = TfidfVectorizer(stop_words="english", max_df=0.5, sublinear_tf=True ) vectorizer.fit(train_data) X_train = vectorizer.transform(train_data) # Read test data from input X_test = np.array([raw_input().strip() for i in range(int(raw_input().strip()))]) X_test = vectorizer.transform(X_test) clf = PassiveAggressiveClassifier(n_iter=9) clf.fit(X_train, class_data) pred = clf.predict(X_test) for i in pred: print i
def test_main(self): categories, documents = get_docs_categories() clean_function = lambda text: '' if text.startswith('[') else text entity_types = set(['GPE']) term_doc_mat = ( TermDocMatrixFactory( category_text_iter=zip(categories, documents), clean_function=clean_function, nlp=_testing_nlp, feats_from_spacy_doc=FeatsFromSpacyDoc(entity_types_to_censor=entity_types) ).build() ) clf = PassiveAggressiveClassifier(n_iter=5, C=0.5, n_jobs=-1, random_state=0) fdc = FeatsFromDoc(term_doc_mat._term_idx_store, clean_function=clean_function, feats_from_spacy_doc=FeatsFromSpacyDoc( entity_types_to_censor=entity_types)).set_nlp(_testing_nlp) tfidf = TfidfTransformer(norm='l1') X = tfidf.fit_transform(term_doc_mat._X) clf.fit(X, term_doc_mat._y) X_to_predict = fdc.feats_from_doc('Did sometimes march UNKNOWNWORD') pred = clf.predict(tfidf.transform(X_to_predict)) dec = clf.decision_function(X_to_predict)
clf = MultinomialNB() clf.fit(count_train, y_train) pred = clf.predict(count_test) score = metrics.accuracy_score(y_test, pred) print("accuracy: %0.3f" % score) cm = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL']) plot_confusion_matrix(cm, classes=['FAKE', 'REAL']) # Testing from sklearn.linear_model import PassiveAggressiveClassifier linear_clf = PassiveAggressiveClassifier(n_iter=50) linear_clf.fit(tfidf_train, y_train) pred = linear_clf.predict(tfidf_test) score = metrics.accuracy_score(y_test, pred) print("accuracy: %0.3f" % score) cm = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL']) plot_confusion_matrix(cm, classes=['FAKE', 'REAL']) clf = MultinomialNB(alpha=0.1) last_score = 0 for alpha in np.arange(0,1,.1): nb_classifier = MultinomialNB(alpha=alpha) nb_classifier.fit(tfidf_train, y_train) pred = nb_classifier.predict(tfidf_test) score = metrics.accuracy_score(y_test, pred) if score > last_score:
#https://www.hackerrank.com/challenges/document-classification/submissions/code/10577787 # Enter your code here. Read input from STDIN. Print output to STDOUT documents=[] target=[] cnt=0 from sklearn.linear_model import PassiveAggressiveClassifier with open("trainingdata.txt","rb") as infile: for line in infile: if cnt==0: cnt=1 continue category=int(line[0:2]) doc=line[2:] target.append(category) documents.append(doc) from sklearn.feature_extraction.text import TfidfVectorizer transformer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, analyzer='word',stop_words='english') X = transformer.fit_transform(documents) from sklearn.naive_bayes import MultinomialNB clf = PassiveAggressiveClassifier(n_iter=50) clf.fit(X, target) n=int(raw_input()) for i in range(0,n): X=transformer.transform([raw_input()]) print(clf.predict(X))[0]
#print X_train_tfidf.shape ntest = input() testdoc = [] for t in range(0, ntest): doc = raw_input() testdoc.append(doc) X_new_counts = count_vect.transform(testdoc) X_new_tfidf = tfidf_transformer.transform(X_new_counts) """" #Naive bayes from sklearn.naive_bayes import MultinomialNB clf = MultinomialNB().fit(X_train_tfidf, trainlabel) predicted = clf.predict(X_new_tfidf) #test random forest from sklearn.ensemble import RandomForestClassifier clf = RandomForestClassifier(n_estimators=10) clf = clf.fit(X_train_tfidf, trainlabel) predicted = clf.predict(X_new_tfidf) """ from sklearn.linear_model import PassiveAggressiveClassifier clf = PassiveAggressiveClassifier(n_iter=50) clf = clf.fit(X_train_tfidf, trainlabel) predicted = clf.predict(X_new_tfidf) for t in range(0, ntest): print predicted[t]
def tweetread(): data = [] catagory = [] results_traffic = collection_aa.find({"manualtype":{"$ne":"/^non*/"}}) for i,item in enumerate(results_traffic): text = unicodedata.normalize('NFKD', item["text"]).encode('ascii','ignore').decode('utf-8') text = re.sub(r"@([A-Za-z]+[A-Za-z]+[A-Za-z0-9-_\.]+)", "", text) print(text) data.append(str(text)) catagory.append(0) results_nontraffic = collection_mapped.find({"_id":{"$regex":"2014/04/18/09*"}}) nontraffic = [] data = data[:5000] catagory = catagory[:5000] #docs = [{f["text"]:"TRAFFIC"} for f in results_traffic] print(len(data), " TRAFFIC SIZE ") for res in results_nontraffic: #print(len(res["item"])) for i in res["item"]: if len(data) < 10000: text = unicodedata.normalize('NFKD', i["text"]).encode('ascii','ignore').decode('utf-8') #if not check_in(['delays', 'crash', 'cleared'] , text): text = re.sub(r"@([A-Za-z]+[A-Za-z0-9-_\.]+)", "", text) print(text) data.append(text) catagory.append(1) #else: # print(text) print(len(data), "SAMPLE SIZE ") vectorizer = TfidfVectorizer( analyzer='word', # features made of words token_pattern=r'[a-z]{3,}', use_idf=True, strip_accents='unicode', #ngram_range=(2,3), sublinear_tf=True, max_df=0.95, min_df=0.05,stop_words='english') #vectorizer = DictVectorizer(); X_train = vectorizer.fit_transform(data) X_test = vectorizer.transform(data) feature_names = vectorizer.get_feature_names()#np.vectorize(vectorizer.get_feature_names()) print(feature_names); print(X_test) print(data[0]) print(data[1]) #BernoulliNB(alpha=.01) #nb_classifier = BernoulliNB(alpha=.01).fit(X_train, catagory) #nb_classifier = RidgeClassifier(tol=1e-2, solver="lsqr").fit(X_train, catagory) #nb_classifier = Perceptron(n_iter=50).fit(X_train, catagory) nb_classifier = PassiveAggressiveClassifier(n_iter=50).fit(X_train, catagory) #nb_classifier = MultinomialNB(alpha=.01).fit(X_train, catagory) y_nb_predicted = nb_classifier.predict(X_test) print("Dimensionality: %d" % nb_classifier.coef_.shape[0]) show_most_informative_features(vectorizer, nb_classifier, n=50) print("traffic :" + str(traffic_label)) print("traffic score #:" + str(traffic_scores)) print("non :" + str(nontraffic_label)) print("non score #:" + str(nontraffic_scores)) print("MODEL: Multinomial Naive Bayes\n") print('The precision for this classifier is ' + str(metrics.precision_score(catagory, y_nb_predicted))); print('The recall for this classifier is ' + str(metrics.recall_score(catagory, y_nb_predicted))); print('The f1 for this classifier is ' + str(metrics.f1_score(catagory, y_nb_predicted))); print('The accuracy for this classifier is ' + str(metrics.accuracy_score(catagory, y_nb_predicted))); print('\nHere is the classification report:'); print(classification_report(catagory, y_nb_predicted)); print(metrics.confusion_matrix(catagory, y_nb_predicted, labels=[0,1])) results_nontraffic = collection_mapped.find({"_id":{"$regex":"2014/04/*"}}) nontraffic = [] data = data[:1000] catagory = catagory[:1000] #docs = [{f["text"]:"TRAFFIC"} for f in results_traffic] print(len(data), " TRAFFIC SIZE ") f = open('classifier.pickle', 'wb') v = open('vector.pickle', 'wb') pickle.dump(nb_classifier, f) pickle.dump(vectorizer, v) f.close() for res in results_nontraffic: for item in res["item"]: text = unicodedata.normalize('NFKD', item["text"]).encode('ascii','ignore').decode('utf-8') X_test = vectorizer.transform([text]) y_nb_predicted = nb_classifier.predict(X_test) #score = metrics.f1_score(X_test, y_nb_predicted) if y_nb_predicted == 0: #if check_in(['delays', 'crash', 'cleared'] , text): #print("PREDICTED", text) print("", text,"\\\\")
import numpy as np import pandas as pd from sklearn.cross_validation import train_test_split from sklearn.linear_model import PassiveAggressiveClassifier # NOTE: Make sure that the class is labeled 'class' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR') training_indices, testing_indices = train_test_split(tpot_data.index, stratify = tpot_data['class'].values, train_size=0.75, test_size=0.25) result1 = tpot_data.copy() # Perform classification with a passive aggressive classifier pagr1 = PassiveAggressiveClassifier(C=0.81, loss="squared_hinge", fit_intercept=True, random_state=42) pagr1.fit(result1.loc[training_indices].drop('class', axis=1).values, result1.loc[training_indices, 'class'].values) result1['pagr1-classification'] = pagr1.predict(result1.drop('class', axis=1).values)
#clf = neighbors.KNeighborsClassifier(K,weights = 'distance', leaf_size= 30) from sklearn.linear_model import PassiveAggressiveClassifier clf = PassiveAggressiveClassifier(n_iter=50) clf.fit(trans, y) #f = open("testDatatextClassification.txt",'r') f = open("input01.txt",'r') f2 = open("output01.txt","r") d = f.readlines() d = d[1:] ans = map(int,f2.readlines()) t0= time.clock() summing = 0; for j,i in enumerate(d): sol = int(clf.predict(vectorizer.transform([i]).toarray())[0]) #print sol, ans[j] if (sol==ans[j]): summing = summing + 1 #clf.predict(vectorizer.transform([i]).toarray())[0] t= time.clock() - t0 print t print 100*float(summing-(len(ans)-summing))/len(ans) print len(ans)
#stem the words bag_of_words=vectorizer.fit(ls) bag_of_words=vectorizer.transform(ls) cmax=0 for cc in range(1,100): #sw=stopwords.words() #stopwords are not supported, requires download clf = PassiveAggressiveClassifier(n_iter=9,C=cc/10) # svm=LinearSVC(C=cc/10.0) clf.fit(bag_of_words,ln) #Now get input (test) data lt=[] filename=open("testdata.txt") line = filename.readline() ntests=int(line) for _ in range(ntests): lt.append(filename.readline()) bag_of_test_words=vectorizer.transform(lt) result=clf.predict(bag_of_test_words) actuals=[] filename=open("testresults.txt") z=0 for x in range(len(result)): zz = int(filename.readline()) if zz==int(result[x]): z=z+1 acc=(float(z)-(len(result)-float(z)))/len(result) if cmax<acc: cmax=acc print cc print cmax*100