def fit_predict(self, X_train, y_train, X_val, X_test, c_weight): self.classifier = NB() self.classifier.fit(X_train, y_train) self.test_y_predicted = self.classifier.predict(X_test) self.val_y_predicted = self.classifier.predict(X_val) return (X_train, X_val, X_test, self.val_y_predicted, self.test_y_predicted)
def instantiate_naive_bayes(self): naive_bayes = [] for i in range(self.num_of_classifier): t_naive_bayes = NB() naive_bayes.append(t_naive_bayes) return naive_bayes
def train(topSet, X, Y, test_size=testPercent, sample_weight=None): X_arr = np.array(X) Y_arr = np.array(Y) # classify X_train, X_test, y_train, y_test = train_test_split(X_arr, Y_arr, test_size=test_size, random_state=0) print("Training...") print("train set:") print("X: ", X_train.shape) print("Y: ", y_train.shape) # print("X[0]: ", X_train[0]) clf = NB(alpha=1) clf.fit(X_train, y_train, sample_weight=sample_weight) # print(clf.coef_) # print(clf.intercept_) # test if test_size > 0: print("Testing...") print("test set:") print("X: ", X_test.shape) print("Y: ", y_test.shape) test_res = clf.predict(X_test) detail = "feature num: " + str(len(topSet)) + "\n" detail += "testPercent: " + str(test_size) showTestResult(test_res, y_test, clType='NB', title=detail) return clf
def stat_on_train(model, train_set, val_set, is_using_val_set=True): """ train a model with the train set and test on the validation set, return the test results and model. :param str model: the classification model (DT, NB or KNN) :param list train_set: the training set instances :param list val_set: the validation set instances :param boolean is_using_val_set: if is_using_val_set is True, the method will train the model using all the instances in the training and validation set, and return the model; otherwise it will just use the instances in the training set. """ if model == "DT": model = DT() elif model == "KNN": model = KNN() elif model == "NB": model = NB() else: exit() xtrain = np.array([[float(i) for i in v[:-1]] for v in train_set]) ytrain = np.array([v[-1] for v in train_set]) xtest = np.array([[float(i) for i in v[:-1]] for v in val_set]) ytest = np.array([v[-1] for v in val_set]) clf = model.fit(xtrain, ytrain) ypred = clf.predict(xtest) if is_using_val_set: clf = model.fit(np.concatenate((xtrain, xtest), axis=0), np.concatenate((ytrain, ytest), axis=0)) return get_stat(ytest, ypred), clf
def scoring(train_X, train_Y): score = cross_validation.cross_val_score(OneVsOneClassifier(NB()), train_X, train_Y, cv=5) #score = cross_validation.cross_val_score(OneVsOneClassifier(svm.LinearSVC(random_state=0)),train_X,train_Y,cv=5) print score print "average accuracy of svm ", score.mean()
def test_iht_fit_resample_half(): sampling_strategy = {0: 3, 1: 3} iht = InstanceHardnessThreshold(NB(), sampling_strategy=sampling_strategy, random_state=RND_SEED) X_resampled, y_resampled = iht.fit_resample(X, Y) assert X_resampled.shape == (6, 2) assert y_resampled.shape == (6, )
def Loop_for_computataion(my_train_data, my_train_label, model_cnn, status, iris_cifar): #Applying the K Fold using 5 splits as mentioned question lda = LDA() qda = QDA() nb = NB() rf = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0) svm = SVC(kernel='rbf', random_state=0) dt = DecisionTreeClassifier(criterion='entropy', random_state=0) #CITATIONS:https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedShuffleSplit.html #Even if i chnage the train and test size (Ex:Train 80% and test 20% I find slight variation in op I,e I mean #i have cross verified changing the sizes and fit it performs correctly) Kfold_stratified_shuffleop = StratifiedShuffleSplit(n_splits=5, train_size=0.8, test_size=0.2, random_state=0) for training_values, testing_values in Kfold_stratified_shuffleop.split( my_train_data, my_train_label): #using the standard naming convention X_train X_test,y_train,y_test X_train, X_test = my_train_data[training_values], my_train_data[ testing_values] y_train, y_test = my_train_label[training_values], my_train_label[ testing_values] print("\n") print("TRAINING VALUES:", training_values, "TESTING VALUES:", testing_values) print("\n") if status == 3: print("ENABLING PCA") meshgrid_pca_analysis(X_train, X_test, y_train, y_test, lda, qda, nb, rf, dt, svm, 1, iris_cifar) elif status == 1: compute_logic_supervised_learning(X_train, X_test, y_train, y_test, lda, qda, nb, rf, dt, svm, 1) elif status == 2: cnn_split = list( StratifiedShuffleSplit(n_splits=2, test_size=0.1).split(X_train, y_train)) idx_tr, idx_val = cnn_split[0] X_val, y_val = X_train[idx_val], y_train[idx_val] X_tr, y_tr = X_train[idx_tr], y_train[idx_tr] X_val = X_val.reshape(len(X_val), 32, 32, 3) X_tr = X_tr.reshape(len(X_tr), 32, 32, 3) X_test = X_test.reshape(len(X_test), 32, 32, 3) y_val = np_utils.to_categorical(y_val, 10) y_tr = np_utils.to_categorical(y_tr, 10) model_cnn.fit(X_tr, y_tr, validation_data=(X_val, y_val)) model_cnn.predict(X_test) else: print("No proper selection")
def naiveBayes (self, X_train, y_train, X_test, y_test): t1 = time() nb = NB() nb.fit(X_train, y_train) t2 = time() elapsed_time = t2-t1 accuracy = nb.score(X_test, y_test) print("Naive Bayes Classifier:\n\taccuracy score:{0:0.2f}\n\telapsed time:{1:0.2f} sec"\ .format(accuracy, elapsed_time)) filename = "./pkl/NaiveBayes_training.pkl" pickle.dump(nb, open(filename, "wb"))
def naive_bayes(self): """Compute predictions on naive bayes algorithm. Parameters ---------- Use just class attributes Returns ------- Store the predictions on the _prediction attribute. """ model = NB() model.fit(self._X_train, self._y_train) self._prediction = model.predict(self._X_val, self._y_val)
def naive_bays(x_train, y_train, bagging=False, boosting=False): from sklearn.naive_bayes import GaussianNB as NB nb = NB() if bagging == True and boosting == True: raise ValueError( "Cant have bagging and boosting enabled at the same time") if bagging == True: #if bagging from sklearn.ensemble import BaggingClassifier model = BaggingClassifier(nb, max_samples=.5, max_features=.5) elif boosting == True: from sklearn.ensemble import AdaBoostClassifier model = AdaBoostClassifier(nb, algorithm="SAMME", n_estimators=300) else: #just regular logistic regression model = nb model.fit(x_train, y_train) return model
def StratifiedShuffleSplit_cross_validate_func_NaiveBayes(X, y,partitioner) -> (np.array, np.array,np.array): runs = 4 accuracy_list=[] error_rate_list=[] NaiveBayes= np.empty([runs]) for i in range(runs): NaiveBayes_results = cross_validate(NB(), X, y, scoring="accuracy", cv=partitioner) NaiveBayes[i] = np.mean(NaiveBayes_results["test_score"]) error_rate_nb = 1-NaiveBayes[i] print("NaiveBayes[i]") print(NaiveBayes[i]) print("error_rate_nb") print(error_rate_nb) accuracy_list.append(NaiveBayes[i]) error_rate_list.append(error_rate_nb) plt.plot(error_rate_list) plt.show() plt.plot(accuracy_list) plt.show()
def main(k): x_train, x_test, y_train, y_test = get_classification_dataset(0.3) lr = k_fold(LogisticRegression(), x_train, y_train, k) print_errors(lr, x_train, y_train, x_test, y_test, msg='Logistic Regression', prf=True) lda = k_fold(LDA(), x_train, y_train, k) print_errors(lda, x_train, y_train, x_test, y_test, msg='Linear Discriminant Analysis', prf=True) qda = k_fold(QDA(), x_train, y_train, k) print_errors(qda, x_train, y_train, x_test, y_test, msg='Quadratic Discriminant Analysis', prf=True) gnb = k_fold(NB(), x_train, y_train, k) print_errors(gnb, x_train, y_train, x_test, y_test, msg='Gaussian Naive Bayes', prf=True) lreg = LinearRegression() lreg.fit(x_train, y_train) print_errors(lreg, x_train, y_train, x_test, y_test, msg='Linear Regression', prf=True) plt.show()
def main(input_file=INPUT_FILE): # count_vect = CountVectorizer(stop_words='english') count_vect = CountVectorizer() data = pd.read_csv(input_file, sep=',', names=['label', 'text']) # NOTE pos record = fraud data['bin_target'] = data.label.apply(lambda k: k == 'bad') # preproc X_train_counts = count_vect.fit_transform(data.text) tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts) X_train_tf = tf_transformer.transform(X_train_counts) # train classifier nb = NB() cv_scores = cross_val_score(nb, X_train_tf, data.bin_target, cv=N_FOLDS) #, scoring='roc_auc') print(nb) print(cv_scores) print(cv_scores.mean()) print(cv_scores.std())
trdf , labels = get_data_frame(args['train']) tedf , truth = get_data_frame(args['test']) #fnames = get_fnames(args['fnames']) rf = RFC(n_estimators=50, criterion='entropy', max_depth=None, min_samples_split=2, max_leaf_nodes=None, class_weight = 'balanced' ) svm_rbf = svm.SVC(C=20,kernel='rbf',gamma = 'auto',class_weight = 'balanced') svm_linear = svm.LinearSVC(C=1,class_weight = 'balanced') nb = NB() clfs = [rf,svm_rbf,svm_linear,nb] if args['trorte'] == 'cross': scoring = ['f1_macro','accuracy','precision_macro','recall_macro'] for clf in clfs: print(clf) scores = cross_validate(clf,trdf,labels,scoring = scoring,cv=10,return_train_score=False) for s in scores.keys(): print(s) for v in scores[s]: print(v) print ('###############################') print ('$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$') elif args['trorte'] == 'test':
# store train test files type III in the dictionary train_test_files_dic[3]=[X_3_train, X_3_test, y_3_train, y_3_test] # %% [markdown] # # VI. Train-Test PipeLine # %% from sklearn.naive_bayes import GaussianNB as NB # import gaussian naive bayes classifier from sklearn.tree import DecisionTreeClassifier as DTC # import decision tree classifier from sklearn.linear_model import LogisticRegression as LR # import logistic regression classifier from sklearn.metrics import accuracy_score as accuracy # import accuracy score from sklearn.metrics import confusion_matrix as cm # import confusion matrix # intialize models Benchmark_model =NB() Clf1=DTC(random_state=337) Clf2=LR(random_state=337) # %% # Define the adpted confusion matrix def full_confusion_matrix(Df): # input: # Df : pandas dataframe, the contingency table resulted from the confusion matrix defined earlier as cm columns=Df.columns # activity names # add new columns containing detailed scores new_columns=list(columns)+['data points number','precision %','sensitivity %','specificity %'] # create the index from the same old columns add an other row called total
y, test_size=.25, random_state=123) # %% # Set the folds index to ensure comparable samples fold_generator = KFold(n_splits=10, shuffle=True, random_state=1234) #%% pipe = Pipeline(steps=[('pre_process', pp.MinMaxScaler()), ('model', None)]) search_space = [ # NaiveBayes { 'model': [NB()] }, # KNN with K tuning param { 'model': [KNN()], 'model__n_neighbors': [5, 10, 15, 20, 25, 30, 35, 40, 45, 50] }, # # Decision Tree with the Max Depth Param { 'model': [DT()], 'model__max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10] }, # #Random forest with the N Estimators tuning param
def fit(self, X, y): self.clf.fit(X, y) def predict(self, X): m = int(X.shape[0]**(0.5)) pred = [] for I in range(m): pred.extend( self.clf.predict(X[I * X.shape[0] // m:(I + 1) * X.shape[0] // m])) return pred # TODO: clean this lines clfOption = [Boosting(), LR(n_jobs=-1), NB(), LinearSVC(), Neighbors(), RFC()] mrePred = [] # TODO: clean this function def mrc(pred, Y): pred = array(pred) Y = array(Y) TP, FP, TN, FN = 0, 0, 0, 0 for I in range(len(pred)): if pred[I] == Y[I]: if pred[I] == 1: TP += 1
new_sentence += " " data[sess].append(new_sentence) chronology = list(data.keys()) for i in range(len(data.keys())): for j in range(i + 1, len(data.keys())): if chronology[i] > chronology[j]: chronology[i], chronology[j] = chronology[j], chronology[i] date = [chronology[len(chronology) * i // 6 - 1] for i in range(1, 7)] clf_option = [ Boosting(), LR(n_jobs=-1), NB(), LinearSVC(), Neighbors(), RFC() ] mre_pred = [] for iter in tqdm(range(5)): query = "Select * from berita WHERE Date <= " + str( date[iter]) + " AND Title LIKE '%ekono%' " c.execute(query) train_data = c.fetchall() query = "Select * from berita WHERE Date <= " + str( date[iter]) + " AND NOT Title LIKE '%ekono%' " c.execute(query) train_data_unknown = c.fetchall()
data['precip.(mm)'][ind]=1 elif(value>4.0 and value<=8.0): data['precip.(mm)'][ind]=2 elif(value>8.0): data['precip.(mm)'][ind]=3 data['humidity()']=data['humidity()'].fillna(0).astype('int64') X=data[['temp(c)','pressure(mb)','humidity()','wind speed(mph)','wind speed(mph)','wind dir.']] y=data['precip.(mm)'] names=["KNN","SVM","Decision Tree", "Neural Network","Naive Bayesian"] classifiers=[ KNN(3), SVC(kernel="linear",C=0.025), DTC(max_depth=5), MLP(alpha=1,max_iter=1000), NB()] x_train,x_test,y_train,y_test=ttl(X,y,test_size=0.3,random_state=1) model_cols=[] comparison=pd.DataFrame(columns=model_cols) index=0 for name,clf in zip(names,classifiers): clf.fit(x_train,y_train) comparison.loc[index,'Classifiers']=name comparison.loc[index,'Train Accuracy']=clf.score(x_train,y_train) comparison.loc[index,'Test Accuracy']=clf.score(x_test,y_test) comparison.loc[index,'Precision']=precision_score(y_test,clf.predict(x_test),average='macro') comparison.loc[index,'Recall']=recall_score(y_test,clf.predict(x_test),average='macro') comparison.loc[index,'F1 Score'] = f1_score(y_test,clf.predict(x_test),average='macro') index+=1 comparison
def get_model(model_name, feature): clf = " " if model_name == "lr": if feature == "word": clf = LogisticRegression(penalty='l2', dual=True, fit_intercept=True, C=1, tol=0.0001, class_weight=None, random_state=None, intercept_scaling=0.1) elif feature == "length": clf = LogisticRegression(penalty='l2', dual=True, fit_intercept=True, C=0.09, tol=0.0001, class_weight=None, random_state=None, intercept_scaling=0.1) elif feature == "struct": clf = LogisticRegression(penalty='l2', dual=True, fit_intercept=True, C=2, tol=0.0001, class_weight=None, random_state=None, intercept_scaling=0.1) elif feature == "lsa": clf = LogisticRegression(penalty='l2', dual=True, fit_intercept=True, C=2, tol=0.0001, class_weight=None, random_state=None, intercept_scaling=0.1) else: sp = feature.split(',') if set(sp) == set(["word", "length", "struct"]): clf = LogisticRegression(penalty='l2', dual=True, fit_intercept=True, C=1, tol=0.0001, class_weight=None, random_state=None, intercept_scaling=0.2) elif set(sp) == set(["word", "length", "lsa"]): clf = LogisticRegression(penalty='l2', dual=True, fit_intercept=True, C=0.8, tol=0.0001, class_weight=None, random_state=None, intercept_scaling=0.2) elif set(sp) == set(["struct", "length", "lsa"]): clf = LogisticRegression(penalty='l2', dual=True, fit_intercept=True, C=2, tol=0.0001, class_weight=None, random_state=None, intercept_scaling=0.3) elif set(sp) == set(["struct", "length", "lsa", "word"]): clf = LogisticRegression(penalty='l2', dual=False, fit_intercept=True, C=3, tol=0.0001, class_weight=None, random_state=None, intercept_scaling=2) elif "word" in sp and "length" in sp: clf = LogisticRegression(penalty='l2', dual=True, fit_intercept=True, C=0.2, tol=0.0001, class_weight=None, random_state=None, intercept_scaling=0.2) elif "word" in sp and "struct" in sp: clf = LogisticRegression(penalty='l2', dual=True, fit_intercept=True, C=5, tol=0.0001, class_weight=None, random_state=None, intercept_scaling=0.2) elif "word" in sp and "lsa" in sp: clf = LogisticRegression(penalty='l2', dual=True, fit_intercept=True, C=2, tol=0.0001, class_weight=None, random_state=None, intercept_scaling=0.2) elif "length" in sp and "struct" in sp: clf = LogisticRegression(penalty='l2', dual=True, fit_intercept=True, C=0.08, tol=0.0001, class_weight=None, random_state=None, intercept_scaling=0.2) elif "length" in sp and "lsa" in sp: clf = LogisticRegression(penalty='l2', dual=True, fit_intercept=True, C=0.3, tol=0.0001, class_weight=None, random_state=None, intercept_scaling=0.2) elif "struct" in sp and "lsa" in sp: clf = LogisticRegression(penalty='l2', dual=True, fit_intercept=True, C=2.5, tol=0.0001, class_weight=None, random_state=None, intercept_scaling=0.2) else: clf = LogisticRegression(penalty='l2', dual=True, fit_intercept=True, C=0.09, tol=0.0001, class_weight=None, random_state=None, intercept_scaling=0.1) elif model_name == "nb": clf = NB() elif model_name == "knn": if feature == "lsa": clf = KNN(n_neighbors=60) else: clf = KNN(n_neighbors=120) elif model_name == "rf": clf = RF(n_estimators=1000, max_features="auto", max_depth=8, min_samples_split=10, min_samples_leaf=2) elif model_name == "gbdt": clf = GBDT(n_estimators=400, max_features="auto", max_depth=8, min_samples_split=10, min_samples_leaf=2) elif model_name == "svm": if feature == "word" or feature == "length": clf = svm.SVC(C=0.8, kernel='rbf', gamma=0.08) elif feature == "structure": clf = svm.SVC(C=0.1, kernel='rbf', gamma=0.08) else: sp = feature.split(',') if "struct" in sp and "lsa" in sp: clf = svm.SVC(C=0.9, kernel='rbf', gamma=0.08) else: clf = svm.SVC(C=3, kernel='rbf', gamma=0.08) else: print("你只能从LR,NB,RF几种模型里选择") sys.exit(1) return clf
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA # KNN : sklearn.neighbors.classification.KNeighborsClassifier from sklearn.neighbors.classification import KNeighborsClassifier as KNN # DTC : sklearn.tree.tree.DecisionTreeClassifier # 有时也称为 "分类与回归树", CART (= Classifier And Regressor Tree) from sklearn.tree.tree import DecisionTreeClassifier as DTC # NB : sklearn.naive_bayes.GaussianNB from sklearn.naive_bayes import GaussianNB as NB # SVM : sklearn.svm.classes.SVC from sklearn.svm.classes import SVC as SVM _Models = { "LR": LR(), "LDA": LDA(), "KNN": KNN(), "DTC": DTC(), "NB": NB(), "SVM": SVM() } # 审查结果比较 print("审查结果比较及其可视化...:") _Algorithm_CMP_Results = [] _Algorithm_CMP_Result_List = [] _Result_File.write("模型名称" + " " * 6 + "MEAN(准确度)" + " " * 12 + "STD(应该是标准差)\n") print("模型名称" + " " * 6 + "MEAN(准确度)" + " " * 12 + "STD(应该是标准差)") for _Each in _Models: # KFold : K折叠 : sklearn.model_selection._split.KFold(n_splits = 10, random_state = 7) # LeaveOneOut : 弃一 : sklearn.model_selection._split.LeaveOneOut() # KFold 与 LeaveOneOut 为两种不同的数据集分割策略方案 cv_results = model_selection.cross_val_score( # cross_val_score : 交叉验证 _Models[_Each], X=_X_Train,
# print(y_test) # print(y_pre) # print("Precision: %f" % (cnt / len(y_test))) return cnt / len(y_test) if __name__ == '__main__': X, y = genData(argv[1], False) print("X.shape:", X.shape) print("y.shape:", y.shape) clas = [] clas.append(["KNN", KNN(n_neighbors=6)]) clas.append(["SVC", SVC()]) clas.append(["DT", DT()]) clas.append(["NB", NB()]) # cla = KNN(n_neighbors=6) clfIdx = 0 savedClfs = [None] * 4 bestPre = [0] * 4 for clf in clas: pres = [] for i in range(2000): p = classify(X, y, clf[1]) if p > bestPre[clfIdx]: bestPre[clfIdx] = p savedClfs[clfIdx] = clf[1] pres.append(p) print("%s precision: %f" % (clf[0], np.mean(pres))) joblib.dump(savedClfs[clfIdx], MODEL_DIR + clf[0] + '.pkl') clfIdx += 1
type=str, default=os.environ['SM_CHANNEL_TRAIN']) # args holds all passed-in arguments args = parser.parse_args() # Read in csv training file training_dir = args.data_dir train_data = pd.read_csv(os.path.join(training_dir, "train.csv"), header=None, names=None) # Labels are in the first column train_y = train_data.iloc[:, 0] train_x = train_data.iloc[:, 1:] # Define Naive Bayes Classifier and hyperparameter tuner nbc = NB() model = GridSearchCV( estimator=nbc, n_jobs=3, verbose=10, param_grid={'var_smoothing': [1e-9, 1e-7, 1e-5, 1e-3]}) model.fit(train_x, train_y) print('Best Parameters: ', model.best_params_) print('Best Estimator: ', model.best_estimator_) # Save the trained model joblib.dump(model, os.path.join(args.model_dir, "model.joblib"))
def _ModelSetting(self, model_name, cv_train_p=None): self.model_p = '' self.clf = None if model_name == 'K-MEANS': pars = [cv_train_p, 50000, 0.00001] self.model_p = '-'.join(str(p) for p in pars) self.clf = KMEANS(n_clusters=pars[0], init='k-means++', n_init=10, max_iter=pars[1], tol=pars[2], precompute_distances='auto', verbose=0, random_state=None, copy_x=True, n_jobs=4) if model_name == 'K-MINI': pars = [cv_train_p, 10000, 0.0] self.model_p = '-'.join(str(p) for p in pars) self.clf = KMINI(n_clusters=pars[0], init='k-means++', max_iter=pars[1], batch_size=100, verbose=0, compute_labels=True, random_state=None, tol=pars[2], max_no_improvement=10, init_size=None, n_init=3, reassignment_ratio=0.01) if model_name == 'PAC': self.clf = PAC(C=1.0, fit_intercept=True, n_iter=5, shuffle=True, verbose=0, loss='hinge', n_jobs=1, random_state=None, warm_start=False, class_weight='balanced') if model_name == 'PCP': self.clf = PCP(penalty=None, alpha=0.0001, fit_intercept=True, n_iter=20, shuffle=False, verbose=0, eta0=1.0, n_jobs=6, random_state=0, class_weight=None, warm_start=False) if model_name == 'NB': self.clf = NB() if model_name == 'SGD': pars = [1e-4, None, 'hinge', 200] # loss = 'modified_huber', 'hinge' n_iter = 5 self.model_p = '-'.join(str(p) for p in pars) self.clf = SGD(loss=pars[2], penalty='l2', alpha=pars[0], l1_ratio=0.15, fit_intercept=True, n_iter=pars[3], shuffle=True, verbose=0, epsilon=0.1, n_jobs=1, random_state=None, learning_rate='optimal', eta0=0.0, power_t=0.5, class_weight=pars[1], warm_start=False, average=False) if model_name == 'LSVC': pars = [1e-5, 1e-2, 'balanced', 2000] # 'crammer_singer' self.model_p = '-'.join(str(p) for p in pars) self.clf = LSVC(penalty='l2', loss='squared_hinge', dual=False, tol=pars[0], C=pars[1], multi_class='ovr', fit_intercept=True, intercept_scaling=1, class_weight=pars[2], verbose=0, random_state=None, max_iter=pars[3]) if model_name == 'CSVC': pars = [8, 'rbf', 0.00048828125, 'balanced'] pars = [1e2, 'linear', 1e-3, 'auto'] self.model_p = '-'.join(str(p) for p in pars) self.clf = CSVC(C=pars[0], kernel=pars[1], degree=3, gamma=pars[2], coef0=0.0, shrinking=True, probability=True, tol=1e-3, cache_size=5000, class_weight=pars[3], verbose=False, max_iter=-1, random_state=None) if model_name == 'NSVC': #pars = [0.5, 'rbf', 0.00048828125, 'auto'] pars = [0.5, 'rbf', 'auto', 'auto'] self.model_p = '-'.join(str(p) for p in pars) self.clf = NSVC(nu=pars[0], kernel=pars[1], degree=3, gamma=pars[2], coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=500, class_weight=pars[3], verbose=False, max_iter=-1, decision_function_shape=None, random_state=None) if model_name == 'LR': pars = ['l2', 1e+2, 'balanced', 3000] self.model_p = '-'.join(str(p) for p in pars) self.clf = LR(penalty=pars[0], dual=False, tol=0.0001, C=pars[1], fit_intercept=True, intercept_scaling=1, class_weight=pars[2], random_state=None, solver='liblinear', max_iter=pars[3], multi_class='ovr', verbose=0, warm_start=False, n_jobs=1) if model_name == 'LinR': pars = [True] self.model_p = '-'.join(str(p) for p in pars) self.clf = LinR(fit_intercept=True, normalize=pars[0], copy_X=True, n_jobs=1) if model_name == 'DT': pars = [8, 'balanced'] self.model_p = '-'.join(str(p) for p in pars) self.clf = DT(criterion='gini', splitter='best', max_depth=pars[0], min_samples_split=1, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, class_weight=pars[1], presort=False) if model_name == 'RF': pars = [5, 7, 'balanced'] self.model_p = '-'.join(str(p) for p in pars) self.clf = RF(n_estimators=pars[0], criterion='gini', max_depth=pars[1], min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, bootstrap=True, oob_score=False, n_jobs=2, random_state=None, verbose=0, warm_start=False, class_weight=pars[2]) if model_name == 'ADA': pars = [13, 18, 0.05] self.model_p = '-'.join(str(p) for p in pars) self.clf = ADA(base_estimator=DT(max_depth=pars[0], class_weight='balanced'), n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=None) if model_name == 'GBM': pars = [20, 0.03, 13] self.model_p = '-'.join(str(p) for p in pars) self.clf = GBM(loss='deviance', learning_rate=pars[1], n_estimators=pars[0], subsample=1.0, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=pars[2], init=None, random_state=None, max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto')
nonfrauds = train_tr[y == 0] nonfraudresults = predresults[y == 0] fp = nonfrauds[(nonfraudresults["true"] == 0) & (nonfraudresults["cvpredict"] == 1)] plt.scatter(fp[:, xax], fp[:, yax], color="red") plt.title("1,2PCA scores; yellow:fraud; blue:FN; red: FP") print("") print("") # check some models and compare them with respect to F1, Acc and Profit models = [ LogisticRegression(C=10, solver='lbfgs'), SVM.SVC(gamma='auto'), DT(), KNN(5), NB() ] #uses the profit provided by the teachers cv_profits_for_models(models, train, y) #profit for the perceptron learner perc = PerceptronLearner(1000) cv_profits_for_models([perc], train, y) #%% ## Pocket ALgorithm Prototype import random
# preprocessing data => converting reviews to token list of words for rev in range(0, train.shape[0]): t, r = review_to_words(train["review"][rev]) tokens.append(t) # token list reviews.append(r) # seperate reviews vocabulary = 5000 # max features vectorizer = cv(analyzer="word", tokenizer=None, preprocessor=None, stop_words=None, max_features=vocabulary) X = vectorizer.fit_transform(reviews).toarray() Y = train["sentiment"] validation_size = 0.20 X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split( X, Y, test_size=validation_size) #classifier = DecisionTreeClassifier() #DTC #classifier = SVC() #SVM #classifier = KNeighborsClassifier() #KNN classifier = NB(alpha=2) #alpha=0 means no laplace smoothing classifier.fit(X_train, np.array(Y_train)) predictions = classifier.predict(X_validation) print("Accuracy: " + accuracy_score(Y_validation, predictions)) print("Confusion Matrix: " + confusion_matrix(Y_validation, predictions)) print(classification_report(Y_validation, predictions))
def MS_nonoptimised(self, features_after, labels): for x in self.Tests: ScoreDT = [] ScoreSVM = [] ScoreRF = [] ScoreNB = [] for i in range(100): featuresTraining = features_after.sample(int(x)) indexTraining = featuresTraining.index.tolist() labelsTraining = [] for n in indexTraining: labelsTraining.append(labels[n]) clfTestDT = cross_val_score(DT(), featuresTraining, labelsTraining, cv=5).mean() ScoreDT.append(clfTestDT) clfTestSVM = cross_val_score(SVC(), featuresTraining, labelsTraining, cv=5).mean() ScoreSVM.append(clfTestSVM) clfTestRF = cross_val_score(RFC(), featuresTraining, labelsTraining, cv=5).mean() ScoreRF.append(clfTestRF) clfTestNB = cross_val_score(NB(), featuresTraining, labelsTraining, cv=5).mean() ScoreNB.append(clfTestNB) DTs = np.mean(ScoreDT) SVMs = np.mean(ScoreSVM) RFs = np.mean(ScoreRF) NBs = np.mean(ScoreNB) plt.scatter(x, DTs) plt.scatter(x, SVMs) plt.scatter(x, RFs) plt.scatter(x, NBs) plt.plot(x, DTs, '.b-') plt.plot(x, SVMs, '.y-') plt.plot(x, RFs, '.g-') plt.plot(x, NBs, '.r-') plt.title( 'Monte Carlo simulation of accuracy on non optimised classifiers') plt.xlim([5, 405]) blue_patch = mpatches.Patch(color='blue', label='Decision Trees') red_patch = mpatches.Patch(color='red', label='Naive Bayes') y_patch = mpatches.Patch(color='yellow', label='Support Vector Machine') g_patch = mpatches.Patch(color='green', label='Random Forest') plt.legend(handles=[g_patch, y_patch, blue_patch, red_patch], loc='lower right') plt.ylim([0.3, 1.1]) plt.ylabel('Accuracy') plt.xlabel('Number of Samples') plt.grid() plt.show()
def findStopWord(word): try: stopwords = json.load(codecs.open('stopwords.json', 'r', 'utf-8-sig')) except: print("Loading stopwords.json failed") for lang in stopwords: if (word in stopwords[lang]): return True else: return False X_Training, y_Training, X_Test, y_Test = loadBoWTry() classifier = NB(alpha=0.03) print("Fitting NB model\n") classifier.fit(X_Training, y_Training) # Predict Class print("Predict class\n") y_Predicted = classifier.predict(X_Test) # Accuracy print(np.shape(X_Test), np.shape(y_Predicted)) accuracy = accuracy_score(y_Test, y_Predicted) print("Accurcay %f" % accuracy) np.save("naivebayesclassifier", classifier) #sentence = "ciao sono dario e ho ventiquattro anni, posto molto bello da vedere"
chronology = list(data.keys()) for i in range(len(data.keys())): for j in range(i + 1, len(data.keys())): if chronology[i] > chronology[j]: chronology[i], chronology[j] = chronology[j], chronology[i] date = [chronology[len(chronology) * i // 6-1] for i in range(1,7)] del data, label gc.collect() clf_option = [ Boosting(), LR(n_jobs = -1), NB(), LinearSVC(), Neighbors(), RFC() ] mre_pred = [] for iter in tqdm(range(5)): if settings.DEBUG_MODE: print("Memulai pengambilan data") mre_total = [] query = "Select * from berita WHERE Date <= "+str(date[iter]) c.execute(query) train_data = c.fetchall()
def run(): while True: trial = pull_pending() if trial is None: break params = eval(trial['Parameters']) logging.info(trial) dataset = load(trial['Dataset']) fold = int(trial['Fold']) - 1 (X_train, y_train), (X_test, y_test) = dataset[fold][0], dataset[fold][1] n_minority = Counter(y_train).most_common()[1][1] n_majority = Counter(y_train).most_common()[0][1] imblearn_ratios = [ ((n_majority - n_minority) * ratio + n_minority) / n_majority for ratio in [0.5, 0.75, 1.0] ] clf = { 'NB': NB(), 'KNN': KNN(), 'SVM': SVM(gamma='scale'), 'CART': CART() }[params['classifier']] if (trial['Algorithm'] is None) or (trial['Algorithm'] == 'None'): algorithm = None else: algorithms = { 'AKNN': ResamplingCV(AKNN, clf, n_neighbors=[1, 3, 5, 7]), 'Bord': ResamplingCV(SMOTE, clf, kind=['borderline1'], k_neighbors=[1, 3, 5, 7, 9], m_neighbors=[5, 10, 15], sampling_strategy=imblearn_ratios), 'CC': ResamplingCV(CC, clf, sampling_strategy=imblearn_ratios), 'CNN': ResamplingCV(CNN, clf, n_neighbors=[1, 3, 5, 7]), 'ENN': ResamplingCV(ENN, clf, n_neighbors=[1, 3, 5, 7]), 'IHT': ResamplingCV(IHT, clf, sampling_strategy=imblearn_ratios, cv=[2]), 'NCL': ResamplingCV(NCL, clf, n_neighbors=[1, 3, 5, 7]), 'NM': ResamplingCV(NM, clf, n_neighbors=[1, 3, 5, 7]), 'OSS': ResamplingCV(OSS, clf, n_neighbors=[1, 3, 5, 7]), 'RBO': ResamplingCV(RBO, clf, gamma=[0.01, 0.1, 1.0, 10.0], ratio=[0.5, 0.75, 1.0]), 'RBU': ResamplingCV(RBU, clf, gamma=params.get('gamma'), ratio=params.get('ratio')), 'RENN': ResamplingCV(RENN, clf, n_neighbors=[1, 3, 5, 7]), 'ROS': ResamplingCV(ROS, clf, sampling_strategy=imblearn_ratios), 'RUS': ResamplingCV(RUS, clf, sampling_strategy=imblearn_ratios), 'SMOTE': ResamplingCV(SMOTE, clf, k_neighbors=[1, 3, 5, 7, 9], sampling_strategy=imblearn_ratios), 'SMOTE+ENN': ResamplingCV( SMOTEENN, clf, smote=[SMOTE(k_neighbors=k) for k in [1, 3, 5, 7, 9]], sampling_strategy=imblearn_ratios), 'SMOTE+TL': ResamplingCV( SMOTETomek, clf, smote=[SMOTE(k_neighbors=k) for k in [1, 3, 5, 7, 9]], sampling_strategy=imblearn_ratios), 'TL': TL(), } algorithm = algorithms.get(trial['Algorithm']) if algorithm is None: raise NotImplementedError if algorithm is not None: X_train, y_train = algorithm.fit_sample(X_train, y_train) clf = clf.fit(X_train, y_train) predictions = clf.predict(X_test) scores = { 'Precision': metrics.precision(y_test, predictions), 'Recall': metrics.recall(y_test, predictions), 'F-measure': metrics.f_measure(y_test, predictions), 'AUC': metrics.auc(y_test, predictions), 'G-mean': metrics.g_mean(y_test, predictions) } submit_result(trial, scores)