def fit_predict(self, X_train, y_train, X_val, X_test, c_weight):
     
     self.classifier = NB()
     self.classifier.fit(X_train, y_train)
     self.test_y_predicted = self.classifier.predict(X_test)
     self.val_y_predicted = self.classifier.predict(X_val)
     return (X_train, X_val, X_test, self.val_y_predicted, self.test_y_predicted)
    def instantiate_naive_bayes(self):
        naive_bayes = []
        for i in range(self.num_of_classifier):
            t_naive_bayes = NB()
            naive_bayes.append(t_naive_bayes)

        return naive_bayes
def train(topSet, X, Y, test_size=testPercent, sample_weight=None):
    X_arr = np.array(X)
    Y_arr = np.array(Y)

    # classify
    X_train, X_test, y_train, y_test = train_test_split(X_arr,
                                                        Y_arr,
                                                        test_size=test_size,
                                                        random_state=0)
    print("Training...")
    print("train set:")
    print("X: ", X_train.shape)
    print("Y: ", y_train.shape)
    # print("X[0]: ", X_train[0])
    clf = NB(alpha=1)
    clf.fit(X_train, y_train, sample_weight=sample_weight)

    # print(clf.coef_)
    # print(clf.intercept_)

    # test
    if test_size > 0:
        print("Testing...")
        print("test set:")
        print("X: ", X_test.shape)
        print("Y: ", y_test.shape)
        test_res = clf.predict(X_test)
        detail = "feature num: " + str(len(topSet)) + "\n"
        detail += "testPercent: " + str(test_size)
        showTestResult(test_res, y_test, clType='NB', title=detail)

    return clf
Exemplo n.º 4
0
def stat_on_train(model, train_set, val_set, is_using_val_set=True):
    """
    train a model with the train set and test on the validation set, 
    return the test results and model.
    :param str model: the classification model (DT, NB or KNN)
    :param list train_set: the training set instances
    :param list val_set: the validation set instances
    :param boolean is_using_val_set: if is_using_val_set is True, 
    the method will train the model using all the instances in 
    the training and validation set, and return the model; otherwise 
    it will just use the instances in the training set. 
    """
    if model == "DT":
        model = DT()
    elif model == "KNN":
        model = KNN()
    elif model == "NB":
        model = NB()
    else:
        exit()
    xtrain = np.array([[float(i) for i in v[:-1]] for v in train_set])
    ytrain = np.array([v[-1] for v in train_set])
    xtest = np.array([[float(i) for i in v[:-1]] for v in val_set])
    ytest = np.array([v[-1] for v in val_set])
    clf = model.fit(xtrain, ytrain)
    ypred = clf.predict(xtest)
    if is_using_val_set:
        clf = model.fit(np.concatenate((xtrain, xtest), axis=0), np.concatenate((ytrain, ytest), axis=0))
    return get_stat(ytest, ypred), clf
def scoring(train_X, train_Y):
    score = cross_validation.cross_val_score(OneVsOneClassifier(NB()),
                                             train_X,
                                             train_Y,
                                             cv=5)
    #score = cross_validation.cross_val_score(OneVsOneClassifier(svm.LinearSVC(random_state=0)),train_X,train_Y,cv=5)
    print score
    print "average accuracy of svm ", score.mean()
Exemplo n.º 6
0
def test_iht_fit_resample_half():
    sampling_strategy = {0: 3, 1: 3}
    iht = InstanceHardnessThreshold(NB(),
                                    sampling_strategy=sampling_strategy,
                                    random_state=RND_SEED)
    X_resampled, y_resampled = iht.fit_resample(X, Y)
    assert X_resampled.shape == (6, 2)
    assert y_resampled.shape == (6, )
Exemplo n.º 7
0
def Loop_for_computataion(my_train_data, my_train_label, model_cnn, status,
                          iris_cifar):
    #Applying the K Fold using 5 splits as mentioned question
    lda = LDA()
    qda = QDA()
    nb = NB()
    rf = RandomForestClassifier(n_estimators=10,
                                criterion='entropy',
                                random_state=0)
    svm = SVC(kernel='rbf', random_state=0)
    dt = DecisionTreeClassifier(criterion='entropy', random_state=0)

    #CITATIONS:https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedShuffleSplit.html
    #Even if i chnage the train and test size (Ex:Train 80% and test 20% I find slight variation in op I,e I mean
    #i have cross verified changing the sizes and fit it performs correctly)
    Kfold_stratified_shuffleop = StratifiedShuffleSplit(n_splits=5,
                                                        train_size=0.8,
                                                        test_size=0.2,
                                                        random_state=0)
    for training_values, testing_values in Kfold_stratified_shuffleop.split(
            my_train_data, my_train_label):

        #using the standard naming convention X_train X_test,y_train,y_test
        X_train, X_test = my_train_data[training_values], my_train_data[
            testing_values]
        y_train, y_test = my_train_label[training_values], my_train_label[
            testing_values]

        print("\n")
        print("TRAINING VALUES:", training_values, "TESTING VALUES:",
              testing_values)
        print("\n")

        if status == 3:
            print("ENABLING PCA")
            meshgrid_pca_analysis(X_train, X_test, y_train, y_test, lda, qda,
                                  nb, rf, dt, svm, 1, iris_cifar)
        elif status == 1:
            compute_logic_supervised_learning(X_train, X_test, y_train, y_test,
                                              lda, qda, nb, rf, dt, svm, 1)
        elif status == 2:
            cnn_split = list(
                StratifiedShuffleSplit(n_splits=2,
                                       test_size=0.1).split(X_train, y_train))
            idx_tr, idx_val = cnn_split[0]
            X_val, y_val = X_train[idx_val], y_train[idx_val]
            X_tr, y_tr = X_train[idx_tr], y_train[idx_tr]

            X_val = X_val.reshape(len(X_val), 32, 32, 3)
            X_tr = X_tr.reshape(len(X_tr), 32, 32, 3)
            X_test = X_test.reshape(len(X_test), 32, 32, 3)

            y_val = np_utils.to_categorical(y_val, 10)
            y_tr = np_utils.to_categorical(y_tr, 10)
            model_cnn.fit(X_tr, y_tr, validation_data=(X_val, y_val))
            model_cnn.predict(X_test)
        else:
            print("No proper selection")
 def naiveBayes (self, X_train, y_train, X_test, y_test):
     t1 = time()
     nb = NB()
     nb.fit(X_train, y_train)
     t2 = time()
     elapsed_time = t2-t1
     accuracy = nb.score(X_test, y_test)
     print("Naive Bayes Classifier:\n\taccuracy score:{0:0.2f}\n\telapsed time:{1:0.2f} sec"\
           .format(accuracy, elapsed_time))
     filename = "./pkl/NaiveBayes_training.pkl"
     pickle.dump(nb, open(filename, "wb"))
Exemplo n.º 9
0
    def naive_bayes(self):
        """Compute predictions on naive bayes algorithm.

		Parameters
		----------
		Use just class attributes
		Returns
		-------
		Store the predictions on the _prediction attribute.
		"""

        model = NB()
        model.fit(self._X_train, self._y_train)

        self._prediction = model.predict(self._X_val, self._y_val)
Exemplo n.º 10
0
def naive_bays(x_train, y_train, bagging=False, boosting=False):
    from sklearn.naive_bayes import GaussianNB as NB
    nb = NB()
    if bagging == True and boosting == True:
        raise ValueError(
            "Cant have bagging and boosting enabled at the same time")
    if bagging == True:  #if bagging
        from sklearn.ensemble import BaggingClassifier
        model = BaggingClassifier(nb, max_samples=.5, max_features=.5)
    elif boosting == True:
        from sklearn.ensemble import AdaBoostClassifier
        model = AdaBoostClassifier(nb, algorithm="SAMME", n_estimators=300)
    else:  #just regular logistic regression
        model = nb
    model.fit(x_train, y_train)
    return model
Exemplo n.º 11
0
def StratifiedShuffleSplit_cross_validate_func_NaiveBayes(X, y,partitioner) -> (np.array, np.array,np.array):    
    runs = 4
    accuracy_list=[]
    error_rate_list=[]
    NaiveBayes= np.empty([runs])
    for i in range(runs):        
        NaiveBayes_results = cross_validate(NB(), X, y, scoring="accuracy", cv=partitioner)
        
        NaiveBayes[i] = np.mean(NaiveBayes_results["test_score"])
        error_rate_nb = 1-NaiveBayes[i]
        print("NaiveBayes[i]")
        print(NaiveBayes[i])                        
        print("error_rate_nb")
        print(error_rate_nb)
        accuracy_list.append(NaiveBayes[i])
        error_rate_list.append(error_rate_nb)
    plt.plot(error_rate_list)
    plt.show()
    plt.plot(accuracy_list)
    plt.show()         
Exemplo n.º 12
0
def main(k):
    x_train, x_test, y_train, y_test = get_classification_dataset(0.3)

    lr = k_fold(LogisticRegression(), x_train, y_train, k)
    print_errors(lr, x_train, y_train, x_test, y_test, msg='Logistic Regression', prf=True)

    lda = k_fold(LDA(), x_train, y_train, k)
    print_errors(lda, x_train, y_train, x_test, y_test, msg='Linear Discriminant Analysis', prf=True)

    qda = k_fold(QDA(), x_train, y_train, k)
    print_errors(qda, x_train, y_train, x_test, y_test, msg='Quadratic Discriminant Analysis', prf=True)

    gnb = k_fold(NB(), x_train, y_train, k)
    print_errors(gnb, x_train, y_train, x_test, y_test, msg='Gaussian Naive Bayes', prf=True)

    lreg = LinearRegression()
    lreg.fit(x_train, y_train)
    print_errors(lreg, x_train, y_train, x_test, y_test, msg='Linear Regression', prf=True)

    plt.show()
Exemplo n.º 13
0
def main(input_file=INPUT_FILE):

    # count_vect = CountVectorizer(stop_words='english')
    count_vect = CountVectorizer()

    data = pd.read_csv(input_file, sep=',', names=['label', 'text'])

    # NOTE pos record = fraud
    data['bin_target'] = data.label.apply(lambda k: k == 'bad')

    # preproc
    X_train_counts = count_vect.fit_transform(data.text)
    tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
    X_train_tf = tf_transformer.transform(X_train_counts)

    # train classifier
    nb = NB()
    cv_scores = cross_val_score(nb, X_train_tf, data.bin_target,
                                cv=N_FOLDS)  #, scoring='roc_auc')

    print(nb)
    print(cv_scores)
    print(cv_scores.mean())
    print(cv_scores.std())
Exemplo n.º 14
0
	trdf , labels = get_data_frame(args['train'])
	tedf , truth = get_data_frame(args['test'])
	#fnames = get_fnames(args['fnames'])


	rf = RFC(n_estimators=50,
	 criterion='entropy', 
	 max_depth=None, 
	 min_samples_split=2, 
	  max_leaf_nodes=None,
	  class_weight = 'balanced'
	)
	svm_rbf = svm.SVC(C=20,kernel='rbf',gamma = 'auto',class_weight = 'balanced')
	svm_linear = svm.LinearSVC(C=1,class_weight = 'balanced')
	nb = NB()
	
	clfs = [rf,svm_rbf,svm_linear,nb]

	if args['trorte'] == 'cross':
		scoring = ['f1_macro','accuracy','precision_macro','recall_macro']
		for clf in clfs:
			print(clf) 
			scores = cross_validate(clf,trdf,labels,scoring = scoring,cv=10,return_train_score=False)
			for s in scores.keys():
				print(s)
				for v in scores[s]:
					print(v)
				print ('###############################')
			print ('$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$')
	elif args['trorte'] == 'test':
# store train test files type III in the dictionary 
train_test_files_dic[3]=[X_3_train, X_3_test, y_3_train, y_3_test]

# %% [markdown]
# # VI. Train-Test PipeLine

# %%
from sklearn.naive_bayes import GaussianNB as NB # import gaussian naive bayes classifier
from sklearn.tree import DecisionTreeClassifier as DTC # import decision tree classifier
from sklearn.linear_model import LogisticRegression as LR # import logistic regression classifier

from sklearn.metrics import accuracy_score as accuracy # import accuracy score
from sklearn.metrics import confusion_matrix as cm # import confusion matrix

# intialize models
Benchmark_model =NB()
Clf1=DTC(random_state=337)
Clf2=LR(random_state=337)


# %%
# Define the adpted confusion matrix
def full_confusion_matrix(Df): 
    # input: 
    #   Df : pandas dataframe, the contingency table resulted from the confusion matrix defined earlier as cm
    
    columns=Df.columns # activity names
    # add new columns containing detailed scores
    new_columns=list(columns)+['data points number','precision %','sensitivity %','specificity %']
    
    # create the index from the same old columns add an other row called total
Exemplo n.º 16
0
                                                    y,
                                                    test_size=.25,
                                                    random_state=123)

# %%
# Set the folds index to ensure comparable samples
fold_generator = KFold(n_splits=10, shuffle=True, random_state=1234)

#%%
pipe = Pipeline(steps=[('pre_process', pp.MinMaxScaler()), ('model', None)])

search_space = [

    # NaiveBayes
    {
        'model': [NB()]
    },

    # KNN with K tuning param
    {
        'model': [KNN()],
        'model__n_neighbors': [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
    },

    # # Decision Tree with the Max Depth Param
    {
        'model': [DT()],
        'model__max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10]
    },

    # #Random forest with the N Estimators tuning param
Exemplo n.º 17
0
    def fit(self, X, y):
        self.clf.fit(X, y)

    def predict(self, X):
        m = int(X.shape[0]**(0.5))
        pred = []
        for I in range(m):
            pred.extend(
                self.clf.predict(X[I * X.shape[0] // m:(I + 1) * X.shape[0] //
                                   m]))
        return pred


# TODO: clean this lines
clfOption = [Boosting(), LR(n_jobs=-1), NB(), LinearSVC(), Neighbors(), RFC()]
mrePred = []


# TODO: clean this function
def mrc(pred, Y):

    pred = array(pred)
    Y = array(Y)

    TP, FP, TN, FN = 0, 0, 0, 0

    for I in range(len(pred)):
        if pred[I] == Y[I]:
            if pred[I] == 1:
                TP += 1
Exemplo n.º 18
0
                new_sentence += " "
        data[sess].append(new_sentence)

    chronology = list(data.keys())

    for i in range(len(data.keys())):
        for j in range(i + 1, len(data.keys())):
            if chronology[i] > chronology[j]:
                chronology[i], chronology[j] = chronology[j], chronology[i]

    date = [chronology[len(chronology) * i // 6 - 1] for i in range(1, 7)]

    clf_option = [
        Boosting(),
        LR(n_jobs=-1),
        NB(), LinearSVC(),
        Neighbors(),
        RFC()
    ]
    mre_pred = []

    for iter in tqdm(range(5)):
        query = "Select * from berita WHERE Date <= " + str(
            date[iter]) + " AND Title LIKE '%ekono%' "
        c.execute(query)
        train_data = c.fetchall()

        query = "Select * from berita WHERE Date <= " + str(
            date[iter]) + " AND NOT Title LIKE '%ekono%' "
        c.execute(query)
        train_data_unknown = c.fetchall()
Exemplo n.º 19
0
        data['precip.(mm)'][ind]=1
    elif(value>4.0 and value<=8.0):
        data['precip.(mm)'][ind]=2
    elif(value>8.0):
        data['precip.(mm)'][ind]=3
data['humidity()']=data['humidity()'].fillna(0).astype('int64')
X=data[['temp(c)','pressure(mb)','humidity()','wind speed(mph)','wind speed(mph)','wind dir.']]
y=data['precip.(mm)']
names=["KNN","SVM","Decision Tree",
       "Neural Network","Naive Bayesian"]
classifiers=[
    KNN(3),
    SVC(kernel="linear",C=0.025),
    DTC(max_depth=5),
    MLP(alpha=1,max_iter=1000),
    NB()]
x_train,x_test,y_train,y_test=ttl(X,y,test_size=0.3,random_state=1)
model_cols=[]
comparison=pd.DataFrame(columns=model_cols)
index=0
for name,clf in zip(names,classifiers):
    clf.fit(x_train,y_train)
    comparison.loc[index,'Classifiers']=name
    comparison.loc[index,'Train Accuracy']=clf.score(x_train,y_train)
    comparison.loc[index,'Test Accuracy']=clf.score(x_test,y_test)
    comparison.loc[index,'Precision']=precision_score(y_test,clf.predict(x_test),average='macro')
    comparison.loc[index,'Recall']=recall_score(y_test,clf.predict(x_test),average='macro')
    comparison.loc[index,'F1 Score'] = f1_score(y_test,clf.predict(x_test),average='macro')
    index+=1
comparison
Exemplo n.º 20
0
def get_model(model_name, feature):
    clf = " "
    if model_name == "lr":
        if feature == "word":
            clf = LogisticRegression(penalty='l2',
                                     dual=True,
                                     fit_intercept=True,
                                     C=1,
                                     tol=0.0001,
                                     class_weight=None,
                                     random_state=None,
                                     intercept_scaling=0.1)
        elif feature == "length":
            clf = LogisticRegression(penalty='l2',
                                     dual=True,
                                     fit_intercept=True,
                                     C=0.09,
                                     tol=0.0001,
                                     class_weight=None,
                                     random_state=None,
                                     intercept_scaling=0.1)
        elif feature == "struct":
            clf = LogisticRegression(penalty='l2',
                                     dual=True,
                                     fit_intercept=True,
                                     C=2,
                                     tol=0.0001,
                                     class_weight=None,
                                     random_state=None,
                                     intercept_scaling=0.1)
        elif feature == "lsa":
            clf = LogisticRegression(penalty='l2',
                                     dual=True,
                                     fit_intercept=True,
                                     C=2,
                                     tol=0.0001,
                                     class_weight=None,
                                     random_state=None,
                                     intercept_scaling=0.1)
        else:
            sp = feature.split(',')
            if set(sp) == set(["word", "length", "struct"]):
                clf = LogisticRegression(penalty='l2',
                                         dual=True,
                                         fit_intercept=True,
                                         C=1,
                                         tol=0.0001,
                                         class_weight=None,
                                         random_state=None,
                                         intercept_scaling=0.2)
            elif set(sp) == set(["word", "length", "lsa"]):
                clf = LogisticRegression(penalty='l2',
                                         dual=True,
                                         fit_intercept=True,
                                         C=0.8,
                                         tol=0.0001,
                                         class_weight=None,
                                         random_state=None,
                                         intercept_scaling=0.2)
            elif set(sp) == set(["struct", "length", "lsa"]):
                clf = LogisticRegression(penalty='l2',
                                         dual=True,
                                         fit_intercept=True,
                                         C=2,
                                         tol=0.0001,
                                         class_weight=None,
                                         random_state=None,
                                         intercept_scaling=0.3)
            elif set(sp) == set(["struct", "length", "lsa", "word"]):
                clf = LogisticRegression(penalty='l2',
                                         dual=False,
                                         fit_intercept=True,
                                         C=3,
                                         tol=0.0001,
                                         class_weight=None,
                                         random_state=None,
                                         intercept_scaling=2)

            elif "word" in sp and "length" in sp:
                clf = LogisticRegression(penalty='l2',
                                         dual=True,
                                         fit_intercept=True,
                                         C=0.2,
                                         tol=0.0001,
                                         class_weight=None,
                                         random_state=None,
                                         intercept_scaling=0.2)
            elif "word" in sp and "struct" in sp:
                clf = LogisticRegression(penalty='l2',
                                         dual=True,
                                         fit_intercept=True,
                                         C=5,
                                         tol=0.0001,
                                         class_weight=None,
                                         random_state=None,
                                         intercept_scaling=0.2)
            elif "word" in sp and "lsa" in sp:
                clf = LogisticRegression(penalty='l2',
                                         dual=True,
                                         fit_intercept=True,
                                         C=2,
                                         tol=0.0001,
                                         class_weight=None,
                                         random_state=None,
                                         intercept_scaling=0.2)
            elif "length" in sp and "struct" in sp:
                clf = LogisticRegression(penalty='l2',
                                         dual=True,
                                         fit_intercept=True,
                                         C=0.08,
                                         tol=0.0001,
                                         class_weight=None,
                                         random_state=None,
                                         intercept_scaling=0.2)
            elif "length" in sp and "lsa" in sp:
                clf = LogisticRegression(penalty='l2',
                                         dual=True,
                                         fit_intercept=True,
                                         C=0.3,
                                         tol=0.0001,
                                         class_weight=None,
                                         random_state=None,
                                         intercept_scaling=0.2)
            elif "struct" in sp and "lsa" in sp:
                clf = LogisticRegression(penalty='l2',
                                         dual=True,
                                         fit_intercept=True,
                                         C=2.5,
                                         tol=0.0001,
                                         class_weight=None,
                                         random_state=None,
                                         intercept_scaling=0.2)
            else:
                clf = LogisticRegression(penalty='l2',
                                         dual=True,
                                         fit_intercept=True,
                                         C=0.09,
                                         tol=0.0001,
                                         class_weight=None,
                                         random_state=None,
                                         intercept_scaling=0.1)

    elif model_name == "nb":
        clf = NB()
    elif model_name == "knn":
        if feature == "lsa":
            clf = KNN(n_neighbors=60)
        else:
            clf = KNN(n_neighbors=120)

    elif model_name == "rf":
        clf = RF(n_estimators=1000,
                 max_features="auto",
                 max_depth=8,
                 min_samples_split=10,
                 min_samples_leaf=2)

    elif model_name == "gbdt":
        clf = GBDT(n_estimators=400,
                   max_features="auto",
                   max_depth=8,
                   min_samples_split=10,
                   min_samples_leaf=2)

    elif model_name == "svm":
        if feature == "word" or feature == "length":
            clf = svm.SVC(C=0.8, kernel='rbf', gamma=0.08)
        elif feature == "structure":
            clf = svm.SVC(C=0.1, kernel='rbf', gamma=0.08)
        else:
            sp = feature.split(',')
            if "struct" in sp and "lsa" in sp:
                clf = svm.SVC(C=0.9, kernel='rbf', gamma=0.08)
            else:
                clf = svm.SVC(C=3, kernel='rbf', gamma=0.08)
    else:
        print("你只能从LR,NB,RF几种模型里选择")
        sys.exit(1)
    return clf
Exemplo n.º 21
0
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
# KNN : sklearn.neighbors.classification.KNeighborsClassifier
from sklearn.neighbors.classification import KNeighborsClassifier as KNN
# DTC : sklearn.tree.tree.DecisionTreeClassifier
# 有时也称为 "分类与回归树", CART (= Classifier And Regressor Tree)
from sklearn.tree.tree import DecisionTreeClassifier as DTC
# NB : sklearn.naive_bayes.GaussianNB
from sklearn.naive_bayes import GaussianNB as NB
# SVM : sklearn.svm.classes.SVC
from sklearn.svm.classes import SVC as SVM
_Models = {
    "LR": LR(),
    "LDA": LDA(),
    "KNN": KNN(),
    "DTC": DTC(),
    "NB": NB(),
    "SVM": SVM()
}
# 审查结果比较
print("审查结果比较及其可视化...:")
_Algorithm_CMP_Results = []
_Algorithm_CMP_Result_List = []
_Result_File.write("模型名称" + " " * 6 + "MEAN(准确度)" + " " * 12 + "STD(应该是标准差)\n")
print("模型名称" + " " * 6 + "MEAN(准确度)" + " " * 12 + "STD(应该是标准差)")
for _Each in _Models:
    # KFold : K折叠 : sklearn.model_selection._split.KFold(n_splits = 10, random_state = 7)
    # LeaveOneOut : 弃一 : sklearn.model_selection._split.LeaveOneOut()
    # KFold 与 LeaveOneOut 为两种不同的数据集分割策略方案
    cv_results = model_selection.cross_val_score(  # cross_val_score : 交叉验证
        _Models[_Each],
        X=_X_Train,
Exemplo n.º 22
0
    # print(y_test)
    # print(y_pre)
    # print("Precision: %f" % (cnt / len(y_test)))
    return cnt / len(y_test)


if __name__ == '__main__':
    X, y = genData(argv[1], False)
    print("X.shape:", X.shape)
    print("y.shape:", y.shape)
    clas = []
    clas.append(["KNN", KNN(n_neighbors=6)])
    clas.append(["SVC", SVC()])
    clas.append(["DT", DT()])
    clas.append(["NB", NB()])
    # cla = KNN(n_neighbors=6)
    clfIdx = 0
    savedClfs = [None] * 4
    bestPre = [0] * 4
    for clf in clas:
        pres = []
        for i in range(2000):
            p = classify(X, y, clf[1])
            if p > bestPre[clfIdx]:
                bestPre[clfIdx] = p
                savedClfs[clfIdx] = clf[1]
            pres.append(p)
        print("%s precision: %f" % (clf[0], np.mean(pres)))
        joblib.dump(savedClfs[clfIdx], MODEL_DIR + clf[0] + '.pkl')
        clfIdx += 1
                        type=str,
                        default=os.environ['SM_CHANNEL_TRAIN'])

    # args holds all passed-in arguments
    args = parser.parse_args()

    # Read in csv training file
    training_dir = args.data_dir
    train_data = pd.read_csv(os.path.join(training_dir, "train.csv"),
                             header=None,
                             names=None)

    # Labels are in the first column
    train_y = train_data.iloc[:, 0]
    train_x = train_data.iloc[:, 1:]

    # Define Naive Bayes Classifier and hyperparameter tuner
    nbc = NB()
    model = GridSearchCV(
        estimator=nbc,
        n_jobs=3,
        verbose=10,
        param_grid={'var_smoothing': [1e-9, 1e-7, 1e-5, 1e-3]})

    model.fit(train_x, train_y)

    print('Best Parameters: ', model.best_params_)
    print('Best Estimator: ', model.best_estimator_)

    # Save the trained model
    joblib.dump(model, os.path.join(args.model_dir, "model.joblib"))
    def _ModelSetting(self, model_name, cv_train_p=None):
        self.model_p = ''
        self.clf = None

        if model_name == 'K-MEANS':
            pars = [cv_train_p, 50000, 0.00001]
            self.model_p = '-'.join(str(p) for p in pars)
            self.clf = KMEANS(n_clusters=pars[0],
                              init='k-means++',
                              n_init=10,
                              max_iter=pars[1],
                              tol=pars[2],
                              precompute_distances='auto',
                              verbose=0,
                              random_state=None,
                              copy_x=True,
                              n_jobs=4)
        if model_name == 'K-MINI':
            pars = [cv_train_p, 10000, 0.0]
            self.model_p = '-'.join(str(p) for p in pars)
            self.clf = KMINI(n_clusters=pars[0],
                             init='k-means++',
                             max_iter=pars[1],
                             batch_size=100,
                             verbose=0,
                             compute_labels=True,
                             random_state=None,
                             tol=pars[2],
                             max_no_improvement=10,
                             init_size=None,
                             n_init=3,
                             reassignment_ratio=0.01)

        if model_name == 'PAC':
            self.clf = PAC(C=1.0,
                           fit_intercept=True,
                           n_iter=5,
                           shuffle=True,
                           verbose=0,
                           loss='hinge',
                           n_jobs=1,
                           random_state=None,
                           warm_start=False,
                           class_weight='balanced')
        if model_name == 'PCP':
            self.clf = PCP(penalty=None,
                           alpha=0.0001,
                           fit_intercept=True,
                           n_iter=20,
                           shuffle=False,
                           verbose=0,
                           eta0=1.0,
                           n_jobs=6,
                           random_state=0,
                           class_weight=None,
                           warm_start=False)
        if model_name == 'NB':
            self.clf = NB()

        if model_name == 'SGD':
            pars = [1e-4, None, 'hinge', 200]
            # loss = 'modified_huber', 'hinge' n_iter = 5
            self.model_p = '-'.join(str(p) for p in pars)
            self.clf = SGD(loss=pars[2],
                           penalty='l2',
                           alpha=pars[0],
                           l1_ratio=0.15,
                           fit_intercept=True,
                           n_iter=pars[3],
                           shuffle=True,
                           verbose=0,
                           epsilon=0.1,
                           n_jobs=1,
                           random_state=None,
                           learning_rate='optimal',
                           eta0=0.0,
                           power_t=0.5,
                           class_weight=pars[1],
                           warm_start=False,
                           average=False)
        if model_name == 'LSVC':
            pars = [1e-5, 1e-2, 'balanced', 2000]
            # 'crammer_singer'
            self.model_p = '-'.join(str(p) for p in pars)
            self.clf = LSVC(penalty='l2',
                            loss='squared_hinge',
                            dual=False,
                            tol=pars[0],
                            C=pars[1],
                            multi_class='ovr',
                            fit_intercept=True,
                            intercept_scaling=1,
                            class_weight=pars[2],
                            verbose=0,
                            random_state=None,
                            max_iter=pars[3])
        if model_name == 'CSVC':
            pars = [8, 'rbf', 0.00048828125, 'balanced']
            pars = [1e2, 'linear', 1e-3, 'auto']
            self.model_p = '-'.join(str(p) for p in pars)
            self.clf = CSVC(C=pars[0],
                            kernel=pars[1],
                            degree=3,
                            gamma=pars[2],
                            coef0=0.0,
                            shrinking=True,
                            probability=True,
                            tol=1e-3,
                            cache_size=5000,
                            class_weight=pars[3],
                            verbose=False,
                            max_iter=-1,
                            random_state=None)
        if model_name == 'NSVC':
            #pars = [0.5, 'rbf', 0.00048828125, 'auto']
            pars = [0.5, 'rbf', 'auto', 'auto']
            self.model_p = '-'.join(str(p) for p in pars)
            self.clf = NSVC(nu=pars[0],
                            kernel=pars[1],
                            degree=3,
                            gamma=pars[2],
                            coef0=0.0,
                            shrinking=True,
                            probability=False,
                            tol=0.001,
                            cache_size=500,
                            class_weight=pars[3],
                            verbose=False,
                            max_iter=-1,
                            decision_function_shape=None,
                            random_state=None)
        if model_name == 'LR':
            pars = ['l2', 1e+2, 'balanced', 3000]
            self.model_p = '-'.join(str(p) for p in pars)
            self.clf = LR(penalty=pars[0],
                          dual=False,
                          tol=0.0001,
                          C=pars[1],
                          fit_intercept=True,
                          intercept_scaling=1,
                          class_weight=pars[2],
                          random_state=None,
                          solver='liblinear',
                          max_iter=pars[3],
                          multi_class='ovr',
                          verbose=0,
                          warm_start=False,
                          n_jobs=1)
        if model_name == 'LinR':
            pars = [True]
            self.model_p = '-'.join(str(p) for p in pars)
            self.clf = LinR(fit_intercept=True,
                            normalize=pars[0],
                            copy_X=True,
                            n_jobs=1)
        if model_name == 'DT':
            pars = [8, 'balanced']
            self.model_p = '-'.join(str(p) for p in pars)
            self.clf = DT(criterion='gini',
                          splitter='best',
                          max_depth=pars[0],
                          min_samples_split=1,
                          min_samples_leaf=1,
                          min_weight_fraction_leaf=0.0,
                          max_features=None,
                          random_state=None,
                          max_leaf_nodes=None,
                          class_weight=pars[1],
                          presort=False)
        if model_name == 'RF':
            pars = [5, 7, 'balanced']
            self.model_p = '-'.join(str(p) for p in pars)
            self.clf = RF(n_estimators=pars[0],
                          criterion='gini',
                          max_depth=pars[1],
                          min_samples_split=2,
                          min_samples_leaf=1,
                          min_weight_fraction_leaf=0.0,
                          max_features='auto',
                          max_leaf_nodes=None,
                          bootstrap=True,
                          oob_score=False,
                          n_jobs=2,
                          random_state=None,
                          verbose=0,
                          warm_start=False,
                          class_weight=pars[2])
        if model_name == 'ADA':
            pars = [13, 18, 0.05]
            self.model_p = '-'.join(str(p) for p in pars)
            self.clf = ADA(base_estimator=DT(max_depth=pars[0],
                                             class_weight='balanced'),
                           n_estimators=50,
                           learning_rate=1.0,
                           algorithm='SAMME.R',
                           random_state=None)
        if model_name == 'GBM':
            pars = [20, 0.03, 13]
            self.model_p = '-'.join(str(p) for p in pars)
            self.clf = GBM(loss='deviance',
                           learning_rate=pars[1],
                           n_estimators=pars[0],
                           subsample=1.0,
                           min_samples_split=2,
                           min_samples_leaf=1,
                           min_weight_fraction_leaf=0.0,
                           max_depth=pars[2],
                           init=None,
                           random_state=None,
                           max_features=None,
                           verbose=0,
                           max_leaf_nodes=None,
                           warm_start=False,
                           presort='auto')
Exemplo n.º 25
0
nonfrauds = train_tr[y == 0]
nonfraudresults = predresults[y == 0]
fp = nonfrauds[(nonfraudresults["true"] == 0)
               & (nonfraudresults["cvpredict"] == 1)]
plt.scatter(fp[:, xax], fp[:, yax], color="red")
plt.title("1,2PCA scores; yellow:fraud; blue:FN; red: FP")

print("")
print("")
# check some models and compare them with respect to F1, Acc and Profit
models = [
    LogisticRegression(C=10, solver='lbfgs'),
    SVM.SVC(gamma='auto'),
    DT(),
    KNN(5),
    NB()
]
#uses the profit provided by the teachers
cv_profits_for_models(models, train, y)

#profit for the perceptron learner

perc = PerceptronLearner(1000)
cv_profits_for_models([perc], train, y)

#%%

## Pocket ALgorithm Prototype

import random
Exemplo n.º 26
0
# preprocessing data => converting reviews to token list of words
for rev in range(0, train.shape[0]):
    t, r = review_to_words(train["review"][rev])
    tokens.append(t)  # token list
    reviews.append(r)  # seperate reviews

vocabulary = 5000  # max features
vectorizer = cv(analyzer="word",
                tokenizer=None,
                preprocessor=None,
                stop_words=None,
                max_features=vocabulary)

X = vectorizer.fit_transform(reviews).toarray()
Y = train["sentiment"]

validation_size = 0.20
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(
    X, Y, test_size=validation_size)

#classifier = DecisionTreeClassifier()		#DTC
#classifier = SVC()				#SVM
#classifier = KNeighborsClassifier()		#KNN
classifier = NB(alpha=2)  #alpha=0 means no laplace smoothing
classifier.fit(X_train, np.array(Y_train))

predictions = classifier.predict(X_validation)
print("Accuracy: " + accuracy_score(Y_validation, predictions))
print("Confusion Matrix: " + confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation, predictions))
Exemplo n.º 27
0
    def MS_nonoptimised(self, features_after, labels):
        for x in self.Tests:
            ScoreDT = []
            ScoreSVM = []
            ScoreRF = []
            ScoreNB = []
            for i in range(100):
                featuresTraining = features_after.sample(int(x))
                indexTraining = featuresTraining.index.tolist()
                labelsTraining = []
                for n in indexTraining:
                    labelsTraining.append(labels[n])

                clfTestDT = cross_val_score(DT(),
                                            featuresTraining,
                                            labelsTraining,
                                            cv=5).mean()
                ScoreDT.append(clfTestDT)

                clfTestSVM = cross_val_score(SVC(),
                                             featuresTraining,
                                             labelsTraining,
                                             cv=5).mean()
                ScoreSVM.append(clfTestSVM)

                clfTestRF = cross_val_score(RFC(),
                                            featuresTraining,
                                            labelsTraining,
                                            cv=5).mean()
                ScoreRF.append(clfTestRF)

                clfTestNB = cross_val_score(NB(),
                                            featuresTraining,
                                            labelsTraining,
                                            cv=5).mean()
                ScoreNB.append(clfTestNB)

            DTs = np.mean(ScoreDT)
            SVMs = np.mean(ScoreSVM)
            RFs = np.mean(ScoreRF)
            NBs = np.mean(ScoreNB)

            plt.scatter(x, DTs)
            plt.scatter(x, SVMs)
            plt.scatter(x, RFs)
            plt.scatter(x, NBs)
            plt.plot(x, DTs, '.b-')
            plt.plot(x, SVMs, '.y-')
            plt.plot(x, RFs, '.g-')
            plt.plot(x, NBs, '.r-')

        plt.title(
            'Monte Carlo simulation of accuracy on non optimised classifiers')
        plt.xlim([5, 405])
        blue_patch = mpatches.Patch(color='blue', label='Decision Trees')
        red_patch = mpatches.Patch(color='red', label='Naive Bayes')
        y_patch = mpatches.Patch(color='yellow',
                                 label='Support Vector Machine')
        g_patch = mpatches.Patch(color='green', label='Random Forest')
        plt.legend(handles=[g_patch, y_patch, blue_patch, red_patch],
                   loc='lower right')
        plt.ylim([0.3, 1.1])
        plt.ylabel('Accuracy')
        plt.xlabel('Number of Samples')
        plt.grid()
        plt.show()
def findStopWord(word):
    try:
        stopwords = json.load(codecs.open('stopwords.json', 'r', 'utf-8-sig'))
    except:
        print("Loading stopwords.json failed")

    for lang in stopwords:
        if (word in stopwords[lang]):
            return True
        else:
            return False


X_Training, y_Training, X_Test, y_Test = loadBoWTry()

classifier = NB(alpha=0.03)
print("Fitting NB model\n")
classifier.fit(X_Training, y_Training)

# Predict Class
print("Predict class\n")
y_Predicted = classifier.predict(X_Test)

# Accuracy
print(np.shape(X_Test), np.shape(y_Predicted))
accuracy = accuracy_score(y_Test, y_Predicted)

print("Accurcay %f" % accuracy)

np.save("naivebayesclassifier", classifier)
#sentence = "ciao sono dario e ho ventiquattro anni, posto molto bello da vedere"
    chronology = list(data.keys())
    for i in range(len(data.keys())):
        for j in range(i + 1, len(data.keys())):
            if chronology[i] > chronology[j]:
                chronology[i], chronology[j] = chronology[j], chronology[i]

    date = [chronology[len(chronology) * i // 6-1] for i in range(1,7)]

    del data, label
    gc.collect()

    clf_option = [
        Boosting(),
        LR(n_jobs = -1),
        NB(),
        LinearSVC(),
        Neighbors(),
        RFC()
    ]

    mre_pred = []

    for iter in tqdm(range(5)):
        if settings.DEBUG_MODE:
            print("Memulai pengambilan data")

        mre_total = []
        query = "Select * from berita WHERE Date <= "+str(date[iter])
        c.execute(query)
        train_data = c.fetchall()
Exemplo n.º 30
0
def run():
    while True:
        trial = pull_pending()

        if trial is None:
            break

        params = eval(trial['Parameters'])

        logging.info(trial)

        dataset = load(trial['Dataset'])
        fold = int(trial['Fold']) - 1

        (X_train, y_train), (X_test,
                             y_test) = dataset[fold][0], dataset[fold][1]

        n_minority = Counter(y_train).most_common()[1][1]
        n_majority = Counter(y_train).most_common()[0][1]

        imblearn_ratios = [
            ((n_majority - n_minority) * ratio + n_minority) / n_majority
            for ratio in [0.5, 0.75, 1.0]
        ]

        clf = {
            'NB': NB(),
            'KNN': KNN(),
            'SVM': SVM(gamma='scale'),
            'CART': CART()
        }[params['classifier']]

        if (trial['Algorithm'] is None) or (trial['Algorithm'] == 'None'):
            algorithm = None
        else:
            algorithms = {
                'AKNN':
                ResamplingCV(AKNN, clf, n_neighbors=[1, 3, 5, 7]),
                'Bord':
                ResamplingCV(SMOTE,
                             clf,
                             kind=['borderline1'],
                             k_neighbors=[1, 3, 5, 7, 9],
                             m_neighbors=[5, 10, 15],
                             sampling_strategy=imblearn_ratios),
                'CC':
                ResamplingCV(CC, clf, sampling_strategy=imblearn_ratios),
                'CNN':
                ResamplingCV(CNN, clf, n_neighbors=[1, 3, 5, 7]),
                'ENN':
                ResamplingCV(ENN, clf, n_neighbors=[1, 3, 5, 7]),
                'IHT':
                ResamplingCV(IHT,
                             clf,
                             sampling_strategy=imblearn_ratios,
                             cv=[2]),
                'NCL':
                ResamplingCV(NCL, clf, n_neighbors=[1, 3, 5, 7]),
                'NM':
                ResamplingCV(NM, clf, n_neighbors=[1, 3, 5, 7]),
                'OSS':
                ResamplingCV(OSS, clf, n_neighbors=[1, 3, 5, 7]),
                'RBO':
                ResamplingCV(RBO,
                             clf,
                             gamma=[0.01, 0.1, 1.0, 10.0],
                             ratio=[0.5, 0.75, 1.0]),
                'RBU':
                ResamplingCV(RBU,
                             clf,
                             gamma=params.get('gamma'),
                             ratio=params.get('ratio')),
                'RENN':
                ResamplingCV(RENN, clf, n_neighbors=[1, 3, 5, 7]),
                'ROS':
                ResamplingCV(ROS, clf, sampling_strategy=imblearn_ratios),
                'RUS':
                ResamplingCV(RUS, clf, sampling_strategy=imblearn_ratios),
                'SMOTE':
                ResamplingCV(SMOTE,
                             clf,
                             k_neighbors=[1, 3, 5, 7, 9],
                             sampling_strategy=imblearn_ratios),
                'SMOTE+ENN':
                ResamplingCV(
                    SMOTEENN,
                    clf,
                    smote=[SMOTE(k_neighbors=k) for k in [1, 3, 5, 7, 9]],
                    sampling_strategy=imblearn_ratios),
                'SMOTE+TL':
                ResamplingCV(
                    SMOTETomek,
                    clf,
                    smote=[SMOTE(k_neighbors=k) for k in [1, 3, 5, 7, 9]],
                    sampling_strategy=imblearn_ratios),
                'TL':
                TL(),
            }

            algorithm = algorithms.get(trial['Algorithm'])

            if algorithm is None:
                raise NotImplementedError

        if algorithm is not None:
            X_train, y_train = algorithm.fit_sample(X_train, y_train)

        clf = clf.fit(X_train, y_train)
        predictions = clf.predict(X_test)

        scores = {
            'Precision': metrics.precision(y_test, predictions),
            'Recall': metrics.recall(y_test, predictions),
            'F-measure': metrics.f_measure(y_test, predictions),
            'AUC': metrics.auc(y_test, predictions),
            'G-mean': metrics.g_mean(y_test, predictions)
        }

        submit_result(trial, scores)