Exemplo n.º 1
0
def read_results(data, model_name):
    with open('data.json') as data_json:
        data_params = json.load(data_json)

    # Prepare data
    data_path = os.path.join(DATA_PATH, data_params['data'][data]['file_name'])
    print('Read file: {}'.format(data_path))
    X, y = load_csv(data_path)

    # Apply scaling
    scaler = MinMaxScaler().fit(X)
    X = scaler.transform(X)

    n_test = data_params['data'][data]['n_test']
    random_state = RANDOM_STATE
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=n_test, random_state=random_state)

    model = ExtraTreeClassifier(random_state=RANDOM_STATE)
    model.fit(X_train, y_train)
    acc_train = model.score(X_train, y_train)
    acc_test = model.score(X_test, y_test)
    print(('Train Acc: {:.4f}, ' + 'Test Acc: {:.4f}').format(
        acc_train, acc_test))

    df = pd.DataFrame(columns=COLUMNS)
    for attack in ATTACKS_NUM:
        for defence in DEFENCES_NUM:
            try:
                df = get_dataframe_sklearn(df, model, data, model_name, attack,
                                           defence)
            except FileNotFoundError as err:
                print(err)
                continue

    # These attacks have no hyperparameter
    df.loc[(df['Attack'] == 'boundary') | (df['Attack'] == 'tree'),
           'Adv_param'] = np.nan

    output_file = os.path.join(
        OUTPUT_PATH, '{}_{}_{}.csv'.format(data, model_name, VERSION))
    df.to_csv(output_file)
    print('Save to:', output_file)
Exemplo n.º 2
0
def trees_models(x_train, y_train):
    from sklearn.tree import DecisionTreeClassifier
    classifier1 = DecisionTreeClassifier(criterion='entropy', random_state=0)
    classifier1.fit(x_train, y_train)

    from sklearn.tree import ExtraTreeClassifier
    classifier2 = ExtraTreeClassifier()
    classifier2.fit(x_train, y_train)

    print('DecisionTreeClassifier training accuracy: ',
          classifier1.score(x_train, y_train))
    print('ExtraTreesClassifier training accuracy: ',
          classifier2.score(x_train, y_train))

    return classifier1, classifier2
def myclassify(numfiers=5,xtrain=xtrain,ytrain=ytrain,xtest=xtest,ytest=ytest):
    count = 0



    bagging2 = BaggingClassifier(ETC(),bootstrap=False,bootstrap_features=False)
    bagging2.fit(xtrain,ytrain)
    #print bagging2.score(xtest,ytest)
    count += 1
    classifiers = [bagging2.score(xtest,ytest)]

    if count < numfiers:

        tree2 = ETC()
        tree2.fit(xtrain,ytrain)
        #print tree2.fit(xtrain,ytrain)
        #print tree2.score(xtest,ytest)
        count+=1
        classifiers = np.append(classifiers,tree2.score(xtest,ytest))
        print "1"
        print tree2.score(xtest,ytest)

    if count < numfiers:
        bagging1 = BaggingClassifier(ETC())
        bagging1.fit(xtrain,ytrain)
        #print bagging1.score(xtest,ytest)
        count+=1
        classifiers = np.append(classifiers,bagging1.score(xtest,ytest))
        print "2"
        print bagging1.score(xtest,ytest)

#     if count < numfiers:
#         # votingClassifiers combine completely different machine learning classifiers and use a majority vote
#         clff1 = SVC()
#         clff2 = RFC(bootstrap=False)
#         clff3 = ETC()
#         clff4 = neighbors.KNeighborsClassifier()
#         clff5 = quadda()
#         print"3"


#         eclf = VotingClassifier(estimators = [('svc',clff1),('rfc',clff2),('etc',clff3),('knn',clff4),('qda',clff5)])
#         eclf = eclf.fit(xtrain,ytrain)
#         #print(eclf.score(xtest,ytest))
#         # for claf, label in zip([clff1,clff2,clff3,clff4,clff5,eclf],['SVC','RFC','ETC','KNN','QDA','Ensemble']):
#         #     cla
#         #     scores = crossvalidation.cross_val_score(claf,xtrain,ytrain,scoring='accuracy')
#         #     print ()
#         count+=1
#         classifiers = np.append(classifiers,eclf.score(xtest,ytest))


#     if count < numfiers:
#         svc1 = SVC()
#         svc1.fit(xtrain,ytrain)
#         dec = svc1.score(xtest,ytest)
#         count+=1
#         classifiers = np.append(classifiers,svc1.score(xtest,ytest))
#         print "3"

    if count < numfiers:
        # Quadradic discriminant analysis - classifier with quadratic decision boundary -
        qda = quadda()
        qda.fit(xtrain,ytrain)
        #print(qda.score(xtest,ytest))
        count+=1
        classifiers = np.append(classifiers,qda.score(xtest,ytest))
        print "4"


    if count < numfiers:

        tree1 = DTC()
        tree1.fit(xtrain,ytrain)
        #print tree1.fit(xtrain,ytrain)
        #print tree1.score(xtest,ytest)
        count+=1
        classifiers = np.append(classifiers,tree1.score(xtest,ytest))

    if count < numfiers:
        knn1 = neighbors.KNeighborsClassifier() # this classifies based on the #k nearest neighbors, where k is definted by the user.
        knn1.fit(xtrain,ytrain)
        #print(knn1.score(xtest,ytest))
        count+=1
        classifiers = np.append(classifiers,knn1.score(xtest,ytest))

    if count < numfiers:
        # linear discriminant analysis - classifier with linear decision boundary -
        lda = linda()
        lda.fit(xtrain,ytrain)
        #print(lda.score(xtest,ytest))
        count+=1
        classifiers = np.append(classifiers,lda.score(xtest,ytest))

    if count < numfiers:
        tree3 = RFC()
        tree3.fit(xtrain,ytrain)
        #print tree3.score(xtest,ytest)
        count+=1
        classifiers = np.append(classifiers,tree3.score(xtest,ytest))

    if count < numfiers:
        bagging3 = BaggingClassifier(RFC(),bootstrap=False,bootstrap_features=False)
        bagging3.fit(xtrain,ytrain)
        #print bagging3.score(xtest,ytest)
        count+=1
        classifiers = np.append(classifiers,bagging3.score(xtest,ytest))


    if count < numfiers:
        bagging4 = BaggingClassifier(SVC(),bootstrap=False,bootstrap_features=False)
        bagging4.fit(xtrain,ytrain)
        #print bagging4.score(xtest,ytest)
        count+=1
        classifiers = np.append(classifiers,bagging4.score(xtest,ytest))

    if count < numfiers:
        tree4 = RFC(bootstrap=False)
        tree4.fit(xtrain,ytrain)
        #print tree4.score(xtest,ytest)
        count+=1
        classifiers = np.append(classifiers,tree4.score(xtest,ytest))

    if count < numfiers:
        tree6 = GBC()
        tree6.fit(xtrain,ytrain)
        #print(tree6.score(xtest,ytest))
        count+=1
        classifiers = np.append(classifiers,tree6.score(xtest,ytest))

    if count < numfiers:
        knn2 = neighbors.KNeighborsClassifier(n_neighbors = 10)
        knn2.fit(xtrain,ytrain)
        #print(knn2.score(xtest,ytest))
        count+=1
        classifiers = np.append(classifiers,knn2.score(xtest,ytest))

    if count < numfiers:
        knn3 = neighbors.KNeighborsClassifier(n_neighbors = 3)
        knn3.fit(xtrain,ytrain)
        #print(knn3.score(xtest,ytest))
        count+=1
        classifiers = np.append(classifiers,knn3.score(xtest,ytest))

    if count < numfiers:
        knn4 = neighbors.KNeighborsClassifier(algorithm = 'ball_tree')
        knn4.fit(xtrain,ytrain)
        #print(knn4.score(xtest,ytest))
        count+=1
        classifiers = np.append(classifiers,knn4.score(xtest,ytest))

    if count < numfiers:
        knn5 = neighbors.KNeighborsClassifier(algorithm = 'kd_tree')
        knn5.fit(xtrain,ytrain)
        #print(knn5.score(xtest,ytest))
        count+=1
        classifiers = np.append(classifiers,knn5.score(xtest,ytest))

    if count < numfiers:
        ncc1 = NearestCentroid()
        ncc1.fit(xtrain,ytrain)
        #print (ncc1.score(xtest,ytest))
        count+=1
        classifiers = np.append(classifiers,ncc1.score(xtest,ytest))

    if count < numfiers:
    # Nearest shrunken Centroid
        for shrinkage in [None,0.05,0.1,0.2,0.3,0.4,0.5]:
            ncc2 = NearestCentroid(shrink_threshold = shrinkage)
            ncc2.fit(xtrain,ytrain)
            #print(ncc2.score(xtest,ytest))

        count+=1
        classifiers = np.append(classifiers,ncc2.score(xtest,ytest))

    if count < numfiers:
        tree5 = ABC()
        tree5.fit(xtrain,ytrain)
        #print(tree5.score(xtest,ytest))
        count+=1
        classifiers = np.append(classifiers,tree5.score(xtest,ytest))

    classifierlabel = ["BaggingETC (with bootstraps set to false)","ETC","BaggingETC","Voting Classifier","svm","QDA","DTC","KNN (default)","LDA","RFC",
                       "BaggingRFC (with bootstraps set to false)","BaggingSVC (with bootstraps set to false)","RFC (bootstrap false)","GBC",
                        "knn (n_neighbors = 10)","knn (n_neighbors = 3)","knn (ball tree algorithm)","knn (kd_tree algorithm)",
                       "Nearest Centroid","Shrunken Centroid?","ABC"]


    classifierlabel = classifierlabel[:len(classifiers)]
    #print len(classifiers)
    #print classifiers
    for i in range(len(classifiers)):


        print ("{} classifier has percent correct {}".format(classifierlabel[i],classifiers[i]))
Exemplo n.º 4
0
def main():
    # filepath: sentence data file path
    # vecfile: word vector file path pre-generated from other
    # vectype: compression methods. Average, avg+tf-idf one line, agg+tf-idf whole data
    # vec_path: vector file save path

    filepath = '/home/junlinux/Desktop/CSCI544_Last/hw7/data/stem_testdata'  # 'data/data_test'
    vecfile = '/home/junlinux/Desktop/CSCI544_Last/hw7/data/glove.6B/glove.6B.50d.txt'

    vec_files = [
        '/home/junlinux/Desktop/CSCI544_Last/hw7/data/glove.6B/glove.6B.50d.txt',
        '/home/junlinux/Desktop/CSCI544_Last/hw7/data/glove.6B/glove.6B.100d.txt',
        '/home/junlinux/Desktop/CSCI544_Last/hw7/data/glove.6B/glove.6B.200d.txt',
        '/home/junlinux/Desktop/CSCI544_Last/hw7/data/glove.6B/glove.6B.300d.txt',
        '/home/junlinux/Desktop/CSCI544_Last/hw7/data/glove.6B/glove.42B.300d.txt',
        '/home/junlinux/Desktop/CSCI544_Last/hw7/data/glove.6B/glove.840B.300d.txt'
    ]
    # don't know why yet, relative file path having permission deny
    # so we're using absolute path for now
    vec_path = '/home/junlinux/Desktop/CSCI544_Last/hw7/data/word_vector/'

    # Here, we can choose type of vectorization
    # there are 6 word vector file downloaded from glove
    """
    vectype = 1
    for v in vec_files:
        start_time = time.time()
        name = v.split('/')[-1][:-4] + '_vec'
        print(name, 'vectorization in process')
        word_vec_gen(filepath, v, vectype, vec_path+name)
        print("--- %s seconds ---" % (time.time() - start_time))

    vectype = 2
    for v in vec_files:
        start_time = time.time()
        name = v.split('/')[-1][:-4] + '_vec_OnelineTF'
        print(name, 'vectorization in process')
        word_vec_gen(filepath, v, vectype, vec_path + name)
        print("--- %s seconds ---" % (time.time() - start_time))

    vectype = 3
    for v in vec_files:
        start_time = time.time()
        name = v.split('/')[-1][:-4] + '_vec_WholeDataTF'
        print(name, 'vectorization in process')
        word_vec_gen(filepath, v, vectype, vec_path + name)
        print("--- %s seconds ---" % (time.time() - start_time))
    """

    # from here, will earase.

    filepath = '/home/junlinux/Desktop/CSCI544_Last/hw7/data/data_test'  # 'data/stem_testdata'
    #filepath = '/home/junlinux/Desktop/CSCI544_Last/hw7/data/hyp1-hyp2-ref'
    vectype = 1
    start_time = time.time()
    name = vecfile.split('/')[-1][:-4] + '_vec_diffOrder'
    #print(name, 'vectorization in process')
    #word_vec_gen(filepath, vecfile, vectype, vec_path + name)
    #print("--- %s seconds ---" % (time.time() - start_time))

    filepath = '/home/junlinux/Desktop/CSCI544_Last/hw7/data/data_test'  # 'data/stem_testdata'
    vectype = 2
    start_time = time.time()
    name = vecfile.split('/')[-1][:-4] + '_vec_OnelineTF'
    #print(name, 'vectorization in process')
    #word_vec_gen(filepath, vecfile, vectype, vec_path + name)
    #print("--- %s seconds ---" % (time.time() - start_time))

    filepath = '/home/junlinux/Desktop/CSCI544_Last/hw7/data/data_test'  # 'data/stem_testdata'
    vectype = 3
    start_time = time.time()
    name = vecfile.split('/')[-1][:-4] + '_vec_WholeDataTF'
    #print(name, 'vectorization in process')
    #word_vec_gen(filepath, vecfile, vectype, vec_path + name)
    #print("--- %s seconds ---" % (time.time() - start_time))

    vec_path = 'data/word_vector/glove.6B.50d_vec_diffOrder'
    wvec = load_wordvec(vec_path)
    target_path = 'data/dev.answers'
    answer = load_target(target_path)

    from sklearn.model_selection import train_test_split
    from sklearn.naive_bayes import BernoulliNB
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.tree import ExtraTreeClassifier
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.neural_network import MLPClassifier
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.linear_model import LogisticRegression
    from sklearn.svm import NuSVC
    from sklearn.multiclass import OneVsOneClassifier
    from sklearn.multiclass import OneVsRestClassifier
    from sklearn.svm import LinearSVC

    clf1 = KNeighborsClassifier()
    clf2 = DecisionTreeClassifier()
    clf3 = ExtraTreeClassifier()
    clf4 = MLPClassifier()
    clf5nu = NuSVC()
    clf6lin = LinearSVC()
    # 'sag', 'saga' and 'lbfgs' ’

    print("Training Starts")
    X_train, X_test, y_train, y_test = train_test_split(wvec,
                                                        answer,
                                                        test_size=0.10,
                                                        random_state=42)
    #clf1.fit(X_train, y_train)
    clf1.fit(X_train, y_train)
    print('KNeighborsClassifier score 50d', clf1.score(X_test, y_test))
    clf2.fit(X_train, y_train)
    print('DecisionTreeClassifier score 50d', clf2.score(X_test, y_test))
    clf3.fit(X_train, y_train)
    print('ExtraTreeClassifier score 50d', clf3.score(X_test, y_test))
    clf4.fit(X_train, y_train)
    print('MLPClassifier score 50d', clf4.score(X_test, y_test))

    clf1 = OneVsRestClassifier(KNeighborsClassifier())
    clf2 = OneVsRestClassifier(DecisionTreeClassifier())
    clf3 = OneVsRestClassifier(ExtraTreeClassifier())
    clf4 = OneVsRestClassifier(MLPClassifier())
    clf5 = OneVsOneClassifier(NuSVC())
    clf6 = OneVsRestClassifier(LinearSVC())

    from sklearn.linear_model import SGDClassifier
    from sklearn.linear_model import Perceptron
    from sklearn.linear_model import PassiveAggressiveClassifier
    clf7 = OneVsRestClassifier(SGDClassifier())
    clf8 = OneVsRestClassifier(Perceptron())
    clf9 = OneVsRestClassifier(PassiveAggressiveClassifier())

    print('One vs Rest methods case::')
    print('KNeighborsClassifier score 50d',
          clf1.fit(X_train, y_train).score(X_test, y_test))
    print('DecisionTreeClassifier score 50d',
          clf2.fit(X_train, y_train).score(X_test, y_test))
    print('ExtraTreeClassifier score 50d',
          clf3.fit(X_train, y_train).score(X_test, y_test))
    print('MLPClassifier score 50d',
          clf4.fit(X_train, y_train).score(X_test, y_test))

    print('SGDClassifier score 50d',
          clf7.fit(X_train, y_train).score(X_test, y_test))
    print('Perceptron score 50d',
          clf8.fit(X_train, y_train).score(X_test, y_test))
    print('PassiveAggressiveClassifier score 50d',
          clf9.fit(X_train, y_train).score(X_test, y_test))

    print('NuSVC score 50d', clf5.fit(X_train, y_train).score(X_test, y_test))
    print('LinearSVC score 50d',
          clf6.fit(X_train, y_train).score(X_test, y_test))

    clf5nu.fit(X_train, y_train)
    print('NuSVC score 50d', clf5nu.score(X_test, y_test))
    clf6lin.fit(X_train, y_train)
    print('LinearSVC score 50d', clf6lin.score(X_test, y_test))

    from sklearn.datasets import make_friedman1
    from sklearn.feature_selection import RFECV
    from sklearn.neighbors import KNeighborsClassifier
    estimator = DecisionTreeClassifier()
from sklearn.tree import DecisionTreeClassifier as DTC
tree1 = DTC()
print tree1
tree1.fit(xtrain,ytrain1)
print tree1.fit(xtrain,ytrain1)
print tree1.score(xtest,ytest1)


# In[22]:

from sklearn.tree import ExtraTreeClassifier as ETC
tree2 = ETC()
print tree2
tree2.fit(xtrain,ytrain1)
print tree2.fit(xtrain,ytrain1)
print tree2.score(xtest,ytest1)


# In[23]:

from sklearn.ensemble import BaggingClassifier
bagging1 = BaggingClassifier(ETC())
bagging1.fit(xtrain,ytrain1)
print bagging1.score(xtest,ytest1)


# In[24]:

from sklearn.ensemble import BaggingClassifier
bagging2 = BaggingClassifier(ETC(),bootstrap=False,bootstrap_features=False)
bagging2.fit(xtrain,ytrain1)
Exemplo n.º 6
0
test_set = df.iloc[train_data_len:, :]

#print(train_set.head(5))
train_x = train_set.iloc[:, 0:6]
train_y = train_set.iloc[:, 6:]

#print(type(train_y))

#train_y.reshape(len(train_y), )

#print(train_y.head(5))
test_x = test_set.iloc[:, 0:6]
test_y = test_set.iloc[:, 6:]

#test_y.reshape(len(test_y), )

#print(train_x.head(5))
#print(train_y.head(5))

from sklearn.tree import ExtraTreeClassifier
classifier = ExtraTreeClassifier(random_state=0,
                                 criterion="entropy",
                                 splitter="best")

classifier.fit(train_x, train_y.values.ravel())

info = classifier.score(test_x, test_y.values.ravel())

print(info)
#model = Sequential()
from AppleStore_Milestone2 import *
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import ExtraTreeClassifier
import time
print('\t\t\t Extra Tree Classifier Model \t\t\t\n','*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*')

start_t=time.time()
ExtraTreeClassifierModel = ExtraTreeClassifier(random_state=0,max_depth=12)
ExtraTreeClassifierModel=BaggingClassifier(ExtraTreeClassifierModel, random_state=0).fit(X_train, Y_train)
end_t=time.time()

max_d=12
print('max depth = ',max_d,'\n accuracy of training is : ',ExtraTreeClassifierModel.score(X_train, Y_train),'\n trainning time = ',end_t-start_t)

start_t=time.time()
acc=ExtraTreeClassifierModel.score(X_test, Y_test)
end_t=time.time()

print('accuracy of testing is : ',acc,'\n testing time = ',end_t-start_t)
joblib.dump(ExtraTreeClassifierModel,'joblib_ExtraTreeClassifierModel.pkl')



# loaded_model = joblib.load('joblib_ExtraTreeClassifierModel.pkl')
# predict = loaded_model.predict(X_test)
# accuracy = loaded_model.score(X_test, Y_test)
# print('Decission tree accuracy test : ' + str(accuracy),'\n')
def run(data, classifications, scoring_data, scoring_classifications):
    classifer = ExtraTreeClassifier()
    classifer.fit(data, classifications)
    accuracy = classifer.score(scoring_data, scoring_classifications)
    return accuracy
Exemplo n.º 9
0
print("keys:			", iris.keys())
print("data:			", iris["data"])
print("target_names:	", iris["target_names"])
print("target:			", iris["target"])
print("feature_names:	", iris["feature_names"])
"""Divide la data en set de entrenamiento y de test"""
X_train, X_test, Y_train, Y_test = train_test_split(iris["data"],
                                                    iris["target"])
print("X_train:			", X_train.shape)
print("X_test:				", X_test.shape)
print("Y_train:			", Y_train.shape)
print("Y_test:				", Y_test.shape)
"""__________________"""
arbol = ExtraTreeClassifier(max_depth=3)
print("Entreno:			", arbol.fit(X_train, Y_train))
print("Comprobacion1:		", arbol.score(X_test, Y_test))
print("Comprobacion2:		", arbol.score(X_train, Y_train))
"""__________________"""
G = export_graphviz(arbol,
                    out_file='arbol.dot',
                    class_names=iris.target_names,
                    feature_names=iris.feature_names,
                    impurity=False,
                    filled=True)

with open('arbol.dot') as f:
    dot_graph = f.read()

graphviz.Source(dot_graph).render('arbol', view=False, format='png')
# graphviz.Source(dot_graph).view()
# graph=graphviz.Source(dot_graph)
Exemplo n.º 10
0
sgd.score(x_test_3, y_test_3)
sgd = SGDClassifier(loss='log', shuffle=True, random_state=171)
sgd.fit(x_train_3, y_train_3)
sgd.predict(x_train_3)
sgd.score(x_test_3, y_test_3)
sgd = SGDClassifier(shuffle=True, random_state=171)
sgd.fit(x_train_3, y_train_3)
sgd.predict(x_train_3)
sgd.score(x_test_3, y_test_3)
submission = pd.DataFrame({'Id': test.Id, 'Cover_Type': ensemble_test_pred})
submission.head()
submission.to_csv('submission.csv', index=False)
submission_tree = pd.DataFrame({'Id': test.Id, 'Cover_Type': tree_test_pred})
submission_tree.head()
submission_tree.to_csv('submission2.csv', index=False)
#Extra tree classifier is a tree based model for classification problems
et = ExtraTreeClassifier()
et.fit(x_train_3, y_train_3)
et.predict(x_train_3)
et.score(x_test_3, y_test_3)
from sklearn.semi_supervised import LabelPropagation
lb = LabelPropagation()
lb.fit(x_train_3, y_train_3)
lb.predict(x_train_3)
lb.score(x_test_3, y_test_3)
from sklearn.neighbors import KNeighborsClassifier
knng = KNeighborsClassifier()
knng.fit(x_train_3, y_train_3)
knng.predict(x_train_3)
knng.score(x_test_3, y_test_3)