Exemplo n.º 1
0
def tree(labels,X,df,i):
  tree = DT(max_depth = 4)
  tree.fit(X,labels)
  impt = tree.feature_importances_
  para = tree.get_params()
  export_graphviz(tree, out_file = OUTPUT_DIRECTORY+str(i)+"_tree.dot", feature_names = df.columns)
  return impt
Exemplo n.º 2
0
def fit_sktree(path, index_filter=None, class_filter=None, feature_filter=None, folds=10,
             inverse=False, max_depth=10, min_samples_split=20, lc_filter=None):

    data = pd.read_csv(path, index_col=0)
    data, y = utils.filter_data(data, index_filter, class_filter, feature_filter, lc_filter)

    skf = cross_validation.StratifiedKFold(y, n_folds=folds)
    
    results = []
    for train_index, test_index in skf:
        if inverse:
            aux = train_index
            train_index = test_index
            test_index = aux

        train_X, test_X = data.iloc[train_index], data.iloc[test_index]
        train_y, test_y = y.iloc[train_index], y.iloc[test_index]

        clf = None
        clf = DecisionTreeClassifier(criterion='entropy', max_depth=max_depth,
                                     min_samples_split=min_samples_split)

        clf.fit(train_X, train_y)
        results.append(metrics.predict_table(clf, test_X, test_y))

    return pd.concat(results)
Exemplo n.º 3
0
def get_clfs(rank, Nfeatures=20, Nscores=10):
    """ Traning decision tree on a chank of data and returns predictions"""

    df = pd.read_csv('data/train_%d.csv'%rank, names=headers)
    print rank, df.shape
    np.random.seed(rank)
    fselect = np.random.choice(range(2, Nscores), Nfeatures, replace = False)
    print rank, fselect

    indexes = np.array(scores_indexes)[fselect]

    Nr, Nc  = df.shape
    Nf = len(indexes)
    X = np.zeros([Nr,Nf+1]) 
    y = np.zeros([Nr]) 

    get_X_y(X, y, df, features_touples, indexes)
    print rank, 'Xy read'
    del df
    
    if rank == 0: print 'Size of numpy array in GB:', X.nbytes/1.e9
    clf = DecisionTreeClassifier(random_state=0)
    clf.fit(X, y)
    y_pred = clf.predict_proba(X)
    etmp = log_loss(y, y_pred)

    del X, y
    print 'IN error on rank:', rank, 'is', etmp
    return (clf, rank, etmp)
def decision_tree_entropy(training_data):
    clf = DecisionTreeClassifier(criterion="entropy",random_state=0)
    clf.fit(training_data[0], training_data[1])
    #with open("/media/deeksha/e/Deeksha/Dropbox/Coursework/MachineLearning/HW3/entropy.dot", 'w') as f:
    #    f = tree.export_graphviz(clf, out_file=f)
    print "entropy:Number of Nodes", clf.tree_.node_count
    return clf
Exemplo n.º 5
0
def quize1(data):
# 1. Select count of neighbors.Загрузите выборку из файла titanic.csv с помощью пакета Pandas.
# 2.Оставьте в выборке четыре признака: класс пассажира (Pclass), цену билета (Fare), возраст пассажира (Age) и его пол (Sex).
# 3.Обратите внимание, что признак Sex имеет строковые значения.
# 4.Выделите целевую переменную — она записана в столбце Survived.
# 5.В данных есть пропущенные значения — например, для некоторых пассажиров неизвестен их возраст.
# 6.Такие записи при чтении их в pandas принимают значение nan.
# Найдите все объекты, у которых есть пропущенные признаки, и удалите их из выборки.
# Обучите решающее дерево с параметром random_state=241 и остальными параметрами по умолчанию.
# Вычислите важности признаков и найдите два признака с
# наибольшей важностью. Их названия будут ответами для данной задачи
# (в качестве ответа укажите названия признаков через запятую или пробел, порядок не важен).
    dataF = data[['Pclass', 'Fare', 'Age', 'Sex','Survived']]
    dataF = dataF.dropna()
    Y = dataF['Survived']
    dataF = dataF[['Pclass', 'Fare', 'Age', 'Sex']]
    clf = DecisionTreeClassifier(random_state=241)
    dataF.loc[dataF['Sex'] != 'male', 'Sex'] = 0
    dataF.loc[dataF['Sex'] == 'male', 'Sex'] = 1
    print (dataF)
    clf.fit(dataF, Y)
    importances = clf.feature_importances_
    print(importances)
    # d = zip(dataF.columns, clf.feature_importanc_)
    # print(d)
    return
Exemplo n.º 6
0
def test(train_feature,train_label,test_feature,test_label):
    from sklearn import metrics
    from sklearn.linear_model import LogisticRegression
    from sklearn.naive_bayes import GaussianNB
    from sklearn.tree import DecisionTreeClassifier
    # fit a CART model to the data
    model = DecisionTreeClassifier()
    from sklearn.svm import SVC
    # fit a SVM model to the data
    # model = SVC()
    # model = GaussianNB()
    # model = LogisticRegression()
    from sklearn.neighbors import KNeighborsClassifier
    # fit a k-nearest neighbor model to the data
    import time
    currenttime = time.time()
    # model = KNeighborsClassifier()
    model.fit(train_feature, train_label)
    print(model)
    # make predictions
    expected = test_label
    predicted = model.predict(test_feature)
    # summarize the fit of the model
    print(metrics.classification_report(expected, predicted))
    print(metrics.confusion_matrix(expected, predicted))
    print(metrics.accuracy_score(expected,predicted))
class Transformer:
    def __init__(self, use_PCA=True):
        self._clf = DecisionTreeClassifier(min_samples_leaf=10)
        self._idx = None
        self._scaler = StandardScaler()
        self._trans = PCA('mle')
        self._use_PCA = use_PCA

    def fit(self, X, y):
        X = np.array(X)
        self._clf.fit(X, y)

        self._idx = filter(lambda x: self._clf.feature_importances_[x] > 0, \
                range(len(self._clf.feature_importances_)))

        new_set = [X[i][self._idx] for i in xrange(len(X))]

#        new_set = self._scaler.fit_transform(new_set)

        if self._use_PCA:
            new_set = self._trans.fit_transform(new_set)
        return new_set

    def transform(self, features):
        features = features[self._idx]
#        features = self._scaler.transform(features.astype(float))
        if self._use_PCA:
            features = self._trans.transform(features)
        return features
Exemplo n.º 8
0
    def Train(self, X, y):
        
        N = X.shape[0]
        New = np.zeros(N)

        for i in self.names:
            
            A = []
            C = []
            
            weight = np.ones(N)/N
            New[np.where(y == i)[0]] =  1
            New[np.where(y != i)[0]] = -1
            
            for j in range(self.itr):
                
                # Input a generic sklearn classifier
                clf = DecisionTreeClassifier(max_depth = self.dep)
                clf.fit(X, New, sample_weight = weight)
                Pre = clf.predict(X)
                err = weight.dot((New != Pre).astype(int))

                if (err != 0):
                    A.append(.5*np.log((1-err)/err))
                else:
                    A.append(1)

                C.append(clf)
                weight *= np.exp(-A[j]*New*Pre)
                weconight = weight/np.sum(weight)

            self.C.append(C)
            self.A.append(A)
Exemplo n.º 9
0
def decision_tree(train_bow,train_labels,test_bow,test_labels,bow_indexes):
    print("Training decision tree")
    dt_classifier=DecisionTreeClassifier()

    dt_classifier.fit(train_bow,train_labels)
    print("Testing decision tree")
    test(dt_classifier,"dt",test_bow,test_labels,bow_indexes)
Exemplo n.º 10
0
def test_scoring():
    X, y = iris_data()
    clf1 = LogisticRegression(random_state=1,
                              solver='liblinear',
                              multi_class='ovr')
    clf2 = DecisionTreeClassifier(random_state=1)

    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=0.5,
                         random_state=123)

    score1 = clf1.fit(X_train, y_train).score(X_test, y_test)
    score2 = clf2.fit(X_train, y_train).score(X_test, y_test)

    assert round(score1, 2) == 0.96, round(score1, 2)
    assert round(score2, 2) == 0.91, round(score2, 2)

    t, p = paired_ttest_kfold_cv(estimator1=clf1,
                                 estimator2=clf2,
                                 X=X, y=y,
                                 scoring='accuracy',
                                 random_seed=1)

    assert round(t, 3) == -1.861, t
    assert round(p, 3) == 0.096, p

    t, p = paired_ttest_kfold_cv(estimator1=clf1,
                                 estimator2=clf2,
                                 X=X, y=y,
                                 scoring='recall_micro',
                                 random_seed=1)

    assert round(t, 3) == -1.861, t
    assert round(p, 3) == 0.096, p
Exemplo n.º 11
0
def evaluateDecisionTree(train_x,train_y,test_x,test_y):
    clf = DecisionTreeClassifier(criterion='entropy',min_samples_leaf=5,max_depth=20)
    clf.fit(train_x,train_y)
    p = clf.predict_proba(test_x)[:,1]
    auc = roc_auc_score(test_y,p)
    plotAUC(test_y,clf.predict_proba(test_x)[:,1],'DT')
    return auc
def test_graphviz_errors():
    """Check for errors of export_graphviz"""
    clf = DecisionTreeClassifier(max_depth=3, min_samples_split=1)
    clf.fit(X, y)

    out = StringIO()
    assert_raises(IndexError, export_graphviz, clf, out, feature_names=[])
Exemplo n.º 13
0
def test_importances():
    """Check variable importances."""
    X, y = datasets.make_classification(n_samples=2000,
                                        n_features=10,
                                        n_informative=3,
                                        n_redundant=0,
                                        n_repeated=0,
                                        shuffle=False,
                                        random_state=0)

    for name, Tree in CLF_TREES.items():
        clf = Tree(random_state=0)

        clf.fit(X, y)
        importances = clf.feature_importances_
        n_important = np.sum(importances > 0.1)

        assert_equal(importances.shape[0], 10, "Failed with {0}".format(name))
        assert_equal(n_important, 3, "Failed with {0}".format(name))

        X_new = clf.transform(X, threshold="mean")
        assert_less(0, X_new.shape[1], "Failed with {0}".format(name))
        assert_less(X_new.shape[1], X.shape[1], "Failed with {0}".format(name))

    # Check on iris that importances are the same for all builders
    clf = DecisionTreeClassifier(random_state=0)
    clf.fit(iris.data, iris.target)
    clf2 = DecisionTreeClassifier(random_state=0,
                                  max_leaf_nodes=len(iris.data))
    clf2.fit(iris.data, iris.target)

    assert_array_equal(clf.feature_importances_,
                       clf2.feature_importances_)
Exemplo n.º 14
0
def buildTree(options, treefile, dataFile = None):
    dt = loadTree(treefile)
    if dt is not None:
        return dt
    if dataFile is None:
        raise ValueError("No data file specified")

    dt = DecisionTreeClassifier(min_samples_split=20, random_state=99)
    files = []
    featureFrames = []
    targetFrames = []
    if os.path.isdir(dataFile):
        files = getFiles(dataFile, ".csv")
    else:
        files.append(dataFile)
    for _file in files:
        print("Loading data %s" % _file)
        (featureValues, targetValues, features, df) = loadData(_file, options)
        featureFrames.append(featureValues)
        targetFrames.append(targetValues)
    dt.fit(pd.concat(featureFrames), pd.concat(targetFrames))
    saveTree(treefile, dt)
    print("Building graph")
    visualize_tree(treefile, dt, features)
    return dt
Exemplo n.º 15
0
def train_adaboost(features, labels, learning_rate, n_lab, n_runs, n_estim, n_samples):
    uniqLabels = np.unique(labels)
    print 'Taking ', str(n_lab), ' labels'
    uniqLabels = uniqLabels[:n_lab]
    used_labels = uniqLabels
    pbar = start_progressbar(len(uniqLabels), 'training adaboost for %i labels' %len(uniqLabels))
    allLearners = []
    for yy ,targetLab in enumerate(uniqLabels):
        runs=[]
        for rrr in xrange(n_runs):
            #import ipdb;ipdb.set_trace()
            feats,labs = get_binary_sets(features, labels, targetLab, n_samples)
            #print 'fitting stump'
            #import ipdb;ipdb.set_trace()
            baseClf = DecisionTreeClassifier(max_depth=4, min_samples_leaf=10, min_samples_split=10)
            baseClf.fit(feats, labs)
            ada_real = AdaBoostClassifier( base_estimator=baseClf, learning_rate=learning_rate,
                                      n_estimators=n_estim,
                                      algorithm="SAMME.R")
            #import ipdb;ipdb.set_trace()
            runs.append(ada_real.fit(feats, labs))
        allLearners.append(runs)
        update_progressbar(pbar, yy)
    end_progressbar(pbar)
    
    return allLearners, used_labels
def plot_tree(max_depth=1):
    fig, ax = plt.subplots(1, 2, figsize=(15, 7))
    h = 0.02

    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

    if max_depth != 0:
        tree = DecisionTreeClassifier(max_depth=max_depth, random_state=1).fit(X, y)
        Z = tree.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
        Z = Z.reshape(xx.shape)
        faces = tree.tree_.apply(np.c_[xx.ravel(), yy.ravel()].astype(np.float32))
        faces = faces.reshape(xx.shape)
        border = ndimage.laplace(faces) != 0
        ax[0].contourf(xx, yy, Z, alpha=.4)
        ax[0].scatter(xx[border], yy[border], marker='.', s=1)
        ax[0].set_title("max_depth = %d" % max_depth)
        ax[1].imshow(tree_image(tree))
        ax[1].axis("off")
    else:
        ax[0].set_title("data set")
        ax[1].set_visible(False)
    ax[0].scatter(X[:, 0], X[:, 1], c=np.array(['b', 'r'])[y], s=60)
    ax[0].set_xlim(x_min, x_max)
    ax[0].set_ylim(y_min, y_max)
    ax[0].set_xticks(())
    ax[0].set_yticks(())
Exemplo n.º 17
0
def decision_tree_prediction(features_train, labels_train, features_test, ids):

    X_train, X_test, y_train, y_test = cross_validation.train_test_split(features_train, labels_train, random_state=1301, stratify=labels_train, test_size=0.3)

    clf = DecisionTreeClassifier(criterion='gini',
                                 min_samples_split=10,
                                 max_depth=10,
                                 max_leaf_nodes=16,
                                 max_features=2)


    #clf_acc = clf.fit(X_train, y_train)
    # print(clf.best_estimator_)
    #feature_importance = clf.feature_importances_
    #print (feature_importance)

    #pred = clf_acc.predict_proba(X_test)[:,1]
    #print (y_test, pred)
    # acc = accuracy_score(y_test, pred)
    # print ("Acc {}".format(acc))

    clf = clf.fit(features_train, labels_train)

    pred = clf.predict_proba(features_test)[:,1]

    predictions_file = open("data/canivel_decision_tree.csv", "wb")
    predictions_file_object = csv.writer(predictions_file)
    predictions_file_object.writerow(["ID", "TARGET"])
    predictions_file_object.writerows(zip(ids, pred))
    predictions_file.close()
Exemplo n.º 18
0
def calculate_single_tree(noisy_fold_single_tree, folds_single_tree):
    accuracy = 0.0

    clf = DecisionTreeClassifier(criterion='entropy', splitter='best', min_samples_split=49)

    for k in range(0, len(noisy_fold_single_tree)):
        learn_group_x_single_tree = get_union_of_all_but_i(noisy_fold_single_tree, k)
        learn_group_y_single_tree = []
        for l in learn_group_x_single_tree:
            learn_group_y_single_tree.append(l.pop())

        curr_tree_single_tree = clf.fit(learn_group_x_single_tree, learn_group_y_single_tree)

        num_of_success = 0
        for m in folds_single_tree[k]:
            ans = m.pop()
            tree_ans = curr_tree_single_tree.predict([m])
            m.append(ans)
            if ans == tree_ans:
                num_of_success += 1

        for l in learn_group_x_single_tree:
            l.append(learn_group_y_single_tree.pop(0))
        accuracy += num_of_success / (float(len(folds_single_tree[k])))

    accuracy /= float(len(noisy_fold_single_tree))
    print('single tree. acc: {}'.format(k, accuracy))
Exemplo n.º 19
0
class TreeClassifier(Classifier):

    def __init__(self, min_samples_split=20, random_state=99):
        self.classifier = DecisionTreeClassifier(min_samples_split=min_samples_split,
                                                 random_state=random_state)

    def do_train(self, X, y):
        self.classifier.fit(X, y)

    def do_classification(self, X, y):
        self.classifier.predict(X[:, 'age':'thal'])
        print('wtf')

    def visualize_tree(tree, feature_names):
        """Create tree png using graphviz.

        Args
        ----
        tree -- scikit-learn DecsisionTree.
        feature_names -- list of feature names.
        """
        with open("dt.dot", 'w') as f:
            export_graphviz(tree, out_file=f, feature_names=feature_names)

        command = ["dot", "-Tpng", "dt.dot", "-o", "dt.png"]
        try:
            subprocess.check_call(command)
        except Exception, e:
            print(e)
            exit("Could not run dot, ie graphviz, to produce visualization")
Exemplo n.º 20
0
	def train(self, X, Y):
		N, D = X.shape
		for t in xrange(self.boostrap_sample):
			sampleX, sampleY = self.get_sample(X, Y)
			clf = DecisionTreeClassifier(criterion="entropy", max_depth = 1)
			clf.fit(sampleX, sampleY)
			self.weak_clfs.append(clf)
Exemplo n.º 21
0
def decisionTree(dataTrain,featuresTrain,dataTest,featuresTest,filename='result'):
    criterion=['gini','entropy']
    splitter=['best','random']
    max_features=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,None,"log2","sqrt"]
    max_accuracy=0.0
    recall_score=0.0
    best_clf=None
    for param in product(criterion,splitter,max_features):
        print("\n========================================================")
        clf=DecisionTreeClassifier(criterion=param[0],splitter=param[1],max_features=param[2])
        result=clf.fit(dataTrain,featuresTrain)
        print result
        print result.score(dataTest,featuresTest)
        resultFeatures=clf.predict(dataTest)
        accuracy=metrics.accuracy_score(featuresTest,resultFeatures)
        recallu=metrics.recall_score(featuresTest,resultFeatures)
        print accuracy,recallu
        if accuracy > max_accuracy:
            max_accuracy=accuracy
            recall_score=recallu
            best_clf=result
            predict_features=resultFeatures
        print("\n========================================================")
    print ("\n Get result")
    print ("Best accuracy is %0.6f\nBest paramters is %s" % (max_accuracy,best_clf))
    fd = open("DT"+filename+".txt",'a')
    fd.write("Best accuracy and recall is %0.6f\t%0.6f\nBest paramters is %s" % (max_accuracy,recall_score,best_clf))
    fd.close()
Exemplo n.º 22
0
def main():
    data = run_game()

    clf = DecisionTreeClassifier(criterion='entropy')

    game_data = [[i[0], i[1]] for i in data]
    profits = [i[2] for i in data]

    clf.fit(game_data, profits)

    with open('tree.dot', 'w') as dotfile:
        export_graphviz(
            clf,
            dotfile,
            feature_names=['coin', 'bet']
        )

    predictions_lose1 = [clf.predict([0, 0]) for x in xrange(100)]
    predictions_lose2 = [clf.predict([0, 1]) for x in xrange(100)]
    predictions_win = [clf.predict([1, 1]) for x in xrange(100)]

    print 'All these profit predictions should be zero:'
    print predictions_lose1
    print 'Accuracy was', calculate_accuracy(predictions_lose1, np.array([0]))

    print 'All these profit predictions should be zero:'
    print predictions_lose2
    print 'Accuracy was', calculate_accuracy(predictions_lose2, np.array([0]))

    print 'All these profit predictions should be two:'
    print predictions_win
    print 'Accuracy was', calculate_accuracy(predictions_win, np.array([2]))
class MultEstimator(BaseEstimator):
    def __init__(self, categories):
        self.categories = categories

    def fit(self, X, y, **params):
        self.models = {_: None for _ in self.categories}
        self.tot_model = DecisionTreeClassifier(max_depth=8, min_samples_leaf=100)
        categ = X[:, -1]
        data = X[:, :-1]
        self.tot_model.fit(data, y)
        for c in self.models.keys():
            mask = categ == c
            m = DecisionTreeClassifier(max_depth=8, min_samples_leaf=100)
            m.fit(data[mask], y[mask])
            self.models[c] = m

    def predict(self, X):
        categ = X[:, -1]
        data = X[:, :-1]
        p = self.tot_model.predict(data)
        for c in self.models.keys():
            mask = categ == c
            if mask.any():
                p[mask] = self.models[c].predict(data[mask])
        return p

    def predict_proba(self, X):
        categ = X[:, -1]
        data = X[:, :-1]
        p = self.tot_model.predict_proba(data)
        for c in self.models.keys():
            mask = categ == c
            if mask.any():
                p[mask] = self.models[c].predict_proba(data[mask])
        return p
def programmer_2():
    datafile = 'data/model.xls'
    data = pd.read_excel(datafile)
    data = data.as_matrix()
    shuffle(data)  # 随机打乱数据

    # 设置训练数据比8:2
    p = 0.8
    train = data[:int(len(data) * p), :]
    test = data[int(len(data) * p):, :]

    # 构建CART决策树模型
    treefile = 'tmp/tree.pkl'
    tree = DecisionTreeClassifier()
    tree.fit(train[:, :3], train[:, 3])

    joblib.dump(tree, treefile)

    cm_plot(train[:, 3], tree.predict(train[:, :3])).show()  # 显示混淆矩阵可视化结果
    # 注意到Scikit-Learn使用predict方法直接给出预测结果。

    fpr, tpr, thresholds = roc_curve(
        test[:, 3], tree.predict_proba(test[:, :3])[:, 1], pos_label=1)
    plt.plot(fpr, tpr, linewidth=2, label='ROC of CART', color='green')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    # 设定边界范围
    plt.ylim(0, 1.05)
    plt.xlim(0, 1.05)
    plt.legend(loc=4)
    plt.show()
    print(thresholds)
Exemplo n.º 25
0
def test_graphviz_errors():
    # Check for errors of export_graphviz
    clf = DecisionTreeClassifier(max_depth=3, min_samples_split=2)

    # Check not-fitted decision tree error
    out = StringIO()
    assert_raises(NotFittedError, export_graphviz, clf, out)

    clf.fit(X, y)

    # Check if it errors when length of feature_names
    # mismatches with number of features
    message = ("Length of feature_names, "
               "1 does not match number of features, 2")
    assert_raise_message(ValueError, message, export_graphviz, clf, None,
                         feature_names=["a"])

    message = ("Length of feature_names, "
               "3 does not match number of features, 2")
    assert_raise_message(ValueError, message, export_graphviz, clf, None,
                         feature_names=["a", "b", "c"])

    # Check class_names error
    out = StringIO()
    assert_raises(IndexError, export_graphviz, clf, out, class_names=[])

    # Check precision error
    out = StringIO()
    assert_raises_regex(ValueError, "should be greater or equal",
                        export_graphviz, clf, out, precision=-1)
    assert_raises_regex(ValueError, "should be an integer",
                        export_graphviz, clf, out, precision="1")
Exemplo n.º 26
0
def text_learning_experiment(words_to_remove=[]):
    from_sara  = open("../text_learning/from_sara.txt", "r")
    from_chris = open("../text_learning/from_chris.txt", "r")
    word_data, authors = vectorize_emails(from_sara, from_chris, max_emails=300, words_to_remove=words_to_remove)
    features_train, features_test, labels_train, labels_test = \
        cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42)
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                                 stop_words='english')
    features_train = vectorizer.fit_transform(features_train)
    features_test  = vectorizer.transform(features_test).toarray()

    features_train = features_train[:150].toarray()
    labels_train   = labels_train[:150]

    clf = DecisionTreeClassifier()
    clf.fit(features_train, labels_train)
    predict_train = clf.predict(features_train)
    predict_test = clf.predict(features_test)
    print "train acc:", accuracy_score(labels_train, predict_train)
    print "test acc: ", accuracy_score(labels_test, predict_test)
    feature_index = np.argmax(clf.feature_importances_)
    feature_importance = clf.feature_importances_[feature_index]
    feature_name = vectorizer.get_feature_names()[feature_index]
    print "Most important feature, and relative importance:", feature_name, ":", feature_importance
    return feature_name, feature_importance
Exemplo n.º 27
0
def decision_trees(features, labels):
    classifier = DecisionTreeClassifier(random_state=0, criterion="entropy")
    classifier.fit(features, labels)
    scores = cross_validation.cross_val_score(
        classifier, features, labels, cv=10, score_func=metrics.precision_recall_fscore_support
    )
    print_table("Decision Trees", numpy.around(numpy.mean(scores, axis=0), 2))
Exemplo n.º 28
0
def main(percentage):
    """Given a percentage for splitting the dataset, fit the training set and apply the rest as a test set."""
    df = pd.read_csv('cellStrength.log')
    df.drop('SSID', 1, inplace=True)
    processed = preprocess(df)
    location_col = processed[0].shape[1]-4

    hash_to_location = {y:x for x,y in processed[1].items()}

    df2, targets = encode_target(processed[0], location_col)
    msk = np.random.rand(len(df)) < percentage
    test = df2[~msk].copy()
    train = df2[msk].copy()

    open('golden.csv', 'w').write(','.join([hash_to_location[p] for p in test['Target'].tolist()]) + '\n' )

    test.drop(186, 1, inplace=True)
    test.drop('Target', 1, inplace=True)

    features = list(df2.columns[:location_col]) + list(df2.columns[location_col+1:-1])

    y = train['Target']
    X = train[features]

    dt = DecisionTreeClassifier(min_samples_split=3, random_state=99)
    try:
        dt.fit(X, y)
    except ValueError:
        return
    predictions = dt.predict(test).tolist()
    open('golden.csv', 'a').write(','.join([hash_to_location[p] for p in predictions]))

    # get_code(dt, features, targets)
    return get_accuracy('golden.csv')
Exemplo n.º 29
0
def train_dtc(X, y):
    """
    Create and train the Decision Tree Classifier.
    """
    dtc = DecisionTreeClassifier()
    dtc.fit(X, y)
    return dtc
Exemplo n.º 30
0
def main(args):
    exec "import main.pandas_talib.sig_%s as conf" % args.signame
    build.work2(20, 'sp500Top50', args.signame)
    df = base.get_merged(conf.__name__, yeod.get_sp500Top50())
    df.to_csv("ta.csv")

    tree = DecisionTreeClassifier() 
    
    feat_names = base.get_feat_names(df)

    dfTrain = df[(df.date>='1970-01-01') & (df.date <='2009-12-31')]
    npTrainFeat = dfTrain.loc[:,feat_names].values.copy()
    npTrainLabel = dfTrain.loc[:,"label5"].values.copy()
    npTrainLabel[npTrainLabel >  1.0] = 1
    npTrainLabel[npTrainLabel <  1.0] = 0

    tree.fit(npTrainFeat, npTrainLabel)
    joblib.dump(tree, "tree.pkl", compress = 3)
    
    dfTest = df[(df.date>='2010-01-01') & (df.date <='2099-12-31')]
    npTestFeat = dfTest.loc[:, feat_names].values.copy()
    
    npPred = tree.predict_proba(npTestFeat)

    dfTest.loc[:,"pred"] = npPred[:,1]
    
    print dfTest['pred'].head()

    dfPos = dfTest[ dfTest['pred'] > 0.55 ]
    print 1.0 * len(dfPos[dfPos['label5']>1])  / len(dfPos)
    print 1.0 * len(dfTest[dfTest['label5']>1])  / len(dfTest)
Exemplo n.º 31
0
x__train, x__test, y__train, y__test = train_test_split(x_train,
                                                        y_train,
                                                        test_size=0.25,
                                                        random_state=0)
x__train.shape
x__test.shape
y__train.shape
y__test.shape

#Decision tree
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier

parameters = {'max_depth': [5], 'min_samples_split': [10, 30, 50]}
decision_tree = DecisionTreeClassifier(criterion='gini')
from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(decision_tree, parameters, n_jobs=-1, cv=3)
grid_search.fit(x__train, y__train)
print(grid_search.best_params_)
predicted = grid_search.predict(x__test)
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y__test, predicted) * 100
print("Accuracy = {}".format(accuracy))

# In[4]:

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
Exemplo n.º 32
0
y = iris.target
sns_plot = seaborn.countplot(y)
sns_plot.figure.savefig("rating.png")
plt.close()

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33)

sns_plot = seaborn.countplot(y_test)
sns_plot.figure.savefig("rating_test.png")
plt.close()

sns_plot = seaborn.countplot(y_train)
sns_plot.figure.savefig("rating_train.png")
plt.close()

decision_tree = DecisionTreeClassifier()
decision_tree.fit(x_train, y_train)
decision_tree_predictions = decision_tree.predict(x_test)

cnf_matrix = confusion_matrix(y_test, decision_tree_predictions)
sns_plot = seaborn.heatmap(cnf_matrix, annot=True, center=0)
sns_plot.figure.savefig("cnf_matrix.png")
plt.close()

normalized_cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(
    axis=1)[:, np.newaxis]
sns_plot = seaborn.heatmap(normalized_cnf_matrix, annot=True, center=0)
sns_plot.figure.savefig("normalized_cnf_matrix.png")
plt.close()

print("decision tree perecision: ",
Exemplo n.º 33
0
# In[25]:

# Train,Test Splitting of data

from sklearn.model_selection import train_test_split

X_trainset, X_testset, y_trainset, y_testset = train_test_split(X_del_zero,
                                                                y_del_zero,
                                                                test_size=0.3,
                                                                random_state=3)

# In[26]:

# Prediction Using decision tree Algo

Clf_dt = DecisionTreeClassifier(criterion="entropy", max_depth=4)

print(Clf_dt)  # it shows the default parameters

Clf_dt.fit(X_trainset, y_trainset)
predTree = Clf_dt.predict(X_testset)

# In[27]:

# Metrics and Accuracy

from sklearn import metrics
import matplotlib.pyplot as plt
print("DecisionTrees's Accuracy: ",
      metrics.accuracy_score(y_testset, predTree))
plt.show()

# 首先对数据进行切分,即划分出训练集和测试集
from sklearn.model_selection import train_test_split #调入sklearn库中交叉检验,划分训练集和测试集
all_inputs = df[['alcohol', 'malic_acid', 'ash', 'alcalinity ash', 'magnesium']].values
all_species = df['species'].values

(X_train,
 X_test,
 Y_train,
 Y_test) = train_test_split(all_inputs, all_species, train_size=0.85, random_state=1)#85%的数据选为训练集

# 使用决策树算法进行训练
from sklearn.tree import DecisionTreeClassifier #调入sklearn库中的DecisionTreeClassifier来构建决策树
# 定义一个决策树对象
decision_tree_classifier = DecisionTreeClassifier()
# 训练模型
model = decision_tree_classifier.fit(X_train, Y_train)
# 输出模型的准确度
print(decision_tree_classifier.score(X_test, Y_test)) 


# ~ print([[13.52,3.17,2.72,23.5,97],[12.42,2.55,2.27,22,90],[13.76,1.53,2.7,19.5,132]])#利用3个数据进行测试,即取3个数据作为模型的输入端
result=model.predict([[13.52,3.17,2.72,23.5,97],[12.42,2.55,2.27,22,90],[13.76,1.53,2.7,19.5,132]])
dict1={'Sauvignon':"赤霞珠",'Syrah':"西拉",'Zinfandel':"先粉黛"}
list2=[]
for i in  result:
    word=dict1.get(i)
    list2.append(word)
print(list2)	#输出测试的结果,即输出模型预测的结果
from sklearn.naive_bayes import ComplementNB
clf2 = ComplementNB()
clf2.fit(x_train,y_train)

print("\n","GaussianNB:",nb.score(x_test,y_test),"\n","MultinomialNB:",clf1.score(x_test,y_test),"\n","ComplementNB:",clf2.score(x_test,y_test))
# en uygunu accuracy i yüksek olduğu için GaussianNB seçildi
predictionnb = nb.predict(x_test)
y_prednb = nb.predict(x_test)

print("Accuracy:",metrics.accuracy_score(y_test, y_prednb))
print( confusion_matrix(y_test,y_prednb))
print("GaussianNB")
print(classification_report(y_test,y_prednb))
#%%  Decision Tree
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(criterion="entropy", max_depth=None,min_samples_split=10,max_features=18,random_state=0)
dt = dt.fit(x_train,y_train)
predictiondt = dt.predict(x_test)
y_preddt = dt.predict(x_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_preddt))
print( confusion_matrix(y_test,y_preddt))
print("Decision Tree")
print(classification_report(y_test,y_preddt))

#%%  Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
#logreg = LogisticRegression()
#%%
from sklearn import model_selection
models = []
Exemplo n.º 36
0
    plt.contourf(xx1, xx2, Z, alpha=0.3, cmap=cmap)
    plt.xlim(xx1.min(), xx1.max())
    plt.ylim(xx2.min(), xx2.max())

    for idx, cl in enumerate(np.unique(y)):
        plt.scatter(x=X[y == cl, 0], 
                    y=X[y == cl, 1],
                    alpha=0.8, 
                    c=colors[idx],
                    marker=markers[idx], 
                    label=cl, 
                    edgecolor='black')

## Building a decision tree
tree = DecisionTreeClassifier(criterion='gini', 
                              max_depth=4, 
                              random_state=1)
tree.fit(X_train1, y_train1)

X_combined = np.vstack((X_train1, X_test1))
y_combined = np.hstack((y_train1, y_test1))
plot_decision_regions(X_combined, y_combined, 
                      classifier=tree, test_idx=range(105, 150))

plt.xlabel('petal length [cm]')
plt.ylabel('petal width [cm]')
plt.legend(loc='upper left')
plt.tight_layout()
#plt.savefig('images/03_20.png', dpi=300)
plt.show()
Exemplo n.º 37
0
label_set = numpy.loadtxt(
    './ml_datasets/titanic_passengers_dataset_min.csv',
    delimiter=',',
    skiprows=1,
    usecols=(1,)
)

target_names = ["survived", "not survived"]

for pairidx, pair in enumerate([[0, 1], [0, 2], [0, 3], [1, 2], [1, 3], [2, 3]]):

    # We only take the two corresponding features
    feature_set = feature_data[:, pair]

    # Train
    clf = DecisionTreeClassifier().fit(feature_set, label_set)

    # Plot the decision boundary
    plt.subplot(2, 3, pairidx + 1)

    x_min, x_max = feature_set[:, 0].min() - 1, feature_set[:, 0].max() + 1
    y_min, y_max = feature_set[:, 1].min() - 1, feature_set[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
                         np.arange(y_min, y_max, plot_step))

    plt.xlabel(feature_names[pair[0]])
    plt.ylabel(feature_names[pair[1]])
    plt.axis("tight")

    # Plot the training points
    for i, color in zip(range(n_classes), plot_colors):
	from sklearn.ensemble import AdaBoostClassifier
	from sklearn.tree import DecisionTreeClassifier
	from models.sentence_encoders import HandcraftedEncoder



	#sent_encoder = HandcraftedEncoder()
	sent_encoder = HandcraftedEncoder(precomputed_embeddings=settings.PRECOMPUTED_HANDCRAFTED_EMBEDDINGS_FNAME)
	feature_list = ["Quote_count", "Sent_position", "R_difficult", "POS_PRP", "POS_VB", "A_concreteness"] #HandcraftedEncoder._all_features + "best"
	#feature = "best"


	for feature in feature_list:
		print(feature)
		sent_encoder.set_features(feature)
		model = SimplePQModel(sent_encoder=sent_encoder, clf_type=AdaBoostClassifier, clf_args={'n_estimators':100, 'base_estimator':DecisionTreeClassifier(max_depth=1, class_weight="balanced")})
		print("training {}...".format(feature))
		model.fit(train_articles)
		print("generating...")

		combined_samples[feature] = generate_samples(model, test_articles)



elif model_name == "ngrams":
	from models.sentence_encoders import NGramEncoder

	for mode, n in [('char', 2), ('word', 1)]:
		print(mode, n)
		sent_encoder = NGramEncoder(mode=mode, n=n, store_results=False, vocab_size=1000)
		print("preparing encoder...")
mpl.rcParams['font.sans-serif'] = [u'SimHei']
mpl.rcParams['axes.unicode_minus'] = False

## 创建模拟数据
X, y = make_gaussian_quantiles(n_samples=13000,
                               n_features=10,
                               n_classes=3,
                               random_state=1)

n_split = 3000

X_train, X_test = X[:n_split], X[n_split:]
y_train, y_test = y[:n_split], y[n_split:]

#建立两个模型,algorithm算法不同,bdt_real选择的是samme.r
bdt_real = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2),
                              n_estimators=600,
                              learning_rate=1)

bdt_discrete = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2),
                                  n_estimators=600,
                                  learning_rate=1,
                                  algorithm="SAMME")

bdt_real.fit(X_train, y_train)
bdt_discrete.fit(X_train, y_train)

#获得预测的准确率,accuracy_score,是单个分类器的准确率。
#预测的误差率estimator_errors_
real_test_errors = []  #第一个模型每一个分类器的误差率
discrete_test_errors = []  #第二个模型每一个分类器的误差率
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Binarizer
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

tree_classification_pipeline = Pipeline([
    ('tree', DecisionTreeClassifier()),

    # Forest instead of Trees
    # ('forest', RandomForestClassifier())
])

ridge_regression_pipeline = Pipeline([
    # Apply scaling to Ridge Regression
    # ('scale', StandardScaler()),
    ('ridge', Ridge())
])

lasso_regression_pipeline = Pipeline([
    # Apply scaling to Lasso Regression
    # ('scale', StandardScaler()),
    ('lasso', Lasso())
Exemplo n.º 41
0
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import random
import pickle
from sklearn.decomposition import PCA

clf = DecisionTreeClassifier(random_state=0)

feature_vector = []

with open('final_data.csv', 'r') as fp:
    for i, line in enumerate(fp):
        if i == 0:
            pass
        else:
            feature_vector.append([int(x.strip()) for x in line.split(',')])

random.shuffle(feature_vector)

X = [x[:-1] for x in feature_vector]
Y = [y[-1] for y in feature_vector]

pca = PCA(n_components=5)
X = [x[:-1] for x in feature_vector]
Y = [y[-1] for y in feature_vector]
X = pca.fit_transform(X)
pickle.dump(pca, open("pca_decision.p", "wb"))

k_fold = KFold(10)
results = []
Exemplo n.º 42
0
plt.figure()

colors = rainbow(np.linspace(0, 1, len(kernels)))
plt.bar(kernels, svc_scores, color = colors)
for i in range(len(kernels)):
    plt.text(i, svc_scores[i], svc_scores[i])
plt.xlabel('Kernels')
plt.ylabel('Scores')
plt.title('Support Vector Classifier scores for different kernels')


#Decision Tree Classifier

dt_scores = []
for i in range(1, len(X.columns) + 1):
    dt_classifier = DecisionTreeClassifier(max_features = i, random_state = 0)
    dt_classifier.fit(X_train, y_train)
    dt_scores.append(dt_classifier.score(X_test, y_test))

plt.figure()

plt.plot([i for i in range(1, len(X.columns) + 1)], dt_scores, color = 'green')
for i in range(1, len(X.columns) + 1):
    plt.text(i, dt_scores[i-1], (i, dt_scores[i-1]))
plt.xticks([i for i in range(1, len(X.columns) + 1)])
plt.xlabel('Max features')
plt.ylabel('Scores')
plt.title('Decision Tree Classifier scores for different number of maximum features')


#Random Forest Classifier
Exemplo n.º 43
0
#df=(df-df.min())/(df.max()-df.min())
#df = df.reset_index()
##########разделение на тестовую и обучающую#############
X= df.iloc[:, :-1].values
y = df.iloc[:, 5].values
X_train, X_test, y_train,y_test = train_test_split(X,y, test_size=0.3, random_state=0)
#####выбрать минимум 3 классификатора#####
########k-соседей, метод опорных векторов, дерево решений ##########################

scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

SVC_model = SVC()
DTC_model = DecisionTreeClassifier() 
KNN_model = KNeighborsClassifier(n_neighbors=5)  

SVC_model.fit(X_train, y_train) 
KNN_model.fit(X_train, y_train)
DTC_model.fit(X_train, y_train)

SVC_prediction = SVC_model.predict(X_test) 
KNN_prediction = KNN_model.predict(X_test)
DTC_prediction = DTC_model.predict(X_test)
# Оценка точности — простейший вариант оценки работы классификатора
print(accuracy_score(SVC_prediction, y_test))  
print(accuracy_score(KNN_prediction, y_test))  
print(accuracy_score(DTC_prediction, y_test))  
############Часть 2.2 ##############################################
data2= pd.read_csv('6.csv')
# split dataset into training and test set
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)

# feature scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.fit_transform(x_test)

# start making decision tree classifier
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(criterion='entropy', random_state=0)
clf.fit(x_train, y_train)

# predict our model
predict = clf.predict(x_test)

#create confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, predict)
#visualization of train data
from matplotlib.colors import ListedColormap
x_set, y_set = x_train, y_train
x1, x2 = np.meshgrid(
    np.arange(start=x_set[:, 0].min() - 1,
              stop=x_set[:, 0].max() + 1,
              step=0.01),
Exemplo n.º 45
0
from sklearn.datasets import load_iris
iris = load_iris()
examples = iris.data
truths = iris.target

from sklearn.model_selection import train_test_split
train_examples, test_examples, train_truths, test_truths = train_test_split(examples, truths, test_size=0.33)

from sklearn.tree import DecisionTreeClassifier
decision_tree = DecisionTreeClassifier()
decision_tree.fit(train_examples, train_truths)
prediction = decision_tree.predict(test_examples)

from sklearn.metrics import accuracy_score
print("result with decision tree:", accuracy_score(test_truths, prediction))

from sklearn.tree import export_graphviz
export_graphviz(decision_tree, out_file='decision_tree_iris.dot')
Exemplo n.º 46
0
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

import pandas as pd
import sklearn.datasets

def get_iris_df():
    ds = sklearn.datasets.load_iris()
    df = pd.DataFrame(ds['data'], columns=ds['feature_names'])
    code_species_map = dict(zip(range(3), ds['target_names']))
    df['species'] = [code_species_map[c] for c in ds['target']]
    return df

df = get_iris_df()

CLASS_MAP = {'Logistic Regression': ('-', LogisticRegression()), 'Naive Bayes':('--', GaussianNB()), 'Desicion Tree': ('.-', DecisionTreeClassifier(max_depth=5)), 'Random Forest': (':', RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)),}

X, Y = df[df.columns[:3]], (df['species']=='virginica')
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.8)

for name, (line_fmt, model) in CLASS_MAP.items():
    model.fit(X_train, Y_train)
    preds = model.predict_proba(X_test)
    pred = pd.Series(preds[:,1])
    fpr, tpr, thresholds = roc_curve(Y_test, pred)
    auc_score = auc(fpr, tpr)
    label = '%s: auc=%f' % (name, auc_score)
    plt.plot(fpr, tpr, line_fmt, linewidth=5, label=label)

plt.legend(loc="lower right")
plt.title("Comparisons among classifiers")
Exemplo n.º 47
0
# In[23]:


# We can try different combination of variables
predictor_var = ['Credit_History','Education','Married','Self_Employed','Property_Area']
classification_model(model, train, predictor_var,outcome_var)


# The Credit History variable is a relatively dominating predictor since the additional variables seem to have little effect on the scores.

# #### Decision Tree

# In[24]:


model = DecisionTreeClassifier()
predictor_var = ['Credit_History']
classification_model(model, train, predictor_var, outcome_var)


# In[25]:


#We can try different combination of variables:
train.head()
predictor_var = ['Credit_History','Loan_Amount_Term','LoanAmount_log']
classification_model(model, train, predictor_var,outcome_var)


# #### Random Forest
Exemplo n.º 48
0
import random

data_set = DataSet()
data, label, class_names = data_set.get_train_data_set()

indexs = random.sample(range(len(data)), 50000)
data = data[indexs]
label = label[indexs]
X_train, X_test, y_train, y_test = train_test_split(data,
                                                    label,
                                                    test_size=0.33,
                                                    random_state=42)

est = [('count_vect', CountVectorizer()),
       ('tr', TruncatedSVD(n_components=10, n_iter=100, random_state=42)),
       ('clf_DT', DecisionTreeClassifier())]

pipeline_DT = Pipeline(est)

pipeline_DT = pipeline_DT.fit(X_train, y_train)
y_pred = pipeline_DT.predict(X_test)
print("F1 score - DT:",
      f1_score(y_test, pipeline_DT.predict(X_test), average='micro'))
print("Accuracy Score - DT:",
      accuracy_score(y_test, pipeline_DT.predict(X_test)))
cnf_matrix = confusion_matrix(y_test, y_pred)
plt.figure()
plt = plot_confusion_matrix(cnf_matrix,
                            classes=class_names,
                            normalize=True,
                            title='Normalized confusion matrix DT')
Exemplo n.º 49
0
            self.baseModels = copy.deepcopy(self.oriBaseModels)
            self.metaModel = copy.deepcopy(self.oriMetaModel)
            self.fit(trainX, trainy)
            print("训练集表现:")
            y_train = self.predict(trainX)
            cm = confusion_matrix(trainy, y_train)
            print(cm)
            y_pred = self.predict(testX)
            cm = confusion_matrix(testy, y_pred)
            cmTotal += np.array(cm)
            print("测试集表现:")
            print(cm)
        print(cmTotal)
        return cmTotal

if __name__ == '__main__':
    clf = StackingClassifier()
    from sklearn import datasets
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.neural_network import MLPClassifier
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.naive_bayes import GaussianNB, BaseNB

    iris = datasets.load_iris()
    X = iris.data
    y = iris.target
    clf.setBaseModels(baseModels={'desisionTree': DecisionTreeClassifier(),
                                  'mlp': MLPClassifier(hidden_layer_sizes=(50)),
                                  'KNN': KNeighborsClassifier(n_neighbors=10), "NB": GaussianNB()})
    clf.setMetaModel( DecisionTreeClassifier())
    clf.kFoldValidatoin({'desisionTree': X, 'mlp': X, 'KNN': X, 'NB': X}, y, classNum=3)
Exemplo n.º 50
0
# - KNN
# - Logistic regression
# - Linear Discriminant Analysis

# In[ ]:

# Cross validate model with Kfold stratified cross val
kfold = StratifiedKFold(n_splits=10)

# In[ ]:

# Modeling step Test differents algorithms
random_state = 7
classifiers = []
classifiers.append(SVC(random_state=random_state))
classifiers.append(DecisionTreeClassifier(random_state=random_state))
classifiers.append(
    AdaBoostClassifier(DecisionTreeClassifier(random_state=random_state),
                       random_state=random_state,
                       learning_rate=0.1))
classifiers.append(RandomForestClassifier(random_state=random_state))
classifiers.append(ExtraTreesClassifier(random_state=random_state))
classifiers.append(GradientBoostingClassifier(random_state=random_state))
classifiers.append(MLPClassifier(random_state=random_state))
classifiers.append(KNeighborsClassifier())
classifiers.append(LogisticRegression(random_state=random_state))
classifiers.append(LinearDiscriminantAnalysis())

cv_results = []
for classifier in classifiers:
    cv_results.append(
Exemplo n.º 51
0
data = featureFormat(my_dataset, features_list, sort_keys = True)

labels, features = targetFeatureSplit(data)

from sklearn.cross_validation import train_test_split
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.3, random_state=42)
### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html
from sklearn.tree import DecisionTreeClassifier

t0 = time()

clf = DecisionTreeClassifier()
clf.fit(features_train,labels_train)
score = clf.score(features_test,labels_test)
pred= clf.predict(features_test)
print 'accuracy', score

print "Decision tree algorithm time:", round(time()-t0, 3), "s"



importances = clf.feature_importances_
import numpy as np
indices = np.argsort(importances)[::-1]
print 'Feature Ranking: '
for i in range(11):
    print "{} feature {} ({})".format(i+1,features_list[i+1],importances[indices[i]])
Exemplo n.º 52
0
import numpy as np
import matplotlib.pyplot as pt
import pandas as pd
import time
from sklearn.tree import DecisionTreeClassifier

x = int(input("Give the index of the test image you would like to predict : "))
#Read from train.csv
data = pd.read_csv("train.csv").as_matrix()

#Declare a Decision Tree Classifier Object
clf = DecisionTreeClassifier()

#training data
xtrain = data[20000:, 1:]
train_label = data[20000:, 0]

#testing data
xtest = data[0:21000, 1:]
actual_label = data[0:21000:, 0]

#training the classifier
print("training classifier.....")
clf.fit(xtrain, train_label)

#Calculating accuracy
print("Predicting the image at index %d ....." % x)
time.sleep(2)
p = clf.predict(xtest)
count = 0
for i in range(0, 21000):
Exemplo n.º 53
0
forest.fit(X,y)
feature_importances = pd.DataFrame(forest.feature_importances_,
                                   index = xTrain.columns,
                                    columns=['importance']).sort_values('importance',                                                                 ascending=False)
print(feature_importances.head(10))
exit(2)
'''

import warnings
warnings.filterwarnings('ignore')  # "error", "ignore", "always", "default", "module" or "once"
# prepare models
models = []
models.append(('Nearest Neighbors', KNeighborsClassifier(n_neighbors=29,weights='distance')))
models.append(('Linear SVM', SVC(kernel='linear', C=0.025)))
models.append(('Decision Tree',  DecisionTreeClassifier(max_depth=5)))
models.append(('Random Forest', RandomForestClassifier(n_estimators = 1000, random_state = 42)))
models.append(('Neural Net', MLPClassifier(alpha=1, max_iter=1000)))
models.append(('AdaBoost',  AdaBoostClassifier()))
models.append(('Naive Bayes', GaussianNB()))
models.append(('SGD Classifier', linear_model.SGDClassifier(max_iter=1000, tol=1e-3)))
models.append(('LogisticRegressionCV', linear_model.LogisticRegressionCV(cv=5,max_iter=60000)))
models.append(('K-means',KMeans(n_clusters=2)))
models.append(('Gradient-Boost',GradientBoostingClassifier()))

# evaluate each model in turn
results = []
names = []
scoring = 'f1_macro'
for name, model in models:
   cv_results = model_selection.cross_val_score(model, X, y, cv=5, scoring=scoring)
x = x.drop("index", axis=1)
x_test = x_test.drop("index", axis=1)
x.info()


# In[ ]:


y = train.Survived
x_train, x_cv, y_train, y_cv = train_test_split(x, y, test_size=0.25, random_state=0)


# In[ ]:


model = DecisionTreeClassifier(min_samples_split = 10)
model.fit(x_train,y_train)
print("Decision Tree")
print(model.score(x_train,y_train))
print(model.score(x_cv,y_cv))

imp = pd.DataFrame({"Features":x_train.columns})
imp["DecTree"] = model.feature_importances_

print("----------------------------")

rf = RandomForestClassifier(min_samples_split =20, n_estimators=100)
rf.fit(x_train,y_train)
print("Random Forest")
print(rf.score(x_train,y_train))
print(rf.score(x_cv,y_cv))
Exemplo n.º 55
0
def create_classifiers_features():

    clf_list = []

    params_kbest = {"kbest__k": [1,2, 3,  5, 10, 15]}


    kbest_clf_naive = GaussianNB()
    kbest_params_naive={}
    kbest_params_naive.update(params_kbest)
    kbest = SelectKBest()
    clf_list.append(  (Pipeline([("kbest", kbest), ("naive", kbest_clf_naive)]), kbest_params_naive) )

    #

    kbest_clf_tree = DecisionTreeClassifier()
    kbest_params_tree = {"tree__min_samples_split":[2, 5, 10, 20],
                    "tree__criterion": ('gini', 'entropy'),'tree__random_state':[50]}
    kbest_params_tree.update(params_kbest)
    kbest = SelectKBest()
    clf_list.append((Pipeline([("kbest", kbest), ("tree", kbest_clf_tree)]), kbest_params_tree))

    #

    kbest_clf_linearsvm = LinearSVC()
    kbest_params_linearsvm = {"svm__C": [0.1, 1, 5, 10, 100],
                        "svm__tol": [10**-1,  10**-3, 10**-5],
                        "svm__class_weight": ['balanced']

                        }
    kbest_params_linearsvm.update(params_kbest)
    kbest = SelectKBest()
    clf_list.append((Pipeline([("kbest", kbest), ("svm", kbest_clf_linearsvm)]), kbest_params_linearsvm))

    #

    kbest_clf_adaboost = AdaBoostClassifier()
    kbest_params_adaboost = { "adaboost__n_estimators":[20,  50, 100],
                        'adaboost__learning_rate': [0.4, 0.6, 1]}
    kbest_params_adaboost.update(params_kbest)
    kbest = SelectKBest()
    clf_list.append((Pipeline([("kbest", kbest), ("adaboost", kbest_clf_adaboost)]), kbest_params_adaboost))


    kbest_clf_random_tree  = RandomForestClassifier()
    kbest_params_random_tree  = {  "random_tree__n_estimators":[2, 3, 5,10,15],
                            "random_tree__criterion": ('gini', 'entropy'),
                            'random_tree__min_samples_split': [1, 2, 4]
                            }
    kbest_params_random_tree.update(params_kbest)
    kbest = SelectKBest()
    clf_list.append((Pipeline([("kbest", kbest), ("random_tree", kbest_clf_random_tree )]), kbest_params_random_tree ))

    #


    kbest_clf_log = LogisticRegression()
    kbest_params_log = {  "log__C":[0.05, 0.5, 1, 10, 10**2,10**5,],
                    "log__tol":[10**-1, 10**-5, 10**-10],
                    "log__penalty":['l2','l1'],
                    "log__class_weight":['balanced']
                    }
    kbest_params_log.update(params_kbest)
    kbest = SelectKBest()
    clf_list.append((Pipeline([("kbest", kbest), ("log", kbest_clf_log)]), kbest_params_log))

    return clf_list
Exemplo n.º 56
0
X_test_counts = count_vect.fit_transform(test['text'])
X_train_counts = count_vect.transform(train['text'])

X_train, X_test, y_train, y_test = train_test_split(X_train_counts,
                                                    train['sentiment'],
                                                    test_size=0.4,
                                                    random_state=0)

clf = MLPClassifier(alpha=1, random_state=65)
clf.fit(X_train_counts, train['sentiment'])

clf2 = SVC(probability=True, gamma=2, C=1)
clf2.fit(X_train_counts, train['sentiment'])

clf3 = DecisionTreeClassifier(random_state=0)
clf3.fit(X_train_counts, train['sentiment'])

clf5 = BaggingClassifier(random_state=54)
clf5.fit(X_train, y_train)

clf6 = ExtraTreesClassifier(random_state=0)
clf6.fit(X_train, y_train)

clf7 = GradientBoostingClassifier(random_state=32)
clf7.fit(X_train, y_train)

vc = VotingClassifier(estimators=[('mlp', clf), ('dt', clf3), ('et', clf6),
                                  ('bag', clf5), ('grad', clf7)],
                      voting='soft',
                      weights=[0.3, 0.1, 0.2, 0.1, 0.3])
    y_test = pd.read_csv(filePath + "y_test.csv")
    featureList = []
    for i in range(len(x_train.columns)):
        featureList.append("x" + str(i))
    x_train.columns = featureList
    x_test.columns = featureList
    return x_train, x_test, y_train, y_test


x_train, x_test, y_train, y_test = load_dataset()


# y_train = y_train.values.ravel()
# y_test = y_test.values.ravel()

pipeline = Pipeline([('enc', OneHotEncoder(handle_unknown='ignore')), ('clf', DecisionTreeClassifier(criterion='entropy', min_samples_split=200, max_depth=24))])

pipeline.fit(x_train, y_train)

y_pred = pipeline.predict(x_test)



# USE FOR CLASSIFICATION GRID SEARCH
"""

pipeline = Pipeline([('oh', OneHotEncoder(handle_unknown='ignore')), ('dt', DecisionTreeClassifier(criterion='entropy', min_samples_split=200, max_depth=24))])

# pipeline = Pipeline([('enc', OneHotEncoder(handle_unknown='ignore')), ('clf', RandomForestClassifier())])
# Create lists of parameter for Decision Tree Classifier
n_estimators = list(range(200, 1001, 200))
Exemplo n.º 58
0
def create_classifiers():

    clf_list = []

    params_pca = {"pca__n_components": [2, 3,  5, 10, 15], "pca__whiten": [False]}

    #
    clf_naive = GaussianNB()
    params_naive = {}
    clf_list.append( (clf_naive, params_naive) )
    pca_clf_naive = GaussianNB()
    pca_params_naive={}
    pca_params_naive.update(params_pca)
    pca = PCA()
    clf_list.append(  (Pipeline([("pca", pca), ("naive", pca_clf_naive)]), pca_params_naive) )

    #
    clf_tree = DecisionTreeClassifier()
    params_tree = { "min_samples_split":[2, 5, 10, 20],
                    "criterion": ('gini', 'entropy'),
                    'random_state':[50]
                    }
    clf_list.append( (clf_tree, params_tree) )
    pca_clf_tree = DecisionTreeClassifier()
    pca_params_tree = {"tree__min_samples_split":[2, 5, 10, 20],
                    "tree__criterion": ('gini', 'entropy'),'tree__random_state':[50]}
    pca_params_tree.update(params_pca)
    pca = PCA()
    clf_list.append((Pipeline([("pca", pca), ("tree", pca_clf_tree)]), pca_params_tree))

    #
    clf_linearsvm = LinearSVC()
    params_linearsvm = {"C": [0.1, 1, 5, 10, 100],
                        "tol":[10**-1,  10**-3, 10**-5],
                        "class_weight":['balanced']

                        }
    clf_list.append( (clf_linearsvm, params_linearsvm) )
    pca_clf_linearsvm = LinearSVC()
    pca_params_linearsvm = {"svm__C": [0.1, 1, 5, 10, 100],
                        "svm__tol": [10**-1,  10**-3, 10**-5],
                        "svm__class_weight": ['balanced']

                        }
    pca_params_linearsvm.update(params_pca)
    pca = PCA()
    clf_list.append((Pipeline([("pca", pca), ("svm", pca_clf_linearsvm)]), pca_params_linearsvm))

    #
    clf_adaboost = AdaBoostClassifier()
    params_adaboost = { "n_estimators":[20, 50, 100],
                        'learning_rate': [0.4, 0.6, 1]}
    clf_list.append( (clf_adaboost, params_adaboost) )
    # pca_clf_adaboost = AdaBoostClassifier()
    # pca_params_adaboost = { "adaboost__n_estimators":[20,  50, 100],
    #                     'adaboost__learning_rate': [0.4, 0.6, 1]}
    # pca_params_adaboost.update(params_pca)
    # pca = PCA()
    # clf_list.append((Pipeline([("pca", pca), ("adaboost", pca_clf_adaboost)]), pca_params_adaboost))
    #
    clf_random_tree = RandomForestClassifier()
    params_random_tree = {  "n_estimators":[2, 3, 5,10,15],
                            "criterion": ('gini', 'entropy'),
                            'min_samples_split': [1, 2, 4], 'max_features': [1, 2, 3,'sqrt',5,10]
                            }
    clf_list.append( (clf_random_tree, params_random_tree) )
    pca_clf_random_tree  = RandomForestClassifier()
    pca_params_random_tree  = {  "random_tree__n_estimators":[2, 3, 5,10,15],
                            "random_tree__criterion": ('gini', 'entropy'),
                            'random_tree__min_samples_split': [1, 2, 4]
                            }
    pca_params_random_tree.update(params_pca)
    pca = PCA()
    clf_list.append((Pipeline([("pca", pca), ("random_tree", pca_clf_random_tree )]), pca_params_random_tree ))

    #
    clf_log = LogisticRegression()
    params_log = {  "C":[0.05, 0.5, 1, 10, 10**2,10**5,],
                    "tol":[10**-1, 10**-5, 10**-10],
                    "class_weight":['balanced'],
                    "penalty": ['l2', 'l1']
                    }
    clf_list.append( (clf_log, params_log) )

    pca_clf_log = LogisticRegression()
    pca_params_log = {  "log__C":[0.05, 0.5, 1, 10, 10**2,10**5,],
                    "log__tol":[10**-1, 10**-5, 10**-10],
                    "log__penalty":['l2','l1'],
                    "log__class_weight":['balanced']
                    }
    pca_params_log.update(params_pca)
    pca = PCA()
    clf_list.append((Pipeline([("pca", pca), ("log", pca_clf_log)]), pca_params_log))

    return clf_list
Exemplo n.º 59
0
 def cart(X, y, params):
     clf = CART(n_jobs=params['n_jobs'] if 'n_jobs' in params else 1)
     clf.fit(X, y)
     return clf
Exemplo n.º 60
0
# import time
# for i, clf in enumerate(clf_list):
#     start_time = time.time()
#     result=evaluate_classifier(clf, labels,features,scv)
#     summary_list1[i]=result
#     summary_list[clf] = result
#     print clf,result
#     print("--- %s seconds ---" % (time.time() - start_time))
#
# ordered_list = sorted(summary_list.keys(), key=lambda k: summary_list[k][3], reverse=True)
# print [(key,summary_list[key]) for key in summary_list.keys() if summary_list[key][1]>0.3 and summary_list[key][2]>0.3]
# print ordered_list
# print "*"*100
# print summary_list
# print "*"*100
#
# clf = ordered_list[0]
# scores = summary_list[clf]
# print "Best classifier is ", clf
# print "With scores of  accuracy,recall, precision,f1,f2: ", scores

clf= DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,max_features=None, max_leaf_nodes=None, min_samples_leaf=20,min_samples_split=20, min_weight_fraction_leaf=0.0,presort=False, random_state=50, splitter='best')
test_classifier(clf, my_dataset, features_list)
# Example starting point. Try investigating other evaluation techniques!

### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

dump_classifier_and_data(clf, my_dataset, features_list)