Пример #1
0
 def testEntropyOneEl(self):
     """
     Test with all same element, empty list,  more than 2 labels,
     and a couple uneven distributions
     """
     dt = decisionTree.DecisionTree()
     self.assertEquals(dt.entropy(['yes']), 0)
Пример #2
0
 def testRecursiveBuild(self):
     dt = decisionTree.DecisionTree()
     tree = dt.makeTree([[1,'a'],[2,'a']], ['yes','no'],['a1','a2'],
                        {'a1': [1,2], 'a2': ['a','b']}, 'default')
     self.assertNotEquals(repr(tree), 'yes')
     self.assertNotEquals(repr(tree), 'no')
     self.assertNotEquals(repr(tree), 'default')
Пример #3
0
def tree_fit_predict(xTrain, yTrain, xTest, index):
    model = decisionTree.DecisionTree(maxDepth=maxDepth, verbose=True)
    #   new = tf.gather(xTrain,index)
    #   print(new)
    # #   index = np.array(index)
    # #   index = index.astype(int)
    # #   print(index.dtype)
    result = model.fit(xTrain[index], yTrain[index]).predict(xTest)
    return tf.convert_to_tensor(result)
Пример #4
0
 def testTennis(self):
     """
     Test entire program on the tennis data set
     """
     tennis = mu.extract_data('tennis.csv')
     tennis = mu.enhance_data(tennis)
     dt = decisionTree.DecisionTree(tennis['feature_dict'], tennis['feature_names'])
     dt.fit(tennis['data'],tennis['target'])
     for x,y in zip(tennis['data'],tennis['target']):
         self.assertEquals(dt.predict([x]), [y])
     self.assertEquals(dt.predict(tennis['data']), tennis['target'])
def run(sc, numTrees=100):
    def zero_matrix(n, m):
        return np.zeros(n * m, dtype=int).reshape(n, m)

    def vote_increment(y_est):
        increment = zero_matrix(y_est.size, n_ys)
        increment[np.arange(y_est.size), y_est] = 1
        return increment  # test point x class matrix with 1s marking the estimator prediction

    airlineData = pd.read_csv("AirlineReduced")
    #airlineData =  spark.read.text("/user/sumish/data/AirlineReduced")
    #airlineData=pd.read_csv("hdfs:///user/sumish/data/AirlineReduced")
    airlineData = airlineData.loc[:, ~airlineData.columns.isin([
        'DepTime', 'ArrTime', 'CRSArrTime', 'CRSDepTime', 'ActualElapsedTime',
        'ArrTimeInMins', 'ArrDelay'
    ])]

    X_train, X_test, y_train, y_test = train_test_split(
        airlineData.loc[:, ~airlineData.columns.isin(['IsDelayed'])],
        airlineData['IsDelayed'],
        test_size=0.75,
        random_state=42)
    X_train, X_test, y_train, y_test = np.array(X_train), np.array(
        X_test), np.array(y_train), np.array(y_test)

    n_test = X_test.shape[0]
    n_ys = np.unique(y_train).size

    ###broad cast variables
    X_train_BC = sc.broadcast(X_train)
    X_test_BC = sc.broadcast(X_test)
    y_train_BC = sc.broadcast(y_train)
    y_test_BC = sc.broadcast(y_test)

    model = decisionTree.DecisionTree()
    # Partition the training data into random sub-samples with replacement.
    listOfRanomIndexes = [
        random.sample(list(range(y_train.size)), int((2 / 3) * y_train.size))
        for i in range(numTrees)
    ]
    #    features = [random.sample(list(range(y_train.size)),int((2/3)*y_train.size)) for i in range(10)]
    samples = sc.parallelize(listOfRanomIndexes)

    # Train a model for each sub-sample and apply it to the test data.
    vote_tally = samples.map(lambda index: model.fit(X_train_BC.value[
        index], y_train_BC.value[index]).predict(X_test_BC.value)).map(
            vote_increment).fold(zero_matrix(n_test, n_ys),
                                 np.add)  # Take the learner majority vote.

    y_estimate_vote = np.argmax(vote_tally, axis=1)
    return f1_score(y_test_BC.value, y_estimate_vote)
Пример #6
0
    def fit(self, dataset, label):
        label = np.asarray(label)
        
        yi = label
        y_prev = 0
        
        for i in range(self.iteration):
            begin = time.time()
            model = decisionTree.DecisionTree(self.max_depth_tree, self.min_size, 'sse')
            model.fit(dataset, yi)
            
            y_predict = model.predict(dataset)
            y_prev = y_prev + y_predict
                                    
            yi = label - self.learning_rate * y_prev
            
            self.models.append(model)
#             self.gammas.append(gamma)
            print("Iteration " + str(i+1) + ": " + str(time.time() - begin) + " s")
Пример #7
0
# random forest

rf = randomForest.RandomForest(10,10,train_X.shape[0],train_X.shape[1])
rf.train(train_X,train_y)
res = rf.predict(validation_X)

score = 0
for i in range(len(res)):
    if res[i] == validation_y[i]:
        score += 1
score /= len(res)
print(score)

# decision tree

tree = decisionTree.DecisionTree(10,train_X.shape[1])
tree.train(train_X,train_y)
res = tree.predict(validation_X)

score = 0
for i in range(len(res)):
    if res[i] == validation_y[i]:
        score += 1
score /= len(res)
print(score)




# with open('spam_prediction.csv','wt') as f:
#     writer = csv.writer(f, delimiter=',')
Пример #8
0
def main():
    temp = lambda col: col not in ['payer_code']
    diabet = pd.read_csv('dataset/diabetic_data.csv',
                         na_values='?',
                         usecols=temp)
    # print(diabet.isnull().sum())
    # print(diabet.dtypes)
    fill = {}
    for column in diabet:
        if diabet[column].dtypes == 'int64':
            fill[column] = diabet[column].mean()
        else:
            temp = diabet[column].mode()
            fill[column] = temp[0]
    # diabet = diabet.fillna(value=fill)
    # print(diabet.isnull().sum())
    for column in diabet:
        if diabet[column].dtypes != 'int64':
            try:
                float(diabet[column][0])
                # diabet[column] = diabet[column].apply(lambda x: x.isnumeric())
                diabet[column] = diabet[column].apply(pd.to_numeric,
                                                      errors='coerce')
            except ValueError:
                continue
    # print(diabet['diag_1'])
    # print(diabet['diag_2'])
    # print(diabet.dtypes)
    temp = pd.DataFrame()
    for column in diabet:
        clean_df = diabet[column]
        elements = np.array(clean_df)
        if diabet[column].dtypes == 'int64':
            mean = np.mean(elements, axis=0)
            sd = np.std(elements, axis=0)
            final_list = [x for x in clean_df if (x > mean - 2 * sd)]
            final_list = [x for x in final_list if (x < mean + 2 * sd)]
            temp[column] = pd.Series(final_list)
        else:
            temp[column] = pd.Series(elements)
    temp = temp.fillna(value=fill)
    # print(temp.isnull().sum())
    # print(temp.head())
    # temp.sort_values(by=[''], ascending = False)

    # temp.to_csv('dataset/pre_processed.csv', encoding='UTF8', index=False)

    temp = lambda col: col not in ['payer_code']
    df = pd.read_csv('dataset/pre_processed.csv', usecols=temp)
    y = df['diabetesMed'].map({'No': 0, 'Yes': 1})
    # print(df)

    clean_df = featureSelection.feature_engineering(df)
    # print(clean_df)

    pca = PCA(15)
    pca_arr = pca.fit_transform(clean_df)
    pca_df = pd.DataFrame(data=pca_arr)
    # print(pca_df)

    # clean_df['label'] = y
    # clean_df.to_csv("dataset/clean.csv", index_label=False)
    # pca_df['label'] = y
    # clean_df.to_csv("dataset/clean_pca.csv", index_label=False)

    df_clean = pd.read_csv("dataset/clean.csv")
    df_clean = df_clean[:10000]
    y = df_clean["label"]
    del df_clean["label"]
    # print(df_clean)

    model = gradientBoosting.GradientBoosting(10, 0.1, max_depth_tree=4)
    model.fit(df_clean, y)

    for dt in model.models:
        print(dt.root)

    a = model.predict(df_clean)
    print(a)

    dt = decisionTree.DecisionTree(5, 100)
    dt.fit(df_clean, y)

    b = dt.predict(df_clean)
Пример #9
0
import preprocess as pp
import decisionTree as dt
import numpy as np
import random

random.seed()
file = 'data.csv'

rows = pp.readFile(file=file)

X_train, Y_train, X_test, Y_test = pp.splitData(rows)

tree = dt.DecisionTree(X_train, Y_train, 9)
tree.train()
Y_pred, error = tree.test(X_test, Y_test)

print("error in decision tree testing =", error)
for i in range(len(Y_test)):
    print(Y_pred[i], Y_test[i])

print("vignesh's prediction")

Xmale = [['3', '9.6', 'M', '28.3', '7', '1']]
Xfemale = [['3', '9.6', 'F', '28.3', '7', '1']]
Y = [10]

yp, e = tree.test(Xmale, Y)
print("vignesh's grade", yp[0])
yp, e = tree.test(Xfemale, Y)
print("chhakka vignesh's grade", yp[0])
Пример #10
0
 def testFitEmpty(self):
     dt = decisionTree.DecisionTree(default_v='default')
     dt.fit([],[])
     self.assertEquals(repr(dt.clf), 'default')
Пример #11
0
 def testEntropyMax(self):
     dt = decisionTree.DecisionTree()
     self.assertEquals(dt.entropy(['yes','no']), 1.0)
Пример #12
0
 def testEntropyEmpty(self):
     """
     Test with empty list
     """
     dt = decisionTree.DecisionTree()
     self.assertEquals(dt.entropy([]), 0)
Пример #13
0
 def testbuildEmpty(self):
     dt = decisionTree.DecisionTree()
     tree = dt.makeTree([],[],[],{},'default')
     self.assertEquals(repr(tree), 'default')
Пример #14
0
 def testOneFeature(self):
     dt = decisionTree.DecisionTree()
     tree = dt.makeTree([[1],[1],[2]], ['no','yes','yes'], ['a1'],
                        {'a1': [1,2], 'a2': ['a','b']}, 'default')
     self.assertEquals(tree.attribute, 'a1')
Пример #15
0
 def testSelAttRest(self):
     dt = decisionTree.DecisionTree()
     rest = mu.extract_data('restaurant.csv')
     attrib = dt.selectAttribute(rest['data'], rest['target'])
     self.assertEquals(attrib, 4)
Пример #16
0
 def _init_(self, data):
     self.data = data
     self.decisionTree = dt.DecisionTree()
Пример #17
0
 def testSelAttTenn(self):
     dt = decisionTree.DecisionTree()
     tennis = mu.extract_data('tennis.csv')
     attrib = dt.selectAttribute(tennis['data'], tennis['target'])
     self.assertEquals(attrib, 0)
Пример #18
0
 def testEntropyUnbal(self):
     dt = decisionTree.DecisionTree()
     self.assertAlmostEquals(dt.entropy(['yes','no','yes']), 0.918296, places=4)
Пример #19
0
 def testEntropyThreeVal(self):
     dt = decisionTree.DecisionTree()
     self.assertAlmostEquals(dt.entropy(['yes','no','maybe']), 1.58496, places=4)
Пример #20
0
def k_fold_eval(traind, k):
    '''See the Word doc for specifications
    '''
    test_size = int(len(traind["data"]) / k)
    result_dict = {}
    #testset's precisions,recalls,accuracy
    v_precisions = []
    v_recalls = []
    v_accuracys = []
    #trainset's precisions,recalls,accuracy
    t_precisions = []
    t_recalls = []
    t_accuracys = []
    # get random id numbers in range(0,k) for a number of k
    id_ls = random.sample(xrange(0, k), k)
    for i in id_ls:
        #each time, select the i_th part of data as testset which depends on the id from random_id list
        test_set = traind["data"][i * test_size:(i + 1) * test_size]
        #copy all samples/labels from data and data's labels
        train_set = traind["data"][:]
        train_labels = traind["target"][:]
        #Then delete the samples and labels which testset contains...Now,the remaining sets are the trainset
        del (train_labels[i * test_size:(i + 1) * test_size])
        del (train_set[i * test_size:(i + 1) * test_size])
        #fit the trainset and predict the testset
        clf = decisionTree.DecisionTree(attrib_d=data['feature_dict'],
                                        attribs=data['feature_names'],
                                        default_v="default")
        clf.fit(train_set, train_labels)
        train_tars = clf.predict(train_set)
        test_tars = clf.predict(test_set)
        #actual targets for testing set
        test_act = traind["target"][i * test_size:(i + 1) * test_size]
        #Assume that the most common value is the positive one
        positive = decisionTree.zeroR(traind['target'])
        #init TP,TN,FP,FN
        TP = 0.0
        TN = 0.0
        FP = 0.0
        FN = 0.0
        # calculate tests' performance
        for i in range(len(test_tars)):
            if test_tars[i] == test_act[i]:
                if test_tars[i] == positive:
                    TP = TP + 1
                else:
                    TN = TN + 1
            # else, we get a false value
            else:
                if test_tars[i] == positive:
                    FP = FP + 1
                else:
                    FN = FN + 1

        if (TP + FP) == 0.0:
            v_precisions.append(0.0)
        else:
            v_precisions.append(float(TP / (TP + FP)))
        if (TP + FN) == 0.0:
            v_recalls.append(0.0)
        else:
            v_recalls.append((TP / (TP + FN)))
        v_accuracys.append(float((TP + TN) / len(test_act)))

        #set TP,TN,FP,FN to 0 in order calculate trainset's performance
        TP = 0.0
        TN = 0.0
        FP = 0.0
        FN = 0.0
        # For trainset...(the same as testset)
        for i in range(len(train_tars)):
            if train_tars[i] == train_labels[i]:
                if train_tars[i] == positive:
                    TP = TP + 1
                else:
                    TN = TN + 1
            # else, we get a false value
            else:
                if train_tars[i] == positive:
                    FP = FP + 1
                else:
                    FN = FN + 1
        if (TP + FP) == 0.0:
            t_precisions.append(0.0)
        else:
            t_precisions.append((TP / (TP + FP)))
        if (TP + FN) == 0.0:
            t_recalls.append(0.0)
        else:
            t_recalls.append((TP / (TP + FN)))
        t_accuracys.append(((TP + TN) / len(train_labels)))

    #calculate the average for each value
    v_precision = sum(v_precisions) / k
    v_recall = sum(v_recalls) / k
    v_accuracy = sum(v_accuracys) / k

    result_dict["test_precision"] = v_precision
    result_dict["test_recall"] = v_recall
    result_dict["test_accuracy"] = v_accuracy

    t_precision = sum(t_precisions) / k
    t_recall = sum(t_recalls) / k
    t_accuracy = sum(t_accuracys) / k

    result_dict["train_precision"] = t_precision
    result_dict["train_recall"] = t_recall
    result_dict["train_accuracy"] = t_accuracy

    return result_dict
Пример #21
0
 def testNoFeatures(self):
     dt = decisionTree.DecisionTree()
     tree = dt.makeTree([[],[],[]], ['no','yes','yes'], [],
                        {'a1': [1,2], 'a2': ['a','b']}, 'default')
     self.assertEquals(repr(tree), 'yes')
Пример #22
0
 def testbuildZeroEnt(self):
     dt = decisionTree.DecisionTree()
     tree = dt.makeTree([[1,'a'],[2,'a']], ['yes','yes'],['a1','a2'],
                        {'a1': [1,2], 'a2': ['a','b']}, 'default')
     self.assertEquals(repr(tree), 'yes')
Пример #23
0
 def testEntropyAllSame(self):
     dt = decisionTree.DecisionTree()
     self.assertEquals(dt.entropy(['yes','yes']), 0)