Пример #1
0
def adaboost(train, test, headers, fullTestData):
    ylabels = ['H', 'A', 'D']#Make predictions for each of the possible labels
    results = []
    for y in ylabels:
#        print "Training for", y
        rootNode = Node(train)#Initialize first decision stump
        treeRootNode = buildTree(rootNode, y, headers)
        results.append(predict(test, rootNode.splits, y))
    
    print "Now making predictions"    
    prediction = []
    for r in xrange(0,len(results[0])):
        temp = [zy for zy in column(results, r)]
        #Take the label with corresponding max value of alpha as final prediction
        prediction.append(ylabels[temp.index(max(temp))])
        
    print "Now checking predictions"
    corr = 0
    print "Home\tAway\tPrediction\tActual\tBookie"
    file = open("resultdata.csv", 'a')
    writer = csv.writer(file, quoting=csv.QUOTE_ALL)
    for p in xrange(0,len(prediction)):
        print '\a'
        writer.writerow([column(fullTestData,-2)[p], column(fullTestData,-1)[p], prediction[p], column(test,-2)[p], column(fullTestData,-4)[p]]) 
        print [column(fullTestData,-2)[p], column(fullTestData,-1)[p], prediction[p], column(test,-2)[p], column(fullTestData,-4)[p]]
        if prediction[p] == column(test,-2)[p]:
            corr+=1
    
    file.close()
    try:
        print str(float(corr)*100/len(prediction)), len(prediction)
    except ZeroDivisionError:
        print 0, len(prediction)
    print "done"
Пример #2
0
    def predict(self, X):
        """
        Test the trained RF on the given set of examples X


            Input:
            ------
                X: [m x d] a d-dimensional test examples.

            Returns:
            -----------
                pclass: the predicted class for the given example, i.e. to which it belongs
        """
        z = []

        if self.scalefeat:
            X = self.applyScaling(X)

        pred = []

        for tree in self.trees:
            z.append(tree.predict(X))
        z = np.array(z).T

        for row in z:
            pred.append(stats.mode(row)[0])
        return pred
def get_predict(trees_result, trees_fiture, data_train):
    m_tree = len(trees_result)
    m = np.shape(data_train)[0]

    result = []
    for i in xrange(m_tree):
        clf = trees_result[i]
        feature = trees_fiture[i]
        data = split_data(data_train, feature)
        result_i = []
        for i in xrange(m):
            result_i.append((predict(data[i][0:-1], clf).keys())[0])
        result.append(result_i)
    final_predict = np.sum(result, axis=0)
    return final_predict
def get_predict(trees_result, trees_fiture, data_train):
    m_tree = len(trees_result)
    m = np.shape(data_train)[0]
    
    result = []
    for i in xrange(m_tree):
        clf = trees_result[i]
        feature = trees_fiture[i]
        data = split_data(data_train, feature)
        result_i = []
        for i in xrange(m):
            result_i.append((predict(data[i][0:-1], clf).keys())[0])
        result.append(result_i)
    final_predict = np.sum(result, axis=0)
    return final_predict
Пример #5
0
 def predict(self, data):
     # 返回权重为中位数的模型(过低的模型容器欠拟合,过高的模型容易过拟合)
     median = np.median(self.model_weight)
     # 由于结果都是浮点数,所以要使用差值小于一个小量代表两者相同
     median_index = np.where(
         np.array(self.model_weight) - median <= 1e-3)[0]
     result = []
     for index in median_index:
         # 每个模型的预测结果需要乘以对应的权重
         pred = list(
             map(
                 lambda _: predict(self.model_list[index], _, self.
                                   feature_list), data))
         result.append(self.model_weight[index] * pred)
     # 把每个模型预测的结果相加为最终的强模型
     return np.sign(np.sum(result, axis=0)).astype(int).reshape(-1)
Пример #6
0
    def predict(self, X):
        """
        Test the trained RF on the given set of examples X
        
                   
            Input:
            ------
                X: [m x d] a d-dimensional test examples.
           
            Returns:
            -----------
                pclass: the predicted class for the given example, i.e. to which it belongs
        """
        z = []

        if self.scalefeat:
            X = self.applyScaling(X)

        #-----------------------TODO-----------------------#
        #--------Write Your Code Here ---------------------#
        nexamples, nfeatures = X.shape
        predictions = []
        for tree in self.trees:
            predictions.append(tree.predict(X))

        #print "pred are:",predictions
        predictions_np = np.array(predictions)
        #print 'shape of pred is:',predictions

        iterator = 0
        while (iterator < nexamples):
            max_sum = -np.inf
            col = -1
            k = 0
            while (k < len(self.classes)):
                boolean = (predictions_np[:, iterator] == self.classes[k])
                temp_sum = np.sum(boolean)
                if (temp_sum > max_sum):
                    col = k
                    #print "col# is:",col
                    max_sum = temp_sum
                    #print "max sum is:",max_sum
                k = k + 1
            z.append(self.classes[col])
            iterator = iterator + 1
        return z
    def predict(self, x_pred):
        result = np.array([])

        for feature_vec in x_pred:  # 对每一行特征进行迭代
            vote_array = np.array([])
            # 记录模型的预测结果进行投票
            for index in range(self.n_estimators):
                pred = predict(self.tree_list[index],
                               feature_vec[self.tree_feature[index]],
                               self.feature_list[self.tree_feature[index]])
                vote_array = np.append(vote_array, pred)
            # 取预测各个模型预测的投票结果的作为模型的预测结果
            label_class, counts = np.unique(vote_array.astype(int),
                                            return_counts=True)
            most_label_index = np.argmax(counts)
            result = np.append(result, label_class[most_label_index])

        return result
Пример #8
0
    def fit(self, data, label):
        """
        模型拟合过程,实现原理参考对应的链接
        :param data: 特征矩阵
        :param label: 标签
        :return:
        """
        # 设置初始的数据分布的采样权重,此时都相等
        self.data_weight = np.ones((data.shape[0], 1)) / data.shape[0]

        # 记录数据集的索引
        index = np.arange(0, data.shape[0], 1)
        # 进行迭代求解
        for i in range(self.n_iterates):
            # 根据数据权重进行采样, 注意bagging是有放回,boosting是无放回
            # https://zhuanlan.zhihu.com/p/47922595
            sub_samping = np.random.choice(
                index,
                int(self.data_weight.shape[0] * self.alpha),
                replace=False,
                p=self.data_weight.reshape(-1, ).tolist())
            train_x = data[sub_samping]
            train_y = label[sub_samping]
            dt = createTree(train_x, train_y, self.feature_list)  # 进行弱学习模型训练

            self.model_list.append(dt)  # 存储该弱学习模型

            pred = list(
                map(lambda _: predict(dt, _, self.feature_list), train_x))
            # 计算模型在训练集上的误差率 (即预测错误的样本权重相加,相同为0,不同为1)
            pred_error = np.ones((len(pred), 1))
            pred_error[pred == train_y] = 0
            et = pred_error.T.dot(self.data_weight[sub_samping])

            # 把模型的权重加入到列表中
            at = 0.5 * np.log((1 - et) / et)
            self.model_weight.append(at)

            # 更新样本的权重
            self.data_weight[sub_samping] = self.data_weight[
                sub_samping] * np.exp(-at * train_y * pred).reshape(-1, 1)

            # 权重归一化
            self.data_weight = self.data_weight / self.data_weight.sum()
Пример #9
0
def get_predict(trees_result, trees_feature, data_train):
    '''利用训练好的随机森林模型对样本进行预测
    :param trees_result:
    :param trees_feature:
    :param data_train:
    :return:
    '''
    m_tree = len(trees_result)  # 手动设置的50个树节点
    m = np.shape(data_train)[0]
    result = []
    for i in range(m_tree):
        clf = trees_result[i]
        feature = trees_feature[i]
        data = split_data(data_train, feature)
        result_i = []
        for j in range(m):
            # 查看每个样本与计算出来的树比较,判断数据是左、右子树
            result_i.append(list(predict(data[j][0:-1], clf).keys())[0])
        result.append(result_i)
    final_predict = np.sum(result, axis=0)
    return final_predict
Пример #10
0
def ValidationMining(target, fn, ln):

    print("target: %s, fn: %s, ln: %s" % (target, fn, ln))
    # get real value for input
    r_sql = "select * from validtree where nameFirst = \'" + fn + "\' and nameLast = \'" + ln + "\' limit 1;"

    realdata = databaseconnection(r_sql)
    if realdata == None or len(realdata) == 0:
        print("No record exists for %s %s" % (fn, ln))
        pred = "Unknown"
        real = "Unknown"
        #exit()
        return pred, real
    else:
        r_df = pd.DataFrame(list(realdata))
        r_df.columns = [
            'playerID', 'nameFirst', 'nameLast', 'nom', 'hof', 'man'
        ]
        r_df.fillna(value=0, inplace=True)
        real = r_df[target].iloc[0]
        real = "Y" if int(real) == 1 else "N"
        print("real value is ", real)

    # get corresponding row
    playerid = r_df['playerID'].iloc[0]

    tables = AllTables()
    dfcols = tables.cols

    row_sql = "select * from treesource where playerID = \'" + playerid + "\'"
    rowdata = databaseconnection(row_sql)
    rowdf = pd.DataFrame(list(rowdata))

    rowdf.columns = dfcols
    rowdf.fillna(value=0, inplace=True)

    #######decision tree data
    sql = tables.sql

    results = databaseconnection(sql)
    print("get result from db..")
    df = pd.DataFrame(list(results))

    df.columns = dfcols
    df.fillna(value=0, inplace=True)
    df = removezero(df)
    y = df[target].values.astype(int)
    df.drop(columns=['playerID', 'nom', 'hof', 'man'], inplace=True)

    cols = list(df.columns.values)
    df = df[cols].applymap(np.int64)
    df = df[cols].round(decimals=-1)

    rowdf = rowdf[cols].applymap(np.int64)
    rowdf = rowdf[cols].round(decimals=-1)
    row = rowdf.iloc[0]

    df[target] = y.tolist()
    train, test = tree.train_test_split(df)

    ##
    attributes = cols
    print("Generating decision tree..")
    root = tree.build_tree(train, attributes, target)

    print("Start to predict..")
    pred = str(tree.predict(root, row))
    pred = "Y" if int(pred) == 1 else "N"

    return pred, real
Пример #11
0
def predict_forest_predict(forest, x):
    k = len(forest)
    s = 0.0
    for i in range(k):
        s += tree.predict(forest[i], x) / k
    return s
Пример #12
0
	['sunny','33','high','FALSE',25],
	['sunny','32','high','TRUE',30],
	['overcast','31','high','FALSE',46],
	['rainy','22','high','FALSE',45],
	['rainy','13','normal','FALSE',52],
	['rainy','15','normal','TRUE',23],
	['overcast','12','normal','TRUE',43],
	['sunny','25','high','FALSE',35],
	['sunny','13','normal','FALSE',35],
	['rainy','23','normal','FALSE',38],
	['sunny','24','normal','TRUE',46],
	['overcast','25','high','TRUE',48],
	['overcast','24','normal','FALSE',52],
	['rainy','21','high','TRUE',44]
]
"""
datamatrix = [['1', '33', '90', '0', 25], ['1', '32', '90', '1', 30],
              ['50', '31', '90', '0', 46], ['100', '22', '90', '0', 45],
              ['100', '13', '50', '0', 52], ['100', '15', '50', '1', 23],
              ['50', '12', '50', '1', 43], ['1', '25', '90', '0', 35],
              ['1', '13', '50', '0', 35], ['100', '23', '50', '0', 38],
              ['1', '24', '50', '1', 46], ['50', '25', '90', '1', 48],
              ['50', '24', '50', '0', 52], ['100', '21', '90', '1', 44]]
datamatrix.sort()
tree = tree.Tree(tableheader, datamatrix)  #, mode="between")
tree.showTrees()
print 'PREDICTION'
#tree.predict(['sunny','hot','high','FALSE'])
tree.predict(['sunny', 'hot', 'high', 'FALSE'])
#tree.predict(['sunny','33','high','FALSE'])
#print tree.predict(['1','24','50','1'])
Пример #13
0
def predict(forest, x):
    y_i = np.empty(forest.size, dtype=object)
    for i in range(forest.size):
        y_i[i] = tree.predict(forest[i], x)
    return np.mean(y_i)