def model_bin_train(self, data_row, truth, num_bins=2):
     #TODO add epsilon
     model = {}
     cutoffsc = [[] for _ in range(len(data_row[0]))]
     dmat = np.matrix(data_row)
     drange = dmat.max() - dmat.min()
     bin_size = float(drange) / num_bins
     data_col = hw3.transpose_array(data_row)
     for j in range(len(data_col)):
         #cutoffsc.append([min(data_col)[0] + bin_size * i for i in range(num_bins)])
         mu = np.asarray(data_col[j]).mean()
         low_mu = np.asarray([data_col[j][i] for i in range(len(data_col[j])) if data_col[j][i] < mu]).mean()
         high_mu = np.asarray([data_col[j][i] for i in range(len(data_col[j])) if data_col[j][i] > mu]).mean()
         if num_bins == 4:
             cutoffsc[j] = [min(data_col)[0], low_mu, mu, high_mu]
         else:
             cutoffsc[j] = [min(data_col)[0], (low_mu - min(data_col)[0])/2, mu, (high_mu-mu)/2, high_mu, (max(data_col)[0]-high_mu)/2]
     cutoffs = [dmat.min() + bin_size * i for i in range(num_bins)]
     #epsilon = float(alpha * 1) / len(covar_matrix)
     for label in [0,1]:
         # transpose to go by column
         sub_data = hw3.transpose_array(hw3.get_sub_at_value(data_row, truth, label))
         model[label] = hw3.bins_per_column(sub_data, cutoffs)
         model[label] = hw3.bins_per_column_by_col(sub_data, cutoffsc)
         # probability of bin given label
     self.y_prob = float(sum(truth))/len(truth)
     self.cutoffs = cutoffsc
     return model
def testTransposeArray():
    dfup = hw3.load_and_normalize_spambase()
    cols = dfup.columns[0:3]
    sub = utils.train_subset(dfup, cols, 5)
    up = hw3.pandas_to_data(sub)
    print up
    trans = hw3.transpose_array(up)
    print trans
def get_prob_over(data_by_row, mus):
    """
    Return array of arrays
    column[i] = [probability_above]
    """
    probability_above_mu = []
    size = len(data_by_row)
    by_col = hw3.transpose_array(data_by_row)
    for col in range(len(by_col)):
        total_over = 0
        column = by_col[col]
        mu_col = mus[col]
        var_col = utils.variance(by_col[col], size)
        for row in range(len(column)):
            if column[row] > mu_col:
                total_over += 1
        probability_above_mu.append(float(total_over)/size)
    return probability_above_mu
 def model_average_predict(self, data_row, theta=.5):
     """  For each row calculate the probability
     that y is 1 and the probability that y is 0
     P(Y|X) = ( P(X|Y) * P(Y) ) / ( P(X) )
     P(X) = prob_over (probability that x is above average for column)
     P(X|Y) = prob_over_given_c (probability that x is above average when y = c for column)
     P(Y) = prob_y ( probability of y )
     """
     mus = hw3.get_mus(data_row)
     data_cols = hw3.transpose_array(data_row)
     prob_over_given_1 = self.model[0]
     prob_over_given_0 = self.model[1]
     prob_over = self.model[2]
     prob_y1 = self.model[3]
     predict = []
     for r in range(len(data_row)):
         row = data_row[r]
         prob_1 = 1
         prob_0 = 1
         for c in range(len(row)):
             mu = mus[c]
             if row[c] > mu:
                 prob_x1 = prob_over_given_1[c]
                 prob_x0 = prob_over_given_0[c]
                 prob_xover = prob_over[c]
             else:
                 prob_x1 = 1 - prob_over_given_1[c]
                 prob_x0 = 1 - prob_over_given_0[c]
                 prob_xover = 1 - prob_over[c]
             prob_1 = prob_1 * prob_x1 #* prob_y1 #/ prob_xover  #P(X|Y) * P(Y)
             prob_0 = prob_0 * prob_x0 #* (1-prob_y1) #/ prob_xover
             #prob_1 = prob_1 + np.log(prob_x1) + np.log(prob_y1)
             #prob_0 = prob_0 + np.log(prob_x0) + np.log(1-prob_y1)
         prob_1 = prob_1 * prob_y1
         prob_0 = prob_0 * (1 - prob_y1)
         prob_norm = float(prob_1)/(prob_0 + prob_1)
         if prob_norm > theta:
             predict.append(1)
         else:
             predict.append(0)
     return predict
Пример #5
0
    def initialize(self, data, k=2):
        # start with k = 2 and std_dev = 1
        self.k = k
        self.labels = [ki for ki in range(self.k)]
        models = [EMModel() for _ in range(self.k)]

        mucheat = mu_cheat(hw3.transpose_array(data), k)
        for ki in range(self.k):
            #models[ki].random_mus(data)
            models[ki].mu = mucheat[ki]

        self.labels = self.assign_labels(data, models)
        #self.labels = self.assign_labels2(data, model)

        self.prevent_empty(data)

        for ki in range(self.k):
            sub_data = hw3.get_sub_at_value(data, self.labels, ki)
            #models[ki].sigma = hw3.get_covar(sub_data)
            models[ki].sigma = hw3.get_covar(data)
            #models[ki].weight = float(len(sub_data)) / len(data)
            models[ki].weight = .5
            models[ki].likelihood = self.expectation(data, models[ki])  # multivarate_normal
        self.models = models
def test_covar_matrix():
    arr = get_test_data()
    print arr
    y = [1, 0, 0, 0]
    print hw3.get_covar(arr, y)
    print np.cov(hw3.transpose_array(arr), y)
Пример #7
0
 def set_mus(self, data):
     mu = []
     by_col = hw3.transpose_array(data) # to go by column
     for j in range(len(by_col)):
         mu.append(np.mean(by_col[j]))
     self.mu = mu