示例#1
0
def knn(x_train, target_train, x_test, k):
    '''
    Classificador Knn.
    Entradas:
        x_train (n x m):    Conjunto de n amostras com m atributos
    cada que serao utilizadas para treinar o modelo.
        target_train (n x 1): classes de x_train.
        x_test (n2 x m):      Conjunto de n2 amostras com m atributos
    cada que serao classificados utilizando o modelo treinado.
        k:    numero de vizinhos mais proximos que serao utilizados.
    Retorna:
        yhat (n2x1):    classificacao de x_test.
    '''

    import numpy as np
    import utils

    cls = []

    n = x_test.shape[0]
    m = x_test.shape[1]

    for i in range(0, n):
        distance = np.sqrt(np.sum(np.power(x_train - x_test[i, :], 2), 1))
        ind = np.argsort(distance)
        cls.append(utils.mode(target_train[ind[0:k]]))

    return np.reshape(np.array(cls), (-1, 1))
示例#2
0
def PluralityLearner(dataset):
    """A very dumb algorithm: always pick the result that was most popular
    in the training data.  Makes a baseline for comparison."""
    most_popular = mode([e[dataset.target] for e in dataset.examples])

    def predict(example):
        """Always return same result: the most popular from the training set."""
        return most_popular
    return predict
def extract_tfdf_sites(genome,tf):
    sites = tfdf[tfdf['genome_accession'] == genome][tfdf['TF']==tf]['site_sequence']
    # convert to list matrix form,remove nans
    bio_motif_all_lens = [site for site in sites if type(site) is str] 
    modal_length = mode(map(len,bio_motif_all_lens))
    bio_motif = filter(lambda x:len(x)==modal_length,bio_motif_all_lens)
    if len(bio_motif) != len(bio_motif_all_lens):
        print "removed", len(bio_motif_all_lens) - len(bio_motif),"of",
    return bio_motif
示例#4
0
    def calculate_accuracy(self, y):

        acc = 0
        for i in range(0, self.nc):
            c = utils.mode(y[self.z[i]])
            yhat = mat.repmat(c, len(self.z[i]), 1)
            acc = acc + len(self.z[i]) * utils.accuracy(y[self.z[i]], yhat)
        acc = acc / self.n

        return acc
示例#5
0
    def calculate_accuracy(self, y):

        acc = 0
        for i in range(0, self.k):
            ind = np.where(self.c == i)[0]
            c = utils.mode(y[ind])
            yhat = mat.repmat(c, ind.size, 1)
            acc = acc + ind.size * utils.accuracy(y[ind], yhat)
        acc = acc / self.n

        return acc
示例#6
0
def i3conf_windowing():
    return lines("# Windowing",
                 bindsym("Mod1+o", "split h"),
                 bindsym("Mod1+v", "split v"),
                 "floating_modifier Mod1",
                 bindsym("Mod1+t", "floating toggle"),
                 bindsym("Mod1+f", "fullscreen toggle"),
                 bindsym("Mod1+s", "layout stacking"),
                 bindsym("Mod1+w", "layout tabbed"),
                 bindsym("Mod1+e", "layout toggle split"),
                 bindsym("Mod1+p", "focus parent"),
                 bindsym("Mod1+c", "focus child"),

                 mode("resize",
                      bindsym("h", "resize shrink width  10 px or 10 ppt"),
                      bindsym("j", "resize grow   height 10 px or 10 ppt"),
                      bindsym("k", "resize shrink height 10 px or 10 ppt"),
                      bindsym("l", "resize grow   width  10 px or 10 ppt"),

                      bindsym_mode("Return", "default"),
                      bindsym_mode("Escape", "default"),
                      ""),
                 bindsym_mode("Mod1+r", "resize"),
                 "")
示例#7
0
                            "4.1", "4.2", "5"
                        ])

    io_args = parser.parse_args()
    question = io_args.question

    if question == "1.1":
        input_file = "../data/fluTrends.csv"
        df = pd.read_csv(input_file, header=0)

        print('Minimum: %.3f' % df.values.min())
        print('Maximum: %.3f' % df.values.max())
        print('Mean: %.3f' % df.values.mean())
        print('Median: %.3f' % np.median(df.values))
        import utils
        print('Mode: %.3f' % utils.mode(df.values))

        print('5th percentile: %.3f' % np.percentile(df.values, 5))
        print('25th percentile: %.3f' % np.percentile(df.values, 25))
        print('50th percentile: %.3f' % np.percentile(df.values, 50))
        print('75th percentile: %.3f' % np.percentile(df.values, 75))
        print('95th percentile: %.3f' % np.percentile(df.values, 95))

        means = df.mean()
        print('Highest mean is in: %s' % means.idxmax())
        print('Lowest mean is in: %s' % means.idxmin())
        variances = df.var()
        print('Highest variance is in: %s' % variances.idxmax())
        print('Lowest variance is in: %s' % variances.idxmin())
        pass
示例#8
0
 if question == "1.1":
     # Load the fluTrends dataset
     df = pd.read_csv(os.path.join('..', 'data', 'fluTrends.csv'))
     X = df.values
     names = df.columns.values
     #print(X)
     #print(names)
     ''' YOUR CODE HERE FOR Q1.1 '''
     #1
     a = np.zeros(2)
     x = np.array(X)
     print("The mean of all dataset is: %.4f" % np.mean(x))
     print("The maximum of all dataset is :%.4f" % np.max(x))
     print("The minimum of all dataset is :%.4f" % np.min(x))
     print("The median of all dataset is :%.4f" % np.median(x))
     print("The mode of all dataset is :%.4f" % utils.mode(x))
     #2
     print("The 5%% quantile across the dataset: %.4f" %
           np.percentile(x, 5))
     print("The 25%% quantile across the dataset: %.4f" %
           np.percentile(x, 25))
     print("The 50%% quantile across the dataset: %.4f" %
           np.percentile(x, 50))
     print("The 75%% quantile across the dataset: %.4f" %
           np.percentile(x, 75))
     print("The 95%% quantile across the dataset: %.4f" %
           np.percentile(x, 95))
     #3
     columnMean = np.mean(x, axis=0)
     columnVar = np.var(x, axis=0)
     print("The regions with the highest mean:%s" %
示例#9
0
    def fit(self, X, y):
        N, D = X.shape
        max_info_gain = 0

        # Get an array with the number of 0's, number of 1's, etc.
        count = np.bincount(y, minlength=2)

        # Get the index of the largest value in count.
        # Thus, y_mode is the mode (most popular value) of y
        y_mode = np.argmax(count)

        self.splitSat = y_mode
        self.splitNot = None
        self.splitVariable = None
        self.splitValue = None

        # If all the labels are the same, no need to split further
        if np.unique(y).size <= 1:
            return

        # Loop over features looking for the best split
        for d in range(D):
            for n in range(N):
                # Choose value to equate to
                value = X[n, d]

                # Find most likely class for each split
                y_sat = utils.mode(y[X[:, d] < value])
                y_not = utils.mode(y[X[:, d] >= value])

                # Make predictions
                zero, one = count
                p = np.zeros(2)
                p[0] = zero / N
                p[1] = one / N

                y_y = y[X[:, d] < value]
                y_n = y[X[:, d] >= value]
                N_y = np.size(y_y)
                N_n = np.size(y_n)

                zero_y, one_y = np.bincount(y_y, minlength=2)
                zero_n, one_n = np.bincount(y_n, minlength=2)

                p_y = np.zeros(2)
                p_y[0] = zero_y / N_y
                p_y[1] = one_y / N_y

                p_n = np.zeros(2)
                p_n[0] = zero_n / N_n
                p_n[1] = one_n / N_n

                # Compute info gain
                info_gain = entropy(
                    p) - N_y / N * entropy(p_y) - N_n / N * entropy(p_n)

                # Compare to minimum error so far
                if info_gain > max_info_gain:
                    max_info_gain = info_gain
                    self.splitVariable = d
                    self.splitValue = value
                    self.splitSat = y_sat
                    self.splitNot = y_not
                    print(self.splitValue)
    def fit(self, X, y):
        """ YOUR CODE HERE """
        #Compute INFO GAIN
        N, D = X.shape

        # Get an array with the number of 0's, number of 1's, etc.
        count = np.bincount(y)

        # Get the index of the largest value in count.  
        # Thus, y_mode is the mode (most popular value) of y
        y_mode = np.argmax(count)

        self.splitSat = y_mode
        self.splitNot = None
        self.splitVariable = None
        self.splitValue = None
        # If all the labels are the same, no need to split further
        if np.unique(y).size <= 1:
            return

        # Loop over features looking for the best split
        X = np.round(X)
        
        max_info_gain = 0
        for d in range(D):
            for n in range(N):
                # Choose value to equate to
                value = X[n, d]

                # Find most likely class for each split
                y_sat = utils.mode(y[X[:,d] > value])
                y_not = utils.mode(y[X[:,d] <= value])

                # Make predictions
                y_pred = y_sat * np.ones(N)
                y_pred[X[:, d] <= value] = y_not

                #process n_yes
                try:
                    n_yes_div_n = np.bincount(y[X[:,d] > value])[1] / N
                except IndexError:
                    return
                #process n_no
                n_no_div_n = 1 - n_yes_div_n

                # Compute info gain
                y2 = count / y.size

                try:
                    y_yes = np.bincount(y[X[:,d] > value]) / y.size
                    y_no = 1 - y_yes
                except IndexError:
                    y_no = np.bincount(y[X[:,d] <= value]) / y.size
                    y_yes = 1 - y_no

                info_gain = entropy(y2) - (n_yes_div_n * entropy(y_yes)) - (n_no_div_n * entropy(y_no))
                
                # Compare to max_info_gain so far
                if info_gain > max_info_gain:
                    max_info_gain = info_gain
                    self.splitVariable = d
                    self.splitValue = value
                    self.splitSat = y_sat
                    self.splitNot = y_not
示例#11
0
    def fit(self, X, y):
        N, D = X.shape

        # Get an array with the number of 0's, number of 1's, etc.
        count = np.bincount(y)

        # Get the index of the largest value in count.
        # Thus, y_mode is the mode (most popular value) of y
        y_mode = np.argmax(count)

        self.splitSat = y_mode
        self.splitNot = None
        self.splitVariable = None
        self.splitValue = None

        # If all the labels are the same, no need to split further
        if np.unique(y).size <= 1:
            return

        # initialize the minimum entropy value
        Origin_Entropy = entropy(count / N)
        maxGain = 0

        # Loop over features looking for the best split
        # X = np.round(X)

        for d in range(D):
            for n in range(N):
                # Choose value to equate to
                value = X[n, d]

                # Find most likely class for each split
                y_sat = utils.mode(y[X[:, d] <= value])
                y_not = utils.mode(y[X[:, d] > value])

                # Make predictions
                y_pred = y_sat * np.ones(N)
                y_pred[X[:, d] > value] = y_not

                # Compute entropy
                label_a = np.bincount(y[X[:, d] <= value], minlength=2)
                label_b = np.bincount(y[X[:, d] > value], minlength=2)
                a = np.sum(label_a)
                b = np.sum(label_b)

                if a == 0:
                    entro_value_a = label_a
                else:
                    entro_value_a = label_a / a
                if b == 0:
                    entro_value_b = label_b
                else:
                    entro_value_b = label_b / b

                gain = Origin_Entropy - (a / N) * entropy(entro_value_a) - (
                    b / N) * entropy(entro_value_b)

                # Compare to minimum error so far
                if gain > maxGain:
                    # This is the lowest error, store this value
                    maxGain = gain
                    self.splitVariable = d
                    self.splitValue = value
                    self.splitSat = y_sat
                    self.splitNot = y_not
示例#12
0
 def joinBools(self, bools, sprKing):
     return utils.convertNoneToIdentity(utils.mode(bools), bools[sprKing])
示例#13
0
文件: main.py 项目: lishaowen0426/ml
        percentlist = [0.05, 0.25, 0.5, 0.75, 0.95]
        quantileList = [
            np.percentile(X, 5),
            np.percentile(X, 25),
            np.percentile(X, 50),
            np.percentile(X, 75),
            np.percentile(X, 95)
        ]
        for x, y in zip(percentlist, quantileList):
            print('{} quantile :{:.4}'.format(int(x * 100), y))

        print('max: {:.4}'.format(np.max(X)))
        print('min: {:.4}'.format(np.min(X)))
        print('mean: {:.4}'.format(np.mean(X)))
        print('median: {:.4}'.format(np.median(X)))
        print('mode: {:.4}'.format(utils.mode(X)))

        meanList = []
        varianceList = []
        for index, col in df.iteritems():
            meanList.append(col.values.mean())
            varianceList.append(col.values.var())

        print('highest mean region: %s' %
              df.columns[meanList.index(max(meanList))])
        print('lowest mean region: %s' %
              df.columns[meanList.index(min(meanList))])
        print('highest variance region: %s' %
              df.columns[varianceList.index(max(varianceList))])
        print('lowest variance region: %s' %
              df.columns[varianceList.index(min(varianceList))])
示例#14
0
 def predict(example):
     """Find the k closest items, and have them vote for the best."""
     best = heapq.nsmallest(k, ((dataset.distance(e, example), e)
                                for e in dataset.examples))
     return mode(e[dataset.target] for (d, e) in best)
示例#15
0
    def train(self, x, y, IGMIN=0.05, NMIN=0):
        self.x = np.array(x)
        self.y = np.reshape(y, (-1, 1))
        self.n = self.y.size
        self.m = self.x.shape[1]

        # MAXIMIZAR ATRAVES DO IG!!!!!!
        # att = reduc_dim.sequential_forward_selection(
        # self.x, self.y, self.x, self.y, utils.accuracy, 1,
        # classif_regres.knn, 3)[0]

        if np.union1d(y, y).size == 1 or self.n < NMIN:
            self.root = utils.mode(self.y)
        else:
            ig = np.zeros(self.m)
            for att in range(0, self.m):
                # APENAS PARA ATRIBUTOS NOMINAIS
                ig[att] = self.ig(self.x, self.y, att, 0)
            att = np.argmax(ig)
            values = np.union1d(self.x[:, att], self.x[:, att])

    #         if isinstance(x[0, att], basestring):
    #             thr = utils.mode(self.x[:, att])
    #         else:
    #             # TODO: implementar funcao que encontre o limiar otimo
    #             # de separacao e coloca-lo dentro da funcao.
    #             '''
    #             Os valores dos atributos sao primeiro ordenados;
    #
    #             O ponto medio entre dois valores consecutivos eh
    #             um possivel ponto de corte e eh avaliado pela
    #             funcao merito;
    #
    #             O possivel ponto de corte que maximiza a funcao
    #             merito eh escolhido.
    #             '''
    #             thr = np.mean(self.x[:, att])
    #         ig = self.ig(self.x, self.y, att, thr)

            ig = ig[att]
            if ig < IGMIN:
                self.root = utils.mode(y)  # valor mais comum
            else:
                self.root = None
                self.children = {}
                for v in values:
                    self.children[v] = DecisionTree(self.nclasses)

                # true if attribute is discrete (string)
                if isinstance(self.x[0, att], basestring):
                    self.att = att
                    x = self.x
                    y = self.y
                    del self.x
                    del self.y
                    for v in values:
                        ind = np.core.defchararray.equal(x[:, att], v)
                        self.children[v].train(
                            x[ind, :], y[ind, 0], IGMIN, NMIN)
                else:
                    # TODO: variaveis continuas
                    pass
        min_val = 100
        max_val = 0
        sum_val = 0
        median_val = 0
        list_vals = X.ravel()
        list_vals.sort()

        if len(list_vals) % 2 == 0:
            median_val = (list_vals[(len(list_vals)/2)] + list_vals[(len(list_vals)/2)-1])/2
        else:
            median_val = list_vals[(len(list_vals) - 1) / 2]

        for feature in range(D):       
            
            mode_val = utils.mode(X[:][:])

            for obj in range(N):
                if X[obj][feature] > max_val:
                    max_val = X[obj][feature]
                if X[obj][feature] < min_val:
                    min_val = X[obj][feature]
                sum_val = sum_val + X[obj][feature]          

        print "Minimum value is ", min_val
        print "Maximum value is ", max_val
        print "Mean value is ", (sum_val / (N*D))
        print "Median value is ", median_val
        print "Mode value is ", mode_val

        # part 2: quantiles
示例#17
0
        check_grad(grads.bar, grads.bar_grad)

    elif question == "5.1":
        # Load the fluTrends dataset
        df = pd.read_csv(os.path.join("..", "data", "fluTrends.csv"))
        X = df.values
        names = df.columns.values

        # YOUR CODE HERE
        from scipy import stats as sc

        print(np.min(X))
        print(np.max(X))
        print(np.mean(X))
        print(np.median(X))
        print(utils.mode(X))
        print()
        print(np.percentile(X, 5))
        print(np.percentile(X, 25))
        print(np.percentile(X, 50))
        print(np.percentile(X, 75))
        print(np.percentile(X, 95))
        print()
        print(df.mean().sort_values())
        print()
        print(df.var().sort_values())
        print(df.min())
        print()
        print(df.max())
        print()
        print(df.mean())
示例#18
0
文件: main.py 项目: jaysc96/CS340
    io_args = parser.parse_args()
    question = io_args.question

    if question == "1.1":
        # Q1.1 - This should print the answers to Q 1.1

        # Load the fluTrends dataset
        X, names = utils.load_dataset("fluTrends")

        # part 1: min, max, mean, median and mode
        print "Min = %.3f" % np.amin(X)
        print "Max = %.3f" % np.amax(X)
        print "Mean = %.3f" % np.mean(X)
        print "Median = %.3f" % np.median(X)
        print "Mode = %.3f" % utils.mode(X)

        # part 2: quantiles
        print "10th quantile = %.3f" % np.percentile(X, 10)
        print "25th quantile = %.3f" % np.percentile(X, 25)
        print "50th quantile = %.3f" % np.percentile(X, 50)
        print "75th quantile = %.3f" % np.percentile(X, 75)
        print "90th quantile = %.3f" % np.percentile(X, 90)

        # part 3: maxMean, minMean, maxVar, minVar
        means = np.mean(X, axis=0)
        vars = np.var(X, axis=0)
        print "Highest Mean at %s" % names[np.argmax(means)]
        print "Lowest Mean at %s" % names[np.argmin(means)]
        print "Highest Variance at %s" % names[np.argmax(vars)]
        print "Minimum Variance at %s" % names[np.argmin(vars)]
示例#19
0
                        ])

    io_args = parser.parse_args()
    question = io_args.question

    if question == "1.1":
        # Load the fluTrends dataset
        df = pd.read_csv(os.path.join('..', 'data', 'fluTrends.csv'))
        X = df.values
        names = df.columns.values

        print("         Min: %s " % np.min(X))
        print("         Max: %s " % np.max(X))
        print("        Mean: %s " % np.mean(X))
        print("      Median: %s " % np.median(X))
        print("        Mode: %s " % utils.mode(X))
        print(" 5%% quantile: %s " % np.percentile(X, 5))
        print("25%% quantile: %s " % np.percentile(X, 25))
        print("50%% quantile: %s " % np.percentile(X, 50))
        print("75%% quantile: %s " % np.percentile(X, 75))
        print("95%% quantile: %s " % np.percentile(X, 95))
        print("Region with max mean: %s " %
              list(df.columns.values)[np.argmax(X.mean(0))])
        print("Region with min mean: %s " %
              list(df.columns.values)[np.argmin(X.mean(0))])
        print("Region with max variance: %s " %
              list(df.columns.values)[np.argmax(np.var(X, 0))])
        print("Region with min variance: %s " %
              list(df.columns.values)[np.argmin(np.var(X, 0))])

    elif question == "2":
示例#20
0
    def fit(self, X, y):
        """ YOUR CODE HERE """
        N, D = X.shape

        # Get an array with the number of 0's, number of 1's, etc.
        count = np.bincount(y, None, minlength=2)
        # Get the index of the largest value in count.
        # Thus, y_mode is the mode (most popular value) of y
        y_mode = np.argmax(count)

        self.splitSat = y_mode
        self.splitNot = None
        self.splitVariable = None
        self.splitValue = None

        # If all the labels are the same, no need to split further
        if np.unique(y).size <= 1:
            return

        #(start) This part can be a function
        iniProb = np.bincount(y) / np.size(y)
        #print(iniProb)
        iniEntropy = entropy(iniProb)
        #(end)
        minInfoGain = iniEntropy
        maxInfoGain = 0
        #print(iniEntropy)
        minError = np.sum(y != y_mode)
        #print(y)
        # Loop over features looking for the best split
        for d in range(D):
            for n in range(N):
                # Choose value to equate to
                value = X[n, d]
                # Find most likely class for each split
                y_spl0 = y[X[:, d] >= value]
                y_spl1 = y[X[:, d] < value]
                #print(y_spl1)
                if np.size(y_spl0) != 0:
                    spl0Prob = np.bincount(y_spl0, None, 2) / np.size(y_spl0)
                elif np.size(y_spl0) == 0:
                    spl0Prob = np.zeros(2)
                spl0Entropy = entropy(spl0Prob)
                wSpl0Entropy = (np.size(y_spl0) / np.size(y)) * spl0Entropy
                if np.size(y_spl1) != 0:
                    spl1Prob = np.bincount(y_spl1, None, 2) / np.size(y_spl1)
                elif np.size(y_spl1) == 0:
                    spl1Prob = np.zeros(2)
                spl1Entropy = entropy(spl1Prob)
                wSpl1Entropy = (np.size(y_spl1) / np.size(y)) * spl1Entropy
                infoGain = iniEntropy - wSpl0Entropy - wSpl1Entropy

                #print(infoGain)

                #print(testarray)
                #print(np.size(testarray))
                y_sat = utils.mode(y[X[:, d] > value])
                y_not = utils.mode(y[X[:, d] < value])
                #if(y_sat==0):print(y_not)
                #print(y_not)
                # Make predictions
                y_pred = y_sat * np.ones(N)
                #if(y_sat==1):print(y_pred)
                y_pred[X[:, d] < value] = y_not
                # Compute error
                errors = np.sum(y_pred != y)
                # Compare to minimum error so far
                if (infoGain != 0 and infoGain > maxInfoGain):
                    # This is the lowest error, store this value
                    minError = errors
                    maxInfoGain = infoGain
                    #print(minInfoGain)
                    self.splitVariable = d
                    self.splitValue = value
                    self.splitSat = y_sat
                    self.splitNot = y_not
示例#21
0
 def predict_reducer(self, minimizer_ix):
     '''returns the average y's for the closest X's'''
     return mode(self.y[minimizer_ix])[0]
示例#22
0
        check_grad(grads.example, grads.example_grad)
        # check_grad(grads.foo, grads.foo_grad)
        # check_grad(grads.bar, grads.bar_grad)

    elif question == "5.1":
        # Load the fluTrends dataset
        df = pd.read_csv(os.path.join('..', 'data', 'fluTrends.csv'))
        X = df.values

        names = df.columns.values

        minValues = X.min()
        maxValues = X.max()
        meanValues = X.mean()
        medianValues = np.median(X)
        modeValues = utils.mode(X)

        print(minValues)
        print(maxValues)
        print(meanValues)
        print(medianValues)
        print(modeValues)

        five = np.quantile(X, 0.05)
        twentyFive = np.quantile(X, 0.25)
        fifty = np.quantile(X, 0.5)
        seventyFive = np.quantile(X, 0.75)
        nintyFive = np.quantile(X, 0.95)

        print(five)
        print(twentyFive)
示例#23
0
 def predict(example):
     return mode(predictor(example) for predictor in predictors)
示例#24
0
 def predict(example):
     return mode(predictor(example) for predictor in predictors)
示例#25
0
    def fit(self, X, y):
        N, D = X.shape  # N is the num of examples , D is num of features

        # here X is a 400 * 2 matrix
        # here Y is a 400 * 1 matrix with value of 0 and 1

        # print(N) here N = 400
        # print(D) here D = 2

        # Get an array with the number of 0's, number of 1's, etc.
        count = np.bincount(y)
        # here count = [234 166], num of 0 = 234, num of 1 = 166
        # print(count)

        # Get the index of the largest label value in count.
        # which means y_mode = index(0 or 1) of which label value(0 or 1) has more
        # Thus, y_mode is the mode (most popular value) of y
        y_mode = np.argmax(count)
        # y_mode is most popular label value, here is 0

        # print(y_mode)

        self.splitSat = y_mode  # = 0 ?????
        self.splitNot = None  # ?????
        self.splitVariable = None  # split feature for Equality
        self.splitValue = None  # ?????

        # If all the labels are the same, no need to split further
        if np.unique(y).size <= 1:
            return

        minError = np.sum(y != y_mode)
        # here minError = 166
        # print(y)
        # print(y != y_mode)
        # print((minError))

        # Loop over features looking for the best split
        X = np.round(X)
        # round each Xnd of X to int

        # value = X[1, 1]

        # print(value) # 32
        # print(X[:, 1])
        # y_sat = utils.mode(y[X[:, 1] == value])
        # print(y_sat) # 1
        # print(y[X[:, 1] == value]) # [1 1 1 1 1 1 0 1 1 1 1 1]
        # print(y[X[:, 1] != value])
        # print(utils.mode(y[X[:, 1] != value]))
        # Find most likely class for each split
        # First, let's look at the X[:,d] == value. This is a condition,
        # it means that the d column of X which has the same value with "value" should be true.
        # then y[condition] gives you labels for the rows which are true
        # (labels of the points which satisfy the equality rule in decision stump).

        for d in range(D):  # outer loop each feature
            for n in range(N):  # inner loop each example
                # Choose value to equate to
                value = X[
                    n, d]  # Xij, or we say Xnd, as our current equality value

                # Find most likely class for each split
                # "y[X[:,d] == value" gives the values in y where the corresponding Xnd = the current equality value
                # it returns a part of y
                # y_sat is the most appeared label value in y[X[:,d] == value]
                # y_not is the most appeared label value in y[X[:,d] != value]
                y_sat = utils.mode(y[X[:, d] == value])
                y_not = utils.mode(y[X[:, d] != value])

                # Make predictions
                y_pred = y_sat * np.ones(N)  # = [1 1 1 ...] or [0 0 0 ...]
                y_pred[
                    X[:, d] !=
                    value] = y_not  # change those y[X[:,d] != value] corresponding y[] to y_not

                # Compute error
                errors = np.sum(y_pred != y)
                # y_pred != y will give a array where not equivalent y_pred and y value will be 1

                # Compare to minimum error so far
                if errors < minError:
                    # This is the lowest error, store this value
                    minError = errors
                    self.splitVariable = d  # change the split feature to the new feature
                    self.splitValue = value  # change the equality value to the new value
                    self.splitSat = y_sat  # predicted y if Xnd == splitValue
                    self.splitNot = y_not  # predicted y if Xnd != splitValue
示例#26
0
    def fit(self, X, y):

        N, D = X.shape

        # Get an array with the number of 0's, number of 1's, etc.
        count = np.bincount(y)

        # Get the index of the largest value in count.
        # Thus, y_mode is the mode (most popular value) of y
        y_mode = np.argmax(count)

        self.splitSat = y_mode
        self.splitNot = None
        self.splitVariable = None
        self.splitValue = None

        # Set information gain to 0 for baseline
        maxInfo = 0

        # If all the labels are the same, no need to split further
        if np.unique(y).size <= 1:
            return

        # Loop over features looking for the best split by infoGain
        for d in range(D):
            for n in range(N):
                # Choose threshold for constraint X[:, d] > value
                value = X[n, d]

                # Compute the labels satisfying and not satisfying the constraint
                y_yes = y[X[:, d] > value]
                y_no = y[X[:, d] <= value]

                # Find most likely class for each split
                y_sat = utils.mode(y_yes)
                y_not = utils.mode(y_no)

                # Compute information gain
                n_yes = y_yes.size
                n_no = y_no.size

                classes = np.bincount(y).size
                dist = np.bincount(y, minlength=classes)
                dist_yes = np.bincount(y_yes, minlength=classes)
                dist_no = np.bincount(y_no, minlength=classes)

                # Note that entropy is 0 if there is no data
                a = b = c = 0
                if np.sum(dist) != 0:
                    a = entropy(dist / np.sum(dist))
                if np.sum(dist_yes) != 0:
                    b = entropy(dist_yes / np.sum(dist_yes))
                if np.sum(dist_no) != 0:
                    c = entropy(dist_no / np.sum(dist_no))
                infoGain = a - n_yes * b / N - n_no * c / N

                # Compare to minimum error so far
                if infoGain > maxInfo:
                    # This is the lowest error, store this value
                    maxInfo = infoGain
                    self.splitVariable = d
                    self.splitValue = value
                    self.splitSat = y_sat
                    self.splitNot = y_not
示例#27
0
                            "1.1", "2", "2.2", "2.3", "2.4", "3", "3.1", "3.2",
                            "4.1", "4.2", "5"
                        ])

    io_args = parser.parse_args()
    question = io_args.question

    if question == "1.1":

        # retrieve max, min, median, mode
        ds = pd.read_csv('../data/fluTrends.csv')

        maximum = ds.values.max()
        minimum = ds.values.min()
        median = ds.stack().median()
        mode = utils.mode(ds.values)

        results = [maximum, minimum, median, mode]

        # retrieve quantiles

        print("quantiles: %s" %
              ds.stack().quantile([0.05, 0.25, 0.5, 0.75, 0.95]))

        # retrieve maximum mean, minimum mean, highest variance, lowest variance

        means = ds.mean()
        variances = ds.var()

        maxMean = means.idxmax(axis=0)
        minMean = means.idxmin(axis=0)
示例#28
0
 def predict(example):
     print([predictor(example) for predictor in predictors])
     return mode(predictor(example) for predictor in predictors)
示例#29
0
    elif question == "5.1":
        # Load the fluTrends dataset
        df = pd.read_csv(os.path.join('..', 'data', 'fluTrends.csv'))
        X = df.values
        names = df.columns.values

        # -----------------
        # 5.1.1
        # -----------------

        minimum = np.round(np.amin(X), decimals=4)
        maximum = np.round(np.amax(X), decimals=4)
        mean = np.round(np.mean(X), decimals=4)
        median = np.round(np.median(X), decimals=4)
        mode = np.round(utils.mode(X), decimals=4)

        print("\n-----------------------")
        print("Question 5.1.1\n")
        print("min    = " + str(minimum))
        print("max    = " + str(maximum))
        print("mean   = " + str(mean))
        print("median = " + str(median))
        print("mode   = " + str(mode))

        # -----------------
        # 5.1.2
        # -----------------

        quantiles = np.round(np.quantile(X, [0.05, 0.25, 0.5, 0.75, 0.95]),
                             decimals=6)  # Requires numpy version > 1.15
示例#30
0
        check_grad(grads.bar, grads.bar_grad)

    elif question == "5.1":
        # Load the fluTrends dataset
        df = pd.read_csv(os.path.join('..', 'data', 'fluTrends.csv'))
        X = df.values
        names = df.columns.values

        # YOUR CODE HERE
        # 1. The minimum, maximum, mean, median,
        # and mode of all values across the dataset.
        print("The minimum is: %s" % np.ndarray.min(X))
        print("The maximum is: %s" % np.ndarray.max(X))
        print("The mean is: %s" % np.ndarray.mean(X))
        print("The median is: %s" % np.median(X))
        print("The mode is %s" % utils.mode(X))

        # 2. The 5%, 25%, 50%, 75%, and 95% quantiles
        # of all values across the dataset.
        print("The 5%% quantile is %s" % np.percentile(X, 5))
        print("The 25%% quantile is %s" % np.percentile(X, 25))
        print("The 50%% quantile is %s" % np.percentile(X, 50))
        print("The 75%% quantile is %s" % np.percentile(X, 75))
        print("The 95%% quantile is %s" % np.percentile(X, 95))

        # 3. The names of the regions with the highest and lowest means,
        # and the highest and lowest variances.
        regionMean = np.mean(X, axis=0)
        regionVar = np.var(X, axis=0)
        print("The region {} has the highest mean of {} ".format(
            names[np.argmax(regionMean)], np.max(regionMean)))
示例#31
0
 def predict(example):
     print([predictor(example) for predictor in predictors])
     return mode(predictor(example) for predictor in predictors)
示例#32
0
文件: main.py 项目: kthnd/ml-basics
                        choices=[
                            "1.1", "2", "2.2", "2.3", "2.4", "3", "3.1", "3.2",
                            "4.1", "4.2", "5"
                        ])

    io_args = parser.parse_args()
    question = io_args.question

    if question == "1.1":

        dataset = utils.load_dataset("fluTrends")
        print("Minimum of dataset", ":", np.min(dataset[0]))
        print("Maximum of dataset", ":", np.max(dataset[0]))
        print("Mean of dataset", "   :", np.mean(dataset[0]))
        print("Median of dataset", " :", np.median(dataset[0]))
        print("Mode of dataset", "   :", utils.mode(dataset[0]))

    elif question == "2":

        # 1. Load citiesSmall dataset
        dataset = utils.load_dataset("citiesSmall")
        X = dataset["X"]
        y = dataset["y"]

        # 2. Evaluate majority predictor model
        y_pred = np.zeros(y.size) + utils.mode(y)

        error = np.mean(y_pred != y)
        print("Mode predictor error: %.3f" % error)

        # 3. Evaluate decision stump
示例#33
0
    def findOddScoutForListOfDicts(self, tempTIMDs, key1):
        #Similar to findOddScoutForDict, but for lists of dicts instead of individual dicts
        #The nth dict on each list should be the same
        weight = self.gradingListsOfDicts[key1][0]
        allScouts = filter(lambda v: v,
                           map(lambda k: k.get('scoutName'), tempTIMDs))
        # Unsorted meaning they can have different lengths
        unsortedLists = filter(
            lambda k: k,
            map(lambda t: t.get(key1)
                if t.get('scoutName') else None, tempTIMDs))
        #Finds the mode for length of dicts and ignores if not that length
        #i.e. if there is disagreement over how many shots a robot took
        if unsortedLists != None:
            modeListLength = utils.mode([len(lis) for lis in unsortedLists
                                         ])  # finds mode, not max
            modeAmount = [len(lis)
                          for lis in unsortedLists].count(modeListLength)
            #If someone missed an attempt or had an extra attempt, there is no way to compare their data
            #This filters out anything with a different length of dicts
            # 2018 - each dict is an attempt
            lists = []
            aScouts = []
            for aScoutIndex in range(len(unsortedLists)):
                if len(unsortedLists[aScoutIndex]) == modeListLength:
                    lists.append(unsortedLists[aScoutIndex])
                    aScouts.append(allScouts[aScoutIndex])
                elif modeAmount > 1:  # Updates SPR if incorecct list amount and at least 2 scouts agree
                    self.sprs.update({
                        allScouts[aScoutIndex]:
                        (self.sprs.get(allScouts[aScoutIndex]) or 0) + weight
                    })
                    self.disagreementBreakdown[allScouts[aScoutIndex]].update({
                        key1: {
                            'amount':
                            (self.disagreementBreakdown[allScouts[aScoutIndex]]
                             .get(key1, {}).get('amount', 0) + 1)
                        }
                    })
            # Need at least 2 scouts to compare, or SPR is not affected
            if modeAmount > 1:
                # check here with if statement before runing code below
                for num in range(modeListLength):
                    #Comparing dicts that should be the same (e.g. each shot time dict for the same shot) within the tempTIMDs
                    #This means the nth shot by a given robot in a given match, as recorded by multiple scouts
                    #The comparison itself is the same as the other findOddScout functions
                    dicts = [lis[num] for lis in lists]
                    scouts = [scout for scout in aScouts]

                    values = []
                    for aDict in dicts:
                        values += [aDict['didSucceed']]
                    modeSuccess = utils.mode(values)
                    if modeSuccess != None:
                        popList = []
                        weight = self.gradingListsOfDicts[key1][1][
                            'didSucceed']
                        for aDictIndex in range(len(dicts)):
                            if dicts[aDictIndex]['didSucceed'] != modeSuccess:
                                popList.append(aDictIndex)
                        for item in popList[::-1]:
                            #self.SPRBreakdown.update({key2: (self.SPRBreakdown.get(key2) or []) + [(differenceFromMode[c])]})
                            self.sprs.update({
                                scouts[item]:
                                (self.sprs.get(scouts[item]) or 0) + weight
                            })
                            self.disagreementBreakdown[scouts[item]].update({
                                key1: {
                                    'didSucceed':
                                    (self.disagreementBreakdown[scouts[item]].
                                     get(key1, {}).get('didSucceed', 0) + 1)
                                }
                            })
                            dicts.pop(item)
                            scouts.pop(item)
                        for key2 in dicts[0].keys():
                            #Strings can be averaged (we're just looking at mode, not subtracting them)
                            #Without averaging, one person could be declared correct for no reason
                            values = [aDict[key2] for aDict in dicts]
                            weight = self.gradingListsOfDicts[key1][1][key2]
                            mode = utils.mode(values)
                            if mode != None:
                                differenceFromMode = [
                                    weight if v != mode else 0 for v in values
                                ]
                                #Gets inaccuracy by category
                                for c in range(len(differenceFromMode)):
                                    self.SPRBreakdown.update({
                                        key2:
                                        (self.SPRBreakdown.get(key2) or []) +
                                        [(differenceFromMode[c])]
                                    })
                                    if weight != 0.0:
                                        self.sprs.update({
                                            scouts[c]:
                                            (self.sprs.get(scouts[c]) or 0) +
                                            differenceFromMode[c]
                                        })
                                        self.disagreementBreakdown[
                                            scouts[c]].update({
                                                key1: {
                                                    key2:
                                                    (self.
                                                     disagreementBreakdown[
                                                         scouts[c]].get(
                                                             key1, {}).get(
                                                                 key2, 0) + 1)
                                                }
                                            })
示例#34
0
        print(df.max())
        print(df.min())
        print(df.mean())
        print(df.median())
        print(df.mode())

    elif question == "6":
        # 1Load citiesSmall dataset
        with open(os.path.join('..','data','citiesSmall.pkl'), 'rb') as f:
            dataset = pickle.load(f)

        X = dataset["X"]
        y = dataset["y"]

        # 2Evaluate majority predictor model
        y_pred = np.zeros(y.size) + utils.mode(y)

        error = np.mean(y_pred != y)
        print("Mode predictor error: %.3f" % error)

        # 3Evaluate decision stump
        model = DecisionStumpEquality()
        model.fit(X, y)
        y_pred = model.predict(X)

        error = np.mean(y_pred != y) 
        print("Decision Stump with inequality rule error: %.3f"
              % error)

        # Plot result
        utils.plotClassifier(model, X, y)
示例#35
0
    def findOddScoutForListOfDictsDicts(self, tempTIMDs, key1):
        # Similar to findOddScoutForListOfDicts, but for a (dict in dict) in a list
        #The nth dict on each list should be the same
        weight = self.gradingListsOfDictsDicts[key1][0]
        allScouts = filter(lambda v: v,
                           map(lambda k: k.get('scoutName'), tempTIMDs))
        # Unsorted meaning they can have different lengths
        unsortedLists = [
            tempTIMDs[tempTIMD].get(key1, [])
            for tempTIMD in range(len(tempTIMDs))
            if tempTIMDs[tempTIMD].get('scoutName')
        ]
        #Finds the mode for length of dicts and ignores if not that length
        #i.e. if there is disagreement over how many shots a robot took
        if unsortedLists:
            lenList = [len(lis) for lis in unsortedLists]
            modeListLength = utils.mode(lenList)
            modeAmount = lenList.count(modeListLength)
            #If someone missed an attempt or had an extra attempt, there is no way to compare their data
            #This filters out anything with a different length of dicts
            # 2018 - each dict is an attempt
            # This is year specific code for 2018!
            lists = []
            scouts = []
            for aScoutIndex in range(len(unsortedLists)):
                if len(unsortedLists[aScoutIndex]) == modeListLength:
                    lists.append(unsortedLists[aScoutIndex])
                    scouts.append(allScouts[aScoutIndex])
                elif modeAmount > 1:  # Updates SPR if incorecct list amount and at least 2 scouts agree
                    self.sprs.update({
                        allScouts[aScoutIndex]:
                        (self.sprs.get(allScouts[aScoutIndex]) or 0) + weight
                    })
                    self.disagreementBreakdown[allScouts[aScoutIndex]].update({
                        key1: {
                            'amount':
                            (self.disagreementBreakdown[allScouts[aScoutIndex]]
                             .get(key1, {}).get('amount', 0) + 1)
                        }
                    })
            # Need at least 2 scouts to compare, or SPR is not affected
            if modeAmount > 1:
                for num in range(modeListLength):
                    #Comparing dicts that should be the same (e.g. each shot time dict for the same shot) within the tempTIMDs
                    #This means the nth shot by a given robot in a given match, as recorded by multiple scouts
                    #The comparison itself is the same as the other findOddScout functions
                    dicts = [lis[num] for lis in lists]
                    keys = []
                    for x in dicts:
                        keys.append(x.keys()[0])
                    modeKey = max(set(keys), key=keys.count)
                    modeKeyAmount = keys.count(modeKey)

                    dicts2 = []
                    scouts2 = []
                    weight = self.gradingListsOfDictsDicts[key1][1]
                    for index in range(len(dicts)):
                        if dicts[index].keys()[0] == modeKey:
                            dicts2.append(dicts[index])
                            scouts2.append(scouts[index])
                        else:
                            self.sprs.update({
                                scouts[index]:
                                (self.sprs.get(scouts[index]) or 0) + weight
                            })
                            self.disagreementBreakdown[scouts[index]].update({
                                key1: {
                                    'climbType':
                                    (self.disagreementBreakdown[scouts[index]].
                                     get(key1, {}).get('climbType', 0) + 1)
                                }
                            })
                    # Must have 2 scouts to compare, or SPR is not affected
                    if modeKeyAmount > 1:
                        for key2 in dicts2[0].keys():
                            for key3 in dicts2[0][key2].keys():
                                #Strings can be averaged (we're just looking at mean, not subtracting them)
                                #Without averaging, one person could be declared correct for no reason
                                values = []
                                for aDict in dicts2:
                                    values += [aDict[key2][key3]]

                                weight = self.gradingListsOfDictsDicts[key1][
                                    2][key2][key3]
                                if weight != 0.0:
                                    mode = utils.mode(values)
                                    if mode:
                                        differenceFromMode = map(
                                            lambda v: weight
                                            if v != mode else 0, values)
                                        #Gets inaccuracy by category
                                        for c in range(
                                                len(differenceFromMode)):
                                            self.SPRBreakdown.update({
                                                key2:
                                                (self.SPRBreakdown.get(key2)
                                                 or []) +
                                                [(differenceFromMode[c])]
                                            })
                                            self.sprs.update({
                                                scouts2[c]:
                                                (self.sprs.get(scouts2[c])
                                                 or 0) + differenceFromMode[c]
                                            })
                                            self.disagreementBreakdown[
                                                scouts2[c]].update({
                                                    key1: {
                                                        key2: {
                                                            key3:
                                                            (self.
                                                             disagreementBreakdown[
                                                                 scouts2[c]].
                                                             get(key1, {}).get(
                                                                 key2, {}).get(
                                                                     key3, 0) +
                                                             1)
                                                        }
                                                    }
                                                })
示例#36
0
 def predict(example):
     """Find the k closest items, and have them vote for the best."""
     best = heapq.nsmallest(k, ((dataset.distance(e, example), e)
                                for e in dataset.examples))
     return mode(e[dataset.target] for (d, e) in best)
示例#37
0
    parser = argparse.ArgumentParser()
    parser.add_argument('-q','--question', required=True,
        choices=["1.1", "2", "2.2", "2.3", "2.4", "3", "3.1", "3.2", "4.1", "4.2", "5"])

    io_args = parser.parse_args()
    question = io_args.question

    if question == "1.1":
        # Q1.1 - This should print the answers to Q 1.1

        # Load the fluTrends dataset
        X, names = utils.load_dataset("fluTrends")

        # part 1: min, max, mean, median and mode
        results = ("Min: %.3f, Max: %.3f, Mean: %.3f, Median: %.3f, Mode: %.3f" %
                   (np.min(X), np.max(X), np.mean(X), np.median(X), utils.mode(X)))
        print(results)

        # part 2: quantiles
        print("quantiles: %s" % np.percentile(X, [10, 25, 50, 75, 90]))

        # part 3: maxMean, minMean, maxVar, minVar
        means = np.mean(X, axis=0)
        variances = np.var(X, axis=0)

        results = ("maxMean: %s, minMean: %s, maxVar: %s, minVar: %s" %
                   (names[np.argmax(means)], names[np.argmin(means)],
                    names[np.argmax(variances)], names[np.argmin(variances)]))

        # part 4: correlation between columns
        corr = np.corrcoef(X.T)