Exemplo n.º 1
0
def main():
    # preprocs = ["wav","normalized","bandpass","highpass"]
    # coefficients = ["mfccs","chroma","mel","contrast","all"]
    # subsegmentLengths = ["0.2", "0.05", "0.01"]
    # chunkLengths = ["1","2","3"]

    preprocs = ["normalized"]
    coefficients = ["mel"]
    subsegmentLengths = ["0.2"]
    chunkLengths = ["2"]

    numNodes = 20

    for preproc in preprocs:
        for coefficientType in coefficients:
            for subsegmentLength in subsegmentLengths:
                for chunkLength in chunkLengths:
                    argv = []
                    argv.append("")
                    argv.append(preproc)
                    argv.append(coefficientType)
                    argv.append(subsegmentLength)
                    argv.append(chunkLength)

                    X, Y = fetchDataMulti.getData(argv)

                    start = time.time()

                    knn.knn(X, Y)
                    svm.svm(X, Y)

                    return
Exemplo n.º 2
0
def hack(img_name):
    '''
    HACK Recognize a CAPTCHA image
      Inputs:
          img_name: filename of image
      Outputs:
          digits: 1x5 matrix, 5 digits in the input CAPTCHA image.
    '''
    data = np.load('hack_data.npz')
    x = extract_image.extract_image(img_name)

    # YOUR CODE HERE (you can delete the following code as you wish)
    x_train = data['x_train']
    y_train = data['y_train']
    number = len(x_train)

    # begin answer
    x_train, x_valid = x_train[ : number // 2], x_train[number // 2:]
    y_train, y_valid = y_train[ : number // 2], y_train[number // 2:]
    
    best_acc, best_k = 0.0, 1
    for k in range(1, 101):
        y = knn.knn(x_valid, x_train, y_train, k)
        acc = np.sum(y == y_valid) / len(y)
        print ("K =", k, " ACC =", acc)
        if acc > best_acc:
            best_acc = acc
            best_k = k
    print ("Choose", best_k, "as K.")
    digits = knn.knn(x, x_train, y_train, best_k)
    # end answer
    return digits
Exemplo n.º 3
0
Arquivo: lvq2.py Projeto: mbs8/lvq
def lvq2(dataset, prototypesPerClass, learningRate, k, w):
	classes = []
	prototypes = []
	minArg = []
	maxArg = []
	trainSet = []

	prototypes, classes, minArg, maxArg, trainSet = lvq1(dataset, prototypesPerClass, learningRate, k)

	actualIndex = 0
	totalIndex  = int((len(trainSet) * (1 - learningRate)) / 2)
	while actualIndex < totalIndex:
		i = 0
		while i < len(trainSet):
			knnClassification, neighbors = knn(classes, prototypes, minArg, maxArg, trainSet[i], k)
			knnClassification1, neighbors1 = knn(classes, prototypes, minArg, maxArg, trainSet[i+1], k)
			j = 0
			while j < len(neighbors):
				if window(neighbors[j][1], trainSet[i], trainSet[i+1], w, minArg, maxArg):
					if knnClassification != knnClassification1:
						if knnClassification == neighbors[j][1].classification:
							neighbors[j][1].adjustParam(trainSet[i], False, actualIndex, totalIndex)
							neighbors1[j][1].adjustParam(trainSet[i], True, actualIndex, totalIndex)
						else:
							neighbors[j][1].adjustParam(trainSet[i], True, actualIndex, totalIndex)
							neighbors1[j][1].adjustParam(trainSet[i], False, actualIndex, totalIndex)
				j += 1
			i += 2
		i = 0
		actualIndex += 1

	return prototypes, classes, minArg, maxArg
Exemplo n.º 4
0
def performance(df):
    execution_times = pd.DataFrame(
        index=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
        columns=[
            'method1_content', 'method1_content+books', 'method2',
            'method3_content', 'method3_content+books', 'method4_content_k1',
            'method4_content+books_k1', 'method4_content_k2',
            'method4_content+books_k2', 'method4_content_k3',
            'method4_content+books_k3', 'method4_content_k4',
            'method4_content+books_k4', 'method4_content_k5',
            'method4_content+books_k5'
        ])
    for i in xrange(10, 101, 10):
        #Method 1
        sample_df = df.sample(frac=i / 100.0)
        start = time.time()
        multi_label_classification(sample_df,
                                   False,
                                   test_size=1.0 / len(sample_df))
        end = time.time()
        execution_times["method1_content"][i] = end - start
        execution_times.to_csv("performance_test.csv")
        start = time.time()
        multi_label_classification(sample_df,
                                   True,
                                   test_size=1.0 / len(sample_df))
        end = time.time()
        execution_times["method1_content+books"][i] = end - start
        execution_times.to_csv("performance_test.csv")
        #Method 2
        start = time.time()
        word_frequencies(sample_df, one_run=True)
        end = time.time()
        execution_times["method2"][i] = end - start
        execution_times.to_csv("performance_test.csv")
        #Method 3
        start = time.time()
        nearest_centroid(sample_df, False, test_size=1.0 / len(sample_df))
        end = time.time()
        execution_times["method3_content"][i] = end - start
        execution_times.to_csv("performance_test.csv")
        start = time.time()
        nearest_centroid(sample_df, True, test_size=1.0 / len(sample_df))
        end = time.time()
        execution_times["method3_content+books"][i] = end - start
        execution_times.to_csv("performance_test.csv")
        #Method 4
        for k in range(5):
            start = time.time()
            knn(sample_df, k + 1, False, one_run=True)
            end = time.time()
            execution_times["method4_content_k" + str(k + 1)][i] = end - start
            execution_times.to_csv("performance_test.csv")
            start = time.time()
            knn(sample_df, k + 1, True, one_run=True)
            end = time.time()
            execution_times["method4_content+books_k" +
                            str(k + 1)][i] = end - start
            execution_times.to_csv("performance_test.csv")
Exemplo n.º 5
0
def max_rule(data_set, view1, view2, dists, classes, labels, ks):
    L = 3
    num_classes = len(classes)
    number_rows = data_set.shape[0] # igual para todos
    num_variables1 = data_set.shape[1]
    num_variables2 = view1.shape[1]
    num_variables3 = view2.shape[1]
    rates = numpy.zeros(30)

    for i in range(0, 30):
        kf = StratifiedKFold(n_splits=10, shuffle=True)
        folds = kf.split(data_set, labels)
        rate = 0.0
        for train, test in folds:

            class_probs1, means1, inv_cov_matrices1 = train_bayesian_classifier(
                data_set, classes, train)

            class_probs2, means2, inv_cov_matrices2 = train_bayesian_classifier(
                view1, classes, train)

            class_probs3, means3, inv_cov_matrices3 = train_bayesian_classifier(
                view2, classes, train)

            for x in test:
                x1 = data_set.iloc[x]
                x2 = view1.iloc[x]
                x3 = view2.iloc[x]

                probs1 = bayes_probability(num_variables1, x1, class_probs1,
                                           means1, inv_cov_matrices1)

                probs2 = bayes_probability(num_variables2, x2, class_probs2,
                                           means2, inv_cov_matrices2)

                probs3 = bayes_probability(num_variables3, x3, class_probs3,
                                           means3, inv_cov_matrices3)

                pred_class1, knn_probs_1 = knn.knn(data_set, dists[0], train,
                                               classes, x, ks[0])
                pred_class2, knn_probs_2 = knn.knn(view1, dists[1], train,
                                                   classes, x, ks[1])
                pred_class3, knn_probs_3 = knn.knn(data_set, dists[2], train,
                                                   classes, x, ks[2])

                class_votes = numpy.zeros(num_classes)
                for j in range(0, num_classes):
                    class_votes[j] = (1 - L) * class_probs1[j] + L * max(
                        probs1[j], probs2[j], probs3[j], knn_probs_1[j],
                        knn_probs_2[j], knn_probs_3[j])
                predicted_class = numpy.argmax(class_votes)
                if classes[predicted_class] == x1.name:
                    rate += 1.0
        rate /= number_rows
        rates[i] = rate

    mean_confidence_interval(rates)
    proportion_confidence_interval(rates)
    return rates
def runCode():
    pd.set_option('display.max_columns',50)
    pd.set_option('display.expand_frame_repr', False)
    dataset=pd.read_csv('ted_main.csv') 
    
    #formatting date
    dataset['film_date'] = dataset['film_date'].apply(lambda x: datetime.datetime.fromtimestamp( int(x)).strftime('%d-%m-%Y'))
    dataset['published_date'] = dataset['published_date'].apply(lambda x: datetime.datetime.fromtimestamp( int(x)).strftime('%d-%m-%Y'))
    dataset["published_year"] = dataset["published_date"].apply(lambda x: x.split("-")[2])
    
    dataset = dataset.sort_values('views', ascending=False)
    
    pyp.pubYearPlot(dataset)
    
    #printing presenter's occupation and their counts.
    print(dataset["speaker_occupation"].value_counts().head(10),"\n")
    
    
    #printing the top 5 occupation's number of views.
    print("Occupation:  Views")
    print("Writer: ",int(dataset[dataset["speaker_occupation"]=="Writer"]["views"].sum() / len(dataset[dataset["speaker_occupation"]=="Writer"])))
    print("Designer: ", int(dataset[dataset["speaker_occupation"]=="Designer"]["views"].sum() / len(dataset[dataset["speaker_occupation"]=="Designer"])))
    print("Artist: ",int(dataset[dataset["speaker_occupation"]=="Artist"]["views"].sum() / len(dataset[dataset["speaker_occupation"]=="Artist"])))
    print("Jornalist: ",int(dataset[dataset["speaker_occupation"]=="Journalist"]["views"].sum() / len(dataset[dataset["speaker_occupation"]=="Journalist"])))
    print("Entrepreneur",int(dataset[dataset["speaker_occupation"]=="Entrepreneur"]["views"].sum() / len(dataset[dataset["speaker_occupation"]=="Entrepreneur"])))
    
        
    #plotting views for each tag.
    ttp.tagsTalksPlot(dataset)
    
    #tags count yearly (CAN USE GUI HERE)
    print("\nMost popular Tags for year 2015")
    print("===="*7)
    year="2015"
    ttp.tagsCountYearly(dataset,year,listTags)
    print("\nMost popular Tags for year 2016")
    print("===="*7)
    year="2016"
    ttp.tagsCountYearly(dataset,year,listTags)
    print("\nMost popular Tags for year 2017")
    print("===="*7)
    year="2017"
    ttp.tagsCountYearly(dataset,year,listTags)
    
#    #Ratings vs count plot
#    counter = {'Funny':0, 'Beautiful':0, 'Ingenious':0, 'Courageous':0, 'Longwinded':0, 'Confusing':0, 'Informative':0, 'Fascinating':0, 'Unconvincing':0, 'Persuasive':0, 'Jaw-dropping':0, 'OK':0, 'Obnoxious':0, 'Inspiring':0}
#    neg_descriptors = {"Confusing", "Unconvincing", "Longwinded", "Obnoxious", "OK"}
#    rp.ratingsPlot(dataset,counter,neg_descriptors)
#    
#    #Ratings vs count plot
#    counter = {'Funny':0, 'Beautiful':0, 'Ingenious':0, 'Courageous':0, 'Longwinded':0, 'Confusing':0, 'Informative':0, 'Fascinating':0, 'Unconvincing':0, 'Persuasive':0, 'Jaw-dropping':0, 'OK':0, 'Obnoxious':0, 'Inspiring':0}
#    neg_descriptors = {"Confusing", "Unconvincing", "Longwinded", "Obnoxious", "OK"}
#    rp.ratingsPlot(dataset,counter,neg_descriptors)
    
    k.knn(dataset,listTags,durationEntry,languageEntry)
    twty.searchTweets()
Exemplo n.º 7
0
def onclick(event):
    # Creating a new point and finding the k nearest neighbours
    new = sample.Sample('', [event.xdata, event.ydata], '')
    knn.knn(new, data, K)

    data.append(new)
    pylab.scatter([new.getFeatures()[0]], \
                  [new.getFeatures()[1]], \
                  label=new.getLabel(), \
                  marker=MARKERS[LABELS.index(new.getLabel())], \
                  color=COLORS[LABELS.index(new.getLabel())])
    pylab.draw()
Exemplo n.º 8
0
 def knnmodel(self):
     _translate = QtCore.QCoreApplication.translate
     knn.knn(self.path)
     self.modelclass = 2
     self.label.setText(
         _translate(
             "MainWindow",
             "<html><head/><body><p><span style=\" font-size:10pt; font-weight:600;\">"
             + "  KNN模型训练完毕!" + "</span></p></body></html>"))
     self.model.setText(
         _translate(
             "MainWindow",
             "<html><head/><body><p><span style=\" font-size:10pt; font-weight:600;\">"
             + "KNN模型 " + "</span></p></body></html>"))
Exemplo n.º 9
0
def runCode():
    pd.set_option('display.max_columns',50)
    pd.set_option('display.expand_frame_repr', False)
    dataset=pd.read_csv('ted_main.csv') 
    
    #formatting date
    dataset['film_date'] = dataset['film_date'].apply(lambda x: datetime.datetime.fromtimestamp( int(x)).strftime('%d-%m-%Y'))
    dataset['published_date'] = dataset['published_date'].apply(lambda x: datetime.datetime.fromtimestamp( int(x)).strftime('%d-%m-%Y'))
    dataset["published_year"] = dataset["published_date"].apply(lambda x: x.split("-")[2])
    
    dataset = dataset.sort_values('views', ascending=False)
    
    #Call to create graph Number of Talks Vs Published_Year 
    pyp.pubYearPlot(dataset)
    
    #printing presenter's occupation and their counts.
    print(dataset["speaker_occupation"].value_counts().head(10),"\n")
    
    
    #printing the top 5 occupation's number of views.
    print("Occupation:  Views")
    print("Writer: ",int(dataset[dataset["speaker_occupation"]=="Writer"]["views"].sum() / len(dataset[dataset["speaker_occupation"]=="Writer"])))
    print("Designer: ", int(dataset[dataset["speaker_occupation"]=="Designer"]["views"].sum() / len(dataset[dataset["speaker_occupation"]=="Designer"])))
    print("Artist: ",int(dataset[dataset["speaker_occupation"]=="Artist"]["views"].sum() / len(dataset[dataset["speaker_occupation"]=="Artist"])))
    print("Jornalist: ",int(dataset[dataset["speaker_occupation"]=="Journalist"]["views"].sum() / len(dataset[dataset["speaker_occupation"]=="Journalist"])))
    print("Entrepreneur",int(dataset[dataset["speaker_occupation"]=="Entrepreneur"]["views"].sum() / len(dataset[dataset["speaker_occupation"]=="Entrepreneur"])))
    
        
    #plotting views for each tag.
    ttp.tagsTalksPlot(dataset)
    
    #tags count yearly (CAN USE GUI HERE)
    print("\nMost popular Tags for year 2015")
    print("===="*7)
    year="2015"
    ttp.tagsCountYearly(dataset,year,listTags)
    print("\nMost popular Tags for year 2016")
    print("===="*7)
    year="2016"
    ttp.tagsCountYearly(dataset,year,listTags)
    print("\nMost popular Tags for year 2017")
    print("===="*7)
    year="2017"
    ttp.tagsCountYearly(dataset,year,listTags)
    
    #Call to KNN Algorithm    
    k.knn(dataset,listTags,durationEntry,languageEntry)
    twty.searchTweets()
Exemplo n.º 10
0
Arquivo: Spam.py Projeto: zoharbou/IML
def train_knn(x_train, y_train, x_test, k):
    trainer = knn(k)
    trainer.fit(x_train, y_train)
    y_predict = np.zeros(len(x_test))
    for j, x in enumerate(x_test):
        y_predict[j] = trainer.predict(x)
    return y_predict
Exemplo n.º 11
0
def hack(img_name):
    '''
    hack Recognize a CAPTCHA image
      Inputs:
          img_name: filename of image
      Outputs:
          digits: 4 digits in the input CAPTCHA image, shape(4, ).
    '''
    # hack_data.npz contains 100 images with labels,
    # i.e., 400 digits with labels
    data = np.load('hack_data.npz')

    # YOUR CODE HERE (you can delete the following code as you wish)
    x_train = data['x_train']
    y_train = data['y_train']

    # begin answer
    N = x_train.shape[0]
    # square of N, square(400) in this case
    k = 20
    # test matrix, 4-by-144
    x_test = extract_image(img_name)

    digits = knn.knn(x_test, x_train, y_train, k)
    # end answer

    return digits
Exemplo n.º 12
0
def requirement1() :

    global min_range
    global max_range

    ds = [100, 500, 1000, 10000]
    b = 100
    h = 0.1
    k = 10

    xs = np.linspace(min_range, max_range, 200)

    # Histogram as example
    legends = []
    data = get_data(200)
    plot_true_distribution(1000)
    legends.append('True distribution')
    for d in ds :
        data = get_data(d)
        plt.hist(data, density=True, bins=b, alpha=0.4)
        legends.append('#bin = ' + str(b) + ', #data = ' + str(d))
    plt.legend(legends)
    plt.title('Requirement 1-1')
    plt.savefig('req1-1', dpi=300)
    plt.show()

    # KDE as example
    plt.figure()
    legends = []
    data = get_data(200)
    plot_true_distribution(1000)
    legends.append('True distribution')
    density = kde(data)
    for d in ds :
        data = get_data(d)
        density = kde(data)
        density.set_bandwidth(h)
        plt.plot(xs, density(xs))
        legends.append('h = ' + str(h) + ', #data = ' + str(d))
    plt.legend(legends)
    plt.title('Requirement 1-2')
    plt.savefig('req1-2', dpi=300)
    plt.show()

    # KNN as example
    plt.figure()
    legends = []
    data = get_data(200)
    plot_true_distribution(1000)
    legends.append('True distribution')
    for d in ds :
        data = get_data(d)
        density = knn(data, k)
        plt.plot(xs, density(xs))
        legends.append('k = ' + str(k) + ', #data = ' + str(d))
    plt.legend(legends)
    plt.ylim([0, 0.4])
    plt.title('Requirement 1-3')
    plt.savefig('req1-3', dpi=300)
    plt.show()
Exemplo n.º 13
0
def run_ml(k=1):
    total = len(data)
    num_test = int(total * 0.3)

    train_data = []
    train_labels = []
    test_data = []
    test_labels = []

    train_data = data[:]
    train_labels = labels[:]

    n = total
    for i in range(num_test):
        index = int(random.random() * n)
        n -= 1
        test_data.append(train_data.pop(index))
        test_labels.append(train_labels.pop(index))
    # train
    import nn
    classifier_nn = nn.nn(train_data, train_labels)
    import knn
    classifier_knn = knn.knn(train_data, train_labels)
    # test
    for i in range(len(test_data)):
        d = test_data[i]
        res.append(
            [classifier_nn.test(d),
             classifier_knn.test(d, k), test_labels[i]])
Exemplo n.º 14
0
def test():
    features = np.load('train_features.npy')

    # create test matrix
    test_matrix = create_matrix(100, 100)

    # train knn model
    idx = knn(features, RW(test_matrix))

    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    ax.set_title("fruchterman_reingold")

    # draw
    draw(np.load('matrices/synthetic' + str(idx) + '.npy'), 'estimated_layout',
         False, ax)
    draw(test_matrix, 'real_layout', True, ax)

    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    ax.set_title("Kamada-Kawai")

    # draw
    draw2(np.load('matrices/synthetic' + str(idx) + '.npy'),
          'estimated_layout2', False, ax)
    draw2(test_matrix, 'real_layout2', True, ax)

    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    ax.set_title("Spectral method")

    # draw
    draw3(np.load('matrices/synthetic' + str(idx) + '.npy'),
          'estimated_layout3', False, ax)
    draw3(test_matrix, 'real_layout3', True, ax)
Exemplo n.º 15
0
def train(k, xx, yy, data):
    ptxm = []
    ptym = []
    for i in data:
        ptx = [i[0], i[1]]
        ptxm.append(ptx)
        label = i[2]
        ptym.append(label)
    nn = knn(k)
    ptxm = np.array(ptxm)
    ptym = np.array(ptym)
    #print(ptxm)
    #print(ptym)
    nn.train(ptxm, ptym)
    zz = []
    #print(nn.predict(ptxm))
    x_len, y_len = np.shape(xx)
    tptx = []
    for i in range(x_len):
        for j in range(y_len):
            px = xx[i][j]
            py = yy[i][j]
            ptx = [px, py]
            tptx.append(ptx)
    #print(tptx)
    z = nn.predict(np.array(tptx))
    z = np.reshape(z, (x_len, y_len))
    return z
Exemplo n.º 16
0
    def predict(self, test_data):
        self.rtl = np.zeros((test_data.shape[0], self.train_lables_num))

        test_data_num = test_data.shape[0]
        self.predict_labels = np.zeros((test_data_num, self.train_lables_num))
        for i in range(test_data_num):
            # get k nearest neighbors' index in train data
            knn_index, knn_distances = knn.knn(test_data[i], self.train_data,
                                               self.k)

            for j in range(self.train_lables_num):
                temp = 0

                for index in knn_index:
                    if self.train_labels[index][j] == 1:
                        temp = temp + 1
                y1 = self.PH1[j] * self.PEH1[j][temp]
                y0 = self.PH0[j] * self.PEH0[j][temp]

                self.rtl[i][j] = self.PH1[j] * self.PEH1[j][temp] / (
                    self.PH1[j] * self.PEH1[j][temp] +
                    self.PH0[j] * self.PEH0[j][temp])
                if y1 > y0:
                    self.predict_labels[i][j] = 1
                else:
                    self.predict_labels[i][j] = 0
        # print(self.predict_labels)
        return self.predict_labels
Exemplo n.º 17
0
    def fit(self):

        # cal ph0 , ph1
        for i in range(self.train_lables_num):
            y = 0
            for j in range(self.train_data_num):
                if self.train_labels[j][i] == 1:
                    y += 1
            self.PH1[i] = (self.s + y) / (self.s * 2 + self.train_data_num)
        self.PH0 = 1 - self.PH1

        # cal peh1m peh0
        for i in range(self.train_lables_num):
            c1 = np.zeros((self.k + 1, ))
            c0 = np.zeros((self.k + 1, ))
            for j in range(self.train_data_num):
                temp = 0
                knn_index, knn_distances = knn.knn(self.train_data[j],
                                                   self.train_data, self.k + 1)
                knn_index = knn_index[1:]
                # knn_distances = knn_distances[1:]
                for index in knn_index:
                    if self.train_labels[index][i] == 1:
                        temp += 1

                if self.train_labels[j][i] == 1:
                    c1[temp] = c1[temp] + 1
                else:
                    c0[temp] = c0[temp] + 1

            for l in range(self.k + 1):
                self.PEH1[i][l] = (self.s + c1[l]) / (self.s *
                                                      (self.k + 1) + c1.sum())
                self.PEH0[i][l] = (self.s + c0[l]) / (self.s *
                                                      (self.k + 1) + c0.sum())
Exemplo n.º 18
0
def test_knn():
	dataset = pickle.load(open("dataset.obj", "rb"))
	n_classes = len(dataset.get_classes())
	start = time.time()
	predictions = knn.knn(dataset)
	end = time.time()
	elapsed_time = utils.humanize_time(end - start)
	print("Elapsed time using knn {0}...".format(elapsed_time))
	print("predictions = \n{0}".format(predictions))
	utils.write_list(predictions, "results/knn-predictions.txt")
	# predictions = [
	# 	[1, 1, 0, 2, 4, 3, 2, 0, 2, 4, 0, 3, 2, 1, 1],
	# 	[1, 2, 4, 2, 1, 0, 4, 1, 3, 2, 2, 2, 1, 2, 1],
	# 	[2, 3, 4, 2, 2, 0, 2, 0, 3, 3, 1, 2, 2, 2, 3],
 	#	[0, 1, 3, 3, 3, 3, 1, 3, 3, 3, 2, 2, 3, 0, 1],
 	# 	[3, 0, 2, 1, 4, 2, 1, 0, 2, 4, 1, 1, 4, 2, 3]
 	# ]
	hist = np.zeros((n_classes, n_classes), dtype=np.uint16)
	for i in range(len(predictions)):
		for j in range(len(predictions[i])):
			c = predictions[i][j]
			hist[i][c] += 1
	print("hist = \n{0}".format(hist))
	np.savetxt("results/knn-hist.csv", hist, fmt="%i", delimiter=",")
	confusion_matrix = hist / 25.0
	print("conf mat = \n{0}".format(confusion_matrix))
	values = [confusion_matrix[i][i] for i in range(n_classes)]
	precision = np.average(values)
	print("precision = {0}".format(precision))

	plt.matshow(confusion_matrix)
	plt.title('Confusion matrix')
	plt.colorbar()
	plt.show()
Exemplo n.º 19
0
 def knn(self, predictData=None, trainData=None):
     h = hp()
     k = knn()
     accuracy = []
     precision = []
     recall = []
     f_score = []
     mean, stdDev = h.normalizeData(trainData)
     nn = int(input("Enter the number of closest neighbors to consider: "))
     h.normalizeEvaluationSet(predictData, mean, stdDev)
     for i in range(len(trainData)):
         tmp = None
         predictData = trainData[i]
         tmp = [lt for j, lt in enumerate(trainData) if j != i]
         td = h.convertToList(tmp)
         k.classify(td, predictData, nn)
         truePositives, trueNegatives, falsePositives, falseNegatives = h.findParams(
             predictData)
         accuracy.append(
             h.findAccuracy(truePositives, trueNegatives, falsePositives,
                            falseNegatives))
         tmpPrecision = h.findPrecision(truePositives, trueNegatives,
                                        falsePositives, falseNegatives)
         tmpRecall = h.findRecall(truePositives, trueNegatives,
                                  falsePositives, falseNegatives)
         precision.append(tmpPrecision)
         recall.append(tmpRecall)
         f_score.append(h.findFMeasure(tmpPrecision, tmpRecall))
     return accuracy, precision, recall, f_score
Exemplo n.º 20
0
def run_all():
    import pandas as pd
    from ds import decision_tree
    from knn import knn
    from logreg import logreg
    from sv import sv
    from ensemble_methods import ensemble_methods
    from xg import xgb

    print('Running All')
    acc_ds = decision_tree()
    acc_knn = knn()
    acc_log = logreg()
    acc_xg = xgb()
    acc_svc = sv()
    acc_rf, acc_ab, acc_gb = ensemble_methods()

    # Model Performance

    models = pd.DataFrame({
        'Model': [
            'XGBoost', 'Logistic Regression', 'KNN', 'Support Vector Machines',
            'Gradient Boosting', 'Random Forest', 'Decision Tree', 'ADABoost'
        ],
        'Score':
        [acc_xg, acc_log, acc_knn, acc_svc, acc_gb, acc_rf, acc_ds, acc_ab]
    })
    models = models.sort_values(by='Score', ascending=True)
    print models.sort_values(by='Score', ascending=False)
Exemplo n.º 21
0
def compare_errors(k_vals, input_data_file):
    ## read in the input data
    initial_data = create_data(input_data_file)

    ## create plots of the data (this should save the images within the current
    ## directory)
    plot_data(initial_data)

    ## integerize the data labels
    integerized_data, label_dict = integerize_labels(initial_data)

    ## split the data into train and test
    train, test = split(integerized_data)

    ## compute the errors
    errors = {}
    for k in k_vals:
        predicted_labels = knn(train, test, k)
        error_rate = calculate_error_rate(predicted_labels, test)
        errors[k] = error_rate

    ## BONUS: weighting
    for k in k_vals:
        weighted_predicted_labels = weighted_knn(train, test, k)
        weighted_error_rate = calculate_error_rate(weighted_predicted_labels,
                                                   test)
        print("Weighted error value for k = %d was %f" %
              (k, weighted_error_rate))

    return errors
Exemplo n.º 22
0
def digit_recognizer(train_data_file, test_data_file, test_label_file, test_result_file, knn_k):
    log("Start get train data & label.")
    train_data, train_label = load_train_data(train_data_file)

    log("Start get test data.")
    test_data = load_test_data(test_data_file)

    log("Start get test label.")
    test_label = load_test_result(test_label_file)
    log("test label: {}".format(test_label))
    m, n = shape(test_data)
    error_count = 0
    result = []
    for idx in range(m):
        log("main iter: {}".format(idx))
        classifier_result = knn(test_data[idx], train_data, train_label, knn_k)
        result.append(classifier_result)
        log("the class result: {}, the true answer: {}".format(classifier_result, test_label[0, idx]))
        if classifier_result != test_label[0, idx]:
            error_count += 1

    log("error count: {}".format(error_count))
    log("error rate: {}".format(error_count / float(m)))

    save_result(result, test_result_file)
Exemplo n.º 23
0
    def move(self):

        a = []
        Pregnancies = self.lineEdit.text()
        Glucose = self.lineEdit_2.text()
        BloodPressure = self.lineEdit_3.text()
        SkinThickness = self.lineEdit_4.text()
        Insulin = self.lineEdit_5.text()
        Bmi = self.lineEdit_6.text()
        DiabetesPedigreeFunction = self.lineEdit_7.text()
        Age = self.lineEdit_8.text()
        print(isinstance(12, numbers.Real))
        if (len(Pregnancies) != 0 and len(Glucose) != 0
                and len(BloodPressure) != 0 and len(SkinThickness) != 0
                and len(Insulin) != 0 and len(Bmi) != 0
                and len(DiabetesPedigreeFunction) != 0 and len(Age) != 0):
            a.append(float(Pregnancies))
            a.append(float(Glucose))
            a.append(float(BloodPressure))
            a.append(float(SkinThickness))
            a.append(float(Insulin))
            a.append(float(Bmi))
            a.append(float(DiabetesPedigreeFunction))
            a.append(float(Age))
            model = knn('data_diabetes.csv')
            output = model.predict([a])
            print(output)
            self.openWindow(output[0])
        else:
            self.warning("Cảnh báo", "Bạn nhập sai")
Exemplo n.º 24
0
def generate_output(patient_index):
    distances, indices = knn()
    neighbours_indices = neighbours_of_index(patient_index, distances, indices)
    export_anomaly_and_neighbours('Reports/anomalies/', patient_index,
                                  neighbours_indices)
    data = []
    with open('Reports/anomalies/' + str(patient_index) + '.csv') as csvfile:
        spamreader = csv.reader(csvfile, delimiter=',')
        for row in spamreader:
            data.append(row)
    patient_input = [float(i) for i in data[1][1:]]
    all_inputs = [[float(j) for j in i[1:]] for i in data[1:]]
    for input_idx in range(30):
        inputs = get_inputs(input_idx, all_inputs)
        std = np.std(inputs)
        mean = np.mean(inputs)
        z = abs((patient_input[input_idx] - mean) / std)
        z = float("{0:.3f}".format(z))
        above_mean = patient_input[input_idx] >= mean
        t1 = "for " + color.DARKCYAN + Y_LABEL[input_idx].title(
        ) + color.END + " the patient is "
        t2 = color.PURPLE + str(z) + color.END
        t3 = " standard deviations below the mean"
        if z > 1:
            t2 = color.RED + str(z) + color.END
        if above_mean:
            t3 = " standard deviations above the mean"
        print(t1 + t2 + t3)
Exemplo n.º 25
0
def assess_knn(name,
               point_info,
               categories,
               labels,
               actions=['save', 'load', 'evaluate']):
    '''
    name: name of graph being assessed
    point_info: coords (list of coordinates), nids (list of node ids), 
        category_map (map of node ids to categories), nodes (dataframe of all this info)
    categories: categories to assess
    labels: correct and incorrect high-density nodes of a given category
    actions: [evaluate/graph/save] -> what to do
    '''
    print >> sys.stderr, 'assessing knn...'
    coords, nids, category_map, nodes = point_info
    pos = {}
    for nid, coord in zip(nids, coords):
        pos[nid] = (coord[0], coord[1])

    ks = list(range(4, 20))
    results = {cat: {} for cat in categories}
    if 'load' in actions:
        for k in ks:
            for c in categories:
                graphfile = 'src/clustering/graphs/{}-{}-knn-{:02}.csv'.format(
                    name, c, k)
                if os.path.isfile(graphfile):
                    edges = pd.read_csv(graphfile, ',', header=0)
                    edges['r1'] = edges['r1'].apply(str)
                    edges['r2'] = edges['r2'].apply(str)
                    cat_graph = nx.from_pandas_edgelist(edges,
                                                        source='r1',
                                                        target='r2')
                    if 'evaluate' in actions:
                        res = evaluate('knn,k={}'.format(k), c,
                                       set(cat_graph.nodes()), labels)
                        results[c][k] = res

    for i, k in enumerate(ks):
        if all([k in results[c] for c in categories]):
            continue
        graph = knn.knn(nodes, k)
        for c in categories:
            cat_graph = knn.split(graph, c, category_map)
            cat_graph = cutoff.filter_connected_components(cat_graph)
            if 'evaluate' in actions:
                res = evaluate('knn,k={}'.format(k), c, set(cat_graph.nodes()),
                               labels)
                results[c][k] = res
            if 'graph' in actions:
                draw_graph(
                    cat_graph, pos, 'knn with filter and k={}'.format(k),
                    'src/clustering/figures/{}-{}-knn-{:02}.png'.format(
                        name, c, k))
            if 'save' in actions:
                save_graph(
                    cat_graph,
                    'src/clustering/graphs/{}-{}-knn-{:02}.csv'.format(
                        name, c, k))
    return results
Exemplo n.º 26
0
def cross_validation(data, target, k):
    num_samples = data.shape[0]
    perm = np.arange(num_samples)
    np.random.shuffle(perm)
    data = data[perm]
    target = target[perm]

    batch_sz = num_samples//10
    
    errs = 0
    for batch in range(0, num_samples, batch_sz):

        batch_train_X = np.concatenate( [data[:batch],   data[(batch+batch_sz):]] )
        batch_train_Y = np.concatenate( [target[:batch], target[(batch + batch_sz):]] )

        batch_test_X = data[batch:batch+batch_sz]
        batch_test_Y = target[batch:batch+batch_sz]

        pred = knn(batch_train_X, batch_test_X, batch_train_Y, k)

        #print(pred)
        #print(batch_test_Y)
        errs += np.sum(pred != batch_test_Y)

    return errs/num_samples
Exemplo n.º 27
0
def run_ml(k = 1):
	total = len(data)
	num_test = int(total * 0.3)

	train_data = []
	train_labels = []
	test_data  = []
	test_labels = []

	train_data   = data[:]
	train_labels = labels[:]

	n = total
	for i in range(num_test):
		index = int(random.random() * n)
		n -= 1
		test_data.append(train_data.pop(index))
		test_labels.append(train_labels.pop(index))
	# train
	import nn
	classifier_nn = nn.nn(train_data,train_labels)
	import knn
	classifier_knn = knn.knn(train_data,train_labels)
	# test
	for i in range(len(test_data)):
		d = test_data[i]
		res.append([classifier_nn.test(d), classifier_knn.test(d,k), test_labels[i]])
Exemplo n.º 28
0
    def match(self, tree, query, k=None, radius=None):
        dist_list, idx_list = knn.knn(tree, query, k)
        dist_list = dist_list.T

        # index basis is 1
        idx_list = idx_list.T - 1
        return dist_list, idx_list
Exemplo n.º 29
0
def test_knn():
    dataset = pickle.load(open("dataset.obj", "rb"))
    n_classes = len(dataset.get_classes())
    start = time.time()
    predictions = knn.knn(dataset)
    end = time.time()
    elapsed_time = utils.humanize_time(end - start)
    print("Elapsed time using knn {0}...".format(elapsed_time))
    print("predictions = \n{0}".format(predictions))
    utils.write_list(predictions, "results/knn-predictions.txt")
    # predictions = [
    # 	[1, 1, 0, 2, 4, 3, 2, 0, 2, 4, 0, 3, 2, 1, 1],
    # 	[1, 2, 4, 2, 1, 0, 4, 1, 3, 2, 2, 2, 1, 2, 1],
    # 	[2, 3, 4, 2, 2, 0, 2, 0, 3, 3, 1, 2, 2, 2, 3],
    #	[0, 1, 3, 3, 3, 3, 1, 3, 3, 3, 2, 2, 3, 0, 1],
    # 	[3, 0, 2, 1, 4, 2, 1, 0, 2, 4, 1, 1, 4, 2, 3]
    # ]
    hist = np.zeros((n_classes, n_classes), dtype=np.uint16)
    for i in range(len(predictions)):
        for j in range(len(predictions[i])):
            c = predictions[i][j]
            hist[i][c] += 1
    print("hist = \n{0}".format(hist))
    np.savetxt("results/knn-hist.csv", hist, fmt="%i", delimiter=",")
    confusion_matrix = hist / 25.0
    print("conf mat = \n{0}".format(confusion_matrix))
    values = [confusion_matrix[i][i] for i in range(n_classes)]
    precision = np.average(values)
    print("precision = {0}".format(precision))

    plt.matshow(confusion_matrix)
    plt.title('Confusion matrix')
    plt.colorbar()
    plt.show()
def get_nearest_neighbour(pt_cloud,
                          bev_width,
                          bev_length,
                          image_downsampling_factor,
                          P2,
                          parts=4):
    if pt_cloud.shape[0] != 3:
        pt_cloud = pt_cloud.T
    world_pts = []
    # one time for each dim
    for i in range(bev_length):
        for j in range(bev_width):
            world_pts.append(bev2world(j, i, bev_width, bev_length, 80, 70))
    all_inds = []
    for i in range(parts):
        cur_part = np.array(
            world_pts[i * len(world_pts) // parts:i * len(world_pts) // parts +
                      len(world_pts) // parts]).T
        _, inds = knn.knn(cur_part.astype(np.float32),
                          pt_cloud.astype(np.float32), 1)
        inds = np.squeeze(inds) - 1
        all_inds = all_inds + inds.tolist()
    world_pts = np.array(world_pts).T
    nearest = pt_cloud[:, all_inds]
    return world_pts, nearest
Exemplo n.º 31
0
def procYear(player, kNeighbors=4, year=''):
    """ takes as input dataframe from year specific FanGraphs Leaderboard, 
    year (blank = all), player, and the number of neighbors in cluster
    returns cluster of players
    """
    ## load the knn clustering class object
    knnObj = knn.knn()
    knnObj.k = kNeighbors

    knnObj.procList = ['CPU']  ## Choose CPU or GPU distance calculations

    ## read fangraphs csv into dataframe
    ## downloaded from https://www.fangraphs.com/
    df = pd.read_csv('FanGraphsLeaderboard' + str(year) + '.csv')

    ## remove name artifact
    df = df.rename(columns={'\xef\xbb\xbf"Name"': 'name'})

    ## handle % in floating point values
    for col in df.columns:
        if "%" in col:
            ## chop off ' %'
            df[col] = df.apply(lambda x: float(x[col][:-2]), axis=1)

    ## use player name as id
    dataCl = np.array(df.name)

    ## columns to remove from clustering
    drop_cols = ['Team', 'playerid', 'name']

    data = np.column_stack(
        [df[col] for col in df.columns if col not in drop_cols])

    ## handle wide data edge case
    if len(data) <= knnObj.k:
        return [0]

    ## feature scaling
    scaler = StandardScaler()
    data = scaler.fit_transform(data)

    nPts = len(data)

    if player not in dataCl:
        return [0]

    point = list(dataCl).index(player)
    dataPoint = data[point]

    ## subtract off test point
    knnObj.data = np.append(data[:][0:point], data[:][point + 1:nPts], axis=0)
    knnObj.dataCl = np.append(dataCl[:][0:point],
                              dataCl[:][point + 1:nPts],
                              axis=0)

    knnObj.testPt = dataPoint

    result = knnObj.getCluster()

    return result
Exemplo n.º 32
0
def lvq_3(prots):
    prots_3 = prots[:]
    for r in range(repetitions):
        for x in dataset:
            closest_prototypes = nn(x, 2, prots_3)
            m = closest_prototypes[0]['elem']
            n = closest_prototypes[1]['elem']

            m_class = closest_prototypes[0]['class']
            n_class = closest_prototypes[1]['class']
            x_class = x[len(x) - 1]

            same_class = m
            if (m_class == x_class):
                same_class = m
                other_class = n
            elif (n_class == x_class):
                same_class = n
                other_class = m
            else:
                same_class = False

            if (window_rule(x, m, n) and same_class):
                if (m_class != n_class):
                    movement(same_class, x, True)
                    movement(other_class, x, False)
                else:
                    movement(same_class, x, True, e=e)
                    movement(other_class, x, True, e=e)

    print "LVQ 3 RESULTS:"
    return knn(k, prots_3, evaluation)
Exemplo n.º 33
0
def run(source, method, k):
	tags = [1,1,1,1,1,1]
	if method=='knn' and k is None:
		raise ValueError('Argument k is mandatory for KnnClassifier')
	else:
		if source=='chemistry':
			if tags[5]:
				glossary = get_chemistry()
			else:
				glossary = None
			filepath = 'files/chemistry.xml'
		else:
			if tags[5]:
				glossary = get_graphicdesign()
			else:
				glossary = None
			filepath = 'files/graphic-design.xml'
			
		matrix, tag_names = preprocessing(filepath, tags, glossary)

		print(tag_names)

		if method=='knn':
			k = int(math.fabs(int(k) or 5))
			r = knn(matrix, k)
		else:
			r = logistic_regression(matrix)

		print(r)
Exemplo n.º 34
0
def runtest_knn(train_data, test_data, k=1):
    train_t, train_zips = cookdata_knn(train_data)
    test_t, test_zips = cookdata_knn(test_data)
    correct = 0
    for i in range(len(test_zips)):
        res = knn.knn(train_zips, train_t, k, test_zips[i])
        if res == test_t[i]:
            correct += 1
    return 1.-(float(correct)/len(test_zips))
Exemplo n.º 35
0
def loocv(training_data, k, current_features): 
    num_examples = len(training_data)
    num_correct = 0
    for leave_out in range (0, num_examples):
        test = [training_data.pop(leave_out),]
        num_correct += knn.knn(training_data, test, k, current_features)
        training_data.insert(leave_out, test[0])
    acc = 100.0 * num_correct / num_examples
    return [num_correct, num_examples, acc]
Exemplo n.º 36
0
def mnist_block(train_set, valid_set, test_set, knn_data, mis):

    dataset = train_set
    n=int(mis*28)
 
    ###mask
    train_mask=np.ones_like(train_set)
    valid_mask=np.ones_like(valid_set)
    test_mask=np.ones_like(test_set)

    block=[0]*28
    for row in range(train_mask.shape[0]):
        ran=np.random.randint(100,700,size=n)
        for r in ran:
            train_mask[row,r:r+28]=block



    data = (train_set*train_mask, valid_set *valid_mask ,test_set *test_mask)
    mask= train_mask, valid_mask, test_mask



    ###knn
    knn_mask = np.split(train_mask, 10)[0]
    t0=time.time()
    knn_result = knn(knn_data , knn_mask,k=50)
    tknn=time.time()-t0


    ###sda
    t0=time.time()    
    gather=Gather_sda(dataset,data ,problem = 'class', available_mask = mask,
                          method = 'nes_mom',
                          pretraining_epochs = 10,
                          pretrain_lr = 0.0005,
                          training_epochs = 100,
                          finetune_lr = 0.0005,
                          batch_size = 200,  ###300
                          hidden_size = [1000,1000,100],
                          dA_initiall = True ,
                          error_known = True )

    gather.finetuning()
    tsda=time.time()-t0
    print('time_knn',tknn,'time_sda',tsda)

    sda_er = np.mean(np.sum((1-train_mask)*((train_set-gather.gather_out())**2), axis=1))
    kn_er = np.mean(np.sum((1-knn_mask)*((knn_data-knn_result)**2), axis=1))
    
    return(sda_er, kn_er)



    
    """
Exemplo n.º 37
0
def test():
	print("Starting int test.")
	k = knn.knn((
		knn.TrainingEntry((0, 0), 'red'),
		knn.TrainingEntry((5, 5), 'blue'),
	))
	assert k.predict((1, 1)) == 'red'
	assert k.predict((4, 4)) == 'blue'
	for vec in [(0, 0), (2, 4), (5, 5), (7, 9), (-2, -4)]:
		print("Regress[{0}] = {1}".format(vec, k.regress(vec)))
Exemplo n.º 38
0
def setupClassifier(path):
	data  = []
	label = []
	l = p.getTrialList(path)
	for x in l:
		features = f.Features(x,path[-2])
		data.append(features.feature)
		if x.head.target == 'good':
			label.append(1)
		else:
			label.append(0)
	return knn.knn(data, label)
Exemplo n.º 39
0
def leave_one_out(examples,k):
    conf_matr = ConfusionMatrix()
    for ex in examples:
        # disable only this example
        ex.active = False
        # run the k-Nearest-Neighbor algorithm
        rank_list = knn.knn(k,examples,ex)
        # check the voting for correctness
        outcome = knn.voting(rank_list)
        conf_matr.inc_according_to(outcome,ex.outcome)
        ex.active = True
    # return the computed confusion matrix
    return conf_matr
Exemplo n.º 40
0
def datingClassTest():
    hoRatio = 0.80
    datingDataMat, datingLabels = file2matrix('C:\Users\Daniel.Lee\Desktop\machinelearninginaction-master\Ch02\datingTestSet.txt')
    normMat, ranges, minVals = autoNorm(datingDataMat)
    m = normMat.shape[0]
    numTestVecs = int(m * hoRatio)
    errorCount = 0.0
    for i in range(numTestVecs):
        classifierResult = knn(normMat[i, :], normMat[numTestVecs:m, :], datingLabels[numTestVecs:m], 3)
        print "the classifier came back with : %d , the real answer is : %d" % (classifierResult, datingLabels[i])
        if classifierResult != datingLabels[i]: errorCount += 1.0
    print "the total error rate is : %d" % ( float(errorCount) / float(numTestVecs))

    '''
Exemplo n.º 41
0
def str_test(nr_train):
	print("Starting str test ({0} sample[s]).".format(nr_train))
	def label(s):
		return "short" if len(word) < 5 else "long"

	k = knn.knn(dist=knn.util.dist_string)
	# k = knn.knn(dist=lambda l, r: abs(len(l) - len(r)))
	for word, count in entries[:nr_train]:
		k.trainer.append(knn.TrainingEntry(word, label(word)))

	errors = 0
	for word, count in entries:
		errors += int(k.predict(word) != label(word))
	print("Error rate: {0}".format(float(errors) / len(entries)))
Exemplo n.º 42
0
def leave_one_out(examples,k):
    right_classified = 0
    for ex in examples:
        # disable only this example
        ex.active = False
        # run the k-Nearest-Neighbor algorithm
        rank_list = knn.knn(k,examples,ex)
        # check the voting for correctness
        outcome = knn.voting(rank_list)
        if outcome == ex.outcome:
            right_classified += 1
        ex.active = True
    # return which share was correctly classified
    return right_classified/float(len(examples.examples))
Exemplo n.º 43
0
def test_knn(filepath, glossary, k=5):
	bits = functions.gen_bitlist(6)[1:]

	total = len(bits)
	max = 0
	max_tags = []
	counter = 1
	for tags in bits:
		matrix, tag_names = preprocessing(filepath, tags, glossary)
		r = knn(matrix, k)
		print(str(counter) + "/" + str(total), end='\r')
		if r > max:
			max = r
			max_tags = tag_names
		counter += 1
	print("---Mejor---")
	print("Valoracion: " + str(max))
	print("Tag names: " + str(max_tags))
Exemplo n.º 44
0
 def setUp(self):
     self.vector = vectorization.vector()
     self.training = self.vector.vectorize(iter(training_movies))
     self.knn = knn.knn()
Exemplo n.º 45
0
from Preprocess import call
from knn import knn
from naivebayes import main_def
from dtree import decisionTree

if __name__=="__main__":
    raw = raw_input("Data reduction technique? \n1. PCA 2. Correlation filter 3. Variance Filter 4. Without Reduction\n")
    call(int(raw))
    alg = raw_input("Algorithm for classification? \n1. KNN 2.Naive-Bayes 3.Decision Tree\n")
    if int(alg) == 1:
        knn(raw)
    elif int(alg) == 2:
        main_def(int(raw))
    else:
        decisionTree(int(raw))
Exemplo n.º 46
0
def train(d, l):
	import nn
	classifier_nn = nn.nn(d,l)
	import knn
	classifier_knn = knn.knn(d,l)
	return classifier_nn, classifier_knn
Exemplo n.º 47
0
                          finetune_lr = 0.001,
                          batch_size = 50,  #10
                          hidden_size = [170,168,130], #[3000,1000,450,168],
                          corruption_da = [0.2, 0.2, .2, 0.1,0.1],
                          drop = [0.2, 0.3, 0.3,0.1,0.2,0.],
                          dA_initiall = True ,
                          error_known = True ,
                          activ_fun = T.tanh,
                           regu_l1 = 0,
                           regu_l2 = 0)  #T.nnet.sigmoid)

        gather.finetuning()
        ###########define nof K ###############
        k_neib = 20
        print('... Knn calculation with {} neighbor'.format(k_neib))
        knn_result = knn(test_set,test_mask,k=k_neib)

        #########run the result for test

        sda_error.append(MSE(test_set, gather.gather_out(), test_mask))
        mean_error.append(MSE(dataset,dataset.mean(axis=0),available_mask))
        knn_error.append(MSE(test_set,knn_result,test_mask))

        print('sda_error= ',sda_error[-1])
        print('knn_error= ',knn_error[-1])
        print('mean_error= ',mean_error[-1])  

    

print('sda_error= ',sda_error)
print('knn_error= ',knn_error)
Exemplo n.º 48
0
import reader
import tablestr
import zeror
import xval 
import nb
import knn
import uxval

if __name__ == "__main__":      
    filename = 'data/weather2.csv'   
    table = tablestr.Table()             #create raw data structure
    reader.readcsv(filename,table )      #read the .csv data set
    f = '%4.2f'                          #set the formatting for the output 
    tables = reader.klasses(table)
    b = x = 2
    kn = 5
    k = 1
    m = 2
    uxvaltables = uxval.uxvals(tables, x, b)
    knn_acc = []
    nb_acc = []    
    for s in range(b*x):
        s += 1
        acc = knn.knn(uxvaltables[s]['test'], uxvaltables[s]['train'], tables, kn)
        #acc2 = nb.nb(xvaltables[s]['test'], xvaltables[s]['train'], tables['names'], k, m)        
        knn_acc += [f%acc]
        #nb_acc += [f%acc2]
    print 'knn_acc =', knn_acc
    #print 'nb_acc =', nb_acc

Exemplo n.º 49
0
label = []
for x in l:
	features = f.Features(x)
	data.append(features.feature)
	if x.head.target == 'good':
		label.append(1)
	else:
		label.append(0)

print len(data)
print len(label)

n = len(data)
import knn

correct = 0
for i in range(n):
	test_data  = data.pop(i)
	test_label = label.pop(i)

	# knn
	cla = knn.knn(data, label)
	if cla.test(test_data,10) == test_label:
		correct += 1
    # nn
    data, label
    test_data, test_label

	data.insert(i, test_data)
	label.insert(i, test_label)
print correct * 1.0 / n
Exemplo n.º 50
0
				if neighbor_point not in clustered:
					new_cluster.append(neighbor_point)
					clustered.append(neighbor_point)
	
	print '%d clusters formed.' % len(clusters)
	print '%d(%.2f%%) noise points found.' % (len(noise), (float(len(noise))/len(points)) * 100)
	#print 'Noise: %s' % ', '.join([str(point).replace('[', '(').replace(']', ')') for point in noise])
	return [point for point in points if point not in noise]

	
# Read file
test = []
train = []

for cl in 'rgb':
	train_file = open('./TrainingData-original/W%s_train.txt' % cl, 'r')
	train_data = [[float(x.strip()) for x in line.split()] for line in train_file.read().strip().split('\n')]
	print '- Class: %s' % cl
	filtered_data = dbscan(train_data, 4, 2)
	train += [(point, cl) for point in filtered_data]
	
	new_train_file = open('./TrainingData/W%s_train.txt' % cl, 'w')
	new_train_file.write('\n'.join(['\t'.join([str(item) for item in row]) for row in filtered_data]))
	new_train_file.close()
	
	test_file = open('./TestData/W%s_test.txt' % cl, 'r')
	test += [([float(x.strip()) for x in line.split()], cl) for line in test_file.read().strip().split('\n')]
	
for i in range(1,30):
	knn.knn(i, test, train)
Exemplo n.º 51
0
import matplotlib

data = []
f = open(r'DRD.txt','r')
for s in f :
	d = s.split(',')
	d = [float(d2) for d2 in d]
	data.append(d)
f.close()
data = numpy.array(data)
row,colu = numpy.shape(data)

t_n = int(row*0.9)
tra = data[0:t_n,0:-1]
sam = data[t_n:,0:-1]
lab = data[0:t_n,-1]
true = numpy.array(data[t_n:,-1])
k = 21
result,label = knn.knn(tra,lab,sam,k)

label = numpy.array(label)
result_list = []
for r in result:                        #
	if r[0] >=0.5: 
		result_list.append(label[0])
	else:
		result_list.append(label[1])
result_list = numpy.array(result_list)
print "correct rate:"
print len(matplotlib.mlab.find((result_list==true) == True)) / float(len(true))
Exemplo n.º 52
0
                      problem = 'regression',
                      available_mask = mask,
                      method = 'adam',
                      pretraining_epochs = 10,
                      pretrain_lr = 0.0001,
                      training_epochs = 100,
                      finetune_lr = 0.0001,
                      batch_size = 200,
                      hidden_size = [1000,100,2],
                      corruption_da = [0.1,0.1,0.1],
                      dA_initiall = True ,
                      error_known = True )
    
    gather.finetuning()
    ###########define nof K ###############  
    knn_result = knn(dataset,available_mask,k=1000)

    #########run the result for test
    #dd_mask=test_mask
    #dd = test_set

    def MAE(x,xr,mas):
        return np.mean(np.sum((1-mas) * np.abs(x-xr),axis=1))

    
    sda_error.append(MAE(test_set, gather.gather_out(), test_mask))
    mean_error.append(MAE(dataset,dataset.mean(axis=0),available_mask))
    knn_error.append(MAE(dataset,knn_result,available_mask))
        
    #sda_error.append(sum((1-dd_mask)*(np.abs(dd-gather.gather_out())), axis=1).mean())
    #mean_error.append(sum((1-available_mask)*(np.abs(dataset-dataset.mean(axis=0))), axis=1).mean())
            ## tu bedzie logger
            
    #return articles[0]
    
        
polishArticles = getFromDir("pl")
englishArticles = getFromDir("en")
deutschArticles = getFromDir("de")
frenchArticles = getFromDir("fr")
spanishArticles = getFromDir("es")

articles = [ polishArticles, englishArticles, deutschArticles, frenchArticles, spanishArticles ]

sortToTrainAndTest(articles)

print "train set: " + str(len(trainSet))
print "test set: " + str(len(testSet))

network = net.NeuralNetwork(trainSet, testSet)
network.initializeNetwork()
network.testNetwork()

knn = k.knn(trainSet, testSet)
knn.initializeAlgorithm()
knn.testkNN()




    
Exemplo n.º 54
0
        Xtest = Xtest[1 : len(Xtest)]
        Ytest = Ytest[1 : len(Ytest)]

        """
		dataset.X = Xtrain
		dataset.Y = Ytrain
		dataset.Xte = Xtest
		dataset.Yte = Ytest
		"""

        # print len(Xtrain), len(Ytrain), len(Xtest), len(Ytest)

        print("================================================================")
        print("KNN (Feature %d, TestSet %d)" % (testFeatureSize, testSetIndex))
        print("================================================================")
        knn.knn(3, testFeatureSize, Xtrain, Ytrain, Xtest, Ytest)
        # end for testSetIndex
# end for testFeatureSize


###################################
# supervised
###################################
"""
f_tr = "tool.train"
f_tm = "tool.test"
f_mm = "tool.model"
f_pm = "tool.predict"

Xtr, Ytr, Xte, Yte = util.splitTrainTest(X0, Y0, 5)
Exemplo n.º 55
0
 def setUp(self):
     self.knn = knn.knn()
Exemplo n.º 56
0
 def __init__(self, db):
     self.k = knn.knn()
     self.training = db['training']
     self.classified = db['classified']
     self.gold = db['gold']
     self.golden = {}
Exemplo n.º 57
0
def train(d, l, w, th):
	classifier_nn = nn.nn(d,l,w,th)
	classifier_knn = knn.knn(d,l)
	return classifier_nn, classifier_knn
Exemplo n.º 58
0
                          method = 'adam',
                          pretraining_epochs = 200,
                          pretrain_lr = 0.0001,
                          training_epochs = 300,
                          finetune_lr = 0.0001,
                          batch_size = 5,
                          hidden_size = [300,100,3],  #(1388, 8)  PCA--> 3
                          corruption_da = [ 0.2,.2,0.1,.1,0.1,.1],
                          dA_initiall = True ,
                          error_known = True ,
                          activ_fun =T.tanh)    
        gather.finetuning()
        ###########define nof K ###############
        k_neib = 25
        print('... Knn calculation with {} neighbor'.format(k_neib))
        knn_result = knn(dataset,available_mask,k=k_neib)

        #########run the result for test


        def MAE(x,xr,mas):
            return np.mean(np.sum((1-mas) * np.abs(x-xr),axis=1))


        sda_error.append(MAE(test_set, gather.gather_out(), test_mask))
        mean_error.append(MAE(dataset,dataset.mean(axis=0),available_mask))
        knn_error.append(MAE(dataset,knn_result,available_mask))

        print('sda_error= ',sda_error[-1])
        print('knn_error= ',knn_error[-1])
        print('mean_error= ',mean_error[-1])  
Exemplo n.º 59
0
                      problem = 'regression',
                      available_mask = mask,
                      method = 'adam',
                      pretraining_epochs = 100,
                      pretrain_lr = 0.0001,
                      training_epochs = 200,
                      finetune_lr = 0.0001,
                      batch_size = 100,
                      hidden_size = [100,20,2],
                      corruption_da = [0.1,  0.1, 0.1],
                      dA_initiall = True ,
                      error_known = True )
    
    gather.finetuning()
      
    knn_result = knn(dataset,available_mask)
    #########run the result for test
    dd_mask=test_mask
    dd = test_set
    
    b_error.append(sum((1-dd_mask)*((dd-gather.gather_out())**2), axis=1).mean())
    mean_error.append(sum((1-available_mask)*((dataset-dataset.mean(axis=0))**2), axis=1).mean())
    knn_error.append(sum((1-available_mask)*((dataset-knn_result)**2), axis=1).mean())
    plot(mis,b_error[-1],'ro')
    plot(mis,mean_error[-1],'bo')
    plot(mis,knn_error[-1],'g*')

    #### SDA with corruption in training
    train_mask =  rest_mask[:percent_valid]
        
    data= (train_set*train_mask, valid_set *valid_mask ,test_set *test_mask)
Exemplo n.º 60
0
                attrtable_test = attrselect.attrtable(test_table, attrlst)
                
                
                #test_table = projections.projections(uxvaltables[s]['test'])
                pcaT_all,tmp1 = tiles.tiles(table, numdim, outfile0)  #pca projection and clustering
                pcaT_info, tmp2 = tiles.tiles(attrtable_train, numdim, outfile0)
                #fastT_all = tiles.tilesv2(table, numdim, outfile0)
                fastT_info, tmp3 = tiles.tilesv2(attrtable_train, numdim, outfile0)     
                
                pcaT_all_cen += [tmp1]; pcaT_info_cen += [tmp2]; fastT_info_cen += [tmp3]

                #tablestr.tableprint(centroid_attr)
                tables = reader.klasses(table)
                kt = 1; mt = 2
                acc1, f1, prec1, pd1 = nb.nb(xvaltables[s]['test'], xvaltables[s]['train'], tables['names'], kt, mt)
                acc2, f2, prec2, pd2 = knn.knn(uxvaltables[s]['test'], uxvaltables[s]['train'], kn)
                # PCA Infer Methods
                acc3, f3, prec3, pd3 = newknn.knn(test_table, pcaT_all[0], kn, threshold)
                acc4, f4, prec4, pd4 = newknn.knn(attrtable_test, pcaT_info[0], kn, threshold)

                # Fastmap Infer Methods
                #acc5, f5, prec5, pd5 = newknn.knn(test_table, fastT_all[0], kn, threshold)
                acc6, f6, prec6, pd6 = newknn.knn(attrtable_test, fastT_info[0], kn, threshold)
                break
                acc_pcaT_all += [acc3]
                acc_pcaT_infogain += [acc4] 
                acc_nb += [acc1]
                acc_knn += [acc2]
                #acc_fastT_all += [acc5]
                acc_fastT_infogain += [acc6]