def main():
    k_range = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]

    X_train, X_test, y_train, y_test = loadDataDivided(ifSubDir=False, ifScale=True, suffix='_LDA')
    X_train_proj, X_test_proj = runMLKR(X_train, X_test, y_train, y_test)
    KNN.runKNN(X_train_proj, X_test_proj, y_train, y_test, k_range, metric='euclidean', metric_params=None,
                label='_MLKR_euclidean')
示例#2
0
def TestShapeAccuracy(train_images, train_labels, test_images, test_labels,
                      neigh, percentageTrain):
    limitTrain = int(train_labels.shape[0] * percentageTrain / 100)
    knn = KNN(train_images[:limitTrain], train_labels[:limitTrain])
    preds = knn.predict(test_images, neigh)
    percentage = Get_shape_accuracy(preds, test_labels)
    return percentage
def mix_up():
    """creating a hybrid model mixing ML algorithms and neural net , it accumulates the errors from individual algorithms 
	and it increases the error in neural net so much that the model is not flexible enough to decide the trend in market
	or find patterns in data , model2 removes that redundant error ."""
    ind = 0
    for i in bar(xrange(len(x))):
        b_pred, b_y = bayes.naive_bayes_model(x[i], net=True)
        s_pred, s_y = SVM.svm_model(x[i], net=True)
        k_pred, k_y = KNN.knn_algo_model(x[i], net=True)
        print b_pred, b_y
        mix.new_net(s_pred, b_pred, k_pred, s_y, x[i])
    ind = 0
    report = pd.DataFrame(index=range(0),
                          columns=[
                              'Stock Name', 'accuracy', 'profit count',
                              'loss count', 'total no of rise',
                              'total number of loss'
                          ])
    for i in bar(xrange(len(x))):
        b_pred, b_y = bayes.naive_bayes_model(x[i], net=True, actual=True)
        s_pred, s_y = SVM.svm_model(x[i], net=True, actual=True)
        k_pred, k_y = KNN.knn_algo_model(x[i], net=True, actual=True)
        p_count, total_count_p, l_count, total_count_l, accuracy = mix.new_net(
            s_pred, b_pred, k_pred, s_y, x[i], create=False)
        report.loc[ind] = [
            x[i], accuracy, p_count, l_count, total_count_p, total_count_l
        ]
        ind = ind + 1
    print "Mean accuracy----------", report['accuracy'].mean()
    report.to_csv("./report/mix_result.csv")
示例#4
0
    def __init__(self, symb, predlen, cat='RL', kwargs=None):

        self.symb = symb
        self.predlen = predlen
        self.kwargs = kwargs
        self.cat = cat

        if cat == 'RF':
            if kwargs != None:
                self.learner = RF.RandomForest(**kwargs)
            else:
                self.learner = RF.RandomForest()

        elif cat == 'KNN':
            if kwargs != None:
                self.learner = KNN.KNN(**kwargs)
            else:
                self.learner = KNN.KNN()

        elif cat == 'SVM':
            if kwargs != None:
                self.learner = SVM.SVM(**kwargs)
            else:
                self.learner = SVM.SVM()

        elif cat == 'NN':
            if kwargs != None:
                self.learner = NN.NN(**kwargs)
            else:
                self.learner = NN.NN()
示例#5
0
def crossvalidation(userManager, artistManager, folders):
	"""split data into folders and validate the performance"""
	userIDs = userManager.keys()
	userFolders = {}
	for i in range(folders):
		userFolders[i] = []
	for userID in userIDs:
		i = random.randrange(folders)
		userFolders[i].append(userID)
	for f in range(folders):
		testUserSet, testUserIDList, testUserMostFavourite = splitTrainSet(userManager, 1.0/folders, userFolders[f])
		knn = KNN(6)
		knn.training(userManager, artistManager)
		rightNum = 0
		totalNum = len(testUserIDList)
		for i in range(len(testUserIDList)):
			print i, totalNum,
			favOfOne = knn.testing(testUserSet[testUserIDList[i]], userManager, artistManager)
			print testUserIDList[i], testUserMostFavourite[testUserIDList[i]].keys()[0], favOfOne
			if favOfOne == testUserMostFavourite[testUserIDList[i]].keys()[0]:
				rightNum += 1
		print "Folder", f, ":"
		print "Total:", totalNum
		print float(rightNum)/len(testUserIDList)
		for i in range(len(testUserIDList)):
			userManager[testUserIDList[i]] = testUserSet[testUserIDList[i]]
示例#6
0
def handwritingClassTest():
    # 训练
    hwLabels = []
    trainingFileList = listdir('trainingDigits/')
    m = len(trainingFileList)
    trainingMat = zeros((m, 1024))
    for i in range(m):
        fileNameStr = trainingFileList[i]
        fileStr = fileNameStr.split('.')[0]
        classNumStr = int(fileStr.split('_')[0])
        hwLabels.append(classNumStr)
        trainingMat[i,:] = KNN.img2vector('trainingDigits/%s' % fileNameStr)
    
    print "Training done."

    # 测试
    testFileList = listdir('testDigits')
    errorCount = 0.0
    mTest = len(testFileList)
    for i in range(mTest):
        fileNameStr = testFileList[i]
        fileStr = fileNameStr.split('.')[0]
        classNumStr = int(fileStr.split('_')[0])
        vectorUnderTest = KNN.img2vector('testDigits/%s' % fileNameStr)
        classifierResult = KNN.classify0(vectorUnderTest, trainingMat, hwLabels, 3)

        if (classifierResult != classNumStr):
            print "[%s] the classifier came back with: %d, the real answer is: %d" % (fileStr, classifierResult, classNumStr)
            errorCount += 1.0

    print "\nthe total number of errors is: %d" % errorCount
    print "\nthe total error rate is: %f" % (errorCount/float(mTest))
示例#7
0
def main():
    k_range = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
    LMNN_k_range = [2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16]
    X_train, X_test, y_train, y_test = loadDataDivided(ifSubDir=False, ifScale=True, suffix='_LDA')
    for i in LMNN_k_range:
        X_train_proj, X_test_proj = runLMNN(X_train, X_test, y_train, y_test, i)
        KNN.runKNN(X_train_proj, X_test_proj, y_train, y_test, k_range, metric='euclidean', metric_params=None,
        label='_LMNN_euclidean_k='+str(i))
示例#8
0
 def test_fit(self):
     for ix, (train_imgs,
              train_labels) in enumerate(self.test_cases['input']):
         knn = KNN(train_imgs, train_labels)
         preds = knn.predict(self.test_cases['test_input'][ix][0],
                             self.test_cases['rnd_K'][ix])
         np.testing.assert_array_equal(preds,
                                       self.test_cases['get_class'][ix])
示例#9
0
 def test_get_k_neighbours(self):
     for ix, (train_imgs,
              train_labels) in enumerate(self.test_cases['input']):
         knn = KNN(train_imgs, train_labels)
         knn.get_k_neighbours(self.test_cases['test_input'][ix][0],
                              self.test_cases['rnd_K'][ix])
         np.testing.assert_array_equal(knn.neighbors,
                                       self.test_cases['get_k_neig'][ix])
示例#10
0
def run():
    'Main loop, it gets and processes user input until "bye".'
    print(
        '''Hi there! My name is Mr. Rabbits!                        (\_/)           
Welcome to Mr. Rabbits' Machine Learning Adventure!      (^.^)
Today we will be exploring the difference between       c(> <)
Naive Bayes classification and k-nearest neighbors.
There are two datasets to choose from: Fisher's Iris flower data set or ________.'''
    )
    while True:
        invalid = False
        info = input(
            '''Please let me know which classifier you would like to explore:
(type 'knn' or 'naive bayes' or 'bagging' or 'bye' to exit).\n''')
        if info == 'bye':
            print('Goodbye! Bring me a carrot next time! :3"')
            return
        print(
            "Which dataset will you be exploring today? Fisher's iris flower dataset or Wisconsin breast cancer diagnostics?"
        )
        dataset = input("Type 'FI' or 'BC'\n")

        split = input(
            "What % of the dataset should be split into the training set? (type a value from 0 to 100)\n"
        )
        split = float(split) / 100

        filename = ''
        if dataset == "FI":
            filename = 'iris.csv'
        elif dataset == "BC":
            filename = 'wdbc_clean.csv'

        trainSet = []
        testSet = []

        createDataset(filename, trainSet, testSet, split)

        if info == 'knn':
            k = input("What value should k be? (# of nearest neighbors)\n")
            KNN.run(trainSet, testSet, int(k))
        elif info == 'naive bayes':
            NaiveBayes.run(trainingSet=trainSet, testSet=testSet)
        elif info == 'bagging':
            k = input("What value should k be? (# of nearest neighbors)\n")
            bagSize = input("How big should the bags be?\n")
            bagNum = input("How many bags should I use?\n")
            bagging(int(k), trainSet, testSet, int(bagSize), int(bagNum))
        else:
            invalid = True

        if invalid:
            print(
                "Oops! There was some invalid input somewhere along the way.")
            print("Let's start from the top again.\n")
        else:
            print("Wow! That was fun. Let's do it again.\n")
示例#11
0
def main():
    dataSet, labels = KNN.Load_Train_Data()
    print("Newgroup;", dataSet)
    print(dataSet.shape)
    print("Newlabels;", labels)
    print(labels.shape)

    dataIn = np.loadtxt(open("./low_Dim_Data_test.csv"),
                        delimiter=",",
                        skiprows=0)
    val = np.loadtxt(open("./_names_test.csv"), delimiter=",", skiprows=0)
    k = 100
    '''
    dataOut = KNN.classify(dataIn[197],dataSet,labels,k)
    print("测试数据为:",dataIn[197],"分类结果为:",dataOut)
    print("长度:",len(dataIn))
    '''
    bb = 0

    #dataOut={}
    wrong = 0
    all = len(dataIn)

    wrong1 = 0
    wrong0 = 0
    predict_0 = 0
    predict_1 = 0
    original_1 = 0
    original_0 = 0
    for i in range(len(dataIn)):
        #print(i)
        dataOut = KNN.classify(dataIn[i], dataSet, labels, k)
        if dataOut == val[i]:
            bb = bb + 1
            #print('ok')
        else:
            wrong = wrong + 1
            #print('false')
        if val[i] == 1.0:
            original_1 = original_1 + 1
        if val[i] == 0.0:
            original_0 = original_0 + 1
        if dataOut == 1.0:
            predict_1 = predict_1 + 1
        if dataOut == 0.0:
            predict_0 = predict_0 + 1
        if (val[i] == 1.0) & (dataOut == 0.0):
            wrong1 = wrong1 + 1
        if (dataOut == 1.0) & (val[i] == 0.0):
            wrong0 = wrong0 + 1
    print(np.sum(val == 0.0))
    print(np.sum(val == 1.0))
    print("准确率:", (all - wrong) / all)  #准确率
    #print("正确为1的误判:",wrong0/all0)     #正确为1预测为0的个数,准确率
    #print("预测为1中的错误:",wrong1/all1)   #召回率
    print("精确率:", (original_1 - wrong1) / (original_1 - wrong1 + wrong0))
    print("召回率:", (original_1 - wrong1) / (original_1))
示例#12
0
def main():
    inputFile=sys.argv[1]
    global trainSet
    global testSet
    global bootstrap

    generateTrainTestSample(inputFile)
    bootstrapping(trainSet)
    KNN.main(bootstrap[1],testSet,3)
def test_simple():
    data_set, labels = KNN.create_data_set()

    test1 = array([1.2, 1.0])
    test2 = array([0.1, 0.3])
    k = 3
    output_label1 = KNN.knn_classify(test1, data_set, labels, k)
    output_label2 = KNN.knn_classify(test2, data_set, labels, k)
    print test1, output_label1
    print test2, output_label2
def test_simple():
    data_set, labels = KNN.create_data_set()

    test1 = array([1.2, 1.0])
    test2 = array([0.1, 0.3])
    k = 3
    output_label1 = KNN.knn_classify(test1, data_set, labels, k)
    output_label2 = KNN.knn_classify(test2, data_set, labels, k)
    print test1, output_label1
    print test2, output_label2
示例#15
0
def plotwithlable():
    
    xcord1 = []; ycord1 = []; zcord1=[]
    xcord2 = []; ycord2 = []; zcord2=[]
    xcord3 = []; ycord3 = []; zcord3=[]  
    #group ,labels = createDataSet()    
    datingDataMat, datingLables = KNN.file2matrix('datingTestSet2.txt')
    #print(datingDataMat)
    #print(datingDataMat[0,2])
    #print(datingLables)
    normDataMat,  ranges, minVals = KNN.autoNorm(datingDataMat)
    #print(normDataMat)
    tmp = datingDataMat
    datingDataMat = normDataMat
    fig = plt.figure() #create pic: fig 
    ax = fig.add_subplot(311) #create a subplot with 1 row 1 colum, select pic 1   
    #type1 = ax.scatter(xcord1, ycord1, s=20, c='red')
    #type2 = ax.scatter(xcord2, ycord2, s=30, c='green')
    #type3 = ax.scatter(xcord3, ycord3, s=50, c='blue')   
    
    for index, value in enumerate(datingLables):
        if value == 1:
            xcord1.append(datingDataMat[index,0]) 
            ycord1.append(datingDataMat[index,1])
            zcord1.append(datingDataMat[index,2])
        elif value == 2:
            xcord2.append(datingDataMat[index,0]) 
            ycord2.append(datingDataMat[index,1])
            zcord2.append(datingDataMat[index,2])
        else:
            xcord3.append(datingDataMat[index,0]) 
            ycord3.append(datingDataMat[index,1])
            zcord3.append(datingDataMat[index,2])
    type1 = ax.scatter(xcord1, ycord1, s=20, c='red')
    type2 = ax.scatter(xcord2, ycord2, s=30, c='green')
    type3 = ax.scatter(xcord3, ycord3, s=50, c='blue')   
    ax.legend([type1, type2, type3], ["Did Not Like", "Liked in Small Doses", "Liked in Large Doses"], loc=2)
    
    ax2 = fig.add_subplot(312)
    type1 = ax2.scatter(xcord1, zcord1, s=20, c='red')
    type2 = ax2.scatter(xcord2, zcord2, s=30, c='green')
    type3 = ax2.scatter(xcord3, zcord3, s=50, c='blue')   
    ax2.legend([type1, type2, type3], ["Did Not Like", "Liked in Small Doses", "Liked in Large Doses"], loc=2)

    plt.xlabel("Frequent Flyier Miles Earned Per Year")
    plt.ylabel("Liters of Ice Cream Consumed Per Week")
    ax3 = fig.add_subplot(313)
    type1 = ax3.scatter(ycord1, zcord1, s=20, c='red')
    type2 = ax3.scatter(ycord2, zcord2, s=30, c='green')
    type3 = ax3.scatter(ycord3, zcord3, s=50, c='blue')   
    ax3.legend([type1, type2, type3], ["Did Not Like", "Liked in Small Doses", "Liked in Large Doses"], loc=2)

    plt.xlabel("Percentage of Body Covered By Tatoos")
    plt.ylabel("Liters of Ice Cream Consumed Per Week")       
    plt.show()
示例#16
0
 def match_query(self):
     query = cv.imread(self.filename, 0)
     query_instance = KNN.creates_query_instance(query, '')
     match = KNN.finds_best_match(query_instance, self.training_set)
     self.model_prediction_value.configure(text=match.prediction)
     match_image = cv.drawMatches(query, match.keypoints, match.best_match.image,
                                       match.best_match.keypoints, match.k_matches[:20], None, flags=2)
     output_image = cv.imwrite('../output.jpg', match_image)
     match_image_display = Image.open('../output.jpg')
     match_image_display = match_image_display.resize((250, 250), Image.ANTIALIAS)
     self.image_match = ImageTk.PhotoImage(match_image_display)
     self.match_canvas.create_image(20, 20, anchor=NW, image=self.image_match)
示例#17
0
def classifyPerson():
    print "输入相关信息"
    resultList = ['一点不喜欢','有点希望','可能性很大']
    percentTats = float(raw_input("玩游戏时间数目?"))
    ffMiles = float(raw_input("旅游公路数?"))
    ice = float(raw_input("冰淇淋消耗量?"))
    datingDataMat,datingLabels = KNN.file2matrix('datingTestSet2.txt')
    normMat,ranges,minVals = KNN.autoNorm(datingDataMat)
    inArr = np.array([ffMiles,percentTats,ice])
    classfierRt = KNN.classify0((inArr-minVals)/ranges,normMat,datingLabels,3)
    print resultList[classfierRt - 1]
    PrintFigure(normMat, datingLabels)
示例#18
0
def datingClassTest():
    hoRatio = 0.10
    datingdataMat, datingLabels = KNN.file2matrix('datingTestSet2.txt')
    normMat, ranges, minVals = KNN.autoNorm(datingdataMat)
    m = normMat.shape[0]
    numTestVecs = int(m*hoRatio)
    errorCount = 0.0
    for i in range(numTestVecs):
        classifierResult = KNN.classify0(normMat[i,:], normMat[numTestVecs:m,:], datingLabels[numTestVecs:m], 3)
        print "the classifier came back with: %d, the real answer is: %d" % (classifierResult, datingLabels[i])
        if (classifierResult != datingLabels[i]):
            errorCount += 1.0
    print "tht total error rate is: %f" % (errorCount/float(numTestVecs))
示例#19
0
def testUser(testUserID):
    if not UserManager.has_key(testUserID):
        return "don't has user with userID = "+str(testUserID)
    testUserSet, testUserIDList = splitTrainSetWithoutRemoving(TrainUserManager, 0, [testUserID])
    knn = KNN(40)
    knn.training(TrainUserManager, ArtistManager)
    favOfOne, allArtist, allTag = knn.testing(testUserSet[testUserID], UserManager, ArtistManager, True)
    realfavOfOne = UserManager[testUserID].getMostFav().keys()[0]
    ret = "The most listen artist:\n"+str(ArtistManager[realfavOfOne])+"\n"
    ret += "The artist we predict:\n"+str(ArtistManager[favOfOne])
    ret = ret.replace("\n","</br>")
    # recovery modified TrainUserManager
    TrainUserManager[testUserID]=testUserSet[testUserID]

    return ret
示例#20
0
 def train_model(self):
     ABT = DataPrep.populates_ABT()
     print(ABT, len(ABT.ABT))
     precision = 0.5
     required_precision = 0.83
     while precision < required_precision:
         training_set, test_set = DataPrep.generates_training_test_sets(ABT.ABT)
         prediction_set = KNN.test_model(test_set, training_set)
         precision, recall = KNN.compute_metrics(prediction_set)
         print('run precision: ', precision, 'run recall: ', recall)
     self.training_set, self.test_set = training_set, test_set
     self.precision = str(int(precision*100))+"%"
     self.recall = str(int(recall*100))+"%"
     self.precision_value.configure(text=self.precision)
     self.recall_value.configure(text=self.recall)
示例#21
0
def topNRecs():
    #Get Entries
    csv_file_name = entry.get()
    string = entry2.get()

    #User selections
    numNeighbors = int(string[0])
    selectedPoint = int(string[2])

    #Call KNN class and get the neighbors
    knnClient = KNN(csv_file_name, selectedPoint, numNeighbors)
    list_of_Neighbors = knnClient.driver()

    #Catch bad input from a bad csv
    if list_of_Neighbors is None:
        out = "Warning: No Neighbors. Edit the CSV or try again."
    else:
        out = "Results are in the txt file"

    #Dump the results to a file
    with open('output.txt', 'w') as f:
        for neighbor in list_of_Neighbors:
            f.write("%s\n" % str(neighbor))

    #Transform the csv into a dict of dicts
    reader = csv.DictReader(open(csv_file_name))
    dict_list = []
    for line in reader:
        dict_list.append(line)

    #Store the csv in a txt file with appropriate headers for better analysis
    with open('dict.txt', 'w') as d:
        for dictonary in dict_list:
            d.write("%s\n" % str(dictonary))

    #print(dict_list)
    list_of_Neighbors = [[str(str(j)) for j in i] for i in list_of_Neighbors]
    res = [''.join(ele) for ele in list_of_Neighbors]

    #Output windows
    title = tk.Label(root, text=out, font=('helvetica', 10))
    canvas.create_window(220, 80, window=title)

    #More output windows suggesting the user try another csv file
    ans = tk.Label(root,
                   text="Try another txt file below!",
                   font=('helvetica', 15))
    canvas.create_window(250, 300, window=ans)
def calculateTestError():
    testError=KNN.getTestError(resultKNN,KNN.actualLabel)
    print("****Test Error*****")
    print(testError)
    print("*******Accuracy********")
    accuracy=100-testError
    print(accuracy)
示例#23
0
    def test_input_knn(self) -> None:
        """ test for diabetes"""
        my_dict = {
            "B": float(self.l1.text()),
            "C": float(self.l2.text()),
            "D": float(self.l3.text()),
            "E": float(self.l4.text()),
            "F": float(self.l5.text())
        }
        knn_output = KNN.check_input(my_dict)
        self.setFixedSize(850, 342)
        self.report_subhead.setText("Reports")
        self.model_details.setText(
            "K-Nearest Neighbours classifier used.\nAccuracy of model: 81.16%\nWe have used PIMA Indians diabetes dataset."
        )
        self.details.setText(
            "Patient's name: {}\nPlasma glucose concentration: {} \
\nDiastolic blood pressure: {}\nTriceps skin fold thickness: {}\nSerum insulin: {}\nBody mass index: {}"
            .format(self.l0.text(), self.l1.text(), self.l2.text(),
                    self.l3.text(), self.l4.text(), self.l5.text()))

        if knn_output == 0:
            self.results.setText(
                "Diagnosis suggests that patient does not suffers from diabetes."
            )
        else:
            self.results.setText(
                "Our diagnosis suggests patient does suffer from diabetes.\nPlease get checked soon."
            )
        self.results.setFont(QFont("Arial", 14, weight=QFont.Bold))
示例#24
0
def handwritingClassTest():
    hwLabels = []
    trainingFileList = listdir('trainingDigits')  #读取该文件夹下的文件,文件名以列表的形式储存
    m = len(trainingFileList)  #获取该文件夹下的数目
    trainingMat = np.zeros((m, 1024))
    for i in range(m):
        fileNameStr = trainingFileList[i]
        fileStr = fileNameStr.split('.')[0]
        classNumStr = int(fileStr.split('_')[0])
        hwLabels.append(classNumStr)
        trainingMat[i, :] = img2vector('trainingDigits/%s' % fileNameStr)
    testFileList = listdir('testDigits')
    errorCount = 0.0
    mTest = len(testFileList)
    for i in range(mTest):
        fileNameStr = testFileList[i]
        fileStr = fileNameStr.split('.')[0]
        classNumStr = int(fileStr.split('_')[0])
        vectorUnderTest = img2vector('testDigits/%s' % fileNameStr)
        classifierResult = KNN.classify0(vectorUnderTest, trainingMat,
                                         hwLabels, 3)
        print("the classifier came back with: %d, the real answer is: %d" %
              (classifierResult, classNumStr))
        if (classifierResult != classNumStr):
            errorCount += 1.0
    print("the total number of errors is: %d" % errorCount)
    print("the total error rate is: %f" % (errorCount / float(mTest)))
示例#25
0
def handwriteClassfiy(testfile, trainfile, k):
    """函数将trainfile中的文本图片转换成样本特征集和样本类型集,用testfile中的测试样本测试,无返回值
    
    Keyword argument:
    testfile -- 测试图片目录
    trainfile -- 样本图片目录
    """

    trainFileList = os.listdir(trainfile)
    trainFileSize = len(trainFileList)
    labels = []
    trainDataSet = np.zeros((trainFileSize, 1024))
    for i in range(trainFileSize):
        filenameStr = trainFileList[i]
        digitnameStr = filenameStr.split('.')[0]
        digitLabels = digitnameStr.split('_')[0]
        labels.append(digitLabels)
        trainDataSet[i, :] = img2vector(trainfile + '/' + filenameStr)
    #testFileList = os.listdir(testfile)
    #testNumber = len(testFileList)
    #errorcount = 0.0
    #for testname in testFileList:
    #testname='%s.txt'%str(num)
    testdigit = img2vector(testfile + '/' + 'x.txt')
    classifyresult = KNN.classify(testdigit, trainDataSet, labels, k)
    #testStr = testname.split('.')[0]
    #testDigitLabel = testStr.split('_')[0]
    #if classifyresult != testDigitLabel:
    #errorcount += 1.0
    #print(classifyresult)
    #print('this test real digit is:%s, and the result is: %s' % (testDigitLabel, classifyresult))
    #print('k = %d, errorRatio is: %f' % (k, errorcount/float(testNumber)))
    return classifyresult
def cross_validation_nn(k,folds_array):

    #Initial values
    corrects = 0
    incorrects = 0

    #Separate train and test data
    for i in range(0,10):
        training_data = []
        test_data = []
        for j in range(0,10):
            if j == i:
                test_data = folds_array[j]
            else:
                training_data = training_data + folds_array[j]
        #Predict values
        for j in range(0,len(test_data)):
            prediction = KNN.knearest(k,training_data,test_data[j],True)
            length = len(test_data[j])-1
            #Check if the value is correct
            if prediction == test_data[j][length]:
                corrects = corrects + 1
            else:
                incorrects = incorrects + 1

    return float(corrects)/float(corrects+incorrects)
示例#27
0
文件: LLE.py 项目: philipz1/ML
def lle(data, k = 10, target_dim = 2):
	p = data.shape[1]
	graph = KNN.knn(data, k)
 	
	n = len(graph.keys())
	weights_vec = np.zeros((n,k))
	weights_dict = {}
	locals_ = np.zeros((n,k))
	for i, key in enumerate(list(graph.keys())):
		local = construct_knn_vector(key, graph, k)
		local_centered = local - np.repeat(np.array(key).reshape([1, p]), k, axis = 0)
		gram = do_gram(local_centered, k)

		w_num = np.dot(np.linalg.inv(gram), np.ones(gram.shape[0]).T)
		w = w_num / w_num.sum()
		weights_vec[i] = w

		temp_dict = {}
		for q in range(len(local)):
			temp_dict[tuple(local[q])] = w[q]
		weights_dict[tuple(key)] = temp_dict

	weights = reconstruct(data, weights_dict)

	M = np.dot((np.identity(n) - weights).T, (np.identity(n) - weights))
	eigvals, eigvecs = np.linalg.eigh(M)

	index = np.argsort(eigvals)[::1]
	eigvals = eigvals[index]
	eigvecs = eigvecs[:,index]

	return eigvecs[:,1:target_dim + 1]
示例#28
0
def handwritingClassTest():
    hwLabel = []
    trainingFileList = listdir('trainingDigits')  # 获取目录内容 ,type(list)
    m = len(trainingFileList)

    trainingMat = zeros((m, 1024))
    for i in range(m):
        filenameStr = trainingFileList[i]
        filename = filenameStr.split('.')[0]
        classNum = int(filename.split('_')[0])
        hwLabel.append(classNum)  # 从文件名中解析分类数字
        trainingMat[i, :] = img2vector('trainingDigits/%s' % filenameStr)

    testFileList = listdir('testDigits')
    m_test = len(testFileList)
    error_count = 0.0
    for j in range(m_test):
        test_filenameStr = testFileList[j]
        test_filename = test_filenameStr.split('.')[0]
        test_ClassNum = int(test_filename.split('_')[0])  # 通过文件名获取实际的数字编号
        classfierResult = KNN.classify(
            img2vector('testDigits/%s' % test_filenameStr), trainingMat,
            hwLabel, 4)  # 通过KNN算法得到的编号

        print 'the classfier came back with: %d, the realnum came back with %d' % (
            classfierResult, test_ClassNum)
        if classfierResult != test_ClassNum:
            error_count += 1
    print 'the total error num: %d' % error_count
    print 'the total error rate is :%f' % (error_count / m_test)
def hand_writing_class_test():
    """
    构建训练样本数据
    :return:
    """
    # ['5_135.txt', '4_36.txt', '8_102.txt', '8_116.txt', ....]
    dir_path = './data/trainingDigits/'
    training_file_list = os.listdir(dir_path)
    m = len(training_file_list)
    # 存储训练样本数据
    training_mat = np.zeros((m, 1024))
    # 存储训练样本的标签
    hw_labels = []
    for i in range(m):
        file_name = training_file_list[i]
        label = file_name.split('_')[0]
        hw_labels.append(label)
        training_mat[i, :] = img_2_vector(dir_path + file_name)

    # 读取测试样本数据 进行测试
    test_dir_path = './data/testDigits/'
    test_file_list = os.listdir(test_dir_path)
    m = len(test_file_list)
    count = 0
    for i in range(m):
        file_name = test_file_list[i]
        label = file_name.split('_')[0]
        temp_vector = img_2_vector(test_dir_path + file_name)
        pre_result = KNN.classify(temp_vector, training_mat, hw_labels, k=3)
        print("the classifier came back with: %s, the real answer is: %s" % (pre_result, label))
        if pre_result == label:
            count += 1.0
    # 正确率:0.988372
    print('正确率:%f' % (count / m))
示例#30
0
 def recognizer(self, imgPath):
     vectorList = img2Vector(imgPath)
     nameList = []
     for vector in vectorList:
         nameList.append(
             KNN.classify0(vector, self.trainingMat, self.labels, self.k))
     return ''.join(nameList)
示例#31
0
def get_training_error(train, k):
    myKNN = knn.KNN(train)
    num_errors = 0
    for i in range(0, len(train)):
        if (myKNN.predict(train[i], k) != train[i][0]):
            num_errors += 1
    return num_errors
示例#32
0
def get_test_error(train, test, k):
    myKNN = knn.KNN(train)
    num_errors = 0
    for i in range(0, len(test)):
        if (myKNN.predict(test[i], k) != test[i][0]):
            num_errors += 1
    return num_errors
示例#33
0
def datingClassTest():
    '''
    对约会网站 的测试方法
    :return: 错误数目
    '''
    # 设置测试数据的一个比例(训练数据集的比例 = 1 - hotRatio)
    hoRatio = 0.1  #测试范围, 一部分测试一部分作为样本
    # 从文件中加载数据
    datingDataMat, datingLabels = fileParse.file2matrix('./datingTestSet.txt')
    # 归一化数据
    normMat, ranges, miuVals = fileParse.autoNorm(datingDataMat)
    # m  表示数据h的行数, 即矩阵的第一维
    m = normMat.shape[0]
    # 设置测试的样本数量, numTestVecs: m 便是训练样本的数量
    numTestVecs = int(m * hoRatio)

    print('numTestVecs =', numTestVecs)

    errorCount = 0.0

    for i in range(numTestVecs):
        # 对数据测试
        classifierResult = KNN.classify0(normMat[i, :],
                                         normMat[numTestVecs:m, :],
                                         datingLabels[numTestVecs:m], 3)
        print('||' * 40)
        print('The clssifier came back with:%d, the real answer is:%d' %
              (classifierResult, datingLabels[i]))

        if (classifierResult != datingLabels[i]):
            errorCount += 1.0

        print('the total error rate is:%f' % (errorCount / float(numTestVecs)))
        print('errorCount is:', errorCount)
        print()
示例#34
0
def handwritingClassTest():
    hwLabels = []  #s手写数字的标签
    trainingFileList = os.listdir('trainingDigits')  #文件夹中的文件名 获取目录的内容
    m = len(trainingFileList)  #统计一共有多少个训练
    trainingMat = zeros((m, 1024))
    #从文件名解析分类数字 开始
    for i in range(m):
        fileNameStr = trainingFileList[i]
        fileStr = fileNameStr.split('.')[0]  #用'.'号分隔然后取第一个元素
        classNumStr = int(fileStr.split('_')[0])  #用'_'分隔然后取第一个元素
        hwLabels.append(classNumStr)  #将数字标签存入数组
        trainingMat[i, :] = img2vector('trainingDigits/%s' % fileNameStr)
    testFileList = os.listdir('testDigits')  #获得测试数据
    errorCount = 0.0
    mTest = len(testFileList)  #一共有多少测试数据
    for i in range(mTest):
        fileNameStr = testFileList[i]
        fileStr = fileNameStr.split('.')[0]
        classNumStr = int(fileStr.split('_')[0])
        vectorUnderTest = img2vector('testDigits/%s' % fileNameStr)
        classifierResualt = KNN.classify0(vectorUnderTest, trainingMat,
                                          hwLabels, 30)
        print "the classsifier came back with :%d , the real answer is %d" % (
            classifierResualt, classNumStr)
        if (classifierResualt != classNumStr): errorCount += 1.0
    print "\nthe totle number of error is : %d" % errorCount
    print "\nthe totle error rate is : %d" % (errorCount / float(mTest))
示例#35
0
def camplist():
    #error = None
    if request.method == 'POST':
        u_userid = request.form['userid']
        # u_userid=int(u_userid)
        conn = sqlite3.connect(db_path)
        cur = conn.cursor()
        cur.execute("select Latitude,Longitude from User where Id=" + u_userid)
        data = cur.fetchall()
        for row in data:
            lat = row[0]
            lat = float(lat)
            print(row[0])
            long = row[1]
            long = float(long)
            print(row[1])
        campid = []
        campid = KNN.Knn(lat, long)
        print("In app.py")
        print(campid)
        campid = tuple(campid)
        campid = str(campid)
        print("select * from Camp where Id in" + campid)
        cur.execute("select * from Camp where Id in" + campid)
        data1 = cur.fetchall()
        print(data1)
        return render_template('list.html', data1=data1)
    return render_template('camplist.html')
示例#36
0
def handwritingClassTest():
    hwLabels = []
    trainingFileList = listdir("trainingDigits")
    m = len(trainingFileList)
    trainingMat = np.zeros((m,1024))
    for i in range(0,m):
        fileNameStr = trainingFileList[i]
        fileStr = fileNameStr.split('.')[0]
        classNumStr = int(fileStr.split('_')[0])
        hwLabels.append(classNumStr)
        trainingMat[i,:] = img2vector("trainingDigits/"+fileNameStr)
    testFileList = listdir("testDigits")
    errorCount = 0.0
    mTest = len(testFileList)
    for i in range(0,mTest):
        fileNameStr = testFileList[i]
        fileStr = fileNameStr.split('.')[0]
        classNumStr = int(fileStr.split('_')[0])
        vectorUnderTest= img2vector("testDigits/" + fileNameStr)
        classiferResult = KNN.classify0(vectorUnderTest, trainingMat, hwLabels, 3)
        print("the classifier came back with: "+str(classiferResult[0][0])+",the real answer is: "+str(classNumStr))
        if classiferResult[0][0] != classNumStr:
            errorCount += 1.0
    print("\nthe total number od errors is: "+ str(errorCount))
    print("\nthe total error rate is: ",(errorCount / float(mTest)))
示例#37
0
def datingClassTest():
    #打开的文件名 修改自己的地址
    filename = "C:/Users/Administrator/Desktop/blog/github/AILearners/data/2.KNN/datingTestSet2.txt"
    #将返回的特征矩阵和分类向量分别存储到datingDataMat和datingLabels中
    datingDataMat, datingLabels = file2matrix(filename)
    #取所有数据的百分之十
    hoRatio = 0.10
    #数据归一化,返回归一化后的矩阵,数据范围,数据最小值
    normMat, ranges, minVals = autoNorm(datingDataMat)
    #获得normMat的行数
    m = normMat.shape[0]
    #百分之十的测试数据的个数
    numTestVecs = int(m * hoRatio)
    #分类错误计数
    errorCount = 0.0

    for i in range(numTestVecs):
        #前numTestVecs个数据作为测试集,后m-numTestVecs个数据作为训练集
        classifierResult = KNN.classify0(normMat[i, :],
                                         normMat[numTestVecs:m, :],
                                         datingLabels[numTestVecs:m], 3)
        print("分类结果:%d\t真实类别:%d" %
              (classifierResult,
               datingLabels[i])).decode('utf-8').encode('gb2312')
        if classifierResult != datingLabels[i]:
            errorCount += 1.0
    print("错误率:%f%%" % (errorCount / float(numTestVecs) *
                        100)).decode('utf-8').encode('gb2312')
示例#38
0
def datingClassTest():
    """
    Desc:
        对约会网站的测试方法
    parameters:
        none
    return:
        错误数
    """
    # 设置测试数据的比列
    hoRatio = 0.1 # 测试范围 一部分测试 一部分作为样本
    # 从文件中加载数据
    datingDataMat, datingLabels = get_data(filename)
    # 归一化数据
    normDataSet, ranges, minVals = autoNorm(datingDataMat)
    normDataSet = np.array(normDataSet)
    print('$'*100, normDataSet, len(normDataSet))
    datingLabels = datingLabels.iloc[:,0].tolist()
    # 表示数据的行数
    dataSize = normDataSet.shape[0]
    # 设置测试样本的数据
    numTestVecs = int(dataSize * hoRatio)
    print(numTestVecs)
    print('NumTestVecs:', numTestVecs)
    print(normDataSet[numTestVecs:])
    errorCount = 0
    for n in range(numTestVecs):
        # 对数据进行测试
        classifierResult = KNN.classify(normDataSet[n], normDataSet[numTestVecs: ], datingLabels[numTestVecs : dataSize], 3)
        print("the classifier came back with: %d, the real answer is: %d" % (classifierResult, datingLabels[n]))
        if (classifierResult != datingLabels[n]):
            errorCount += 1.0
    print("the total error rate is: %f" % (errorCount / float(numTestVecs)))
    print(errorCount)
def main():
    path = r'C:\Users\mdhal\Desktop\Fall 2018\Machine Learning\Project\Compressed\reviews_Books_5.json.gz'
    weight_range = (0, 150)
    queries = TestBase.get_query_list(path,
                                      5 * (weight_range[1] - weight_range[0]))
    max_to_grab = TestBase.find_count(queries)
    for i in range(len(Review.weights)):
        num_correct = []
        for j in range(weight_range[0], weight_range[1]):
            num_off = [0] * 5
            off = 0
            Review.weights[i] = j
            current_star = 0
            for k in range(200):
                knn_val = KNN.guess_review(queries[current_star][j])
                current_star = (current_star + 1) % 5
                curr_off = abs(current_star + 1 - knn_val)  # actual - estimate
                num_off[curr_off] += 1
                off += curr_off
            print("i:{} j:{}".format(i, j))
            num_correct.append((num_off[0], j))
        Review.weights[i] = get_max_weight(num_correct)
        print(Review.weights[i])
    for i in range(len(Review.weights)):
        print("Weight {} = {}".format(i, Review.weights[i]))
def main():
    appid = '[Your eBay Product AppID]'  # Change this to your eBay product AppID
    search_keyword = 'wine'
    categoryId = '38182'  # red wine
    
    items = get_items(appid, search_keyword, categoryId)
    
    # using un-weighted KNN
    print 'using un-weighted KNN:'
    print KNN.get_KNN(items, (1,1000), k = 3)
    print KNN.get_KNN(items, (2,2000), k = 3)
    print '*********************'
    
    # using weighted KNN
    print 'weighted KNN using Gaussian function:'
    print KNN.get_weightedKNN(items, (1,1000), k = 3)
    print KNN.get_weightedKNN(items, (2,1000), k = 3)
    print KNN.get_weightedKNN(items, (2,2000), k = 3)
def test_non_norm():
    dating_mat, dating_label = KNN.file_to_matrix('datingTestSet2.txt')
    for i in range(30):
        print dating_mat[i], dating_label[i]
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.scatter(dating_mat[:, 0], dating_mat[:, 1],
               15.0 * array(dating_label), 15.0 * array(dating_label))
    plt.show()
示例#42
0
def buildMockUser():
    artists = request.form['artists']
    artistlist = json.loads(artists)
    testUser = User(-100)
    missingArtist = []
    for artistRecord in artistlist:
        artistID = int(artistRecord.keys()[0])
        artistWeight = artistRecord.values()[0]
        if artistWeight == 0:
            artistWeight = 0.0000001
        if ArtistManager.has_key(artistID):
            testUser.insertArt(artistID, artistWeight)
        else:
            missingArtist.append(artistID)
    knn = KNN(40)
    knn.training(UserManager, ArtistManager)
    favOfOne, allArtist, allTag = knn.testing(testUser, UserManager, ArtistManager, True)
    ret = {'artistID': favOfOne}
    if len(missingArtist) > 0:
        ret['warning'] = {'missingArtist':missingArtist}

    ret['artists'] = []
    allArtistLen = len(allArtist)-1
    maxArtistMatchWeight = allArtist[-1][1]
    for i in range(allArtistLen, max(-1, allArtistLen-10), -1):
        artistID = allArtist[i][0]
        matchWeight = allArtist[i][1] / maxArtistMatchWeight
        artistName = ArtistManager[artistID].Name
        topTag = ArtistManager[artistID].getTopTag()
        if topTag == -1:
            topTagName = ""
        else:
            topTagName = TagManager[topTag]
        ret['artists'].append({'id':artistID, 'name':artistName, 'match':matchWeight, 'tag':topTag, 'tagName':topTagName})

    ret['tags'] = []
    allTagLen = len(allTag)-1
    for i in range(allTagLen, max(-1, allTagLen-10), -1):
        tagID = allTag[i][0]
        tagWeight = allTag[i][1]
        tagName = TagManager[tagID]
        ret['tags'].append({'id':tagID, 'name':tagName, 'match':tagWeight})
    # dataObj = {'artists-num':len(artistlist)}
    return json.dumps(ret)
def date_class_test():
    ratio = 0.04    # ratio of the test examples
    # data_set:1000*3,  data_labels: 1000*1
    data_set, data_labels = KNN.file_to_matrix('datingTestSet2.txt')

    # normilize the data_set.   Note:  data_labels is not nessary to normlize
    norm_set, ranges, min_val = KNN.normalize(data_set)

    all_rows = norm_set.shape[0]   # number of all rows
    test_rows = int(ratio * all_rows)  # number of test rows
    error_num = 0
    for i in range(test_rows):
        # return the predict labels
        label_res = KNN.knn_classify(norm_set[i, :], norm_set[test_rows: all_rows, :],\
                                     data_labels[test_rows: all_rows, :], 3)
        print 'Classifier predict: %d, real result is: %d' % (label_res, data_labels[i])
        if label_res != data_labels[i]:
            error_num += 1
    print 'total error rate is: %f ' % (error_num * 1.0 / float(test_rows))
def gameRecommendations(u_name):
    # Get API key
    all_api_keys1 = get_keys("./num1.txt")
    all_api_keys2 = get_keys("./num2.txt")
    api_key = str(all_api_keys1[0]) + str(all_api_keys2[0])

    if len(api_key) != 32:
        print("Uh-oh, don't forget to enter your API key!")
        return

    # Set up a requests session to allow retries when a request fails
    session = reqGet.Session()
    session.mount("http://", reqGet.adapters.HTTPAdapter(max_retries=10))

    games_response_json = getUserGames(u_name, api_key)

    all_games = loadGameIDs("./data/id_header.csv")

    # Get all of the game names and IDs from steam and save them in a dictionary for easy usage
    game_list = json.loads(session.get(url="http://api.steampowered.com/ISteamApps/GetAppList/v2").text)['applist']['apps']
    game_dict = {}
    for game in game_list:
        game_dict[game['appid']] = game

    user_game_array = ["0"] * len(all_games)

    if not games_response_json:
        return

    for game in games_response_json:
        if game['appid'] in all_games:
            game_index = all_games.index(game['appid'])
            user_game_array[game_index] = "1"

    all_games = [game_dict[x]['name'] for x in all_games]

    game_bit_string = int(''.join(user_game_array), 2)
    dataset = KNN.loadDataset("./data/games_by_username_all.csv")
    closest = KNN.findClosest(dataset, game_bit_string, 100)
    return KNN.getTopGames(KNN.getVotes(all_games, closest, game_bit_string), 5)
示例#45
0
def handwriteDigitTest():
    trainData, trainLabel = loadTrainData()
    testData = loadTestData()
    m, n = shape(testData)
    testLabel = loadTestResult() 
    resultList = []
    k = 5
    # predict every testData row's label 
    for i in xrange(m):
        classifyClassResult = KNN.classify0(testData[i], trainData, trainLabel.transpose(), k)
        resultList.append(classifyClassResult)
        print "the classifier calcute is: %d, the real answer is : %d" %(classifyClassResult, testLabel[0,i])
    saveResult(resultList)
def probabilitygraph(data, vec, upperbound, k=5, weightf = KNN.gaussian_weight, sigma = 5.0):
    t = arange(0.0, upperbound, 0.1)
    points = []
    
    # get probabilities for each point
    probabilities = [prob_guess(data, vec, v, v+0.1, k, weightf) for v in t]
    
    # smooth the graph
    for i in range(len(probabilities)):
        sv = 0.0
        for j in range(len(probabilities)):
            dist = abs(i-j)*0.1
            weight = KNN.gaussian_weight(dist, sigma)
            sv += weight*probabilities[j]
        points.append(sv)
    plot(t,array(points))
    show()
示例#47
0
文件: Isomap.py 项目: philipz1/ML
def isomap(data, k = 10, target_dim = 2, load = False, save = False):
	if load == False:
		graph = KNN.knn(data, k)
		A = construct_A(data, graph)
		dists = shortest_dist_weight(A)
		if save != False:
			np.save(save, dists)
	else:
		dists = np.load(load)

	gram_tilda = do_gram_tilda(dists)
	eigvals, eigvecs = np.linalg.eigh(gram_tilda)

	index = np.argsort(eigvals)[::-1]
	eigvals = eigvals[index]
	eigvecs = eigvecs[:,index]

	return eigvecs[:,0:target_dim]
def test(data, k):
    random.shuffle(data)
    pts, labels = column(data, 0), column(data, 1)

    trainingData = pts[:800]
    trainingLabels = labels[:800]
    testData = pts[800:]
    testLabels = labels[800:]

    f = KNN.makeKNNClassifier(trainingData, trainingLabels, k, KNN.euclideanDistance)
    correct = 0.0
    total = len(testLabels)

    for (point, label) in zip(testData, testLabels):
        if f(point) == label:
            correct += 1

    return correct / total
示例#49
0
def le(data, k = 10, target_dim = 2):
	graph = KNN.knn(data, k)
	A = construct_mesh(data, graph)
	from sklearn import manifold
	return(manifold.spectral_embedding(A, 2))

	D = construct_degree(A)
	L = D - A

	eigvals, eigvecs = scipy.linalg.eigh(A, L)

	index = np.argsort(eigvals)[::-1]
	eigvals = eigvals[index]
	eigvecs = eigvecs[:,index]
	
	return eigvecs[:,1:target_dim + 1]

# print(le(npdata))
def prob_guess(data, new_item, low, high, k=5, weightf = KNN.gaussian_weight):
    # get sorted distance list
    dlist = KNN.get_sorted_distances(data, new_item)
    top_k = dlist[0:k]
    
    rweight = 0.0
    total_weight = 0.0
    
    for i in range(k):
        dt = top_k[i][0]
        weight = weightf(dt)
        idx = top_k[i][1]
        price = data[idx]['price']
        
        if price > low and price < high:
            rweight += weight
        total_weight += weight
    if total_weight == 0:
        return 0
    return rweight/total_weight
示例#51
0
def le(data, k = 10, target_dim = 2):
	graph = KNN.knn(data, k)
	A = construct_mesh(data, graph)
	# from sklearn import manifold
	# return(manifold.spectral_embedding(A, 2))

	D = np.diag(A.sum(1))
	# L = D - A
	# print(D**(-1/2))
	x = D ** (-1/2)
	x[np.isinf(x)] = 0
	L = np.dot(x, D-A)
	L = np.dot(L, x)
	# L[np.isinf(L)] = 0

	eigvals, eigvecs = eig(L, D)

	index = np.argsort(eigvals)[::1]
	eigvals = eigvals[index]
	eigvecs = eigvecs[:,index]
	
	return eigvecs[:, 1: 1 + target_dim]
示例#52
0
文件: main.py 项目: Wummer/ML1
plt.title('With $\\theta$ = %1.1f'%theta)
plt.axis('equal')
#plt.show()

"""
------------------------------------ I.4.x ---------------------------------------

"""

""" See the module for the explanation of each function. """

train = open('IrisTrain2014.dt', 'r')
test = open('IrisTest2014.dt', 'r')

#Calling read and split
train_set = KNN.read_data(train)
test_set = KNN.read_data(test)
transformed_test = KNN.transformtest(train_set, test_set)

print "*" * 45
print "Mean and variance"
print "*" * 45

print " Train set:"
zeromean_train = KNN.meanfree(train_set)
print "-" * 45

print " Normalized rain set:"
getting_mean_for_normalized_train = KNN.meanfree(zeromean_train)
print "-" * 45
示例#53
0
'''
# Generate Image
trainDatas, trainLabels = Pretreatment.loadTrainData('/home/hadoop/workdatas/kaggle/DigitRecognizer/train_sort.csv');
trainLabels = trainLabels[0];
Pretreatment.generateImage('/home/hadoop/workdatas/kaggle/DigitRecognizer/imgs/', trainDatas, trainLabels);
'''


'''
# KNN Test
import numpy
group = numpy.array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]])
labels = ['A', 'A', 'B', 'B']

tar = [1.0, 1.2];

result = KNN.classify(tar, group, labels, 3);

print result;
'''

# KNN Clasify
trainDatas, trainLabels = Pretreatment.loadTrainData('/home/hadoop/workdatas/kaggle/DigitRecognizer/train.csv');
trainLabels = trainLabels[0];
testDatas = Pretreatment.loadTestData('/home/hadoop/workdatas/kaggle/DigitRecognizer/test.csv');
result = KNN.process(testDatas, trainDatas, trainLabels);
Pretreatment.generateResultFile('/home/hadoop/workdatas/kaggle/DigitRecognizer/result_knn_10.csv', result);

print 'success'
示例#54
0
        elif datingLabels[i] == 3:
             ax.scatter(datingDataMat[i][0],datingDataMat[i][1],datingDataMat[i][2], c='g',marker='*')
        elif datingLabels[i] == 4:
             ax.scatter(datingDataMat[i][0],datingDataMat[i][1],datingDataMat[i][2], marker='1')
       
    #ax.scatter(datingDataMat[:,0],datingDataMat[:,1],datingDataMat[:,2],
    #          5.0 * np.array(datingLabels), 5.0 * np.array(datingLabels))
    
    plt.show()


def classifyPerson():
    print "输入相关信息"
    resultList = ['一点不喜欢','有点希望','可能性很大']
    percentTats = float(raw_input("玩游戏时间数目?"))
    ffMiles = float(raw_input("旅游公路数?"))
    ice = float(raw_input("冰淇淋消耗量?"))
    datingDataMat,datingLabels = KNN.file2matrix('datingTestSet2.txt')
    normMat,ranges,minVals = KNN.autoNorm(datingDataMat)
    inArr = np.array([ffMiles,percentTats,ice])
    classfierRt = KNN.classify0((inArr-minVals)/ranges,normMat,datingLabels,3)
    print resultList[classfierRt - 1]
    PrintFigure(normMat, datingLabels)

#classifyPerson()
datingDataMat,datingLabels = KNN.file2matrix('datingTestSet2.txt')
print datingLabels
PrintFigure(datingDataMat,datingLabels)

    
示例#55
0
def main(argv):
    """ Start of the program """
    
    if len(argv)==0:      ## check of the arguments
        print("\n Improper command format")
        usage()
        sys.exit()
        
    filename=''  
    flow = False
    dump = False
    try:
        opts,args=getopt.getopt(argv,"hf:d",["ifile=", "dump"])   
    except getopt.GetoptError:
        print("\n Improper command format")
        usage()
        sys.exit()
        
    ## to read the arguments
    for opt,arg in opts:
        if opt=="-h":
            usage()
            sys.exit()
        elif opt in ("-f","--ifile"):
            filename=arg
            flow = True
        elif opt in ("d","--dump"):
            dump=True
        else:
             print("\n Improper command format")
             usage()
             sys.exit()
    if flow :
        #print(filename)
        if os.path.isfile(filename):
            #print("File name  : ",filename)
            openfile = open(filename,'rb')
            readfile = openfile.read()
            print("Entropy of file is ", H(readfile))
       
            pe=pefile.PE(filename)
            sizeofHeader = pe.OPTIONAL_HEADER.SizeOfHeaders
            unknownEntropy = []
            unknownPackedEntropy = []
            for section in pe.sections:
                init=section.VirtualAddress
                last=section.VirtualAddress+section.Misc_VirtualSize
                sectionData= readfile[init:last]
                unknownEntropy.append(HsetReduction(sectionData))

                hex_bytes = binascii.hexlify(sectionData)
                cleartext = hex_bytes.decode("utf-8")
                cipherText= encrypFile(cleartext)
                unknownPackedEntropy.append(HsetReduction(cipherText))
            TotalunknownEntropy= 0
            TotalunknownPackedEntropy = 0 
            for i in range (len(unknownEntropy)):
                TotalunknownEntropy = TotalunknownEntropy+ unknownEntropy[i]
                TotalunknownPackedEntropy = TotalunknownPackedEntropy+ unknownPackedEntropy[i]
            firstEntropy = TotalunknownEntropy/len(unknownEntropy)
            secondEntropy =TotalunknownPackedEntropy/len(unknownPackedEntropy)
            #print("First Entropy = ",firstEntropy)
            #print("Entropy after packing = ",secondEntropy)
            TestingList = [firstEntropy,secondEntropy,(secondEntropy-firstEntropy)]
            predictedResult = KNN.ibk(TestingList)
            print(filename," is ",predictedResult)
            #KNN.ibktest()
            if dump:
                print(dump_info(filename))
        
        else:
            print("File doesn't exit or path is improper")
    else:
        print("\n Improper command format")
        usage()
示例#56
0
train = NaiveBayes.train_nb(data)

#Read example data
f = open(examples, 'r')

#Test every example
for line in f:
    array_line = line.split(',')
    row = []
    length = len(array_line)
    for i in range (0,length):
        row.append(float(array_line[i]))

    #Apply the algorithm
    if algorithm != 'NB':
        print KNN.knearest(int(algorithm),data,row)
    else:
        print NaiveBayes.naive_bayes(row,train)


##############################
#                            #
# Cross validation called    #
# when need it               #
#                            #
##############################
#folds = CrossValidation.fold_divide(data)
#print CrossValidation.cross_validation_nn(1,folds)
#print CrossValidation.cross_validation_nb(folds)

	# 	# data[4]: user_taggedartists.dat 
	# 	# tag = [userID	artistID	tagID	day	month	year]
	# 	if UserManager.has_key(int(tag[0])):
	# 		UserManager[int(tag[0])].insertTag(int(tag[1]),int(tag[2]))



	#train with UserManager, test with TestUserManager
	# counter = 0
	# for userID,user in TestUserManager.iteritems():
	# 	if len(user.ArtistList) == 0:

	# 		counter += 1
	# 	# print userID, len(user.ArtistList)
	# print counter, len(TestUserManager)
	knn = KNN(30)
	knn.training(UserManager, ArtistManager)

	theSameNum = 0
	for userID in TestUserManager:
		favOfOne, neighbors = knn.testingTimeBased(TestUserManager[userID],UserManager, ArtistManager)
		favTruth = TestUserManager[userID].getMostFav().keys()[0]
		if favOfOne == favTruth:
			theSameNum += 1
		print userID, theSameNum, favOfOne

	print 1.0*theSameNum/len(TestUserManager)

	# print favOfOne

示例#58
0
# -*- coding: utf-8 -*-
'''
Created on 2014年9月30日

@author: Rayleigh
'''
import KNN as kNN
from numpy import * 

dataSet, labels = kNN.createDataSet()

testX = array([0.2, 0.9])
k = 3
outputLabel = kNN.kNNClassify(testX, dataSet, labels, 3)
print "Your input is:", testX, "and classified to class: ", outputLabel

testX = array([0.1, 0.3])
outputLabel = kNN.kNNClassify(testX, dataSet, labels, 3)
print "Your input is:", testX, "and classified to class: ", outputLabel