Exemplo n.º 1
0
def makeGraph(relPath, columns, resultColumn,k ,tp):
    dataSet = r.readDataSet(relPath, columns)
    trainingSets = []
    avaliationSets = []
    kfold = kc(dataSet, k, resultColumn, True)
    kfold.run(trainingSets, avaliationSets, stratified = True)
    dataSet = dataSet.apply(pd.to_numeric)

    ks = [1,2,3,5,7,9,11,13,15]
    means = []
    for j in ks:
        print("Using k = " + str(j))
        correctPercentage = 0
        for i in range(len(trainingSets)):
            tset=[]
            aset=[]
            for index, row in dataSet.iterrows():
                tupla = (dataSet.iloc[index][resultColumn], index)
                if tupla in trainingSets[i]:
                    tset.append(row.tolist())
                if tupla in avaliationSets[i]:
                    aset.append(row.tolist())
            k = Knn(tset, j, tp = tp)
            correctPercentage += k.test(aset)     
        generalMean = correctPercentage / len(trainingSets)
        means.append(generalMean)
    matplotlib.pyplot.plot(ks, means)
    matplotlib.pyplot.show()
Exemplo n.º 2
0
def _makeGraph(relPath, columns, resultColumn):
    dataSet = r.readDataSet(relPath, columns)
    trainingSets = []
    avaliationSets = []
    kfold = kc(dataSet, 10, resultColumn, True)
    kfold.run(trainingSets, avaliationSets, stratified=True)
    dataSet = dataSet.apply(pd.to_numeric)

    ks = [1, 3]
    nPrototypes = [3, 5, 10, 20]

    for k in ks:
        meansGeral = []
        meansFalse = []
        meansTrue = []
        for j in nPrototypes:
            correctnessPercentage = 0
            correctTrue = 0
            correctFalse = 0
            for i in range(len(trainingSets)):
                print("\n")
                print(" --------- FOLD " + str(i + 1) + " ----------------")
                tset = []
                aset = []
                for index, row in dataSet.iterrows():
                    tupla = (dataSet.iloc[index][resultColumn], index)
                    if tupla in trainingSets[i]:
                        tset.append(row.tolist())
                    if tupla in avaliationSets[i]:
                        aset.append(row.tolist())
                lvq = LVQ3(tset, resultColumn)
                newtset = lvq.run(nPrototypes=j)
                kn = Knn(newtset, k)
                result = kn.test(aset)
                correctnessPercentage += result[0]
                classErrors = result[1]
                classNumbers = result[2]
                correctFalse += (
                    classErrors[False] /
                    classNumbers[False]) if False in classErrors.keys() else 0
                correctTrue += (
                    classErrors[True] /
                    classNumbers[True]) if True in classErrors.keys() else 0
            meansGeral.append(correctnessPercentage / len(trainingSets))
            meansFalse.append(correctFalse / len(trainingSets))
            meansTrue.append(correctTrue / len(trainingSets))
        plt.ylim(0, 1)
        plt.plot(nPrototypes, meansGeral, 'r', label='general')
        plt.plot(nPrototypes, meansFalse, 'g', label='false')
        plt.plot(nPrototypes, meansTrue, 'b', label='true')
        plt.legend(loc='upper left')
        plt.show()
Exemplo n.º 3
0
def _LVQ3(relPath, columns, resultColumn):
    dataSet = r.readDataSet(relPath, columns)
    trainingSets = []
    avaliationSets = []
    kfold = kc(dataSet, 10, resultColumn, True)
    kfold.run(trainingSets, avaliationSets, stratified=True)
    dataSet = dataSet.apply(pd.to_numeric)
    tset = []
    aset = []
    for i in range(len(trainingSets)):
        print("\n")
        print(" --------- FOLD " + str(i + 1) + " ----------------")
        tset = []
        aset = []
        for index, row in dataSet.iterrows():
            tupla = (dataSet.iloc[index][resultColumn], index)
            if tupla in trainingSets[i]:
                tset.append(row.tolist())
            if tupla in avaliationSets[i]:
                aset.append(row.tolist())
        print("------------- SIMPLE KNN ----------------")
        k = Knn(tset, 3)
        k.test(aset)
        lvq = LVQ3(tset, resultColumn)
        newtset = lvq.run()
        print("-------------- LVQ3 ----------------------")
        k = Knn(newtset, 3)
        k.test(aset)
Exemplo n.º 4
0
def compare(filename):  #filename vai ser Tp1_data.csv
    showPlots = False
    Xs, Ys = get_data(filename)
    X_r, X_t, Y_r, Y_t = train_test_split(Xs, Ys, test_size=0.33, stratify=Ys)
    folds = 5
    Kf = StratifiedKFold(Y_r, n_folds=folds)

    KnnErr, bestN, KnnPred = Knn(Kf, X_r, Y_r, X_t, Y_t,
                                 showPlots)  #KnnPred AA-07
    print("KnnErr, best_N:", KnnErr, bestN)

    LogScore, bestC, LogPred = Logistic(Kf, X_r, Y_r, X_t, Y_t, showPlots)
    print("LogisticScore, best_C:", LogScore, bestC)

    NBScore, bestBandwidth, NBPred = NaiveBayes(Kf, X_r, Y_r, X_t, Y_t,
                                                showPlots)
    print("NBScore, best_Bandwidth:", NBScore, bestBandwidth)

    MCNemarKnn_Log = MCNemar(KnnPred, LogPred, Y_t)  #(|e01-e10|-1)²/e01+e10
    MCNemarNB_Log = MCNemar(NBPred, LogPred, Y_t)
    MCNemarNB_Knn = MCNemar(KnnPred, NBPred, Y_t)

    print()
    print("McNemar:")
    print("MCNemarKnn_Log", MCNemarKnn_Log)
    print("MCNemarNB_Log", MCNemarNB_Log)
    print("MCNemarNB_Knn", MCNemarNB_Knn)
Exemplo n.º 5
0
def knnTest(feature_len, all_lines, all_features, all_labels):
    counts = {}
    for i in range(10):
        rate = 0
        print("Test %d:" % (i + 1))
        train_features = all_features[0:int(0.8 * len(all_features))]
        train_labels = all_labels[0:int(0.8 * len(all_features))]
        test_features = all_features[int(0.8 * len(all_features)):]
        test_labels = all_labels[int(0.8 * len(all_features)):]
        length = len(test_labels)
        for k in range(1, 5):
            rate = 0
            print("k = %d: " % k, end=" ")
            for j in range(0, length):
                res = Knn(train_features, train_labels, test_features[j], k)
                if res == test_labels[j]:
                    rate += 1
            print(rate / length)
            if k not in counts:
                counts[k] = rate / length
            else:
                counts[k] += rate / length
        all_features, all_labels = now_provider.getFeatureAndLabel(
            all_lines, feature_len)
    for x in counts:
        print(x, counts[x])
Exemplo n.º 6
0
def simpleKnn(relPath, columns, resultColumn,k ,tp):
    dataSet = r.readDataSet(relPath, columns)
    trainingSets = []
    avaliationSets = []
    kfold = kc(dataSet, k, resultColumn, True)
    kfold.run(trainingSets, avaliationSets, stratified = True)
    dataSet = dataSet.apply(pd.to_numeric)

    for i in range(len(trainingSets)):
        tset=[]
        aset=[]
        for index, row in dataSet.iterrows():
            tupla = (dataSet.iloc[index][resultColumn], index)
            if tupla in trainingSets[i]:
                tset.append(row.tolist())
            if tupla in avaliationSets[i]:
                aset.append(row.tolist())
        k = Knn(tset, 1, tp = tp)
        k.test(aset)
def Prediction(trainUser,testUser,trainBook,numberof_k,smilarityname,predictdict):
    MeanAbsolute=0
    predictsum=0
    totalpredict=len(testUser)
    weightedMeanAbsolute=0
    if smilarityname == "cosine":
        TrainSquares=squareforcos(trainUser)
    for testkey in testUser:
        if smilarityname == "cosine":
            cossim = cosineSim(testUser[testkey],trainUser,trainBook,TrainSquares)
        if smilarityname == "correlation":
            cossim = correlation(testUser[testkey],trainUser,trainBook)
        if smilarityname == "adjcosine":
            cossim = adjCosineSim(testUser[testkey],trainUser,trainBook)
        if len(cossim)!=0 :
            #print("\nTEST ID ->",testkey," With K=",numberof_k,"neighbours",len(cossim))
            mean,weightedmean,prediction = Knn(cossim,numberof_k,trainBook,predictdict[testkey])
            MeanAbsolute += mean
            weightedMeanAbsolute += weightedmean
            predictsum += prediction
          
        else: # if test user dont have common item in train data, prediction will be updated users own mean
            lenisbn=len(testUser[testkey])
            predictsum += lenisbn
            testusersum=sum(testUser[testkey].values())/lenisbn
            testMean=0
            testWmean=0
            for key,value in testUser[testkey].items():
                predictdict[testkey][key][1],predictdict[testkey][key][2]=round(testusersum),round(testusersum)
                testMean+=abs(value-testusersum)
                testWmean+=abs(value-testusersum)
            testMean=testMean/lenisbn
            testWmean=testWmean/lenisbn
            
            MeanAbsolute += testMean
            weightedMeanAbsolute += testWmean
            
        
    MeanAbsolute= MeanAbsolute/totalpredict
    weightedMeanAbsolute= weightedMeanAbsolute/totalpredict
    print("\nSmilarity function:",smilarityname,
          "\nNeighbours number ==",numberof_k,
          "\nMEAN ABSOLUTE ERROR:",MeanAbsolute,
          "\nWeighted MEAN ABSOLUTE ERROR:",weightedMeanAbsolute,
          "\nTotal Prediction:",totalpredict)
    
    return MeanAbsolute,weightedMeanAbsolute
Exemplo n.º 8
0
def main():
    # ASSERTS
    knn_numeric = Knn.Knn_numeric(0)
    filename = 'data/dataset1-1.csv'
    comp_number = 1  #Numero de componentes principais

    # PCA
    #pca_inst = pca.PCA()
    #(data, target, all_data) = pca_inst.get_data(filename, comp_number)

    # LDA
    lda_inst = lda.LDA()
    (data, target, all_data) = lda_inst.get_data(filename, comp_number)

    # Acuracia
    (k, values) = knn_numeric.get_acuraccy_by_neighbor(data, target, all_data)

    print(list(zip(k, values)))
Exemplo n.º 9
0
def compareTest(feature_len, all_lines, all_features, all_labels):
    count = {}
    for i in range(10):
        print("\nTest %d" % (i + 1))
        train_features = all_features[0:int(0.8 * len(all_features))]
        train_labels = all_labels[0:int(0.8 * len(all_features))]
        test_features = all_features[int(0.8 * len(all_features)):]
        test_labels = all_labels[int(0.8 * len(all_features)):]
        length = len(test_labels)

        rate = 0
        print("NaiveBayes : ", end="")
        new_bayes = NaiveBayes(train_features, train_labels, feature_len)
        new_bayes.train()
        for j in range(0, length):
            res = new_bayes.predict(test_features[j])
            if res == test_labels[j]:
                rate += 1
        print(rate / length)
        if "NaiveBayes" not in count:
            count["NaiveBayes"] = rate / length
        else:
            count["NaiveBayes"] += rate / length

        rate = 0
        print("KNN : ", end="")
        for j in range(0, length):
            res = Knn(train_features, train_labels, test_features[j], 3)
            if res == test_labels[j]:
                rate += 1
        print(rate / length)
        if "KNN" not in count:
            count["KNN"] = rate / length
        else:
            count["KNN"] += rate / length

        rate = 0
        print("Logistic : ", end="")
        new_logistic = Logistic(train_features,
                                train_labels,
                                feature_len,
                                alpha=5,
                                tol=0.000001)
        new_logistic.train()
        for j in range(0, length):
            res = new_logistic.predict(test_features[j])
            if res == test_labels[j]:
                rate += 1
        print(rate / length)
        if "Logistic" not in count:
            count["Logistic"] = rate / length
        else:
            count["Logistic"] += rate / length

        rate = 0
        print("NeuralNetwork : ", end="")
        new_NN = NeuralNetwork(train_features,
                               train_labels,
                               feature_len,
                               hidden_num=32,
                               learn_rate=100)
        new_NN.train()
        for j in range(0, length):
            res = new_NN.predict(test_features[j])
            if res == test_labels[j]:
                rate += 1
        print(rate / length)
        if "NeuralNetwork" not in count:
            count["NeuralNetwork"] = rate / length
        else:
            count["NeuralNetwork"] += rate / length

        rate = 0
        print("Tree : ", end="")
        new_tree = Tree(train_features, train_labels, len(train_features[0]),
                        3, 8)
        new_tree.train()
        for j in range(0, length):
            res = new_tree.predictTree(test_features[j])
            if res == test_labels[j]:
                rate += 1
        print(rate / length)
        if "Tree" not in count:
            count["Tree"] = rate / length
        else:
            count["Tree"] += rate / length

        rate = 0
        print("AdaBoost : ", end="")
        new_boost = AdaBoost(train_features,
                             train_labels,
                             len(train_features[0]),
                             28,
                             mode=2)
        new_boost.train()
        for j in range(0, length):
            res = new_boost.predict(test_features[j])
            if res == test_labels[j]:
                rate += 1
        print(rate / length)
        if "AdaBoost" not in count:
            count["AdaBoost"] = rate / length
        else:
            count["AdaBoost"] += rate / length

        rate = 0
        print("RandomForest : ", end="")
        new_forest = RandomForest(30)
        new_forest.buildTrees(train_features, train_labels,
                              len(train_features[0]), 3, 6)
        for j in range(0, length):
            res = new_forest.predictForest(test_features[j])
            if res == test_labels[j]:
                rate += 1
        print(rate / length)
        if "RandomForest" not in count:
            count["RandomForest"] = rate / length
        else:
            count["RandomForest"] += rate / length

        rate = 0
        print("SVM : ", end="")
        new_svm = SVM(train_features,
                      train_labels,
                      C=43,
                      function='RBF',
                      d=0.53)
        new_svm.train()
        for j in range(0, length):
            res = new_svm.predict(test_features[j])
            if res == test_labels[j]:
                rate += 1
        print(rate / length)
        if "SVM" not in count:
            count["SVM"] = rate / length
        else:
            count["SVM"] += rate / length

        all_features, all_labels = now_provider.getFeatureAndLabel(
            all_lines, feature_len)

    print("\nAverage:")
    for x in count:
        print(x, end=": ")
        print(count[x] / 10)
Exemplo n.º 10
0
### RANDOM FOREST ###
rf = RandomForest(X_train, y_train)

rf.predict(X_test)
rf.set_prediction_data()
#rf.plot_num_deaths_per_age()
#rf.plot_num_deaths_per_gender()

#rf.ageScore(age)
#rf.genderScore(gender)
#rf.deathScore(death)
###############################

### KNN ###
knn = Knn(X_train, y_train)

knn.predict(X_test)
knn.set_prediction_data()
#knn.plot_num_patient_neg_summary_based_on_gender()
#knn.plot_num_patient_neg_summary_baseg_on_age()
#knn.plot_num_patient_neg_summary_based_on_is_from_wuhan()

#knn.ageScore(age)
#knn.genderScore(gender)
#knn.deathScore(death)
###################################

T_1 = df.drop(columns=[
    'reporting date', 'summary', 'location', 'country', 'symptom', 'death'
])
Exemplo n.º 11
0
    .format(X_test.size, X_test.shape, X_test[0]))
print(
    "Testing target (y_target) has {} elements\ny_test.shape = {} --> A single array of 30 elements\n"
    .format(y_test.size, y_test.shape))

print("target_labels = {}".format(y_test))

### view data
# plt.figure()
# plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap, edgecolor='k', s=20)
# plt.show()

# a = [1, 1, 1, 2, 2, 3, 4, 5, 6]
# from collections import Counter
# most_common = Counter(a).most_common(1)
# print(most_common[0][0])

from KNN import Knn
clf = Knn(
    k=5
)  # instantiate a Knn classifier (clf) passing in the number of neighbors (default is 3)
clf.fit(X_train, y_train)  # pass the training data to your Knn classifier
predictions = clf.predict(X_test)

# predict() algorithm...
# 1) calculate distance between X_test and every training data entry
# 2) find kth nearest training data entries i.e. smallest euclidean distance
# 3) match kth nearest data entries with flower class target array and select the most common

acc = np.sum(predictions == y_test) / len(y_test)
print("accuracy = {}".format(acc))