def fit(self, X, y):
        """ Partitions the data into k-folds."""
        # Split data into n_folds
        self.kFold = KFold(n_folds=self.n_folds,
                           random_state=self.random_state)
        self.kFold.generate_data(X, y)

        # Determine k_max, the maximum value of k given N and the number of folds.
        N = X.shape[0]
        fold_size = math.floor(N / self.n_folds)
        self.k_max = (fold_size * (self.n_folds - 1)) - 1

        # Iterate over each value of k, storing mean error for each fold
        k_scores = []
        for i in range(1, self.k_max + 1):
            model = knn(i)
            fold_scores = []
            for j in range(self.n_folds):
                data = self.kFold.get_fold_data(j)
                model.fit(data['X_train'], data['y_train'])
                fold_scores.append(model.score(data['X_val'], data['y_val']))
            k_scores.append(np.mean(fold_scores))

        # Obtain best k
        self.best_k = self._get_best_k(k_scores)
        self.model = knn(self.best_k)
        self.model.fit(X, y)
示例#2
0
文件: klwp.py 项目: bnithish/ml_asgn2
def doit(X, k):
    x, y = loadData("train", 225)
    x = x.toarray()

    train_x = x[0:10000]
    train_y = y[0:10000]

    test_x = x[9000:10000]
    test_y = y[9000:10000]

    model = lwp()
    model.fit(train_x, train_y)
    prediction = model.predict(test_x)
    cent = model.centroids_
    clas = model.classes_
    #  print(cent.shape)
    #  print(clas)
    neigh = knn(n_neighbors=k)
    neigh.fit(cent, clas)
    kn = neigh.kneighbors(X.toarray())[:][1]
    #  correct = 0
    #  wrong = 0
    #  for i in range(1000):
    #  print(test_y[i],clas[kn[i]])
    #  if test_y[i] in clas[kn[i]]:
    #  correct = correct+1
    #  else:
    #  wrong = wrong+1
    #  print(correct,wrong)
    return clas[kn]
示例#3
0
def full_knn(iris, num_features=4):
    """Perform knn classification on iris dataset using given number of
    feature dimensions (default = 4), shows results."""

    # perform projection
    iris.data = iris.data[:, :num_features]

    # screw up scaling! (knn can be sensitive to feature scaling)
    # iris.data[:, :1] *= 100000000

    # perform train/test split
    tts = cv.train_test_split(iris.data, iris.target, train_size=TRAIN_PCT)
    train_features, test_features, train_labels, test_labels = tts

    # initialize model, perform fit
    clf = knn(n_neighbors=NUM_NBRS)
    clf.fit(train_features, train_labels)

    # get accuracy (predictions made internally)
    acc = clf.score(test_features, test_labels)

    # get conf matrix (requires predicted labels)
    predicted_labels = clf.predict(test_features)
    cm = confusion_matrix(test_labels, predicted_labels)

    print 'k = {0}'.format(NUM_NBRS)
    print 'num_features = {0}'.format(num_features)
    print 'accuracy = {0} %\n'.format(round(100 * acc, 2))
    print 'confusion matrix:\n', cm, '\n'
示例#4
0
	def Train_Data(self):
		feature = []
		Temp = pd.read_csv("/home/cse/Work/Dataset/idf.csv")
		for i in Temp:
			if self.check == 0:
				self.attribute.append(i)
		self.check = 1
		data = pd.read_csv("/home/cse/Work/Dataset/data.csv")
		for i in data:
			feature.append(i)
		feature = feature[:-1]
		X = data[feature]
		Y = data['label']

		self.train_data, self.test_data, self.label_data, self.label_test = train_test_split(X,Y,test_size=0.3,random_state=1)
		print("Training model......")
		self.model = svm.LinearSVC(random_state=0)
		self.model.fit(self.train_data,self.label_data)

		self.modelknn = knn(n_neighbors=7)
		self.modelknn.fit(self.train_data,self.label_data)

		joblib.dump(self.model,"/home/cse/Work/Dataset/MODEL.pkl")
		joblib.dump(self.modelknn,"/home/cse/Work/Dataset/MODELKNN.pkl")
		print("Finish training.")
		print()
		return
示例#5
0
def knnOptimization(cmMetric='accuracy'):
    kvals = [i for i in range(1, 15)]
    kmodels = {}
    kpreds = {}
    kpreds_prob = {}
    cutoffgrid = np.linspace(0, 1, 100)
    numk = []
    for k in kvals:
        tknn = knn(n_neighbors=k).fit(XS, y)
        kmodels[k] = tknn

        tknn_preds = tknn.predict(XS)
        kpreds[k] = tknn_preds

        tknn_preds_prob = tknn.predict_proba(XS)
        kpreds_prob[k] = [x[0] for x in tknn_preds_prob]
    for k in kvals:
        tcm = [
            confusionMatrixInfo(kpreds_prob[k] < i, y, labels=[1, 0])[cmMetric]
            for i in cutoffgrid
        ]
        numk.append(max(tcm))
        count = 0
        for x in numk:
            count += 1
            if x == max(numk):
                return count, numk
示例#6
0
def sk_knn():
    train_labels = []
    train_flist = os.listdir("./digit/trainingDigits")
    train_len = len(train_flist)
    train_mat = np.zeros((train_len, 1024))

    for i, fname in enumerate(train_flist):
        # print(fname)
        flabel = int(fname.split("_")[0])
        train_mat[i, :] = mat2vector("./digit/trainingDigits/{}".format(fname))
        train_labels.append(flabel)
        # break

    print(train_labels)
    print(train_mat)
    knn_instance = knn(n_neighbors=3)  #TODO neighbors <= 5效果最好
    knn_instance.fit(train_mat, train_labels)

    test_flist = os.listdir("./digit/testDigits")  # test file list
    err_count = 0
    for fname in test_flist:
        test_label = int(fname.split("_")[0])
        test_mat = mat2vector("./digit/testDigits/{}".format(fname))
        res = knn_instance.predict(test_mat)
        if res != test_label:
            err_count += 1
            print("error,the predict res is {}, the real label is {}".format(
                res, test_label))
            print("./digit/testDigits/{}".format(fname))
    print("the error rate is {}%".format(err_count / len(test_flist) * 100))
示例#7
0
def knn_pca_std(n_neighbors, n_pca_components, X_train_df, X_test_df,
                y_train_df, y_test_df):
    """
    Function performs  KNN with PCA using standardized data set.
    Inputs:
        -  n_neighbors - number of KNN nearest neighbors
        -  n_pca_components - number of PCA components to use
        -  X_train_df - dataframe containing X data for training
        -  X_test_df - dataframe containing X data for testing
        -  y_train_df - dataframe containing y data for training
        -  y_test_df - dataframe containing y data for testing
    Returns:
        - KNN accuracy for specified K and number of pca components
          values
    """

    # Standardize data
    scaler = preprocessing.StandardScaler()
    scaler.fit(X_train_df)
    X_train_std_df = scaler.transform(X_train_df)
    X_test_std_df = scaler.transform(X_test_df)


    # Conduct KNN - pca_standardized
    KNN = knn(n_neighbors=n_neighbors)
    pca = PCA(n_components=n_pca_components)
    pca.fit(X_train_std_df)
    X_train_std_pca_df = pca.transform(X_train_std_df)
    X_test_std_pca_df = pca.transform(X_test_std_df)
    KNN.fit(X_train_std_pca_df, y_train_df)
    y_pred = KNN.predict(X_test_std_pca_df)
    accuracy = accuracy_score(y_test_df, y_pred)

    return accuracy
示例#8
0
def knn_no_pca (n_neighbors, X_train_df, X_test_df, y_train_df, y_test_df):
    """
    Function performs KNN (no pca) using normalized and
    standardized data sets.
    Inputs:
        -  n_neighbors - number of KNN nearest neighbors
        -  n_splits - number of CV folds
        -  X_train_df - dataframe containing X data for training
        -  X_test_df - dataframe containing X data for testing
        -  y_train_df - dataframe containing y data for training
        -  y_test_df - dataframe containing y data for testing
    Returns:
        - knn accuracy
    """

    # Standardize data
    scaler = preprocessing.StandardScaler()
    scaler.fit(X_train_df)
    X_train_std_df = scaler.transform(X_train_df)
    X_test_std_df = scaler.transform(X_test_df)

    # Perform KNN
    KNN = knn(n_neighbors=n_neighbors)
    KNN.fit(X_train_std_df,y_train_df)
    y_pred = KNN.predict(X_test_std_df)
    accuracy = accuracy_score(y_test_df, y_pred)

    return accuracy
def predict_knn(X_train, X_test, y_train, y_test):
    clf=knn(n_neighbors=3)
    print("knn started")
    clf.fit(X_train,y_train)
    y_pred=clf.predict(X_test)
    calc_accuracy("K nearest neighbours",y_test,y_pred)
    np.savetxt('submission_surf_knn.csv', np.c_[range(1,len(y_test)+1),y_pred,y_test], delimiter=',', header = 'ImageId,Label,TrueLabel', comments = '', fmt='%d')
示例#10
0
def label_cluster(X, class_labels, center):
    clf = knn(n_neighbors=nbr, algorithm='kd_tree')
    #clf=tree.DecisionTreeRegressor()
    #clf =SVC(kernel='rbf', class_weight='balanced')
    clf.fit(X, class_labels)
    Y = clf.predict(center)
    return Y
def SelectModel(modelname, param):

    if modelname == "SVM":
        from sklearn.svm import LinearSVC
        model = LinearSVC(C=param)
    elif modelname == "GBDT":
        from sklearn.ensemble import GradientBoostingClassifier
        model = GradientBoostingClassifier()
    elif modelname == "RF":
        from sklearn.ensemble import RandomForestClassifier
        model = RandomForestClassifier()
    elif modelname == "KNN":
        from sklearn.neighbors import KNeighborsClassifier as knn
        model = knn()
    elif modelname == "LR":
        from sklearn.linear_model import LogisticRegression
        model = LogisticRegression(C=param)
    elif modelname == 'NB':
        from sklearn.naive_bayes import MultinomialNB
        model = MultinomialNB(alpha=1)
    elif modelname == "Softmax":
        from sklearn.linear_model import LogisticRegression
        model = LogisticRegression(multi_class='multinomial',
                                   solver='lbfgs',
                                   C=param)
    return model
示例#12
0
def SelectModel(modelname):

    if modelname == "SVM":

        model = SVC(kernel='rbf', C=16, gamma=0.0313, probability=True)

    elif modelname == "GBDT":

        model = GradientBoostingClassifier()

    elif modelname == "RF":
        from sklearn.ensemble import RandomForestClassifier
        model = RandomForestClassifier(n_estimators=500)

    elif modelname == "XGBOOST":
        from xgboost.sklearn import XGBClassifier
        #import xgboost as xgb
        #model = xgb()
        print('+++++++++++++++++++++++++')
        model = XGBClassifier()

    elif modelname == "KNN":
        from sklearn.neighbors import KNeighborsClassifier as knn
        model = knn()
    elif modelname == "lgb":
        model = lgb.LGBMClassifier(n_estimators=500,
                                   max_depth=15,
                                   learning_rate=0.2)
    else:
        pass
    return model
示例#13
0
def knnIrisDataSet(X, y, n):

    #definir el clasificador the the classifier and fits it to the data
    res = 0.05
    k1 = knn(n_neighbors=n, p=2, metric='minkowski')
    #entrenar los datos
    k1.fit(X, y)

    #definir la malla
    x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, res),
                           np.arange(x2_min, x2_max, res))

    #realizar la prediccion
    Z = k1.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
    Z = Z.reshape(xx1.shape)

    #colores
    cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
    cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])

    #dibujar la superficie de decision
    plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap_light)
    plt.xlim(xx1.min(), xx1.max())
    plt.ylim(xx2.min(), xx2.max())

    #dibujar los puntos
    for idx, cl in enumerate(np.unique(y)):
        plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold)

    plt.show()
def knnDemo(X, y, n):

    #cresates the the classifier and fits it to the data
    res = 0.05
    k1 = knn(n_neighbors=n, p=2, metric='minkowski')
    k1.fit(X, y)

    #sets up the grid
    x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, res),
                           np.arange(x2_min, x2_max, res))

    #makes the prediction
    Z = k1.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
    Z = Z.reshape(xx1.shape)

    #creates the color map
    cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
    cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])

    #Plots the decision surface
    plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap_light)
    plt.xlim(xx1.min(), xx1.max())
    plt.ylim(xx2.min(), xx2.max())

    #plots the samples
    for idx, cl in enumerate(np.unique(y)):
        plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold)

    plt.show()
def sklearn_handwritingTest():
    #获取训练特征
    train_mat = []
    train_labels = []
    for sub in os.listdir(train_digits):
        label = sub.split('_')[0]
        train_labels.append(label)

        vector = img2vector(os.path.join(train_digits, sub))
        train_mat.append(vector)
    train_mat = np.array(train_mat)

    neigh = knn(n_neighbors=3)
    neigh.fit(train_mat, train_labels)

    #进行测试
    error = 0.0
    total = 0.0
    for sub in os.listdir(test_digits):
        total += 1
        label = sub.split('_')[0]
        vector = img2vector(os.path.join(test_digits, sub))

        vector = np.array(vector).reshape(1, -1)  # 1d->2d
        pred_label = neigh.predict(vector)
        if pred_label != label:
            error += 1

    print('错误率为{}/{}'.format(error, total))
def handwritingClassTest():
    hwlabels = []
    trainingFileList = listdir("../../data/mnist/trainingDigits")
    # 返回文件夹个数
    m = len(trainingFileList)
    # 初始化训练集
    trainMat = np.zeros((m, 1024))
    # 从文件名解析训练集类别
    for i in range(m):
        fileNameStr = trainingFileList[i]
        classNumber = int(fileNameStr.split('_')[0])
        hwlabels.append(classNumber)
        trainMat[i, :] = img2vector("../../data/mnist/trainingDigits/" +
                                    fileNameStr)
    # 构建knn分类器
    neign = knn(n_neighbors=3, algorithm='auto')
    # 拟合模型
    neign.fit(trainMat, hwlabels)
    # 返回testDigits目录下的文件列表
    testFileList = listdir("../../data/mnist/testDigits")
    errorCount = 0.0
    mTest = len(testFileList)
    # 从文件中解析出测试集的类别并进行分类测试
    for i in range(mTest):
        fileNameStr = testFileList[i]
        classNumber = int(fileNameStr.split('_')[0])
        vectorUnderTest = img2vector("../../data/mnist/testDigits/" +
                                     fileNameStr)
        # 获取预测结果
        classifierResult = neign.predict(vectorUnderTest)
        print("分类返回结果为%d\t真实结果为%d" % (classifierResult, classNumber))
        if (classifierResult != classNumber):
            errorCount += 1
    print("总共错了%d个数据\n错误率为%f%%" % (errorCount, errorCount / mTest * 100))
示例#17
0
文件: knn.py 项目: Adusei/science
def full_knn(iris, num_features=4):
    """Perform knn classification on iris dataset using given number of
    feature dimensions (default = 4), shows results."""

    # perform projection
    iris.data = iris.data[:,: num_features]

    # screw up scaling! (knn can be sensitive to feature scaling)
    # iris.data[:, :1] *= 100000000

    # perform train/test split
    tts = cv.train_test_split(iris.data, iris.target, train_size=TRAIN_PCT)
    train_features, test_features, train_labels, test_labels = tts

    # initialize model, perform fit
    clf = knn(n_neighbors=NUM_NBRS)
    clf.fit(train_features, train_labels)

    # get accuracy (predictions made internally)
    acc = clf.score(test_features, test_labels)

    # get conf matrix (requires predicted labels)
    predicted_labels = clf.predict(test_features)
    cm = confusion_matrix(test_labels, predicted_labels)

    print 'k = {0}'.format(NUM_NBRS)
    print 'num_features = {0}'.format(num_features)
    print 'accuracy = {0} %\n'.format(round(100 * acc, 2))
    print 'confusion matrix:\n', cm, '\n'
示例#18
0
def getKNN(trainX, trainY):
    from sklearn.neighbors import KNeighborsClassifier as knn
    trainX = np.array(trainX)
    trainY = np.array(trainY)
    model = knn(n_neighbors=30, weights='distance')
    model.fit(trainX, trainY.ravel())
    return model
示例#19
0
def knnclassify(traindata, trainlabel, target):
    """
    use knn(3) by default
    trainingdata and target has be to numpy array
    """
    model = knn().fit(traindata, trainlabel)
    prediction = model.predict(target)

    return prediction
示例#20
0
 def remove_outliers(self, X, k = 20, q = 0.1):
     
     nneigh = knn(k + 1)
     nneigh.fit(X)
     dist = nneigh.kneighbors(X,return_distance = True)[0][:,1:]
     dens = 1/np.mean(dist,axis = 1)
     
     keepers = (dens >= np.quantile(dens, q))
     
     return keepers
示例#21
0
文件: win_knn.py 项目: Adusei/science
def run_knn(train_features, train_labels,test_features, test_labels, iteration=None,k=None):
    clf = knn(n_neighbors = k)
    clf.fit(train_features, train_labels)

    acc = clf.score(test_features, test_labels)

    predicted_labels = clf.predict(test_features)
    cm = confusion_matrix(test_labels, predicted_labels)

    CROSS_VAL_ARRAY.append(acc)
示例#22
0
def run_knn():
	clf=knn(n_neighbors=3)
	print("knn started")
	clf.fit(x,y)
	#print(clf.classes_)
	#print clf.n_layers_
	pred=clf.predict(x_)
	#print(pred)
	np.savetxt('submission_knn.csv', np.c_[range(1,len(test)+1),pred,label_test], delimiter=',', header = 'ImageId,Label,TrueLabel', comments = '', fmt='%d')
	calc_accuracy("K nearest neighbours",label_test,pred)
示例#23
0
def test_data(X_test, center, y1):
    clf = knn(n_neighbors=1, algorithm='auto')
    #neigh.fit(center,y1)
    #Y=neigh.predict(X_test)
    #clf=RandomForestClassifier()
    #clf =SVC(kernel='rbf', class_weight='balanced')
    #clf=tree.DecisionTreeRegressor()
    clf.fit(center, y1)
    Y = clf.predict(X_test)
    return Y
示例#24
0
def find_knn_k( max_k = max_knn):
#this function fits the knn model and finds the highest accuracy value and index 

	#initialize results set
	all_fpr, all_tpr, all_auc, all_acc, all_cm = (np.zeros(max_k), np.zeros(max_k), np.zeros(max_k),np.zeros(max_k), np.zeros(max_k))
	

	#Perfom CV to find best value of k
	for i in xrange(max_k):
		
		#randomize data
		#perm = np.random.permutation(len(labels))
		#features = features.iloc[perm]
		#lables = labels.iloc[perm]

		# perform train/test split
		tts = cv.train_test_split(features,labels, train_size=train_pct)
		train_features, test_features, train_labels, test_labels = tts
		#print test_features, '\n'

    	#initialize model, perform fit
		kclf = knn(n_neighbors=i+1)
		kclf.fit(train_features,train_labels)

		# get conf matrix (requires predicted labels)
		predicted_labels = kclf.predict(test_features)
		cm = confusion_matrix(test_labels, predicted_labels)

		
		#calc ROC, AUC, and accuracy
		fpr, tpr, thresholds = roc_curve(test_labels,predicted_labels, pos_label=1)
		roc_auc = auc(fpr,tpr)
		
		#get model accuracy 
		acc = kclf.score(test_features, test_labels)

    	#Put all stats in arrays
		all_fpr[i] = fpr[1]
		all_tpr[i] = tpr[1]
		all_auc[i] = roc_auc
		all_acc[i] = acc 
		#all_cm[i] = cm
		#all_k[i] = all_acc.argmax(axis=0)
		
		#print i
		#print 'confusion matrix:\n', cm, '\n'
		

	print 'Accuracy Matrix = \n', all_acc
	#print np.mean(all_acc)
	print '\nMax accuracy = {0}'.format(max(all_acc))
	print '\nK = {0}'.format(all_acc.argmax(axis=0) + 1)
	#print len(all_acc)
	#print all_k
	return all_acc, max_k, predicted_labels, test_labels
示例#25
0
    def __init__(self):
        """ Constructor.

            Arg:
                name the name of the classifier
        """
        super().__init__("KNN")
        grid_parameters = {'n_neighbors': range(2, 15)}
        self.knn = GridSearchCV(
            knn(), grid_parameters, cv=3, iid=False
        )  # least populated class in y has only 3 members, so cv is set to 3
示例#26
0
def elbow_curve(k):
    empty_lst = []   #empty list

    for i in k:   #instance for knn
        clf = knn(n_neighbors=i)
        clf.fit(train_x,train_y)
        tmp = clf.predict(test_x)
        tmp = m.accuracy_score(tmp,test_y)
        error = 1-tmp
        empty_lst.append(error)
   
    return empty_lst
示例#27
0
def ecur(k):

    error_test = []

    for i in k:
        c = knn(n_neighbors=i)
        c.fit(train_x, train_y)
        tmp = c.predict(test_x)
        tmp = metrics.accuracy_score(tmp, test_y)
        error = 1 - tmp
        error_test.append(error)
    return error_test
示例#28
0
def eval_acc(X_train):
    global_train = []

    for jj in range(1, 101):
        clf = knn(n_neighbors=jj)
        clf.fit(X_train, Y_train)
        global_train.append(round(clf.score(X_train, Y_train), 2))

    for z in range(0, len(X_train_k)):
        X_train_k.iloc[z, :] = X_train_k.iloc[z, :] + global_train

    return (X_train_k)
def knnhelper(x, y):
    xtrain, xtest, ytrain, ytest = train_test_split(x,
                                                    y,
                                                    test_size=0.30,
                                                    random_state=42,
                                                    stratify=y)

    neig = knn(n_neighbors=2, algorithm='kd_tree')
    neig = neig.fit(xtrain, ytrain)
    ans = neig.predict(xtest)

    TP, FP, TN, FN = perf_measure(ytest, ans)
示例#30
0
def select_classify():
    return [
        naive(),
        tree(criterion="entropy"),
        knn(n_neighbors=8, weights='uniform', metric="manhattan"),
        mlp(hidden_layer_sizes=(128, ),
            alpha=0.01,
            activation='tanh',
            solver='sgd',
            max_iter=300,
            learning_rate='constant',
            learning_rate_init=0.001)
    ]
示例#31
0
def getTrainedCLassifier(classifierType, train):
    if classifierType == "naiveBayes":
        from nltk.classify import NaiveBayesClassifier
        trainedClassifier = NaiveBayesClassifier.train(train)
    elif classifierType == "randomForest":
        from sklearn.ensemble import RandomForestClassifier as rfc
        trainedClassifier = SklearnClassifier(rfc(n_estimators=25, n_jobs = 2))
        trainedClassifier.train(train)
    elif classifierType == "knn5":
        from sklearn.neighbors import KNeighborsClassifier as knn
        trainedClassifier = SklearnClassifier(knn(5))
        trainedClassifier.train(train)
    return trainedClassifier
示例#32
0
def cal_cost_knn(x, trn, trg):
    x = list(map(int, np.round(x)))
    if sum(x) == 0 : 
        return np.inf, np.inf, 1
    x_index = [i for i in range(len(x)) if x[i]==1]
    trn = trn.reshape(trn.shape[1], -1)
    trn = trn[x_index, :]
    trn = np.transpose(trn)
    clf = knn(n_neighbors=nn)
    clf.fit(trn, trg)
    pre = clf.predict(trn)
    score = acc(pre, trg)
    error = 1 - score
    return (1-alpha)*error + alpha * (sum(x)*1.0/len(x)), error, sum(x)*1.0/len(x)
示例#33
0
def predict_knn(X, y, X_train, X_test, y_train, y_test):
    clf = knn(n_neighbors=3)
    print("======= KNN =======")
    clf.fit(X_train, y_train)
    pickle.dump(clf, open('knn_trained_new.sav', 'wb'))
    y_pred = clf.predict(X_test)
    calc_accuracy("K nearest neighbours", y_test, y_pred)
    np.savetxt('submission_surf_knn.csv',
               np.c_[range(1,
                           len(y_test) + 1), y_pred, y_test],
               delimiter=',',
               header='ImageId,Label,TrueLabel',
               comments='',
               fmt='%d')
示例#34
0
def chooseClassification(name):
    print "Choosen classfier:",name
    return {
        'NB': GaussianNB(),
        'ADA': adaBoost(n_estimators=50),
        'RF': rf(n_estimators = 100),
        'KNN': knn(n_neighbors=15, p=1),
        'SVM': svm.SVC(kernel='rbf', probability=True),
        'BAG':BaggingClassifier(n_estimators = 30)#base_estimator=knn(),
                             #bootstrap=True,
                             #bootstrap_features=True,
                             #oob_score=True,
                             #max_features = 10,
                             #max_samples = 100),
        }.get(name, GaussianNB())    # default Gaussian Naive Bayes
示例#35
0
def find_features(step_num=1,num_cv=5):
#This function finds the best features for each model
#NOTE: This only works for the Logistic Regression model 	

	#initialize model
	model = LR()
	kclf = knn(n_neighbors=15)

	selector_LG = RFECV(model, step=step_num, cv=num_cv)
	selector_LG.fit(features,labels)

	#selector_KNN = RFECV(kclf, step=step_num, cv=num_cv)
	#selector_KNN.fit(features,labels)

	print 'LG features'
	print selector_LG.support_
	print selector_LG.ranking_, '\n'
示例#36
0
def KNN(trainXY, testXY):
    # clf = knn(n_neighbors=3)
    params = {
        'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
        'weights': ['uniform', 'distance'],
    }
    clf = knn()
    clf = grid_search.GridSearchCV(clf, params)
    print('clf = {:s}'.format(clf))
    clf.fit(trainXY[0], trainXY[1][:, 0])
    print('clf.best_estimator_ = {:s}'.format(clf.best_estimator_))
    clf = clf.best_estimator_
    prd = clf.predict(testXY[0])
    print('prd = {:s}'.format(prd))
    print('ans = {:s}'.format(testXY[1][:, 0].transpose()))
    print('accuracy = {:f}'.format(
        accuracy(prd, testXY[1][:, 0].transpose())))
def knnTrain(datafile,featureNum,fold = 10):
	import sys
	train,test = loaddata(datafile)
	row,col = train['counts'].shape
	if col < featureNum:
		featureNum = col 
	X_train = train['counts'][:,0:featureNum]
	y_train = train['labels'][0,:]
	X_test = test['counts'][:,0:featureNum]
	y_test = test['labels'][0,:]
	tuned_parameters = [{'n_neighbors':[2,3,4,6,10,15,18,20,30,40,50]}]
	model = knn(n_neighbors = 1)	 
	categories = train['category']
	feature_names =np.array([k.strip() for k in train['feature_names']])
	data = [X_train,y_train,X_test,y_test,categories,feature_names,featureNum,model,tuned_parameters,fold]
	clf,accuracy = cross_validation(*data)
	return accuracy
示例#38
0
def plot_dby(iris):
    """Performs knn classification on projected iris dataset, plots results as
    well as decision boundaries."""

    # project features into 2-dim space (for viz purposes)
    # NOTE "projection" just means that we're dropping the other features...this is
    #      not the same thing as "feature selection" (which requires more care)
    #      or "dimensionality reduction" (which requires more math)
    X = iris.data[:, :2] #lop off two last columns
    y = iris.target
    
    # initialize & fit knn model
    clf = knn(n_neighbors=NUM_NBRS)
    clf.fit(X, y)

    # create x, y mesh to plot decision boundaries
    x_min = -1 + X[:, 0].min()
    y_min = -1 + X[:, 1].min()

    x_max = 1 + X[:, 0].max()
    y_max = 1 + X[:, 1].max()

    xx, yy = np.meshgrid(np.arange(x_min, x_max, MESH_SIZE),
        np.arange(y_min, y_max, MESH_SIZE))

    # create predictions & reshape to fit mesh
    preds = clf.predict(np.c_[xx.ravel(), yy.ravel()]) #
    preds = preds.reshape(xx.shape)
        # no train vs test - > because the test set is
        # every point in the 2d plane

    # plot prediction results
    pl.figure()
    pl.pcolormesh(xx, yy, preds, cmap=COLORS_1)

    # plot training examples
    pl.scatter(X[:, 0], X[:, 1], c=y, cmap=COLORS_2)

    # set additional plot parameters
    pl.xlim(xx.min(), xx.max())
    pl.ylim(yy.min(), yy.max())
    pl.title('knn classification of iris dataset (k = {0})'.format(NUM_NBRS))
    
    pl.show(1)
示例#39
0
def sentPred(trainfile, testfile, result, report):
    traindata = np.loadtxt(trainfile)
    testdata = np.loadtxt(testfile)

    x_train = traindata[:,1:]
    y_train = traindata[:,0]

    y_pred_stan = traindata[:,-1]
    score_train_stan = ascore(y_train, y_pred_stan)
    rep_train_stan = prf(y_train, y_pred_stan, average=None)

    clf_lda = lda()
    clf_lda.fit(x_train, y_train)
    y_pred_lda = clf_lda.predict(x_train)
    score_train_lda = ascore(y_train, y_pred_lda)
    rep_train_lda = prf(y_train, y_pred_lda, average=None)
    test_pred_lda = clf_lda.predict(testdata)

    clf_log = log()
    clf_log.fit(x_train, y_train)
    y_pred_log = clf_log.predict(x_train)
    score_train_log = ascore(y_train, y_pred_log)
    rep_train_log = prf(y_train, y_pred_log, average=None)
    test_pred_log = clf_log.predict(testdata)

    clf_knn = knn(n_neighbors = 1)
    clf_knn.fit(x_train, y_train)
    y_pred_knn = clf_knn.predict(x_train)
    score_train_knn = ascore(y_train, y_pred_knn)
    rep_train_knn = prf(y_train, y_pred_knn, average=None)
    test_pred_knn = clf_knn.predict(testdata)

    separator = np.array((9,))
    test_pred = np.concatenate((test_pred_lda,separator,test_pred_log,separator,test_pred_knn))
    np.savetxt(result, test_pred, fmt='%i')

    np.savetxt(report, rep_train_stan + rep_train_lda + rep_train_log + rep_train_knn, fmt = '%10.5f')

    f = open(report, 'ab')
    f.write('stan: ' + str(score_train_stan) + '\n')
    f.write('lda: '  + str(score_train_lda)  + '\n')
    f.write('log: '  + str(score_train_log)  + '\n')
    f.write('knn: '  + str(score_train_knn)  + '\n')
    f.close()
示例#40
0
def train_and_classify(nn=5, X = records, y = labels, n_folds=3):
    kf = cv.KFold(n=len(X), n_folds=n_folds, shuffle=True)
    
    accs = []
    for k, (train_idxs, test_idxs) in enumerate(kf):
        # Get all train/test samples for this fold
        print "*"*10 + "kNN" + "*"*10
        print str(train_idxs)
        print str(test_idxs)
        train_X = X.loc[train_idxs]
        train_y = y.loc[train_idxs]
        
        test_X = X.loc[test_idxs]
        test_y = y.loc[test_idxs]

        # Train the model
        model = knn(n_neighbors=nn)
        model.fit(train_X, train_y)

        # Test the model
        acc = model.score(test_X, test_y)
        print str(acc)
        accs.append(acc)

        pred_y = model.predict(test_X)
        cm = confusion_matrix(test_y, pred_y)
        print str(cm)

        # Train the model with LR
        print "*"*10 + "LR" + "*"*10
        modelLR = LR()
        modelLR.fit(train_X, train_y)

        # Test the model with LR
        accLR = modelLR.score(test_X, test_y)
        print str(accLR)

        pred_y = modelLR.predict(test_X)
        cmLR = confusion_matrix(test_y, pred_y)
        print str(cmLR)
示例#41
0
def knnDemo(X,y, n): #cresates the the classifier and fits it to the data
    res=0.05
    k1 = knn(n_neighbors=n,p=2,metric='minkowski')
    k1.fit(X,y)
    #sets up the grid
    x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, res),np.arange(x2_min, x2_max, res))
    # makes the prediction
    Z = k1.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
    Z = Z.reshape(xx1.shape)
    # creates the color map
    cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
    cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
    # Plots the decision surface
    plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap_light)
    plt.xlim(xx1.min(), xx1.max())
    plt.ylim(xx2.min(), xx2.max())
    # plots the samples
    for idx, cl in enumerate(np.unique(y)):
        plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold)
    plt.show()
#################### SEPARATING EVALUATION DATA #########################
#X_cv, X_eval, y_cv, y_eval = cross_validation.train_test_split(all_feature_matrix, y, test_size=0.2, random_state=0)


###############################################################################
# Classification 

# Run classifier with cross-validation and plot ROC curves
#
folds=10
cv = StratifiedKFold(y_all, n_folds=folds,shuffle=True)
#cv_shufflesplit=cross_validation.ShuffleSplit(len(y_all),1,test_size=0.2,train_size=None, random_state=0)
#classifier = svm.SVC(kernel='linear', probability=True)
#classifier = RandomForestClassifierWithCoef(RandomForestClassifier)
classifier=knn(n_neighbors=3)

all_indexes=[]
index_list=[]

y_test_report=[];
y_predicted_report=[]
y_proba_report=[]

for i, (train, test) in enumerate(cv):
    ## prepare and normalize test train matrices    
    normalized_matrix_train=cl.normalise_mean_var(all_feature_matrix[train])
    normalised_matrix_test=cl.normalise_mean_var(all_feature_matrix[test])
    
    y_predicted2=[]
    
示例#43
0
def model_rank_loo():
#This function uses a LOO cv iterator and fits a knn and logistic regression model to rank performance for model selection

	#get cv iterator 
	kfloo = cv.LeaveOneOut(num_recs)

	#result set for Leave One Out CV
	LG_fpr, LG_tpr, LG_auc, LG_acc = (np.zeros(num_recs), np.zeros(num_recs), np.zeros(num_recs),np.zeros(num_recs))
	KNN_fpr, KNN_tpr, KNN_auc, KNN_acc = (np.zeros(num_recs), np.zeros(num_recs), np.zeros(num_recs),np.zeros(num_recs))

	for i,(traini, testi) in enumerate(kfloo):
		
		#initialize model
		model = LR()
		kclf = knn(n_neighbors=15)

		#make sure the records don't have null values_____________________________________________
		train_features = features.iloc[traini].dropna()

		#train labels for LOO CV
		train_labels = labels.iloc[traini]

		test_features = features.iloc[testi].dropna()

		#test labels for LOO CV
		test_labels = labels.iloc[testi]


		#initialize model, perform fit
		kclf.fit(train_features,train_labels)
		results_LG = model.fit(train_features,train_labels)

		#predict the labels
		predict_LG = results_LG.predict(test_features)
		predict_KNN = kclf.predict(test_features)

		print 'Index =', i, '\n'
		print 'Logistic Regression Classifier Stats \n'
		print 'True class'
		print test_labels, '\n'
		print 'Predicted Class'
		print predict_LG[0]
		print '\n'*2

		print '+' * 80

		print 'KNN Classifier Stats \n'
		print 'True class'
		print test_labels, '\n'
		print 'Predicted Class'
		print predict_KNN[0]
		print '\n'*2

		#Here we update the results arrays with 1 if our classifier was correct, 
		#later we take the average of the arrays - this give us the approximate accuracy of each model 
		if test_labels == predict_LG[0]:
			LG_acc[i] = 1
		else:
			LG_acc[i] = 0 


		if test_labels == predict_KNN[0]:
			KNN_acc[i] = 1
		else:
			KNN_acc[i] = 0 

		print '*' * 80, '\n', '*' * 80
	

	print '\n', '@_' * 40 
	print 'Logistic Regression Model accuracy on trials =\n', LG_acc, '\n'
	print 'Mean LG accuracy = {0}'.format(np.mean(LG_acc)), '\n'
	print 'KNN Model accuracy on trials =\n' ,KNN_acc, '\n'
	print 'Mean KNN accuracy = {0}'.format(np.mean(KNN_acc))
	print '\n' * 2
imgsize = 28

td = np.array(td_df)
tc = np.array(tc_df)
tsd = np.array(tsd_df)
test_data = np.array(test_data_df)
print test_data.shape
test_data = np.reshape(test_data,(test_data.shape[0],imgsize,imgsize))
test_class = np.array(test_class_df)

dtsize = test_data.shape[0]

for nb in range(80,79,-1):
    mdl = SVC(C=c,kernel='rbf',degree=1,tol=0.0001)
    mdl = rfc(n_estimators=100,criterion='entropy',min_samples_leaf=5,min_samples_split=10,max_features=8)
    mdl = knn(n_neighbors=nb)
    mdl.fit(td,tc)
    for i in range(dtsize):
    
        td_index = []
        for k in range(repl_fact):
            td_index.append( dtsize*k + i)
            
        tsd_1 = np.array(tsd[td_index,:])
     
        
        tst_class_act=test_class[i]
        tst_class_pred_df = pd.DataFrame(mdl.predict(tsd_1))
        #print tst_class_pred
        try:
            tst_class_pred_l = list(tst_class_pred_df.mode().iloc[0])
feature_train = npzfile['x']
label_train = npzfile['y']

npzfile = np.load('feature_test_ECG_MTS.npz')
feature_test = npzfile['x']
label_test = npzfile['y']
train_num = feature_train.shape[0]
windowWidth = 0.05 #0.01
numSymbols = 3  #12
alphabetSize = 12 #7
feature = np.vstack((feature_train, feature_test))
bops = bop_vec(feature, windowWidth = int(windowWidth * feature.shape[1]), numSymbols = numSymbols, alphabetSize= alphabetSize )
bop_train = bops[:train_num]
bop_test = bops[train_num:]
#%%
clf = knn(n_neighbors=1)
clf.fit(bop_train, label_train)
print clf.score(bop_train, label_train)
print clf.score(bop_test, label_test)
#%%
#%%
clf = svm.LinearSVC()
clf.fit(bop_train, label_train)
svctrain = cross_validation.cross_val_score(clf, bop_train, label_train, cv=len(label_train)*2/3)
svctrain = np.mean(svctrain)
svctest = clf.score(bop_test, label_test)
print clf.score(bop_train, label_train)
print clf.score(bop_test, label_test)
print svctrain, svctest
#%%
clf = knn(n_neighbors=3)
示例#46
0
文件: wine.py 项目: Gfif/ml.yandex
import pandas as pn
import numpy as np

from sklearn.cross_validation import KFold, cross_val_score
from sklearn.neighbors import KNeighborsClassifier as knn
from sklearn.preprocessing import scale

data = pn.read_csv('wine.data')
fields = [str(i) for i in xrange(2, 15)]

X = data[fields]
Y = data['1']

kf = KFold(len(data), n_folds=5, shuffle=True, random_state=42)

results = []
for k in xrange(1, 50):
  results.append((k, np.mean(cross_val_score(estimator=knn(n_neighbors=k), cv=kf, X=X, y=Y)))) 

print max(results, key=lambda x: x[1])

X = scale(X)
results = []
for k in xrange(1, 50):
  results.append((k, np.mean(cross_val_score(estimator=knn(n_neighbors=k), cv=kf, X=X, y=Y)))) 

print max(results, key=lambda x: x[1])
示例#47
0
test_labels = test['test_label']
test_edges = test['test_edges']
# Preprocessing normalize data
scaler = StandardScaler()
scaler.fit(train_data)
train_data = scaler.transform(train_data)
#Preprocessing RandomizePCA
#pca = RandomizedPCA(n_components=15)
#pca.fit(train_data)

scaler.fit(valid_data)
valid_data = scaler.transform(valid_data)
scaler.fit(test_data)
test_data = scaler.transform(test_data)
#valid_data = pca.transform(valid_data)
clf = knn(n_neighbors=21, p=1)
clf = clf.fit(train_data,train_labels.ravel())
print clf.score(valid_data,valid_labels.ravel())
print clf.score(test_data,test_labels.ravel())
"""
for file_num in range(210,213):#test_files_count):
    # see test results
    sp_file_names = data['sp_file_names'][file_num].strip()
    im_file_names = data['im_file_names'][file_num].strip()

    # Extract features from image files
    fe = Feature()
    fe.loadImage(im_file_names)
    fe.loadSuperpixelImage()
    test_data = fe.getFeaturesVectors()
   # edges, feat = fe.getEdges()
示例#48
0
from sklearn.svm import SVC
iris_data = datasets.load_iris()
print iris_data
dt = iris_data.data
lbls = iris_data.target


#train a KNN and see how does it perform. Keep 50000 for training and 10000 for validation and 10000 for final test.


num_fold = 10
gen_k_sets = StratifiedKFold(lbls,num_fold)
ab = []

for nb2 in range(1,31,1):
    mdl2 = knn(n_neighbors=nb2)
    for nb in range(11,12,1):
        
        dst_mdl = nn(n_neighbors=nb)
        overall_mis = 0
        mdl = SVC(C=1.0)
        #mdl = rfc(n_estimators=100)
        #mdl = knn(n_neighbours=1)
        
        for train_index, test_index in gen_k_sets:   
            train_data, test_data = dt[train_index], dt[test_index]
            train_class, test_class = lbls[train_index], lbls[test_index]
            tr_dts =[]
            tr_clses=[]
            print
            for k in range(3):
示例#49
0
文件: lab2.py 项目: radubl/CS342
	scaleInput = (scaleInput - mean) / std

	return scaleInput

# takes the vectors of results
def accuracy(predicted,actual):
	trues = 0;
	for x in range(len(predicted)):
		if predicted[x] == actual[x]:
			trues += 1

	return trues/len(predicted)
			

diabetesData = pd.read_csv("diabetes.csv",header=0);

classValues = diabetesData["class"]

# print accuracy(classValues,classValues)

del diabetesData["class"]

scaledData = doRelativeScaling(diabetesData)

# print scaledData

neigh = knn(n_neighbors=1)

neigh.fit(scaledData,classValues) 

print(neigh.predict([[1.3, 1.6, 1.9,0.7,5,2,1,5]])) 
示例#50
0
def model_rank(num_fold=10):
#this function fits a knn and logistic regression model to rank performance for model selection

	#get cv iterators 
	kf = cv.KFold(n=num_recs, n_folds=num_fold, shuffle=True)

	#initialize result set
	LG_fpr, LG_tpr, LG_auc, LG_acc = (np.zeros(num_fold), np.zeros(num_fold), np.zeros(num_fold),np.zeros(num_fold))
	KNN_fpr, KNN_tpr, KNN_auc, KNN_acc = (np.zeros(num_fold), np.zeros(num_fold), np.zeros(num_fold),np.zeros(num_fold))


	for i,(traini, testi) in enumerate(kf):
		
		#initialize model
		model = LR()
		kclf = knn(n_neighbors=15)

		#make sure the records don't have null values
		train_features = features.iloc[traini].dropna()
		train_labels = labels.iloc[traini].dropna()


		test_features = features.iloc[testi].dropna()
		test_labels = labels.iloc[testi].dropna()


		#initialize model, perform fit
		kclf.fit(train_features,train_labels)
		results_LG = model.fit(train_features,train_labels)

		#predict the labels
		predict_LG = results_LG.predict(test_features)
		predict_KNN = kclf.predict(test_features)

		#calc ROC, AUC, and accuracy for LG model
		print 'Logistic Regression Classifier Stats \n'
		print 'True class'
		print test_labels, '\n'
		print 'Predicted Class'
		print '\n'*2

		#NOTE: ROC ANALYSIS ONLY WORKS FOR BINARY CLASSIFICATION PROBLEMS 
		#fpr_LG, tpr_LG, thresholds_LG = roc_curve(test_labels,predict_LG, pos_label=1)
		#roc_auc_LG = auc(fpr_LG,tpr_LG)
		acc_LG = model.score(test_features, test_labels)

		#print 'FPR = {0}'.format(fpr_LG), '\n'
		#print 'TPR = {0}'.format(tpr_LG), '\n'
		#print '\n'

		print 'acc =', acc_LG
		print confusion_matrix(test_labels,predict_LG), '\n'
		print classification_report(test_labels,predict_LG,[1,2,3] ,target_names=targets )
		print 'LG kappa =', kappa(test_labels,predict_LG)
		print '+_' * 40

		print 'KNN Classifier Stats \n'
		print 'True class'
		print test_labels, '\n'
		print 'Predicted Class'
		print predict_KNN
		print '\n'*2

		#LG_fpr[i] = fpr_LG[1]
		#LG_tpr[i] = tpr_LG[1]
		#LG_auc[i] = roc_auc_LG
		LG_acc[i] = acc_LG

		#calc ROC, AUC, and accuracy for KNN model
		#fpr_KNN, tpr_KNN, thresholds_KNN = roc_curve(test_labels,predict_KNN, pos_label=1)
		#roc_auc_KNN = auc(fpr_KNN,tpr_KNN)
		acc_KNN = kclf.score(test_features, test_labels)

		print 'acc =', acc_KNN
		print confusion_matrix(test_labels,predict_KNN), '\n'
		print classification_report(test_labels,predict_KNN,[1,2,3] ,target_names=targets )
		print 'KNN kappa =', kappa(test_labels,predict_KNN)
		print '*' * 80
		print '*' * 80
	
		#KNN_fpr[i] = fpr_KNN[1]
		#KNN_tpr[i] = tpr_KNN[1]
		#KNN_auc[i] = roc_auc_KNN
		KNN_acc[i] = acc_KNN
	print '\n', '@_' * 40 
	print 'Logistic Regression Model accuracy on trials =\n', LG_acc, '\n'
	print 'Mean LG accuracy = {0}'.format(np.mean(LG_acc)), '\n'
	print 'KNN Model accuracy on trials =\n' ,KNN_acc, '\n'
	print 'Mean KNN accuracy = {0}'.format(np.mean(KNN_acc))
示例#51
0
文件: model.py 项目: absulier/titanic
#ridge is helpful if we are afriad of making our model too bias due to our training data
#lasso is helpful if we are unsure of which features we should try to eliminate

#C is related to the amount of penalty we apply. C is the inverse of alpha. Alpha
#is the multiplier we use to apply penalty. As alpha increases, more penalty is
#added, and the model becomes more sensitive to larger coefficients

#If we change our threshold to .90, we would be much more confident that our
#predicted survivals were true survivals. However, we would become less confident
#that our deaths were true death. We would reduce our false positive rate, but
#increase our false negative rate.


#KNN
#does not perform as well as LogReg, even with GridSeach
knn =knn()
knn.fit(x_train,y_train)
knn.score(x_test,y_test)

gridknn=skgs(knn,{'n_neighbors':range(1,55)},cv=12,scoring='accuracy')
gridknn.fit(x_train,y_train)
print gridknn.best_estimator_
print gridknn.score(x_test,y_test)
#As we use more neighbors, our model becomes more bias because it becomes more complex
#Logistice regression is usually a better choice than KNN because it is a more sophisticated
#model and it requires less storage to run. KNN is a good model if you are looking
#for something simple, the data set is not too large, and you want as transparent
#of a model as possible.
knnpred=gridknn.predict(x_test)
print skcm(y_test,knnpred)
#with knn, we got more true negatives, and less false positives,
print dt.shape

    
num_fold = 10
gen_k_sets = StratifiedKFold(lbls,num_fold,shuffle=True)
ab = []



overall_mis = 0
err=[]
c= 1.0
mdl = SVC(C=c,kernel='rbf',degree=1,tol=0.0001)
mdl = rfc(n_estimators=100,criterion='entropy',min_samples_leaf=5,min_samples_split=10,max_features=8)
mdl = knn(n_neighbors=1)
imgsize = 8
patchsize = 6
ab= []
for train_index, test_index in gen_k_sets:   

    train_data, test_data = dt[train_index], dt[test_index]
    train_class, test_class = lbls[train_index], lbls[test_index]
    dtsize= train_data.shape[0]
    train_data = train_data.reshape(dtsize,imgsize,imgsize)
    
    c1 = train_data[:,0:patchsize,0:patchsize] 
    '''
    a= c1[0,:,:]
    print a.shape
    print a
示例#53
0
dgts_data = pd.read_csv("abcd.csv",index_col=0)
print dgts_data.head()
print dgts_data.shape
dgts_data = np.array(dgts_data)
print dgts_data.shape
#print dgts_data

dgts_lbl = pd.read_csv("abcd_l.csv",index_col=0)
#print dgts_lbl.head()
print dgts_lbl.shape
dgts_lbl = np.array(dgts_lbl)
print dgts_lbl.shape
#print dgts_lbl

mdl = knn()
gen_k_sets = StratifiedShuffleSplit(dgts_lbl, n_iter=1, test_size=0.3)


for train_index, test_index in gen_k_sets:   
    train_data, test_data = dgts_data[train_index], dgts_data[test_index]
    train_class, test_class = dgts_lbl[train_index], dgts_lbl[test_index]
    mdl.fit(train_data,train_class)
    print mdl.score(test_data,test_class)

clust_data = test_data
print clust_data.shape

pca = PCA(n_components=100)
pca.fit(clust_data)
tr_dt_p = pca.transform(clust_data)