def nearest_centroid(input_file,Output):
    lvltrace.lvltrace("LVLEntree dans nearest_centroid")
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    n_samples, n_features = X.shape
    clf = NearestCentroid()
    clf.fit(X,y)
    y_pred = clf.predict(X)
    print "#########################################################################################################\n"
    print "Nearest Centroid Classifier "
    print "classification accuracy:", metrics.accuracy_score(y, y_pred)
    print "precision:", metrics.precision_score(y, y_pred)
    print "recall:", metrics.recall_score(y, y_pred)
    print "f1 score:", metrics.f1_score(y, y_pred)
    print "\n"
    print "#########################################################################################################\n"
    results = Output+"Nearest_Centroid_metrics.txt"
    file = open(results, "w")
    file.write("Nearest Centroid Classifier estimator accuracy\n")
    file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y, y_pred))
    file.write("Precision Score: %f\n"%metrics.precision_score(y, y_pred))
    file.write("Recall Score: %f\n"%metrics.recall_score(y, y_pred))
    file.write("F1 Score: %f\n"%metrics.f1_score(y, y_pred))
    file.write("\n")
    file.write("True Value, Predicted Value, Iteration\n")
    for n in xrange(len(y)):
        file.write("%f,%f,%i\n"%(y[n],y_pred[n],(n+1)))
    file.close()
    title = "Nearest Centroid Classifier"
    save = Output + "Nearest_Centroid_Classifier_confusion_matrix.png"
    plot_confusion_matrix(y, y_pred,title,save)
    lvltrace.lvltrace("LVLSortie dans nearest_centroid")
def nearest_centroid(input_file,Output,test_size):
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    n_samples, n_features = X.shape
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    print X_train.shape, X_test.shape
    clf = NearestCentroid()
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    print "Nearest Centroid Classifier "
    print "classification accuracy:", metrics.accuracy_score(y_test, y_pred)
    print "precision:", metrics.precision_score(y_test, y_pred)
    print "recall:", metrics.recall_score(y_test, y_pred)
    print "f1 score:", metrics.f1_score(y_test, y_pred)
    print "\n"
    results = Output+"Nearest_Centroid_metrics_test.txt"
    file = open(results, "w")
    file.write("Nearest Centroid Classifier estimator accuracy\n")
    file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y_test, y_pred))
    file.write("Precision Score: %f\n"%metrics.precision_score(y_test, y_pred))
    file.write("Recall Score: %f\n"%metrics.recall_score(y_test, y_pred))
    file.write("F1 Score: %f\n"%metrics.f1_score(y_test, y_pred))
    file.write("\n")
    file.write("True Value, Predicted Value, Iteration\n")
    for n in xrange(len(y_test)):
        file.write("%f,%f,%i\n"%(y_test[n],y_pred[n],(n+1)))
    file.close()
    title = "Nearest Centroid %f"%test_size
    save = Output + "Nearest_Centroid_confusion_matrix"+"_%s.png"%test_size
    plot_confusion_matrix(y_test, y_pred,title,save)
    lvltrace.lvltrace("LVLSortie dans stochasticGD split_test")
예제 #3
0
	def _clustering(self, targetgame, games):
		'''
			Find similar games with clustering
			TODO
		'''
		preparegames = list(map(lambda x: [i[1] for i in x.data], games))
		preparegame = list(map(lambda x: x[1], targetgame.data))
		lables = list(range(len(games)))
		clf = NearestCentroid()
		clf.fit(preparegames, lables)
		print(clf.predict(preparegame))
예제 #4
0
def NC_select_cv(X, Y, num_features):
    scores = []
    skf = cross_validation.StratifiedKFold(Y, n_folds=10)
    for train, test in skf:
        X_train, X_test, y_train, y_test = X[train], X[test], Y[train], Y[test]
        XRF_train, imp, ind, std = fitRF(X_train, y_train, est=2000)  # RFsel
        XRF_test = X_test[:, ind]  # reorder test set after RFsel
        clf = NearestCentroid()
        clf.fit(XRF_train[:, 0:num_features], y_train)
        scores.append(clf.score(XRF_test[:, 0:num_features], y_test))
    score = np.mean(scores)
    return(score)
def itemB():
    train_dataset = load_nebulosa_train()
    # remover missing values
    # print(train_dataset)
    train_dataset = train_dataset[~np.isnan(train_dataset).any(axis=1)]
    train_dataset = train_dataset[:, 2:]

    train_target = train_dataset[:, -1]
    train_dataset = train_dataset[:, :-2]

    # train_dataset = normalize(train_dataset, axis=0)

    test_dataset = load_nebulosa_test()
    # remover mising values
    test_dataset = test_dataset[~np.isnan(test_dataset).any(axis=1)]
    test_dataset = test_dataset[:, 2:]

    test_target = test_dataset[:, -1]
    test_dataset = test_dataset[:, :-2]
    # print(test_dataset)
    # test_dataset = normalize(test_dataset, axis=1)
    # print(test_dataset)

    kbest = SelectKBest(f_classif, k=3).fit(train_dataset, train_target)
    train_dataset = kbest.transform(train_dataset)
    test_dataset = kbest.transform(test_dataset)

    # print(train_dataset)

    n_train_samples = train_dataset.shape[0]
    n_train_features = train_dataset.shape[1]
    # print("Nebulosa Train dataset: %d amostras(%d características)" % (n_train_samples, n_train_features))

    n_test_samples = test_dataset.shape[0]
    n_test_features = test_dataset.shape[1]
    # print("Nebulosa Test dataset: %d amostras(%d características)" % (n_test_samples, n_test_features))

    nn = KNeighborsClassifier(n_neighbors=1)
    nn.fit(train_dataset, train_target)
    nn_target_pred_test = nn.predict(test_dataset)

    nn_accuracy_test = accuracy_score(test_target, nn_target_pred_test)
    print("NN: Acurácia (Teste): %.2f" % (nn_accuracy_test))

    nc = NearestCentroid(metric="euclidean")
    nc.fit(train_dataset, train_target)
    nc_target_pred_test = nc.predict(test_dataset)

    nc_accuracy_test = accuracy_score(test_target, nc_target_pred_test)
    print("Rocchio: Acurácia (Teste): %.2f" % (nc_accuracy_test))
예제 #6
0
    def fit_dev(self,net_data,nnet_cluster='auto',nSubtypes=3):
        self.nnet_cluster = nnet_cluster
        self.nSubtypes = nSubtypes

        if nnet_cluster == 'auto':
            #self.nnet_cluster = self.getClusters(net_data)
            self.valid_cluster, self.valid_net_idx = self.get_match_network(net_data,nnet_cluster,algo='meanshift')
        else:
            self.valid_cluster, self.valid_net_idx = self.get_match_network(net_data,nnet_cluster,algo='kmeans')

        #self.valid_cluster = self.clust_list
        #self.valid_net_idx = range(len(self.valid_cluster))
        for i in range(net_data.shape[0]):
            if i == 0 :
                self.assign_net = self.assigneDist(net_data[i,:,:],self.valid_cluster, self.valid_net_idx)
            else:
                self.assign_net = np.vstack(((self.assign_net,self.assigneDist(net_data[i,:,:],self.valid_cluster, self.valid_net_idx))))
        print 'Size of the new data map: ',self.assign_net.shape
        # group subjects with the most network classifing them together
        # compute the consensus clustering
        self.consensus = cls.hclustering(self.assign_net,self.nSubtypes)
        # save the centroids in a method
        self.clf_subtypes = NearestCentroid()
        self.clf_subtypes.fit(self.assign_net,self.consensus)
        self.consensus = self.clf_subtypes.predict(self.assign_net)
        #print "score: ", self.clf_subtypes.score(self.assign_net,self.consensus)

        return self.consensus
def itemA():
    train_dataset = load_nebulosa_train()

    train_target = train_dataset[:, -1]
    train_dataset = train_dataset[:, :-1]

    nam_target = np.where(np.isnan(train_target))
    train_target = np.delete(train_target, nam_target)
    train_dataset = np.delete(train_dataset, nam_target, 0)
    train_dataset = np.nan_to_num(train_dataset)

    test_dataset = load_nebulosa_test()

    test_target = test_dataset[:, -1]
    test_dataset = test_dataset[:, :-1]

    nam_target = np.where(np.isnan(test_target))
    test_target = np.delete(test_target, nam_target)
    test_dataset = np.delete(test_dataset, nam_target, 0)
    test_dataset = np.nan_to_num(test_dataset)

    n_train_samples = train_dataset.shape[0]
    n_train_features = train_dataset.shape[1]
    print("Nebulosa Train dataset: %d amostras(%d características)" % (n_train_samples, n_train_features))

    n_test_samples = test_dataset.shape[0]
    n_test_features = test_dataset.shape[1]
    print("Nebulosa Test dataset: %d amostras(%d características)" % (n_test_samples, n_test_features))

    nn = KNeighborsClassifier(n_neighbors=1)
    nn.fit(train_dataset, train_target)
    nn_target_pred_test = nn.predict(test_dataset)

    nn_accuracy_test = accuracy_score(test_target, nn_target_pred_test)
    print("NN: Acurácia (Teste): %.2f" % (nn_accuracy_test))

    # train_target[18] = 1
    nc = NearestCentroid(metric="euclidean")
    nc.fit(train_dataset, train_target)
    nc_target_pred_test = nc.predict(test_dataset)
    # print(nc_target_pred_test)

    nc_accuracy_test = accuracy_score(test_target, nc_target_pred_test)
    print("Rocchio: Acurácia (Teste): %.2f" % (nc_accuracy_test))
예제 #8
0
	def __init__(self):
		x = []
		y = []
		small = False
		#clf = svm.LinearSVC()
		self.clf = NearestCentroid()
		folder = "gyro_side\\"
		files = ['still.csv', 'yes.csv', 'no.csv']
		for i in range(3):
			f =open(folder+files[i], 'r')

			for line in f.readlines():
				#print line
				
				line = [int(a) for a in line.split(',')]
				lines = [self.removeMag(line[9*j:9*j+9]) for j in range(9)]
				# smallLine=[]
				# for j in range(5):
					
					# smallLine = smallLine + line[6*j:6*j+3]
				# if small:
					# line=smallLine
				# if len(x)==0:
					# x= np.array(np.array([line]))
				# else:
					# x=np.append(x,np.array([line]), axis=0)
					# #print np.shape(x)
				x += [reduce(lambda x,y: x+y, lines[:5], [])]
				y += [i]
				x += [reduce(lambda x,y: x+y, lines[4:], [])]
				y += [i]
				
				try:
					z=1
				except Exception as e:
					#print e
					print i, line
					#z=1/0
				
			f.close()	
		x= np.array([np.array(z) for z in x])
		y = np.array(y)
		print y
		print np.shape(y)
		print np.shape(x)
		print type(x[0]), np.array(x[0])
		self.clf.fit(x,y)
		self.data = []

		#self.ser = serial.Serial('COM3', 9600)
		print "Classifier trained"
예제 #9
0
    def loadData(self, ressourcepath1, ressourcepath2):
        print(ressourcepath1)
        dirList = os.listdir(ressourcepath1)
        fullpath1 = []
        for fname in dirList:
            fullpath1.append(ressourcepath1+""+fname)

        dirList = os.listdir(ressourcepath2)
        fullpath2 = []
        for fname in dirList:
            fullpath2.append(ressourcepath2+""+fname)
        counter = 0
        for path in fullpath1:
            if counter == 0:
                fs, w1 = wavfile.read(path)
                w1 = self.scaler(w1)
                w1 = w1[self.get_startingpoint(w1):self.get_endingpoint(w1),:]
                X = np.array([self.euclidean_distance(self.word1,w1),self.euclidean_distance(self.word2,w1)])
                y = np.array([1])
                counter = 1
            else:
                fs, w1 = wavfile.read(path)
                w1 = self.scaler(w1)
                w1 = w1[self.get_startingpoint(w1):self.get_endingpoint(w1),:]
                X = np.vstack((X,np.array([self.euclidean_distance(self.word1,w1),self.euclidean_distance(self.word2,w1)])))
                y = np.hstack((y,np.array([1])))

        for path in fullpath2:
                fs, w2 = wavfile.read(path)
                w2 = self.scaler(w2)
                w2 = w2[self.get_startingpoint(w2):self.get_endingpoint(w2),:]
                X = np.vstack((X,np.array([self.euclidean_distance(self.word1,w2),self.euclidean_distance(self.word2,w2)])))
                y = np.hstack((y,np.array([2])))
        from sklearn.neighbors.nearest_centroid import NearestCentroid
        self.clf = NearestCentroid()
        self.clf.fit(X,y)
col_input=['genre', 'year', 'col1', 'col2', 'col3', 'col4', 'col5', 'col6', 'col7', 'col8', 'col9', 'col10', 'col11', 'col12', 'col13', 'col14', 'col15', 'col16', 'col17', 'col18', 'col19', 'col20', 'col21', 'col22', 'col23', 'col24', 'col25', 'col26', 'col27', 'col28', 'col29', 'col30', 'col31', 'col32', 'col33', 'col34', 'col35', 'col36', 'col37', 'col38', 'col39', 'col40', 'col41', 'col42', 'col43', 'col44', 'col45', 'col46', 'col47', 'col48', 'col49', 'col50', 'col51', 'col52', 'col53', 'col54', 'col55', 'col56', 'col57', 'col58', 'col59', 'col60', 'col61', 'col62', 'col63', 'col64', 'col65', 'col66', 'col67', 'col68', 'col69', 'col70', 'col71', 'col72']
df_input = pandas.read_csv('pandas_output_missing_data_fixed.csv', header=None, delimiter = ",", names=col_input)

# range(2,74) means its goes from col 2 to col 73
df_input_data = df_input[list(range(2,74))].as_matrix() # test with few good features as determined through PCA?
df_input_target = df_input[list(range(0,1))].as_matrix()

colors = numpy.random.rand(len(df_input_target))

# splitting the data into training and testing sets
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_input_data, df_input_target.tolist())

# k-NN
from sklearn.neighbors.nearest_centroid import NearestCentroid
knc = NearestCentroid()
knc.fit(X_train[:],numpy.ravel(y_train[:]))
predicted = knc.predict(X_test)

print y_test[60:90] , len(y_test[60:90])
print predicted[60:90] , len(predicted[60:90])

print knc.classes_

# Prediction Performance Measurement
matches = (predicted == [item for sublist in y_test for item in sublist])
print matches.sum()
print len(matches)

print matches[10:50], len(matches[10:50])
예제 #11
0
파일: classifier2.py 프로젝트: zzj/jeweler
 def train(self):
     clf = NearestCentroid()
     half = len(self.X) / 2
     self.fit = clf.fit(self.X[0:half], self.Y[0:half])
예제 #12
0
plt.scatter(bumpy_fast, grade_fast, color="b", label="fast")
plt.scatter(grade_slow, bumpy_slow, color="r", label="slow")
plt.legend()
plt.xlabel("bumpiness")
plt.ylabel("grade")
plt.show()
################################################################################

### your code here!  name your classifier object clf if you want the
### visualization code (prettyPicture) to show you the decision boundary

# Choose a smaller dataset
features_train = features_train[:len(features_train) / 100]
labels_train = labels_train[:len(labels_train) / 100]

clf = NearestCentroid()
t0 = time()
clf = clf.fit(features_train, labels_train)
print "Training time:", round(time() - t0, 2), "s"

accurary = clf.score(features_test, labels_test)
t1 = time()
pred = clf.predict(features_test)
print "Predicting time:", round(time() - t1, 2), "s"

acc = accuracy_score(pred, labels_test)
print "Accuracy:", acc

try:
    prettyPicture(clf, features_test, labels_test)
except NameError:
svm_model = SVC(kernel="rbf", probability=True, max_iter=10000)
svm_model.fit(X_cropped, y_cropped)
y_train_predicted = svm_model.predict(X_train)
print "SVM Error rate on training data (t1): ", ml_aux.get_error_rate(y_train, y_train_predicted)
# ml_aux.plot_confusion_matrix(y_train, y_train_predicted, "CM SVM Training (t1)")
# plt.show()

y_validation_predicted = svm_model.predict(X_validation)
print "SVM Error rate on validation (t1): ", ml_aux.get_error_rate(y_validation, y_validation_predicted)


# Start k nearest Centroid Classification
print "Performing kNC Classification:"
from sklearn.neighbors.nearest_centroid import NearestCentroid

knnc_model = NearestCentroid()
knnc_model.fit(X_cropped, y_cropped)
y_validation_predicted = knnc_model.predict(X_validation)
print "Error Rate on kNNC (t1) Validation:  ", ml_aux.get_error_rate(y_validation, y_validation_predicted)

# Start Bagging Classification
print "Performing Bagging Classification:"
# Bagging
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier

# Bagging
bagging1 = BaggingClassifier(KNeighborsClassifier(n_neighbors=2), max_samples=1.0, max_features=0.1)
bagging1.fit(X_cropped, y_cropped)
y_validation_predicted = bagging1.predict(X_validation)
print "Error Rate kNN with Baggging Validation: ", ml_aux.get_error_rate(y_validation, y_validation_predicted)
예제 #14
0

# SGD classifier - gives about 73% accuracy
cl4 = SGDClassifier(loss='hinge', penalty='l2', alpha=0.0001, l1_ratio=0.15,
                     fit_intercept=True, n_iter=5, shuffle=True, verbose=0, 
                     epsilon=0.1, n_jobs=1, random_state=None, learning_rate='optimal', 
                     eta0=0.0, power_t=0.5, class_weight=None, warm_start=False,
                      average=False)
cl4.fit(X_train, target)
pr4 = cl4.predict(X_test)
allpred += pr4
print"SGD: " + "%.2f" % (evaluate(pr4, test_jokes)) + "%"


# KNN Classifier - gives about 59% accuracy
cl5 = NearestCentroid()
cl5.fit(X_train, target)
pr5 = cl5.predict(X_test)
print"KNN: " + "%.2f" % (evaluate(pr5, test_jokes)) + "%"


# Decision tree classifier - gives about 75% accuracy
cl6 = tree.DecisionTreeClassifier()
cl6.fit(X_train, target)
pr6 = cl6.predict(X_test)
allpred += pr6
print"Decision tree: " + "%.2f" % (evaluate(pr6, test_jokes)) + "%"


maxpred = max(allpred)
pr7 = [1 if x > maxpred / 2 else 0 for x in allpred]
예제 #15
0
def feature_redux_and_classify(df, target, selection, reduction, classifier, n_features, n_reduction=None):
    # 1 - Feature selection
    if n_reduction is None:
        n_reduction = n_features
    sequence = []
    if selection == "Kruskal-Wallis":
        # 1.1 - Kruskal
        kruskal_stats = []
        for column in df:
            stats, _ = ss.kruskal(df[column], target)
            kruskal_stats.append((column, stats))
        kruskal_stats.sort(key=lambda x: x[1], reverse=True)
        selected_columns = [kruskal_stats[i][0] for i in range(n_features)]
        df = df[selected_columns]
    elif selection == "ROC":
        # 1.2 - Roc
        roc_values = []
        for column in df:
            est = LogisticRegression(solver='liblinear', class_weight='balanced')
            est.fit(df[column].to_frame(), target)
            roc_values.append((column, roc_auc_score(target, est.predict(df[column].to_frame()))))
        roc_values.sort(key=lambda x: x[1], reverse=True)
        selected_columns = [roc_values[i][0] for i in range(n_features)]
        df = df[selected_columns]
    elif selection == "K-Best":
        # sequence.append(('select_best', SelectKBest(k=n_features, score_func=mutual_info_classif)))
        skb = SelectKBest(k=n_features, score_func=mutual_info_classif)
        df = skb.fit_transform(df, target)
    elif selection == "RFE":
        # RFE
        estimator = LogisticRegression(solver='liblinear', class_weight='balanced')
        rfe = RFE(estimator, n_features)
        df = rfe.fit_transform(df, target)

    # 2 - Dimension reduction
    if reduction == "PCA":
        # 2.1 - PCA
        sequence.append(('PCA', PCA(n_components=n_reduction)))
    elif reduction == "LDA":
        # 2.2 - LDA
        sequence.append(('LDARed', LinearDiscriminantAnalysis()))

    # 3 - Classifiers
    if classifier == "Euclidean":
        # 3.1 - Euclidean
        sequence.append(('Euclidean', NearestCentroid(metric='euclidean', shrink_threshold=None)))
    elif classifier == "Mahalanobis":
        # 3.2 - Mahalanobis
        sequence.append(('Mahalanobis', NearestCentroid(metric='mahalanobis', shrink_threshold=None)))
    elif classifier == "Bayes":
        # Naive Gaussian Bayes
        sequence.append(('Bayes', GaussianNB()))
    elif classifier == "K-Nearest":
        # K-Nearest Neighbors
        sequence.append(('K-Nearest', KNeighborsClassifier(n_neighbors=5)))
    elif classifier == "SVC":
        # SVC
        sequence.append(('SVC', SVC(gamma='auto')))
    elif classifier == "Parzen Window":
        # Parzen (Kernel Density Estimation)
        sequence.append(('Parzen', KDEClassifier(kernel='gaussian', bandwidth=1)))
    else:
        # 3.3 - Fisher LDA
        sequence.append(('LDAClass', LinearDiscriminantAnalysis()))

    pipe = Pipeline(sequence)
    kfold = StratifiedKFold(n_splits=20, shuffle=True, random_state=10)
    scoring = {'accuracy': make_scorer(accuracy_score),
               'precision': make_scorer(precision_score),
               'recall': make_scorer(recall_score),
               'f1_score': make_scorer(f1_score)}

    cv_results = cross_validate(pipe, df, target, cv=kfold, scoring=scoring)
    return cv_results
#performance: Euclidean, Cosine, or Manhattan. In `scikit-learn` you can see the
#documentation for NearestCentroid here:
#- http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestCentroid.html#sklearn.neighbors.NearestCentroid
#
#and for supported distance metrics here:
#- http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.distance_metrics.html#sklearn.metrics.pairwise.distance_metrics

#%%
from sklearn.neighbors.nearest_centroid import NearestCentroid

# the parameters for the nearest centroid metric to test are:
#    l1, l2, and cosine (all are optimized)
# fill in the training and testing data and save as separate variables

for d in ['l1', 'l2', 'cosine', 'euclidean', 'manhattan']:
    clf = NearestCentroid(metric=d)
    clf.fit(X_train, y_train)
    yhat = clf.predict(X_test)
    acc = accuracy_score(y_test, yhat)
    print(d, acc)

p = 'cosine'
print('The best distance metric is: ', p)

#%% [markdown]
# ___
# <a id="naive"></a> <a href="#top">Back to Top</a>
# ## Naive Bayes Classification
# Now let's look at the use of the Naive Bayes classifier. The 20 newsgroups
# dataset has 20 classes and about 130,000 features per instance. Recall that
# the Naive Bayes classifer calculates a posterior distribution for each
예제 #17
0
X_train, X_test, y_train, y_test = train_test_split(mdata,
                                                    mlabels,
                                                    test_size=0.25,
                                                    random_state=55)

print('X_train dimensions: ', X_train.shape)
print('y_train dimensions: ', y_train.shape)
print('X_test dimensions: ', X_test.shape)
print('y_test dimensions: ', y_test.shape)

#Mentioned below are the three models, use one and comment the other

neigh = KNeighborsClassifier(n_neighbors=3)
#model = neigh.fit(X_train,y_train.ravel())
#model = GaussianNB().fit(X_train, y_train.ravel())
model = NearestCentroid().fit(X_train, y_train.ravel())

y_train_pred = model.predict(X_train)  #Training the model

#printing the training Ground truth and training predicted results
print("Training Data prediction: \n", y_train_pred)
print("Training Data ground truth: \n", y_train.ravel())

#creating confusion_matrix for training dataset
matrix = metrics.confusion_matrix(y_train, y_train_pred)
#print(matrix)
accuracy = (accuracy_score(y_train, y_train_pred)) * 100
print("Accuracy for training dataset: ", accuracy, "%")

#plotting confussion matrix
plt.matshow(matrix)
예제 #18
0
def rocchio_clas(X_train, y_train, X_test):
    from sklearn.neighbors.nearest_centroid import NearestCentroid
    model = NearestCentroid()
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    return preds
class RocchioClassifier(NLTKClassifier):
    nltk_class = nltk.classify.SklearnClassifier(NearestCentroid())
예제 #20
0
    Y.append(row[0])
    X.append(row[1:])

# close CSV file
genderFile.close()

# covert string values to numbers
X_len = len(X)
for row in range(X_len):
    X[row][0] = float(X[row][0])
    X[row][1] = float(X[row][1])
    X[row][2] = float(X[row][2])

# initialize classifiers
clf_LinearSVC = svm.LinearSVC()
clf_NearestCentroid = NearestCentroid()
clf_SVC = svm.SVC()

# train classifiers using data set
clf_LinearSVC = clf_LinearSVC.fit(X, Y)
clf_NearestCentroid = clf_NearestCentroid.fit(X, Y)
clf_SVC = clf_SVC.fit(X, Y)

# test clasiifiers using data set
acc_LinearSVC = accuracy_score(Y, clf_LinearSVC.predict(X)) * 100.0
acc_NearestCentroid = accuracy_score(Y, clf_NearestCentroid.predict(X)) * 100.0
acc_SVC = accuracy_score(Y, clf_SVC.predict(X)) * 100.0

# identify best classifier
index = np.argmax([acc_LinearSVC, acc_NearestCentroid, acc_SVC])
classifiers = {0: 'LinearSVC', 1: 'NearestCentroid', 2: 'SVC'}
from sklearn.cross_validation import KFold
'''Reading the input file and converting it to matrix'''

file = pd.read_csv('ATNTFaceImages400.txt', header=-1)
data = file.as_matrix()
print(data.shape)
'''Splitting the features and labels from the matrix 
    and transposing it to achieve the appropriate dimension'''
X = np.transpose(data[1:, :])
y = np.transpose(data[0, :])
print(X)
print(y)
'''Splitting the data for kfold cross valaidation using KFold() method'''
kf = KFold(len(y), n_folds=5, shuffle=True)
print(kf)
'''Looping thorught the kfold to access every index of that feature one at a time.'''
for train_index, test_index in kf:
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    '''Performing Centroid to classify each feature tot he correspoding label.'''
    # Centroid Classification
    centroid_classifier = NearestCentroid()
    # Train the model using the training sets
    centroid_classifier.fit(X_train, y_train)
    # Predict the labels
    predictions = centroid_classifier.predict(X_test)
    '''Calculating the accuracy between the actual label and predicted label in percentage[(accuracy*100)%]'''
    actual = y_test
    accuracy = metrics.accuracy_score(actual, predictions) * 100
    print("Accuracy is: ", accuracy)
예제 #22
0
파일: ftrl_cv_ovr.py 프로젝트: xwc940512/-
def stacking(clf, train_x, train_y, test_x, clf_name, class_num=1):
    train = np.zeros((train_x.shape[0], class_num))
    test = np.zeros((test_x.shape[0], class_num))
    test_pre = np.zeros((folds, test_x.shape[0], class_num))
    cv_scores = []
    for i, (train_index, test_index) in enumerate(kf):
        tr_x = train_x[train_index]
        tr_y = train_y[train_index]
        te_x = train_x[test_index]
        te_y = train_y[test_index]

        if clf_name == "lgb":
            train_matrix = clf.Dataset(tr_x, label=tr_y)
            test_matrix = clf.Dataset(te_x, label=te_y)

            params = {
                'boosting_type': 'gbdt',
                'objective': 'multiclass',
                'metric': 'multi_logloss',
                'min_child_weight': 1.5,
                'num_leaves': 2**5,
                'lambda_l2': 10,
                'subsample': 0.7,
                'colsample_bytree': 0.5,
                'colsample_bylevel': 0.5,
                'learning_rate': 0.1,
                'scale_pos_weight': 20,
                'seed': 2018,
                'nthread': 16,
                'num_class': class_num,
                'silent': True,
            }

            num_round = 2000
            early_stopping_rounds = 100

            model = clf.train(params,
                              train_matrix,
                              num_round,
                              valid_sets=test_matrix,
                              early_stopping_rounds=early_stopping_rounds)

            pre = model.predict(te_x,
                                num_iteration=model.best_iteration).reshape(
                                    (te_x.shape[0], class_num))
            pred = model.predict(test_x,
                                 num_iteration=model.best_iteration).reshape(
                                     (test_x.shape[0], class_num))
        if clf_name == "lr":
            model = LogisticRegression(C=4, dual=False)
            model.fit(tr_x, tr_y)
            pre = model.predict_proba(te_x)
            pred = model.predict_proba(test_x)

        if clf_name == "svm":
            model = svm.LinearSVC()
            model.fit(tr_x, tr_y)
            pre = model.decision_function(te_x)
            pred = model.decision_function(test_x)

        if clf_name == "ridge":
            model = Ridge(alpha=20,
                          copy_X=True,
                          fit_intercept=True,
                          solver='auto',
                          max_iter=100,
                          normalize=False,
                          random_state=0,
                          tol=0.0025)
            model.fit(tr_x, tr_y)
            pre = model.predict(te_x)
            pred = model.predict(test_x)

        if clf_name == "roc":
            model = NearestCentroid()
            model.fit(tr_x, tr_y)
            pre = model.predict(te_x)
            pred = model.predict(test_x)

        if clf_name == "ftrl":
            model = FM_FTRL(
                alpha=0.02,
                beta=0.01,
                L1=0.00001,
                L2=30.0,
                D=tr_x.shape[1],
                alpha_fm=0.1,
                L2_fm=0.5,
                init_fm=0.01,
                weight_fm=50.0,
                D_fm=200,
                e_noise=0.0,
                iters=3,
                inv_link="identity",
                threads=15,
            )
            model.fit(tr_x, tr_y)
            pre = model.predict(te_x)
            pred = model.predict(test_x)

        train[test_index] = pre.reshape((-1, 1))

        test_pre[i, :] = pred.reshape((-1, 1))
        cv_scores.append(log_loss(te_y, pre))

        print("%s now score is:" % clf_name, cv_scores)
    test[:] = test_pre.mean(axis=0)
    with open("score_cv.txt", "a") as f:
        f.write("%s now score is:" % clf_name + str(cv_scores) + "\n")
        f.write("%s_score_mean:" % clf_name + str(np.mean(cv_scores)) + "\n")
    return train.reshape(-1, class_num), test.reshape(
        -1, class_num), np.mean(cv_scores)
예제 #23
0
def get_model(classifierType):
    """
    Return a trained model based on collected raw data.

    Args:
    ---------
        classifierType: classifier type
    Return:
    ---------
        epoch: epoch number
    """
    os.chdir('data/')

    if classifierType == 'SVM':
        # Create a classifier: a support vector classifier
        classifier = svm.SVC(
            gamma=0.001,
            kernel='linear'
        )
    elif classifierType == 'NearestCentroid':
        # Nearest Centroid Classifier
        classifier = NearestCentroid()
    elif classifierType == 'KNN':
        # KNeighborsClassifier
        classifier = KNeighborsClassifier(n_neighbors=23)
    elif classifierType == 'ANN':
        # Neural network
        mlp = MLPClassifier(
            hidden_layer_sizes=(100, 100),
            max_iter=400,
            alpha=1e-4,
            solver='sgd',
            verbose=10,
            tol=1e-4,
            random_state=1
        )
        classifier = MLPClassifier(
            hidden_layer_sizes=(50, 50),
            max_iter=100,
            alpha=1e-4,
            solver='sgd',
            verbose=10,
            tol=1e-4,
            random_state=1,
            learning_rate_init=.1
        )
    else:
        print("Possible options: SVM/ANN/KNN/NearestCentroid")
        exit()

    # Get set of training data collected from LeapMotion sensor.
    data = np.loadtxt("TrainingInput.txt")
    RTP_0, RTP_1, RTP_2, RTP_3, RTP_4, \
    RTT_01, RTT_02, RTT_03, RTT_04, RTT_12, \
    RTT_13, RTT_14, RTT_23, RTT_24, RTT_34, RTJ_0 = \
    data[:,0], data[:,1], data[:,2], data[:,3], data[:,4], \
    data[:,5], data[:,6], data[:,7], data[:,8], data[:,9], \
    data[:,10], data[:,11], data[:,12], data[:,13], data[:,14], data[:,15]

    InputSamples = np.vstack((
        RTP_0, RTP_1, RTP_2, RTP_3, RTP_4,
        RTT_01, RTT_02, RTT_03, RTT_04, RTT_12,
        RTT_13, RTT_14, RTT_23, RTT_24, RTT_34, RTJ_0
    ))
    InputSamples = InputSamples.T
    print((InputSamples))

    dataTarget = np.loadtxt("TargetTraining.txt")
    dataTarget = dataTarget.T
    # n_samples = len(dataTarget)

    classifier.fit(InputSamples, dataTarget)

    # Storage model
    if classifierType == 'SVM':
        joblib.dump(classifier, 'SVM.pkl')
    elif classifierType == 'NearestCentroid':
        joblib.dump(classifier, 'NearestCentroid.pkl')
    elif classifierType == 'KNN':
        joblib.dump(classifier, 'KNN.pkl')
    elif classifierType == 'ANN':
        joblib.dump(classifier, 'ANN.pkl')
    else:
        print("Possible options: SVM/ANN/KNN/NearestCentroid")
        exit()

    # Predict the value of the set of symbols/gestures on the second half:
    dataTest = np.loadtxt("TestInput.txt")
    RTP_0, RTP_1, RTP_2, RTP_3, RTP_4, \
    RTT_01, RTT_02, RTT_03, RTT_04, RTT_12, \
    RTT_13, RTT_14, RTT_23, RTT_24, RTT_34, RTJ_0 = \
    dataTest[:,0], dataTest[:,1], dataTest[:,2], dataTest[:,3], dataTest[:,4], \
    dataTest[:,5], dataTest[:,6], dataTest[:,7], dataTest[:,8], dataTest[:,9], \
    dataTest[:,10], dataTest[:,11], dataTest[:,12], dataTest[:,13], dataTest[:,14], dataTest[:,15]

    InputSamplesTest = np.vstack((
        RTP_0, RTP_1, RTP_2, RTP_3, RTP_4,
        RTT_01, RTT_02, RTT_03, RTT_04, RTT_12,
        RTT_13, RTT_14, RTT_23, RTT_24, RTT_34, RTJ_0
    ))
    InputSamplesTest = InputSamplesTest.T

    dataTargetTest = np.loadtxt("TestTarget.txt")
    dataTargetTest = dataTargetTest.T

    expected = dataTargetTest
    predicted = classifier.predict(InputSamplesTest)

    print(
        "Classification report for classifier %s:\n%s\n"
        % (classifier, metrics.classification_report(expected, predicted))
    )
    print(
        "Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted)
    )
    conf = metrics.confusion_matrix(expected, predicted)
    plt.imshow(conf, cmap='binary', interpolation='None')
    plt.show()
예제 #24
0
    kfold = StratifiedKFold(n_splits=number_splits, shuffle=True, random_state=seed)
    
    f=0; 

    for train_index, test_index in kfold.split(data_opto_SOM,target): 
        

        ## Opto SOM ##
        x_train, x_test = data_opto_SOM[train_index,:],data_opto_SOM[test_index,:]
        y_train,y_test = target[train_index],target[test_index]      
        mul_lr = LogisticRegression(multi_class='multinomial', solver='newton-cg',max_iter=max_i)
        mul_lr.fit(x_train, y_train)
        score_opto_SOM_LR[n,f] = mul_lr.score(x_test, y_test)*100
        print(mul_lr.score(x_test,y_test))        

        clf = NearestCentroid(metric='euclidean',shrink_threshold=None)  
        clf.fit(x_train,y_train)
        score_opto_SOM_NN[n,f] = clf.score(x_test,y_test)*100
     
        lda = LinearDiscriminantAnalysis(solver='svd')
        lda.fit(x_train,y_train)
        score_opto_SOM_LDA[n,f]=lda.score(x_test,y_test)*100
        print(lda.score(x_test,y_test))

        svm_algo = svm.SVC(decision_function_shape='ovo',kernel='linear')
        svm_algo.fit(x_train,y_train)
        score_opto_SOM_SVM[n,f]=svm_algo.score(x_test,y_test)*100
 
        ## Opto PV ##
        x_train, x_test = data_opto_PV[train_index,:],data_opto_PV[test_index,:]
        y_train,y_test = target[train_index],target[test_index]      
# rnc1 = RadiusNeighborsClassifier()
# #default is r = 1.0
# rnc1.fit(xtrain,ytrain1)
# print (rnc1.score(xtest,ytest1))


# In[ ]:

get_ipython().magic(u'whos')


# In[17]:

# Nearest centroid
from sklearn.neighbors.nearest_centroid import NearestCentroid
ncc1 = NearestCentroid()
ncc1.fit(xtrain,ytrain1)
print (ncc1.score(xtest,ytest1))


# In[18]:

# Nearest shrunken Centroid
for shrinkage in [None,0.05,0.1,0.2,0.3,0.4,0.5]:
    ncc2 = NearestCentroid(shrink_threshold = shrinkage)
    ncc2.fit(xtrain,ytrain1)
    print(ncc2.score(xtest,ytest1))


# In[19]:
예제 #26
0
def do_centroid():
    clf = NearestCentroid()
    clf.fit(X, Y)
    return do_testcase(clf)
    'mdl__metric': ['euclidean', 'manhattan'],
    'mdl__n_neighbors': [1, 10, 50, 100]
}

# NCentroid parameters.
NCentroidParameters = {'mdl__metric': ['euclidean', 'manhattan']}

testCases = [['Linear kernel SVM',
              LinearSVC(), LinearParameters],
             ['RBF kernel SVM', SVC(), RbfParameters],
             ['Polynomial kernel SVM',
              SVC(), PolynomialParameters],
             ['K-NearestNeighbors',
              KNeighborsClassifier(), KNNParameters],
             ['NearestCentroid',
              NearestCentroid(), NCentroidParameters]]

for method, model, parameters in testCases:

    # Constuct a pipeline.
    pipeline = Pipeline([('scaler', MinMaxScaler()), ('pca', PCA(0.9)),
                         ('mdl', model)])

    print()
    print(
        '----------------------------------------------------------------------------'
    )
    print('Tuning parameters to find the best accuracy using the %s.' % method)

    # Execute gridsearch.
    clf = GridSearchCV(pipeline,
예제 #28
0
test_classifier(clf, my_dataset, financial_features)

from sklearn import tree
clf1 = tree.DecisionTreeClassifier()
test_classifier(clf1, my_dataset, financial_features)

from sklearn.ensemble import AdaBoostClassifier
# clf2 = AdaBoostClassifier()
# test_classifier(clf2,my_dataset,financial_features)

# from sklearn.neighbors import KNeighborsClassifier
# clf3=KNeighborsClassifier(n_neighbors = 4)
# test_classifier(clf3,my_dataset,financial_features)

from sklearn.neighbors.nearest_centroid import NearestCentroid
clf4 = NearestCentroid()
test_classifier(clf4, my_dataset, financial_features)

### Task 5: Tune your classifier to achieve better than .3 precision and recall
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info:
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

# Example starting point. Try investigating other evaluation techniques!
from sklearn.cross_validation import train_test_split
features_train, features_test, labels_train, labels_test = \
    train_test_split(features, labels, test_size=0.3, random_state=42)
'''
OUR FINAL ALGORITHM
예제 #29
0
from sklearn.neighbors.nearest_centroid import NearestCentroid
import numpy as np
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
y = np.array([1, 1, 1, 2, 2, 2])
clf = NearestCentroid()
clf.fit(X, y)
print clf.predict([[-0.8, -1]])
예제 #30
0
### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

# Provided to give you a starting point. Try a variety of classifiers.
from sklearn.neighbors.nearest_centroid import NearestCentroid
from sklearn.ensemble import AdaBoostClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.preprocessing import MinMaxScaler

min_max_scaler = MinMaxScaler()

nc = NearestCentroid()
adc = AdaBoostClassifier()

nc_report = {'accuracy': list(), 'precision': list(), 'recall': list()}
adc_report = {'accuracy': list(), 'precision': list(), 'recall': list()}


# this function is derived from tester.py
def create_report(clf, features, labels):
    cv = StratifiedShuffleSplit(labels, 100, random_state=42)
    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0
    for train_idx, test_idx in cv:
        features_train = []
# the Y (expected output) converted into the 3 closest bass for each test input
convertedy = MatchBasses(Y, convertedx)
# the test output converted into just numbers to make it easier for knn
convertedinput = convert_to_numbers(input_midifile)

indexarray = [
]  # y array used for the predicted indexs of the convertedy array

# loops through the number of indexs for the array and stores the each index numer in the indexarray
for i in range(len(convertedy)):
    indexarray.append(i)

deletewaste(convertedx, convertedy, indexarray)

# does all the k nearest neighbor stuff
neighbor = NearestCentroid()
neighbor.fit(convertedx, indexarray)

predictionsindex = []

# stores the prediction indexs in an array
predictionsindex.append((neighbor.predict(convertedinput)))

print(predictionsindex[0])
predictions = []  # this is where the real predictions are stored

# loops through the prediction index array and stores the correct predictions in it (by using the indexs inside the convertedy array)
for index in predictionsindex[0]:
    predictions.append(convertedy[index])

# this adds that track1 format stuff to the database that we are outputing
예제 #32
0
from sklearn.neighbors.nearest_centroid import NearestCentroid
import numpy as np
from numpy import loadtxt
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

data = loadtxt('PhishingData.txt', delimiter=",")

# split data into X and y
X = data[:, 0:9]
y = data[:, 9]

seed = 7
test_size = 0.3
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=test_size,
                                                    random_state=seed)

clf = NearestCentroid()
clf.fit(X_train, y_train)
NearestCentroid(metric='euclidean', shrink_threshold=None)

print(clf)
y_pred = clf.predict(X_test)

predictions = [round(value) for value in y_pred]
#verify predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
예제 #33
0
		return features[:MIN]

	if os.path.exists(os.path.join(here, 'X_train.csv')):
		print('loading X_train ....')
		X_train = pd.read_csv(os.path.join(here, 'X_train.csv'), index_col=0)
		print('shape of X_train', X_train.shape)
	else:
		print('making X_train from trainData ...')
		X_train = pd.DataFrame(index=trainData.index, data=trainData['time_series_file'].apply(featurize).tolist())
		X_train.to_csv(os.path.join(here, 'X_train.csv'))
		print('shape of X_train', X_train.shape)

	from sklearn.neighbors import KNeighborsClassifier
	from sklearn.neighbors.nearest_centroid import NearestCentroid

	clf = clf = NearestCentroid(shrink_threshold=None)
	clf.fit(X_train, trainTargets.ravel())

	# print('=========================================================')
	X_test = pd.DataFrame(index=testData.index, data=testData['time_series_file'].apply(featurize).tolist())
	print('shape of X_test', X_test.shape)

	y_pred = clf.predict(X_test)

	y_truth = testTargets.ravel()

	from sklearn.metrics import accuracy_score, f1_score
	accuracy = accuracy_score(y_truth, y_pred)
	f1 = f1_score(y_truth, y_pred, average='macro')
	print('F1 (macro) score on test data', f1)
예제 #34
0
X_test_main = np.loadtxt("X_test.dat")
y_test = np.loadtxt("y_test.dat")

#####################
X_test = np.array(X_test_main[:, column])
X_train = preprocessing.normalize(X_train, norm='l2')
X_test = preprocessing.normalize(X_test, norm='l2')

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)
##########################
print("Train:", X_train.shape)
print("Test:", X_test.shape)

classifier = NearestCentroid()
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)

TN = cm[0, 0]
FP = cm[0, 1]
FN = cm[1, 0]
TP = cm[1, 1]

TPR = (TP / (TP + FN))
print("TPR: {:0.2f}".format(TPR))

TNR = (TN / (TN + FP))
 specific analysis or data in MNIST_NearestNeighborsCentroid.anls

 this code use NearestNeighborsCentroid method , results shows no specific
 advancement. with about 89% precision or so.
'''
from sklearn.neighbors.nearest_centroid import NearestCentroid
from sklearn import metrics
import numpy
import transform_data_to_format as tdtf

#train_x , train_y = tdtf.read_data_to_ndarray("../data/train.csv",42000)
#train_x , train_y = tdtf.read_data_to_ndarray("../data/train.csv",2100)
#valid_x , valid_y = tdtf.read_data_to_ndarray("../data/valid.csv",21000)
#test_x = tdtf.read_test_data_to_ndarray("../data/test.csv",28000);

clf = NearestCentroid()
clf.fit(train_x,train_y)

#NearestCentroid(metric='euclidean', shrink_threshold=None)
#pred_y = clf.predict(test_x)
#pred_train_y = clf.predict(train_x[0:21000])
pred_valid_y = clf.predict(valid_x)

#print pred_y

#tdtf.write_to_csv(pred_y,"../data/MNIST_NearestNeighborsCentroid.out")

#print("Classification report for classifier %s:\n%s\n"
#      % (clf , metrics.classification_report(train_y , pred_train_y )))
'''
print("Classification report for classifier %s:\n%s\n"
예제 #36
0
clf_ada = AdaBoostClassifier(n_estimators=100)
clf_bdt_real = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3),
                                  n_estimators=600,
                                  learning_rate=1.)
clf_bdt_discrete = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3),
                                      n_estimators=600,
                                      learning_rate=1.5,
                                      algorithm="SAMME")

#create gradient boosting
clf_gbdt = GradientBoostingClassifier(n_estimators=100,
                                      learning_rate=0.1,
                                      max_depth=3)

#create nearest centroid
clf_nn = NearestCentroid()

#create a stochastic gradient descent classifier
clf_sgd = SGDClassifier(loss="modified_huber", penalty="l2")

#define a training sample and train
train_start = 0
train_stop = n_samples / 2
#clf_svm.fit(digits.data[train_start:train_stop], digits.target[train_start:train_stop])
#clf_rf.fit(digits.data[train_start:train_stop], digits.target[train_start:train_stop])
#clf_dt = DecisionTreeClassifier().fit(digits.data[train_start:train_stop], digits.target[train_start:train_stop])

#define a test sample and test
test_start = n_samples / 2
test_stop = n_samples
#expected_test_sample = digits.target[test_start:test_stop]
예제 #37
0
        # averaging out the means for all channels
    mean_avg = []
    for i in range(0, 15):
        mean_avg.append((means_data_1[i] + means_data_2[i] + means_data_3[i] + means_data_4[i]) / 4)
        # print len(mean_avg)

    ratio_avg = []
    for i in range(0, 15):
        ratio_avg.append((ratio_data_1[i] + ratio_data_2[i] + ratio_data_3[i] + ratio_data_4[i]) / 4)
        # print (ratio_avg)

        # mean_center,mean_lab = trainSet(mean_avg)
        # ratio_center, ratio_lab = trainSet(ratio_avg)

    clf = NearestCentroid()
    X = []
    Y = []
    for i in range(0, 4):
        X.append([int(mean_avg[i]), ratio_avg[i]])
        Y.append(0)
    for i in range(5, 9):
        X.append([int(mean_avg[i]), ratio_avg[i]])
        Y.append(1)
    for i in range(10, 14):
        X.append([int(mean_avg[i]), ratio_avg[i]])
        Y.append(2)
        # print X
        # print Y
    clf.fit(X, Y)
    res = clf.predict([[mean_avg[14], ratio_avg[14]]])
예제 #38
0
class clusteringST:
    '''
    Identification of sub-types for prediction
    ''' 

    def getClusters(self,net_data):
        self.avg_bin_mat = np.zeros((net_data.shape[0],net_data.shape[0]))
        self.avg_n_clusters = 0
        self.clust_list = []
        for i in range(net_data.shape[2]):
            ms = MeanShift()
            ms.fit(net_data[:,:,i])
            self.clust_list.append(ms)
            labels = ms.labels_
            cluster_centers = ms.cluster_centers_
            n_clusters_ = len(np.unique(labels))
            #print(labels,cluster_centers.shape,n_clusters_)
            #bin_mat = np.zeros(avg_bin_mat.shape)
             
            bin_mat = cls.ind2matrix(labels+1)>0
            self.avg_bin_mat += bin_mat
            self.avg_n_clusters += n_clusters_
    
        self.avg_bin_mat /= net_data.shape[2]
        self.avg_n_clusters /= net_data.shape[2]
        return self.avg_n_clusters
    
    def getMeanClustering(self):
        return self.avg_bin_mat

    def get_match_network(self,net_data,ncluster,algo='kmeans'):
        '''
        net_data: 3d volume (subjects x vecnetwork x vecnetwork)
        ncluster: number of groups to partition the subjects
        algo: (default: kmeans) kmeans, meanshift.
        '''
        valid_net_idx = []
        valid_cluster = []
        self.avg_bin_mat = np.zeros((net_data.shape[0],net_data.shape[0]))
        self.avg_n_clusters = 0
        
        for i in range(net_data.shape[2]):
            # Compute clustering with for each network
            if algo == 'kmeans':
                clust = KMeans(init='k-means++', n_clusters=ncluster, n_init=10)
            else:
                clust = MeanShift()
            
            #t0 = time.time()
            clust.fit(net_data[:,:,i])
            #t_batch = time.time() - t0
            # Compute the stability matrix among networks
            bin_mat = cls.ind2matrix(clust.labels_+1)>0
            self.avg_bin_mat += bin_mat
            self.avg_n_clusters += len(np.unique(clust.labels_))
            
            valid_cluster.append(clust)
            valid_net_idx.append(i)
        self.avg_bin_mat /= net_data.shape[2]
        self.avg_n_clusters /= net_data.shape[2]
        
        return valid_cluster, valid_net_idx

    def assigneSubtype(self,nets,valid_cluster, valid_net_idx):
        classes = []
        dist_centroid = np.array([])
        for i in range(len(valid_net_idx)):
            classes.append(valid_cluster[i].predict(nets[:,valid_net_idx[i]])[0])
            #points = np.vstack((nets[:,valid_net_idx[i]],valid_cluster[i].cluster_centers_))
            #dist_ = squareform(pdist(points, metric='euclidean'))[0,1:]
            #classes.append(np.argmin(dist_))
            points = np.vstack((nets[:,valid_net_idx[i]],valid_cluster[i].cluster_centers_))
            dist_ = squareform(pdist(points, metric='euclidean'))[0,1:]
            dist_centroid = np.hstack((dist_centroid,dist_))
        
        return classes, dist_centroid

    def assigneDist(self,nets,valid_cluster, valid_net_idx):
        classes = np.array([])
        for i in range(len(valid_net_idx)):
            #print  np.hstack((classes,(valid_cluster[i].transform(nets[:,valid_net_idx[i]])[0])))
            points = np.vstack((nets[:,valid_net_idx[i]],valid_cluster[i].cluster_centers_))
            dist_ = squareform(pdist(points, metric='euclidean'))[0,1:]
            #dist_ = squareform(pdist(points, metric='correlation'))[0,1:]
            classes = np.hstack((classes,dist_))
            #classes.append(np.argmin(dist_))
        return classes

    def fit_old(self,net_data,nnet_cluster='auto',nSubtypes=3):
        self.nnet_cluster = nnet_cluster
        self.nSubtypes = nSubtypes
        
        if nnet_cluster == 'auto':
            #self.nnet_cluster = self.getClusters(net_data)
            self.valid_cluster, self.valid_net_idx = self.get_match_network(net_data,nnet_cluster,algo='meanshift')
        else:
            self.valid_cluster, self.valid_net_idx = self.get_match_network(net_data,nnet_cluster,algo='kmeans')

        #self.valid_cluster = self.clust_list
        #self.valid_net_idx = range(len(self.valid_cluster))
        self.assign_net = np.array([])
        self.dist_net   = np.array([])
        for i in range(net_data.shape[0]):
            if i == 0 :
                classes_, dist_ = self.assigneSubtype(net_data[i,:,:],self.valid_cluster, self.valid_net_idx)
                self.dist_net = dist_
                self.assign_net = classes_
            else:
                classes_, dist_ = self.assigneSubtype(net_data[i,:,:],self.valid_cluster, self.valid_net_idx)
                self.dist_net = np.vstack((self.dist_net,dist_))
                self.assign_net = np.vstack((self.assign_net,classes_))

        # group subjects with the most network classifing them together
        # compute the consensus clustering
        self.consensus = cls.hclustering(self.assign_net,self.nSubtypes)
        # save the centroids in a method
        self.clf_subtypes = NearestCentroid()
        self.clf_subtypes.fit(self.assign_net,self.consensus)
        self.consensus = self.clf_subtypes.predict(self.assign_net)
        #print "score: ", self.clf_subtypes.score(self.assign_net,self.consensus)

        return self.consensus

    def transform_low_scale_old(self,net_data):
        # net_data_low --> Dimensions: nSubjects, nNetwork_low, nNetwork
        nnet_cluster = np.max(self.ind_low_scale)
        net_data_low = []
        net_data_low = np.zeros((net_data.shape[0],nnet_cluster,net_data.shape[2]))

        for i in range(nnet_cluster):
            # average the apropriate parcels and scale them
            #net_data_low[:,i,:] = preprocessing.scale(net_data[:,self.ind_low_scale==i+1,:].mean(axis=1), axis=1)
            net_data_low[:,i,:] = net_data[:,self.ind_low_scale==i+1,:].mean(axis=1)
        return net_data_low

    def fit(self,net_data_low,nSubtypes=3,reshape_w=True):
        self.nnet_cluster = net_data_low.shape[1]
        self.nSubtypes = nSubtypes

        #ind_low_scale = cls.get_ind_high2low(low_res_template,orig_template)
        #self.ind_low_scale = ind_low_scale

        # net_data_low --> Dimensions: nSubjects, nNetwork_low, nNetwork
        #net_data_low = transform_low_scale(ts_data,self.ind_low_scale)
        self.net_data_low = net_data_low

        # st_templates --> Dimensions: nNetwork_low, nSubtypes, nNetwork
        st_templates = []
        for i in range(len(net_data_low[1])):
            # indentity matrix of the corelation between subjects
            #tmp_subj_identity = np.corrcoef(net_data_low[:,i,:])
            #ind_st = cls.hclustering(tmp_subj_identity,nSubtypes)
            # subjects X network_nodes
            #ind_st = cls.hclustering(net_data_low[:,i,:]-np.mean(net_data_low[:,i,:],axis=0),nSubtypes)
            ind_st = cls.hclustering(net_data_low[:,i,:],nSubtypes)

            for j in range(nSubtypes):
                if j == 0:
                    st_templates_tmp = net_data_low[:,i,:][ind_st==j+1,:].mean(axis=0)[np.newaxis,...]
                else:
                    st_templates_tmp = np.vstack((st_templates_tmp,net_data_low[:,i,:][ind_st==j+1,:].mean(axis=0)[np.newaxis,...]))

            if i == 0:
                st_templates = st_templates_tmp[np.newaxis,...]
            else:
                st_templates = np.vstack((st_templates,st_templates_tmp[np.newaxis,...]))

        self.st_templates = st_templates

        # calculate the weights for each subjects
        self.W =  self.compute_weights(net_data_low)
        if reshape_w:
            return self.reshapeW(self.W)
        else:
            return self.W

    def compute_weights(self,net_data_low):
        # calculate the weights for each subjects
        W = np.zeros((net_data_low.shape[0],self.st_templates.shape[0],self.st_templates.shape[1]))
        for i in range(net_data_low.shape[0]):
            for j in range(self.st_templates.shape[0]):
                for k in range(self.st_templates.shape[1]):
                    # Demean
                    average_template = np.median(self.net_data_low[:,j,:],axis=0)
                    #average_template = self.st_templates[j,:,:].mean(axis=0)
                    dm_map = net_data_low[i,j,:] - average_template
                    dm_map = preprocessing.scale(dm_map)
                    st_dm_map = self.st_templates[j,k,:] - average_template
                    W[i,j,k] = np.corrcoef(st_dm_map,dm_map)[-1,0:-1]

        return W

    def transform(self,net_data_low,reshape_w=True):
        '''
            Calculate the weights for each sub-types previously computed
        '''
        # compute the low scale version of the data
        #net_data_low = transform_low_scale(ts_data,self.ind_low_scale)

        # calculate the weights for each subjects
        W = self.compute_weights(net_data_low)

        if reshape_w:
            return self.reshapeW(W)
        else:
            return W

    def reshapeW(self,W):
        # reshape the matrix from [subjects, Nsubtypes, weights] to [subjects, vector of weights]
        xw = W.reshape((W.shape[0], W.shape[1]*W.shape[2]))
        return xw

    def fit_dev(self,net_data,nnet_cluster='auto',nSubtypes=3):
        self.nnet_cluster = nnet_cluster
        self.nSubtypes = nSubtypes

        if nnet_cluster == 'auto':
            #self.nnet_cluster = self.getClusters(net_data)
            self.valid_cluster, self.valid_net_idx = self.get_match_network(net_data,nnet_cluster,algo='meanshift')
        else:
            self.valid_cluster, self.valid_net_idx = self.get_match_network(net_data,nnet_cluster,algo='kmeans')

        #self.valid_cluster = self.clust_list
        #self.valid_net_idx = range(len(self.valid_cluster))
        for i in range(net_data.shape[0]):
            if i == 0 :
                self.assign_net = self.assigneDist(net_data[i,:,:],self.valid_cluster, self.valid_net_idx)
            else:
                self.assign_net = np.vstack(((self.assign_net,self.assigneDist(net_data[i,:,:],self.valid_cluster, self.valid_net_idx))))
        print 'Size of the new data map: ',self.assign_net.shape
        # group subjects with the most network classifing them together
        # compute the consensus clustering
        self.consensus = cls.hclustering(self.assign_net,self.nSubtypes)
        # save the centroids in a method
        self.clf_subtypes = NearestCentroid()
        self.clf_subtypes.fit(self.assign_net,self.consensus)
        self.consensus = self.clf_subtypes.predict(self.assign_net)
        #print "score: ", self.clf_subtypes.score(self.assign_net,self.consensus)

        return self.consensus
예제 #39
0
print 'Reading features... Done!'

# STEP 2 - computing scores
print 'Training...'
tfidf = models.TfidfModel(dictionary=features) # Computing tfidf model to be queried.
tfidf.save('reuters/data/tfidf.model')

# STEP 3 - computing centroids
tfidf = models.TfidfModel.load('reuters/data/tfidf.model')
features = corpora.Dictionary.load_from_text('reuters/data/word.dict')
by_bow = Corpus2Dictionary(features)
train_corpus = ReutersCorpus('training')
tfidf_train = tfidf[by_bow[by_word[train_corpus]]]
X = matutils.corpus2csc(tfidf_train)  # to gensim into scipy sparse matrix
X = X.transpose() # from csc (document as column) to csr (document as row)
y = train_corpus.category_mask # label for doc
rocchio = NearestCentroid()
rocchio.fit(X, y)
print 'Training... Done!'

# STEP 4 - evaluate prediction
test_corpus = ReutersCorpus('test')
tfidf_test = tfidf[by_bow[by_word[test_corpus]]]
# num_terms required: otherwise Z shrink to the max feature found
X = matutils.corpus2csc(tfidf_test, num_terms=len(features))
X = X.transpose()
y_true = test_corpus.category_mask
y_pred = rocchio.predict(X)
# print precision_score(y_true, y_pred)
print rocchio.score(X, y_true)
apr_dbz[:,iiap[0][1]:iiap[0][0]].T.shape


# In[217]:


ncc_sum = np.sum(apr_dbz[:,iiap[0][1]:iiap[0][0]],axis=0)
ncc_set = ncc_sum.copy()*0.0
ncc_set[ncc_sum>0] = 1.0


# In[218]:


nc = NearestCentroid()
ncc = nc.fit(apr_dbz[:,iiap[0][1]:iiap[0][0]].T,ncc_set)


# In[220]:


ncc.centroids_[1]


# In[237]:


plt.figure()
#plt.plot(ncc.centroids_[0],apr['altflt'][:,iap[0]],'.')
plt.plot(ncc.centroids_[1],apr['altflt'][:,iap[0]],'.')
print("--------------------Results-------------------")
print("Classification report for kNN classifier %s:\n%s\n"
     % (clf, metrics.classification_report(expected, predicted)))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted))







#Nearest Centroid classification
start = int(round(time.time() * 1000))


classifier = NearestCentroid()
classifier.fit(X_lda, y_train)
NearestCentroid(metric='euclidean', shrink_threshold=None)
print (classifier)



print("---------(5) Cross validation accuracy--------")
print(cross_validation.cross_val_score(classifier, X_lda,y_train, cv=5))


end = int(round(time.time() * 1000))
print("--Centroid fitting finished in ", (end-start), "ms--------------")


print("---------Test-set dimensions after PCA--------")
예제 #42
0
#!/usr/bin/python
from sklearn.neighbors.nearest_centroid import NearestCentroid

import numpy

X = numpy.array([[-1,-1],[-2,-1],[-3,-2],[1,1],[2,1],[3,2]])
y = numpy.array([1,1,1,2,2,2])

clf = NearestCentroid()
clf.fit(X,y)

NearestCentroid(metric='euclidean', shrink_threshold=None)
print clf.predict([0,1])
def myclassify(numfiers=5,xtrain=xtrain,ytrain=ytrain,xtest=xtest,ytest=ytest):
    count = 0



    bagging2 = BaggingClassifier(ETC(),bootstrap=False,bootstrap_features=False)
    bagging2.fit(xtrain,ytrain)
    #print bagging2.score(xtest,ytest)
    count += 1
    classifiers = [bagging2.score(xtest,ytest)]

    if count < numfiers:

        tree2 = ETC()
        tree2.fit(xtrain,ytrain)
        #print tree2.fit(xtrain,ytrain)
        #print tree2.score(xtest,ytest)
        count+=1
        classifiers = np.append(classifiers,tree2.score(xtest,ytest))
        print "1"
        print tree2.score(xtest,ytest)

    if count < numfiers:
        bagging1 = BaggingClassifier(ETC())
        bagging1.fit(xtrain,ytrain)
        #print bagging1.score(xtest,ytest)
        count+=1
        classifiers = np.append(classifiers,bagging1.score(xtest,ytest))
        print "2"
        print bagging1.score(xtest,ytest)

#     if count < numfiers:
#         # votingClassifiers combine completely different machine learning classifiers and use a majority vote
#         clff1 = SVC()
#         clff2 = RFC(bootstrap=False)
#         clff3 = ETC()
#         clff4 = neighbors.KNeighborsClassifier()
#         clff5 = quadda()
#         print"3"


#         eclf = VotingClassifier(estimators = [('svc',clff1),('rfc',clff2),('etc',clff3),('knn',clff4),('qda',clff5)])
#         eclf = eclf.fit(xtrain,ytrain)
#         #print(eclf.score(xtest,ytest))
#         # for claf, label in zip([clff1,clff2,clff3,clff4,clff5,eclf],['SVC','RFC','ETC','KNN','QDA','Ensemble']):
#         #     cla
#         #     scores = crossvalidation.cross_val_score(claf,xtrain,ytrain,scoring='accuracy')
#         #     print ()
#         count+=1
#         classifiers = np.append(classifiers,eclf.score(xtest,ytest))


#     if count < numfiers:
#         svc1 = SVC()
#         svc1.fit(xtrain,ytrain)
#         dec = svc1.score(xtest,ytest)
#         count+=1
#         classifiers = np.append(classifiers,svc1.score(xtest,ytest))
#         print "3"

    if count < numfiers:
        # Quadradic discriminant analysis - classifier with quadratic decision boundary -
        qda = quadda()
        qda.fit(xtrain,ytrain)
        #print(qda.score(xtest,ytest))
        count+=1
        classifiers = np.append(classifiers,qda.score(xtest,ytest))
        print "4"


    if count < numfiers:

        tree1 = DTC()
        tree1.fit(xtrain,ytrain)
        #print tree1.fit(xtrain,ytrain)
        #print tree1.score(xtest,ytest)
        count+=1
        classifiers = np.append(classifiers,tree1.score(xtest,ytest))

    if count < numfiers:
        knn1 = neighbors.KNeighborsClassifier() # this classifies based on the #k nearest neighbors, where k is definted by the user.
        knn1.fit(xtrain,ytrain)
        #print(knn1.score(xtest,ytest))
        count+=1
        classifiers = np.append(classifiers,knn1.score(xtest,ytest))

    if count < numfiers:
        # linear discriminant analysis - classifier with linear decision boundary -
        lda = linda()
        lda.fit(xtrain,ytrain)
        #print(lda.score(xtest,ytest))
        count+=1
        classifiers = np.append(classifiers,lda.score(xtest,ytest))

    if count < numfiers:
        tree3 = RFC()
        tree3.fit(xtrain,ytrain)
        #print tree3.score(xtest,ytest)
        count+=1
        classifiers = np.append(classifiers,tree3.score(xtest,ytest))

    if count < numfiers:
        bagging3 = BaggingClassifier(RFC(),bootstrap=False,bootstrap_features=False)
        bagging3.fit(xtrain,ytrain)
        #print bagging3.score(xtest,ytest)
        count+=1
        classifiers = np.append(classifiers,bagging3.score(xtest,ytest))


    if count < numfiers:
        bagging4 = BaggingClassifier(SVC(),bootstrap=False,bootstrap_features=False)
        bagging4.fit(xtrain,ytrain)
        #print bagging4.score(xtest,ytest)
        count+=1
        classifiers = np.append(classifiers,bagging4.score(xtest,ytest))

    if count < numfiers:
        tree4 = RFC(bootstrap=False)
        tree4.fit(xtrain,ytrain)
        #print tree4.score(xtest,ytest)
        count+=1
        classifiers = np.append(classifiers,tree4.score(xtest,ytest))

    if count < numfiers:
        tree6 = GBC()
        tree6.fit(xtrain,ytrain)
        #print(tree6.score(xtest,ytest))
        count+=1
        classifiers = np.append(classifiers,tree6.score(xtest,ytest))

    if count < numfiers:
        knn2 = neighbors.KNeighborsClassifier(n_neighbors = 10)
        knn2.fit(xtrain,ytrain)
        #print(knn2.score(xtest,ytest))
        count+=1
        classifiers = np.append(classifiers,knn2.score(xtest,ytest))

    if count < numfiers:
        knn3 = neighbors.KNeighborsClassifier(n_neighbors = 3)
        knn3.fit(xtrain,ytrain)
        #print(knn3.score(xtest,ytest))
        count+=1
        classifiers = np.append(classifiers,knn3.score(xtest,ytest))

    if count < numfiers:
        knn4 = neighbors.KNeighborsClassifier(algorithm = 'ball_tree')
        knn4.fit(xtrain,ytrain)
        #print(knn4.score(xtest,ytest))
        count+=1
        classifiers = np.append(classifiers,knn4.score(xtest,ytest))

    if count < numfiers:
        knn5 = neighbors.KNeighborsClassifier(algorithm = 'kd_tree')
        knn5.fit(xtrain,ytrain)
        #print(knn5.score(xtest,ytest))
        count+=1
        classifiers = np.append(classifiers,knn5.score(xtest,ytest))

    if count < numfiers:
        ncc1 = NearestCentroid()
        ncc1.fit(xtrain,ytrain)
        #print (ncc1.score(xtest,ytest))
        count+=1
        classifiers = np.append(classifiers,ncc1.score(xtest,ytest))

    if count < numfiers:
    # Nearest shrunken Centroid
        for shrinkage in [None,0.05,0.1,0.2,0.3,0.4,0.5]:
            ncc2 = NearestCentroid(shrink_threshold = shrinkage)
            ncc2.fit(xtrain,ytrain)
            #print(ncc2.score(xtest,ytest))

        count+=1
        classifiers = np.append(classifiers,ncc2.score(xtest,ytest))

    if count < numfiers:
        tree5 = ABC()
        tree5.fit(xtrain,ytrain)
        #print(tree5.score(xtest,ytest))
        count+=1
        classifiers = np.append(classifiers,tree5.score(xtest,ytest))

    classifierlabel = ["BaggingETC (with bootstraps set to false)","ETC","BaggingETC","Voting Classifier","svm","QDA","DTC","KNN (default)","LDA","RFC",
                       "BaggingRFC (with bootstraps set to false)","BaggingSVC (with bootstraps set to false)","RFC (bootstrap false)","GBC",
                        "knn (n_neighbors = 10)","knn (n_neighbors = 3)","knn (ball tree algorithm)","knn (kd_tree algorithm)",
                       "Nearest Centroid","Shrunken Centroid?","ABC"]


    classifierlabel = classifierlabel[:len(classifiers)]
    #print len(classifiers)
    #print classifiers
    for i in range(len(classifiers)):


        print ("{} classifier has percent correct {}".format(classifierlabel[i],classifiers[i]))
예제 #44
0
        'finish launching Random Forest Classifier, the test accuracy is {:.5%}'
        .format(rf.score(X_val, y_val)))
    rf_predict = rf.predict(X_test)

    print('=' * 100)
    print('start launching SVM Classifier......')
    svm = svm.SVC()
    svm.fit(X_train, y_train)
    print(
        'finish launching SVM Classifier, the test accuracy is {:.5%}'.format(
            svm.score(X_val, y_val)))
    svm_predict = svm.predict(X_test)

    print('=' * 100)
    print('start launching KNN Classifier......')
    knn = NearestCentroid()
    knn.fit(X_train, y_train)
    print(
        'finish launching KNN Classifier, the test accuracy is {:.5%}'.format(
            knn.score(X_val, y_val)))
    knn.predict(X_test)

    print('=' * 100)
    print('start launching Decision Tree Classifier......')
    dtree = tree.DecisionTreeClassifier()
    dtree.fit(X_train, y_train)
    print(
        'finish launching Decision Tree Classifier, the test accuracy is {:.5%}'
        .format(dtree.score(X_val, y_val)))
    dtree_predict = dtree.predict(X_test)
	all_instances.append(row1)
	if(row1[0] > maxlength):
	    maxlength = row1[0];

for row2 in negative:
	row2 = row2[:-1]
	row2 = row2.split(',')
	row2 = [int(i) for i in row2]
	all_instances.append(row2)
	if(row2[0] > maxlength):
	    maxlength = row2[0];

for instance in all_instances:
    instance[0] = instance[0]/maxlength; 
	
random.shuffle(all_instances)
# print all_instances[0:700]
print "all_instances size: ", len(all_instances)
train_set = np.array(all_instances[0:700])
test_set = np.array(all_instances[701:])

print train_set[:,:-1]

X = np.array(train_set[:,:-1])
Y = np.array(train_set[:,-1])

clf = NearestCentroid()
clf.fit(X, Y)
predication = clf.predict(test_set[:,:-1])

evaluation(predication, test_set[:,-1])
예제 #46
0
from sklearn import svm
import numpy as np
import serial
from sklearn.neighbors.nearest_centroid import NearestCentroid

def removeMag(line):
	return line[6:]

x = []
y = []
small = False
#clf = svm.LinearSVC()
clf = NearestCentroid()
folder = "gyro_side\\"
files = ['still.csv', 'yes.csv', 'no.csv']
for i in range(3):
	f =open(folder+files[i], 'r')

	for line in f.readlines():
		#print line
		
		line = [int(a) for a in line.split(',')]
		lines = [removeMag(line[9*j:9*j+9]) for j in range(9)]
		# smallLine=[]
		# for j in range(5):
			
			# smallLine = smallLine + line[6*j:6*j+3]
		# if small:
			# line=smallLine
		# if len(x)==0:
			# x= np.array(np.array([line]))
예제 #47
0
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors.nearest_centroid import NearestCentroid
from sklearn.svm import SVC

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import (LinearDiscriminantAnalysis,
                                           QuadraticDiscriminantAnalysis)

from sklearn.neural_network import MLPClassifier

classifiers_by_name = {
    '10-nearest-neighbors': lambda: KNeighborsClassifier(n_neighbors=10),
    'nearest-centroid-mean': lambda: NearestCentroid(metric='euclidean'),
    'nearest-centroid-median': lambda: NearestCentroid(metric='manhattan'),
    'logistic-regression': LogisticRegression,
    'sgd': SGDClassifier,
    'linear-svm': lambda: SVC(kernel='linear'),
    'quadratic-svm': lambda: SVC(kernel='poly', degree=2),
    'cubic-svm': lambda: SVC(kernel='poly', degree=3),
    'rbf-svm': lambda: SVC(kernel='rbf'),
    'decision-tree': DecisionTreeClassifier,
    'random-forest': RandomForestClassifier,
    'adaboost': AdaBoostClassifier,
    'gaussian-naive-bayes': GaussianNB,
    'lda': LinearDiscriminantAnalysis,
    'qda': QuadraticDiscriminantAnalysis,
    'multilayer-perceptron': MLPClassifier
예제 #48
0
from sklearn.neighbors.nearest_centroid import NearestCentroid

import time

conf_mat = numpy.zeros(
    (len(no_imgs), len(no_imgs)))  # Initializing the Confusion Matrix

n_neighbors = 1  # better to have this at the start of the code

# 10-fold Cross Validation

for i in range(kfold):
    train_indices = skfind[i][0]
    test_indices = skfind[i][1]
    clf = []
    clf = NearestCentroid()

    X_train = X[train_indices]
    y_train = y[train_indices]
    X_test = X[test_indices]
    y_test = y[test_indices]

    # Training
    tic = time.time()
    clf.fit(X_train, y_train)
    toc = time.time()
    print "training time= ", toc - tic  # roughly 2.5 secs

    # Testing
y_predict = []
tic = time.time()
예제 #49
0
columns = list(df.columns.values)
df = df.values
words = df[:, :-1]  #selecting words
labels = df[:, -1]  #selecting Labels
X_train, X_test, Y_train, Y_test = train_test_split(words,
                                                    labels,
                                                    test_size=0.2,
                                                    random_state=40)
from sklearn.metrics import accuracy_score, confusion_matrix
from matplotlib import style
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')
style.use('ggplot')

# Rocchio Algorithm
clf = NearestCentroid()
clf.fit(X_train, Y_train)
predict = clf.predict(X_test)
accuracy = accuracy_score(Y_test, predict)
print('\nAccuracy of Rocchio:\n')
print(accuracy)
conf_mat = confusion_matrix(Y_test, predict)
print('\nConfusion Matrix: \n', conf_mat)
plt.matshow(conf_mat)
plt.title('Confusion Matrix for test Data\t')
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
# Naive Bayes
clf_1 = GaussianNB()
        results.append(
            benchmark(SGDClassifier(alpha=.0001,
                                    n_iter=50,
                                    penalty="elasticnet"),
                      X_train,
                      y_train,
                      X_test,
                      y_test,
                      target_names,
                      feature_names=feature_names))

        # Train NearestCentroid without threshold
        #print('=' * 80)
        #print("NearestCentroid (aka Rocchio classifier)")
        results.append(
            benchmark(NearestCentroid(),
                      X_train,
                      y_train,
                      X_test,
                      y_test,
                      target_names,
                      feature_names=feature_names))

        # Train sparse Naive Bayes classifiers
        #print('=' * 80)
        #print("Naive Bayes")
        results.append(
            benchmark(MultinomialNB(alpha=.01),
                      X_train,
                      y_train,
                      X_test,
예제 #51
0
print 'knn_benchmark_targets: ' + str(len(knn_benchmark_targets))
print 'rf_benchmark_targets: ' + str(len(rf_benchmark_targets))

# Takes a list, creates a csv file
def submitFile(x, pre):
    f = open(pre + '_submission.csv', 'w')
    for val in x:
        f.write(str(val) + ',\r')
    f.close()

# ==============================================================================
# Nearest centroid classifier
# ==============================================================================
from sklearn.neighbors.nearest_centroid import NearestCentroid

ncc = NearestCentroid()
ncc.fit(features_to_train, targets_to_train)

predicted_targets = ncc.predict(features_to_test)

# Just print out the precision and f1 scores
print 'precision: %0.5f' % metrics.precision_score(rf_benchmark_targets, predicted_targets)
print 'f1 score: %0.5f' % metrics.f1_score(rf_benchmark_targets, predicted_targets)

# The following scores are used for classification models
print 'accuracy: %0.5f' % metrics.zero_one_score(rf_benchmark_targets, predicted_targets)
print 'loss: %d' % metrics.zero_one(rf_benchmark_targets, predicted_targets)

# ==============================================================================
# Multinomial naive bayes
# ==============================================================================
예제 #52
0
>>> sigmoid_svc = svm.SVC(kernel='sigmoid')
>>> sigmoid_svc.fit(X_train, Y_train)
>>> accuracy_score(Y_test,sigmoid_svc.predict(X_test).round()) #0.5617977528089888
#.................................................................................................#
## Nearest Centroid Classifier
>>> from sklearn.neighbors.nearest_centroid import NearestCentroid
>>> import numpy as np
>>> file = open("/home/banafshbts/Desktop/hosh/76/all")
>>> file.readline()
>>> data = np.loadtxt(file,delimiter=',')
>>> data = np.loadtxt(file,delimiter=',')
>>> X_train = data[0:810, 0:12]
>>> Y_train = data[0:810, 13]
>>> X_test = data[810:, 0:12]
>>> Y_test = data[810:, 13]
>>> clf = NearestCentroid()
>>> clf.fit(X_train, Y_train)
>>> accuracy_score(clf.predict(X_test),Y_test) #0.5842696629213483
#.................................................................................................#
##Gaussian Naive Bayes
>>> from sklearn.naive_bayes import GaussianNB
>>> import numpy as np
>>> file = open("/home/banafshbts/Desktop/hosh/76/all")
>>> file.readline()
>>> data = np.loadtxt(file,delimiter=',')
>>> data = np.loadtxt(file,delimiter=',')
>>> X_train = data[0:810, 0:12]
>>> Y_train = data[0:810, 13]
>>> X_test = data[810:, 0:12]
>>> Y_test = data[810:, 13]
>>> gnb = GaussianNB()
예제 #53
0
 def nn_centroid(self, X, y, test):
     clf = NearestCentroid()
     clf.fit(X, y)
     t = clf.predict(test)
     print("nn_centroid:", t)
     return t
예제 #54
0
accuracy_score(
    Y_test,
    rbf_svc.predict(X_test).round(
    ))  #0.4157303370786517   0.0092165898617511521  0.33640552995391704
#pre.append(precision_score(Y_test, rbf_svc.predict(X_test), average='macro'))
sigmoid_svc = svm.SVC(kernel='sigmoid')
sigmoid_svc.fit(X_train, Y_train)
accuracy_score(
    Y_test,
    sigmoid_svc.predict(X_test).round(
    ))  #0.5617977528089888   0.027649769585253458  0.16589861751152074
#pre.append(precision_score(Y_test, sigmoid_svc.predict(X_test), average='macro'))
#.................................................................................................#
## Nearest Centroid Classifier
from sklearn.neighbors.nearest_centroid import NearestCentroid
ncc_clf = NearestCentroid()
ncc_clf.fit(X_train, Y_train)
accuracy_score(
    ncc_clf.predict(X_test),
    Y_test)  #0.5842696629213483    0.72811059907834097   0.3686635944700461
#.................................................................................................#
##Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
accuracy_score(
    gnb.fit(X_train, Y_train).predict(X_test),
    Y_test)  #0.5955056179775281   0.26728110599078342  0.25345622119815669
#.................................................................................................#
##DecisionTreeClassifier
from sklearn import tree
DT_clf = tree.DecisionTreeClassifier()
예제 #55
0
class TwoWordRecognizer:

    def scaler(self,arr):
        return arr/np.max(np.abs(arr))*100

    def get_startingpoint(self,arr):
        arr = np.abs(arr)
        st_i = 0
        e_i = STEPS
        old_value = np.sum(arr[st_i:e_i,0])
        counter = 0
        while e_i < arr.shape[0]:
            arr_sum = np.sum(arr[st_i:e_i,0])
            if(arr_sum>old_value*FACTOR):
                return st_i
            else:
                if(old_value<arr_sum):
                    old_value = arr_sum
                st_i+=STEPS
                e_i+=STEPS
        return 10000

    def get_endingpoint(self,arr):
        arr = np.abs(arr)
        e_i = arr.shape[0]-1
        st_i = e_i - STEPS
        old_value = np.sum(arr[st_i:e_i,0])
        while st_i > 0:
            arr_sum = np.sum(arr[st_i:e_i,0])
            if(arr_sum>old_value*FACTOR):
                return e_i
            else:
                if(old_value<arr_sum):
                        old_value = arr_sum
                st_i -= STEPS
                e_i -= STEPS
        return 10000

    def euclidean_distance(self,arr1,arr2):
        a1 = arr1.copy()
        a2 = arr2.copy()
        if(a1.shape[0]<a2.shape[0]):
            zero_rows = a2[a1.shape[0]:a2.shape[0],[0,1]].copy()
            zero_rows[:,:] = 0
            a1 = np.concatenate((a1,zero_rows))
        elif(a1.shape[0]>a2.shape[0]):
            zero_rows = a1[a2.shape[0]:a1.shape[0],[0,1]].copy()
            zero_rows[:,:] = 0
            a2 = np.concatenate((a2,zero_rows))
        dist = np.sqrt((a2[:,0]-a1[:,0])**2)
        return np.sum(dist)

    def loadReferenceWords(self, word1_path, word2_path):
        fs, self.word1 = wavfile.read(word1_path)
        fs, self.word2 = wavfile.read(word2_path)
        self.word1 =  self.scaler(self.word1)
        self.word2 = self.scaler(self.word2)
        self.word1 = self.word1[self.get_startingpoint(self.word1):self.get_endingpoint(self.word1),:]
        self.word2 = self.word2[self.get_startingpoint(self.word2):self.get_endingpoint(self.word2),:]

    def loadData(self, ressourcepath1, ressourcepath2):
        print(ressourcepath1)
        dirList = os.listdir(ressourcepath1)
        fullpath1 = []
        for fname in dirList:
            fullpath1.append(ressourcepath1+""+fname)

        dirList = os.listdir(ressourcepath2)
        fullpath2 = []
        for fname in dirList:
            fullpath2.append(ressourcepath2+""+fname)
        counter = 0
        for path in fullpath1:
            if counter == 0:
                fs, w1 = wavfile.read(path)
                w1 = self.scaler(w1)
                w1 = w1[self.get_startingpoint(w1):self.get_endingpoint(w1),:]
                X = np.array([self.euclidean_distance(self.word1,w1),self.euclidean_distance(self.word2,w1)])
                y = np.array([1])
                counter = 1
            else:
                fs, w1 = wavfile.read(path)
                w1 = self.scaler(w1)
                w1 = w1[self.get_startingpoint(w1):self.get_endingpoint(w1),:]
                X = np.vstack((X,np.array([self.euclidean_distance(self.word1,w1),self.euclidean_distance(self.word2,w1)])))
                y = np.hstack((y,np.array([1])))

        for path in fullpath2:
                fs, w2 = wavfile.read(path)
                w2 = self.scaler(w2)
                w2 = w2[self.get_startingpoint(w2):self.get_endingpoint(w2),:]
                X = np.vstack((X,np.array([self.euclidean_distance(self.word1,w2),self.euclidean_distance(self.word2,w2)])))
                y = np.hstack((y,np.array([2])))
        from sklearn.neighbors.nearest_centroid import NearestCentroid
        self.clf = NearestCentroid()
        self.clf.fit(X,y)
        #import matplotlib.pyplot as plt
        #plt.scatter(X[:,0],X[:,1])
        #plt.show()

    def predict(self,input_path):
        fs, raw_arr = wavfile.read(input_path)
        raw_arr = self.scaler(raw_arr)
        word= raw_arr[self.get_startingpoint(raw_arr):self.get_endingpoint(raw_arr),:]
        x0 = np.array([self.euclidean_distance(self.word1,word),self.euclidean_distance(self.word2,word)])
        return self.clf.predict(x0)
예제 #56
0
models.append(classifier.fit(X_train, y_train))

from sklearn.naive_bayes import BernoulliNB
classifier = BernoulliNB()
models.append(classifier.fit(X_train, y_train))

from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
models.append(classifier.fit(X_train, y_train))

from sklearn.neighbors import KNeighborsClassifier # KNN
classifier = KNeighborsClassifier()
models.append(classifier.fit(X_train, y_train))

from sklearn.neighbors.nearest_centroid import NearestCentroid
classifier = NearestCentroid()
models.append(classifier.fit(X_train, y_train))

from sklearn.gaussian_process import GaussianProcessClassifier # gaussian process
classifier = GaussianProcessClassifier()
models.append(classifier.fit(X_train, y_train))

from sklearn.tree import DecisionTreeClassifier # decision trees. For interesting tree vizualisation, see graphviz module
classifier = DecisionTreeClassifier()
models.append(classifier.fit(X_train, y_train))

from sklearn.ensemble import BaggingClassifier # bagging meta classifier
classifier = BaggingClassifier()
models.append(classifier.fit(X_train, y_train))

from sklearn.ensemble import RandomForestClassifier # everyone's favorite homeboy random forest
    df_input3_target = filtered3[list(range(0,1))].as_matrix()

    df_input4_data = filtered4[list(range(2,76))].as_matrix()
    df_input4_target = filtered4[list(range(0,1))].as_matrix()

    df_input5_data = filtered5[list(range(2,76))].as_matrix()
    df_input5_target = filtered5[list(range(0,1))].as_matrix()

    # df_input_data = filtered[list(range(2,76))].as_matrix()
    # df_input_target = filtered[list(range(0,1))].as_matrix()

    # Nearest Centroid
    from sklearn.neighbors.nearest_centroid import NearestCentroid

    # Nearest Centroid
    knc1 = NearestCentroid()
    knc1.fit(df_input1_data,numpy.ravel(df_input1_target))
    pickle.dump(knc1, open('model_knc_t1.pkl', 'wb'))

    knc2 = NearestCentroid()
    knc2.fit(df_input2_data,numpy.ravel(df_input2_target))
    pickle.dump(knc2, open('model_knc_t2.pkl', 'wb'))

    knc3 = NearestCentroid()
    knc3.fit(df_input3_data,numpy.ravel(df_input3_target))
    pickle.dump(knc3, open('model_knc_t3.pkl', 'wb'))

    knc4 = NearestCentroid()
    knc4.fit(df_input4_data,numpy.ravel(df_input4_target))
    pickle.dump(knc4, open('model_knc_t4.pkl', 'wb'))
예제 #58
0
df = df.values
words = df[:, :-1]
labels = df[:, -1]
X_train, X_test, Y_train, Y_test = train_test_split(words,
                                                    labels,
                                                    test_size=0.2,
                                                    random_state=50)

from sklearn.metrics import accuracy_score, confusion_matrix
from matplotlib import style
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')
style.use('ggplot')

# Rocchio Algorithm
clf = NearestCentroid()
clf.fit(X_train, Y_train)
predict = clf.predict(X_test)
accuracy = accuracy_score(Y_test, predict)
print('\nAccuracy of Rocchio:\n')
print(accuracy)
conf_mat = confusion_matrix(Y_test, predict)
print('\nConfusion Matrix: \n', conf_mat)
plt.matshow(conf_mat)
plt.title('Confusion Matrix for test Data\t')
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
# Naive Bayes
clf_1 = GaussianNB()
예제 #59
0
def myclassify_practice_set(numfiers,xtrain,ytrain,xtltrain,xtltest,xtest,ytarget=None,testing=False,grids='ABCDEFGHI'):
    #NOTE we might not need xtltrain
    # xtrain and ytrain are your training set. xtltrain is the indices of corresponding recordings in xtrain and ytrain. these will always be present
    #xtest is your testing set. xtltest is the corresponding indices of the recording. for the practice set xtltest = xtrunclength
    # ytest is optional and depends on if you are using a testing set or the practice set

    # remove NaN, Inf, and -Inf values from the xtest feature matrix
    xtest,xtltest,ytarget = removeNanAndInf(xtest,xtltest,ytarget)
    # print 'finished removal of Nans'

    ytrain = np.ravel(ytrain)
    ytarget = np.ravel(ytarget)


    #if xtest is NxM matrix, returns Nxnumifiers matrix where each column corresponds to a classifiers prediction vector
    count = 0
    # print numfiers

    predictionMat = np.empty((xtest.shape[0],numfiers))
    predictionStringMat = []
    finalPredMat = []
    targetStringMat = []
    targets1 = []
    predictions1 = []

    # svc1 = SVC()
    # svc1.fit(xtrain,ytrain)
    # ytest = svc1.predict(xtest)
    # predictionMat[:,count] = ytest
    # count+=1
    if count < numfiers:
        # votingClassifiers combine completely different machine learning classifiers and use a majority vote
        clff1 = SVC()
        clff2 = RFC(bootstrap=False)
        clff3 = ETC()
        clff4 = neighbors.KNeighborsClassifier()
        clff5 = quadda()



        eclf = VotingClassifier(estimators = [('svc',clff1),('rfc',clff2),('etc',clff3),('knn',clff4),('qda',clff5)])
        eclf = eclf.fit(xtrain,ytrain)
        #print(eclf.score(xtest,ytest))
        # for claf, label in zip([clff1,clff2,clff3,clff4,clff5,eclf],['SVC','RFC','ETC','KNN','QDA','Ensemble']):
        #     cla
        #     scores = crossvalidation.cross_val_score(claf,xtrain,ytrain,scoring='accuracy')
        #     print ()
        ytest = eclf.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:

        bagging2 = BaggingClassifier(ETC(),bootstrap=False,bootstrap_features=False)
        bagging2.fit(xtrain,ytrain)
        #print bagging2.score(xtest,ytest)
        ytest = bagging2.predict(xtest)
        predictionMat[:,count] = ytest
        count += 1


    if count < numfiers:

        tree2 = ETC()
        tree2.fit(xtrain,ytrain)
        ytest = tree2.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        bagging1 = BaggingClassifier(ETC())
        bagging1.fit(xtrain,ytrain)
        #print bagging1.score(xtest,ytest)
        ytest = bagging1.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1



    if count < numfiers:
        svc1 = SVC()
        svc1.fit(xtrain,ytrain)
        ytest = svc1.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        # Quadradic discriminant analysis - classifier with quadratic decision boundary -
        qda = quadda()
        qda.fit(xtrain,ytrain)
        #print(qda.score(xtest,ytest))
        ytest = qda.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1



    if count < numfiers:

        tree1 = DTC()
        tree1.fit(xtrain,ytrain)
        ytest = tree1.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        knn1 = neighbors.KNeighborsClassifier() # this classifies based on the #k nearest neighbors, where k is definted by the user.
        knn1.fit(xtrain,ytrain)
        ytest = knn1.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        # linear discriminant analysis - classifier with linear decision boundary -
        lda = linda()
        lda.fit(xtrain,ytrain)
        ytest = lda.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        tree3 = RFC()
        tree3.fit(xtrain,ytrain)
        ytest = tree3.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        bagging3 = BaggingClassifier(RFC(),bootstrap=False,bootstrap_features=False)
        bagging3.fit(xtrain,ytrain)
        ytest = bagging3.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1



    if count < numfiers:
        bagging4 = BaggingClassifier(SVC(),bootstrap=False,bootstrap_features=False)
        bagging4.fit(xtrain,ytrain)
        ytest = bagging4.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        tree4 = RFC(bootstrap=False)
        tree4.fit(xtrain,ytrain)
        ytest = tree4.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        tree6 = GBC()
        tree6.fit(xtrain,ytrain)
        ytest = tree6.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        knn2 = neighbors.KNeighborsClassifier(n_neighbors = 10)
        knn2.fit(xtrain,ytrain)
        ytest = knn2.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        knn3 = neighbors.KNeighborsClassifier(n_neighbors = 3)
        knn3.fit(xtrain,ytrain)
        ytest = knn3.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        knn4 = neighbors.KNeighborsClassifier(algorithm = 'ball_tree')
        knn4.fit(xtrain,ytrain)
        ytest = knn4.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    if count < numfiers:
        knn5 = neighbors.KNeighborsClassifier(algorithm = 'kd_tree')
        knn5.fit(xtrain,ytrain)
        ytest = knn5.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1



    if count < numfiers:
        ncc1 = NearestCentroid()
        ncc1.fit(xtrain,ytrain)
        ytest = ncc1.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1

    if count < numfiers:
        tree5 = ABC()
        tree5.fit(xtrain,ytrain)
        ytest = tree5.predict(xtest)
        predictionMat[:,count] = ytest
        count+=1


    # print xtltest
    # print len(ytest)
    for colCount in range(predictionMat.shape[1]):
        tempCol = predictionMat[:,colCount]
        if testing:
            modeCol = temppredWindowVecModeFinder(tempCol,xtltest,4,grids,isPrint=0)
        else:
            modeCol = predWindowVecModeFinder(tempCol,xtltest,4,isPrint=0)

        ytarg = predWindowVecModeFinder(ytarget,xtltest,1,isPrint=0)
        if testing:
             modeStr = temppredVec2Str(modeCol,grids)
        else:
            modeStr = predVec2Str(modeCol)
        modeStrans = predVec2Str(ytarg)
        predictionStringMat.append(modeStr)
        predictions1.append(modeCol)
        finalPredMat += map(int,modeCol)
        targetStringMat.append(modeStrans)
        targets1.append(ytarg)
        if testing == False:
            if ytarget != None:
                #print targets1
                #print ""
                #print predictions1
                confusionme = confusion_matrix(targets1[0],predictions1[0])
                #print "Confusion Matrix is: "
                #print confusionme


    return predictionStringMat, targetStringMat, finalPredMat
예제 #60
0
def run_knn(train_varnames, train_labels,test_varnames, test_labels):
    clf=NearestCentroid()
    result,accuracy=fit_predict(clf,"Nearest Centroid Classifier", train_varnames, train_labels,test_varnames, test_labels)
    return result,accuracy