def nearest_neighbors(X, Y): ''' This method implements K-nearest neighbor and nearest centroid classifiers from the nearest neighbors ML family. It utilizes 5-fold stratified cross validation for compilation and reports models' performances. ''' # Importing library from sklearn import neighbors # Compiling the model with recording statistical scores for KNN evaluation accuracy_scores, precision_scores, recall_scores, f1_scores = model_compilation( X, Y, neighbors.KNeighborsClassifier(15, weights='uniform')) # Statistical measurement of the model print(" ======= KNN (neighbor = 5 and weight = uniform) ======= ") print("Accuracy: ", np.mean(accuracy_scores)) print("Precision: ", np.mean(precision_scores)) print("Recall: ", np.mean(recall_scores)) print("F1: ", np.mean(f1_scores)) # Compiling the model with recording statistical scores for Nearest Neighbor's evaluation accuracy_scores, precision_scores, recall_scores, f1_scores = model_compilation( X, Y, neighbors.NearestCentroid()) # Statistical measurement of the model print(" ======= Nearest Centroid ======= ") print("Accuracy: ", np.mean(accuracy_scores)) print("Precision: ", np.mean(precision_scores)) print("Recall: ", np.mean(recall_scores)) print("F1: ", np.mean(f1_scores))
def _train(self): x = self._train_features y = self._train_outputs pipe = pipeline.Pipeline([ # x14 == x10 # x8 == x3 # x9 == x6^2 - C ('drop', transformers.ColumnDropper(columns=(7, 8, 11, 12, 13, 14)) ), ('scale', preprocessing.StandardScaler(with_mean=True, with_std=True)), ('expand', preprocessing.PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)), ('select', feature_selection.SelectKBest( k=26, score_func=feature_selection.mutual_info_classif)), ('estim', neighbors.NearestCentroid(metric='euclidean', shrink_threshold=None)), ]) pipe.fit(x, y) self._model = pipe.predict
def get_classifier(classifier_str): ''' This functions maps the classifier string classifier_str to the corresponding classifier object with the default paramers set. ''' # SVC if (classifier_str == 'linearsvc'): cl = svm.LinearSVC(**svm_default_param) elif (classifier_str == 'svc_linear'): libsvm_default_param['kernel'] = 'linear' cl = svm.SVC(**libsvm_default_param) elif (classifier_str == 'svc_rbf'): libsvm_default_param['kernel'] = 'rbf' cl = svm.SVC(**libsvm_default_param) # polynomial, sigmoid kernel # nuSVC # Nearest Neighbors (euclidian distance used by default) elif (classifier_str == 'kn_uniform'): kn_default_param['weights'] = 'uniform' cl = neighbors.KNeighborsClassifier(**kn_default_param) elif (classifier_str == 'kn_distance'): kn_default_param['weights'] = 'distance' cl = neighbors.KNeighborsClassifier(**kn_default_param) elif (classifier_str == 'rn_uniform'): rn_default_param['weights'] = 'uniform' cl = neighbors.RadiusNeighborsClassifier(**rn_default_param) elif (classifier_str == 'rn_distance'): rn_default_param['weights'] = 'distance' cl = neighbors.RadiusNeighborsClassifier(**rn_default_param) elif (classifier_str == 'nc'): cl = neighbors.NearestCentroid() # LDA and QDA, priors are by default set to 1/len(class) for each class elif (classifier_str == 'lda'): cl = lda.LDA() elif (classifier_str == 'qda'): cl = qda.QDA() # Gaussion naive bayes # from the code it is unclear how priors are set elif (classifier_str == 'gnb'): cl = naive_bayes.GaussianNB() elif (classifier_str == 'mnb'): cl = naive_bayes.MultinomialNB() elif (classifier_str == 'bnb'): cl = naive_bayes.BernoulliNB() # Decision tree elif (classifier_str == 'dtree'): cl = tree.DecisionTreeClassifier() elif (classifier_str == 'rforest'): cl = ensemble.RandomForestClassifier() else: # raise error if classifier not found raise ValueError('Classifier not implemented: %s' % (classifier_str)) return (cl)
def select_model(classifier_method): """ Initializes desired classifier :param classifier_method: desired classifier, expects 'KNN' or 'Rocchio' :return: classifier sklearn object """ if classifier_method == 'KNN': return neighbors.KNeighborsClassifier(n_neighbors=NEIGHBORS) elif classifier_method == 'Rocchio': return neighbors.NearestCentroid() else: print("Error. Expects 'KNN' or 'Rocchio' only.")
def select_model(classifier_method, number_of_neighbors): """ Initializes desired classifier :param classifier_method: desired classifier, expects 'KNN' or 'Rocchio' :param number_of_neighbors: the number of neighbors for knn model, default 10 :return: classifier sklearn object """ if classifier_method == 'KNN': return neighbors.KNeighborsClassifier(n_neighbors=number_of_neighbors, metric='manhattan') elif classifier_method == 'Rocchio': return neighbors.NearestCentroid() else: print("Error. Expects 'KNN' or 'Rocchio' only.")
def train_test(x_tr, y_tr, x_te, y_te, name): algorithms = { 'ada_boost': ensemble.AdaBoostClassifier(), 'bagging': ensemble.BaggingClassifier(), 'extra_trees': ensemble.ExtraTreesClassifier(), 'random_forest': ensemble.RandomForestClassifier(), 'logistic_regression': linear_model.LogisticRegression(), 'passive_aggressive': linear_model.PassiveAggressiveClassifier(), 'ridge': linear_model.RidgeClassifier(), 'sgd': linear_model.SGDClassifier(), 'bernoulli': naive_bayes.BernoulliNB(), 'gaussian': naive_bayes.GaussianNB(), 'k_neighbors': neighbors.KNeighborsClassifier(), 'nearest_centroid': neighbors.NearestCentroid(), 'mlp': neural_network.MLPClassifier(), 'linear_svc': svm.LinearSVC(), 'decision_tree': tree.DecisionTreeClassifier(), 'extra_tree': tree.ExtraTreeClassifier(), 'gradient_boosting': ensemble.GradientBoostingClassifier(), 'hist_gradient_boosting': HistGradientBoostingClassifier() } res = {} try: clf = GridSearchCV(algorithms.get(name), getattr(CVParameters, name), cv=2, n_jobs=-1) start = time.clock() clf.fit(x_tr, y_tr) tr_time = time.clock() - start print(tr_time) print(clf.best_params_) print(clf.best_score_) tr_score = clf.score(x_tr, y_tr) score = clf.score(x_te, y_te) tr_fscore = f1_score(y_tr, clf.predict(x_tr), average='weighted') fscore = f1_score(y_te, clf.predict(x_te), average='weighted') print(tr_score, score, tr_fscore, fscore) res = { name: { 'test': score, 'train': tr_score, 'f1_test': fscore, 'f1_train': tr_fscore, 'tr_time': tr_time } } res[name].update(clf.best_params_) except Exception as e: print(e) return res
def feature(model, args, writer, epoch): model.eval() transform = transforms.Compose([ # transforms.ToPILImage(), transforms.Resize(32), transforms.ToTensor(), transforms.Normalize(mean=np.array([0.485, 0.456, 0.406]), std=np.array([0.229, 0.224, 0.225])), ]) NearestCentroid, NearestCentroid_label, features, labels = [], [], [], [] preserved_features, preserved_labels = [], [] fea, l = torch.zeros(0), torch.zeros(0) train_set = torchvision.datasets.ImageFolder(root=args.preserved_sample, transform=transform) train_loader = torch.utils.data.DataLoader(train_set, batch_size=3, shuffle=False) for i, (data, target) in enumerate(train_loader): data, target = data.cuda(), target.cuda() data, target = Variable(data), Variable(target) output = model.module.forward(data) preserved_features.extend(output.data) preserved_labels.extend(target.data.cpu().numpy()) NearestCentroid.append(output[0].data.cpu().numpy()) NearestCentroid_label.append(target[0].data.cpu().numpy()) fea = torch.cat((fea, output.data.cpu())) l = torch.cat((l, target.data.cpu().float())) train_set = torchvision.datasets.ImageFolder(root=args.train_set, transform=transform) train_loader = torch.utils.data.DataLoader(train_set, batch_size=args.test_batch_size, shuffle=False) for i, (data, target) in enumerate(train_loader): data, target = data.cuda(), target.cuda() data, target = Variable(data), Variable(target) output = model.forward(data) features.extend(output.data) labels.extend(target.data.cpu().numpy()) fea = torch.cat((fea, output.data.cpu())) l = torch.cat((l, target.data.cpu().float())) clf = neighbors.NearestCentroid() clf.fit(NearestCentroid, NearestCentroid_label) writer.add_embedding(mat=fea, metadata=l, global_step=epoch) return features, labels, clf, preserved_features, preserved_labels
def create_ts_and_targets(data_cols, cents): print("**extracting time series from raw accidents") #initialize nearest neighbor classifier with precomputed centroids dummy_classes = [i for i in range(K)] nn_clf = nbr.NearestCentroid() nn_clf.fit(cents, dummy_classes) #this is the idx of the first accident in the window time_stamps = data_cols[0] acc_start_idx = extHelp.index_for_t(WIND_EXT_START, time_stamps) wind_idx = 0 #each loop extracts one row for final predictor time series feat_ts = [] target_ts = [] log_count = 0 while (time_stamps[acc_start_idx] < WIND_EXT_END): curr_t = WIND_EXT_START + (wind_idx * STEP_SZ) log_count += 1 if (log_count % 25 == 0): print("**current t\n\t" + str(curr_t) + " of\n\t" + str(WIND_EXT_END)) #windows of accidents, both historical and on the forecasting horizon prev_day, prev_week, prev_month = extract_hist_winds( curr_t, data_cols, acc_start_idx) fwd_horizon = extract_target_wind(curr_t, data_cols, acc_start_idx) prev_day_probs = rel_freqs(nn_clf, prev_day) prev_week_probs = rel_freqs(nn_clf, prev_week) prev_month_probs = rel_freqs(nn_clf, prev_month) target_probs = rel_freqs(nn_clf, fwd_horizon) time_embed = extHelp.embed_time(curr_t, WIND_EXT_START) #concat all probabilities and time embedding into single step concat_feats = time_embed + prev_day_probs + prev_week_probs + prev_month_probs feat_ts.append(concat_feats) target_ts.append(target_probs) #bring index to first accident >= current_t + STEP_SZ acc_start_idx += extHelp.index_for_t(curr_t + STEP_SZ, time_stamps[acc_start_idx:]) wind_idx += 1 return feat_ts, target_ts
def __init__(self, data, algorithm, k=10): """ Runs the specified algorithm on processed data and calculates accuracy. :param data: Data set. :param algorithm: String represent algorithm to use: 'KNN' or 'Rocchio' :param k: Optional - initializes 'KNN' algorithm number of neighbors (default = 10). """ self._name = algorithm self._data = data if algorithm == "KNN": self.algorithm = neighbors.KNeighborsClassifier(n_neighbors=k, p=1) elif algorithm == "Rocchio": self.algorithm = neighbors.NearestCentroid() else: print("Please enter one of : KNN or Rocchio") self._accuracy = 0
def nm_alg(teachingSet, testSet, features, distanceMetrics, normalization, metric): trainDataFeatures, trainDataLabelFeatures = prepareDataSet( teachingSet, features, normalization) testDataFeatures, testDataLabelFeatures = prepareDataSet( testSet, features, normalization) classifier = neighbors.NearestCentroid(metric=distanceMetrics, shrink_threshold=None) classifier.fit(trainDataFeatures, trainDataLabelFeatures) predictions = classifier.predict(testDataFeatures) score = eval(metric + "_score")(testDataLabelFeatures, predictions) accuracy_confusion_matrix = confusion_matrix(testDataLabelFeatures, predictions) return score, accuracy_confusion_matrix
def _train(self): x = self._train_features y = self._train_outputs pipe = pipeline.Pipeline([ ('drop', transformers.ColumnDropper( columns=(0, 3, 5, 14, 26, 35, 40, 65, 72, 95, 99, 104, 124) )), ('scale', preprocessing.StandardScaler( with_mean=True, with_std=True )), ('select', feature_selection.SelectKBest( k=101, score_func=feature_selection.f_classif )), ('estim', neighbors.NearestCentroid( metric='euclidean', shrink_threshold=None )), ]) pipe.fit(x, y) self._model = pipe.predict
def fit(self, X, y): self.centroids_ = neighbors.NearestCentroid(metric="cosine")\ .fit(X, y).centroids_ return self
test_file = 'BRENNT_' + client + '_Test.csv' aux_path = client + '/' cat_list = [2, 5, 6, 23, 24, 25, 26, 27] stats_file = client + '.stats' name_list = client + '.names' DPlib.getLabels(data_path, data_file, cat_list, aux_path, stats_file) DATA, LABEL = DPlib.getAllModData(data_path, data_file, aux_path, name_list, stats_file) tDATA, tLABEL = DPlib.getAllModData(data_path, test_file, aux_path, name_list, stats_file) clfkNNu = neighbors.KNeighborsClassifier(3, 'uniform', p=5) clfkNNd = neighbors.KNeighborsClassifier(3, 'distance', p=5) clfkNNc = neighbors.NearestCentroid() clfkNNu.fit(DATA, LABEL) clfkNNd.fit(DATA, LABEL) clfkNNc.fit(DATA, LABEL) pLABELkNNu = clfkNNu.predict(tDATA) pLABELkNNd = clfkNNd.predict(tDATA) pLABELkNNc = clfkNNc.predict(tDATA) V = [pLABELkNNu, pLABELkNNd, pLABELkNNc] pLABELmajority = [] for ii in range(len(V[0])): summ = 0 for jj in range(3):
vectorizer.fit(x_train_raw) my_representations.append({"name":name, "x_train":vectorizer.transform(x_train_raw), "x_test":vectorizer.transform(x_test_raw)}) if name == 'tf': print len(vectorizer.vocabulary_) ########################### # learning from sklearn import naive_bayes, linear_model, svm, ensemble, neighbors, metrics from sklearn.ensemble import RandomForestClassifier # configure learners = [{"name":"LR", "model":linear_model.LogisticRegression(C=1,class_weight='balanced')}, {"name":"SVM", "model":svm.LinearSVC(C=1,class_weight='balanced')}, {"name":"5-NN", "model":neighbors.KNeighborsClassifier(n_neighbors=5)}, {"name":"Rochio", "model":neighbors.NearestCentroid()}, {"name":"N.B.", "model":naive_bayes.MultinomialNB(alpha=1)}, {"name":"R.F.", "model":RandomForestClassifier(n_estimators = 100)}] # fit and test for representation in my_representations: print "\tRepresentation:", representation["name"] for learner in learners: learner['model'].fit(representation["x_train"], y_train) preds = learner['model'].predict(representation["x_test"]) print "%s:\tAccuracy: %0.3f\tF1 macro: %0.3f"%(learner['name'], metrics.accuracy_score(y_test, preds), metrics.f1_score(y_test, preds, average='macro')) print "----------------"
## refer to 1.1.11. Stochastic Gradient Descent - SGD ## 1.6. Nearest Neighbors models.append( {"name": "1.6.3. KNeighborsRegressor uniform", \ "model": neighbors.KNeighborsRegressor(weights = "uniform")} ) models.append( {"name": "1.6.3. KNeighborsRegressor distance", \ "model": neighbors.KNeighborsRegressor(weights = "distance")} ) #ValueError: Input contains NaN #models.append( {"name": "1.6.3. RadiusNeighborsRegressor uniform", \ # "model": neighbors.RadiusNeighborsRegressor(weights = "uniform")} ) #ZeroDivisionError: Weights sum to zero, can't be normalized #models.append( {"name": "1.6.3. RadiusNeighborsRegressor distance", \ # "model": neighbors.RadiusNeighborsRegressor(weights = "distance")} ) models.append( {"name": "1.6.3. NearestCentroid", \ "model": neighbors.NearestCentroid()} ) ## 1.7. Gaussian Processes ## too slow? #models.append( {"name": "1.7. Gaussian Processes", \ # "model": gaussian_process.GaussianProcess()} ) ## 1.8. Cross decomposition models.append( {"name": "1.8. Cross decomposition PLSRegression", \ "model": cross_decomposition.PLSRegression()} ) models.append( {"name": "1.8. Cross decomposition PLSCanonical", \ "model": cross_decomposition.PLSCanonical()} ) # slow #models.append( {"name": "1.8. Cross decomposition CCA", \ # "model": cross_decomposition.CCA()} )
def select_three_sample(model, args, epoch, writer): model.eval() num_each_class = [500, 500, 500, 500, 500, 500, 500, 50, 50, 50] # num_each_class = [500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500] transform = transforms.Compose([ transforms.Resize(32), transforms.ToTensor(), transforms.Normalize(mean=np.array([0.485, 0.456, 0.406]), std=np.array([0.229, 0.224, 0.225])), ]) train_set = torchvision.datasets.ImageFolder(root=args.train_set, transform=transform) train_loader = torch.utils.data.DataLoader(train_set, batch_size=args.test_batch_size, shuffle=False) NearestCentroid, KNeighbors, features, label, labels = [], [], [], [], [] for i, (data, target) in enumerate(train_loader): data, target = data.cuda(), target.cuda() data, target = Variable(data), Variable(target) output = model(data) features.extend(output.data) KNeighbors.extend(output.data.cpu().numpy()) labels.extend(target.data.cpu().numpy()) count = 0 l2_dist = PairwiseDistance(2) destination = os.path.join(args.check_path, 'epoch' + str(epoch)) if not os.path.exists(destination): os.mkdir(destination) for i in range(len(num_each_class)): num_sample = features[count:count + num_each_class[i]] m = torch.tensor(np.zeros(args.embedding_size)).float().cuda() for x in num_sample: m += x m /= num_each_class[i] sample1 = min(num_sample, key=lambda x: l2_dist.forward_val(x, m)) sample2 = max(num_sample, key=lambda x: l2_dist.forward_val(x, sample1)) sample3 = max(num_sample, key=lambda x: l2_dist.forward_val(x, sample2)) NearestCentroid.append(sample1.cpu().numpy()) label.append(i) sample1_loc, sample2_loc, sample3_loc = -1, -1, -1 for j in range(num_sample.__len__()): if (num_sample[j] == sample1).all(): sample1_loc = j if (num_sample[j] == sample2).all(): sample2_loc = j if (num_sample[j] == sample3).all(): sample3_loc = j frame = pd.read_csv(args.train_set_csv) destination_class = os.path.join( destination, str(frame['name'][count + sample1_loc])) if not os.path.exists(destination_class): os.mkdir(destination_class) sample1_source = os.path.join( args.train_set, str(frame['name'][count + sample1_loc]), str(frame['id'][count + sample1_loc]) + '.png') sample2_source = os.path.join( args.train_set, str(frame['name'][count + sample2_loc]), str(frame['id'][count + sample2_loc]) + '.png') sample3_source = os.path.join( args.train_set, str(frame['name'][count + sample3_loc]), str(frame['id'][count + sample3_loc]) + '.png') shutil.copy(sample1_source, destination_class + '/sample1.png') shutil.copy(sample2_source, destination_class + '/sample2.png') shutil.copy(sample3_source, destination_class + '/sample3.png') count += num_each_class[i] clf = neighbors.NearestCentroid() clf.fit(NearestCentroid, label) return features, labels, clf, destination
def errorCorrectionTrain(input_images, output, parameters=None, debug=False, partition=None, part=None, multilabel=1): try: use_coord = parameters.get('use_coord', True) use_joint = parameters.get('use_joint', True) patch_size = parameters.get('patch_size', 1) border = patch_size * 2 if patch_size == 0: border = 2 normalize_input = parameters.get('normalize_input', True) method = parameters.get('method', 'lSVC') method2 = parameters.get('method2', method) method_n = parameters.get('method_n', 15) method2_n = parameters.get('method2_n', method_n) method_random = parameters.get('method_random', None) method_max_features = parameters.get('method_max_features', 'auto') method_n_jobs = parameters.get('method_n_jobs', 1) primary_features = parameters.get('primary_features', 1) training_images = [] training_diff = [] training_images_direct = [] training_direct = [] if debug: print("errorCorrectionTrain use_coord={} use_joint={} patch_size={} normalize_input={} method={} output={} partition={} part={}".\ format(repr(use_coord),repr(use_joint),repr(patch_size),repr(normalize_input),method,output,partition,part)) coords = None total_mask_size = 0 total_diff_mask_size = 0 for (i, inp) in enumerate(input_images): mask = None diff = None mask_diff = None if inp[-2] is not None: mask = extract_part( minc2_file(inp[-2]).data, partition, part, border) ground_data = minc2_file(inp[-1]).data auto_data = minc2_file(inp[-3]).data ground_shape = ground_data.shape ground = extract_part(ground_data, partition, part, border) auto = extract_part(auto_data, partition, part, border) shape = ground_shape if coords is None and use_coord: c = np.mgrid[0:shape[0], 0:shape[1], 0:shape[2]] coords = [ extract_part((c[j] - shape[j] / 2.0) / (shape[j] / 2.0), partition, part, border) for j in range(3) ] features = [ extract_part(minc2_file(k).data, partition, part, border) for k in inp[0:-3] ] mask_size = shape[0] * shape[1] * shape[2] if debug: print("Training data size:{}".format(len(features))) if mask is not None: mask_size = np.sum(mask) print("Mask size:{}".format(mask_size)) else: print("Mask absent") total_mask_size += mask_size if multilabel > 1: diff = (ground != auto) total_diff_mask_size += np.sum(mask) if mask is not None: mask_diff = diff & (mask > 0) print("Sample {} mask_diff={} diff={}".format( i, np.sum(mask_diff), np.sum(diff))) #print(mask_diff) training_diff.append(diff[mask > 0]) training_direct.append(ground[mask_diff]) else: mask_diff = diff training_diff.append(diff) training_direct.append(ground[diff]) training_images.append( prepare_features(features, coords, mask=mask, use_coord=use_coord, use_joint=use_joint, patch_size=patch_size, primary_features=primary_features)) training_images_direct.append( prepare_features(features, coords, mask=mask_diff, use_coord=use_coord, use_joint=use_joint, patch_size=patch_size, primary_features=primary_features)) else: mask_diff = mask if mask is not None: training_diff.append(ground[mask > 0]) else: training_diff.append(ground) training_images.append( prepare_features(features, coords, mask=mask, use_coord=use_coord, use_joint=use_joint, patch_size=patch_size, primary_features=primary_features)) if debug: print("feature size:{}".format(len(training_images[-1]))) if i == 0 and parameters.get('dump', False): print("Dumping feature images...") for (j, k) in enumerate(training_images[-1]): test = np.zeros_like(images[0]) test[mask > 0] = k out = minc2_file() out.imitate(inp[0], path="dump_{}.mnc".format(j)) out.data = test # calculate normalization coeffecients if debug: print("Done") clf = None clf2 = None if total_mask_size > 0: training_X = convert_image_list(training_images) training_Y = np.ravel( np.concatenate(tuple(j for j in training_diff))) if debug: print("Fitting 1st...") if method == "xgb": clf = None elif method == "SVM": clf = svm.SVC() elif method == "nuSVM": clf = svm.NuSVC() elif method == 'NC': clf = neighbors.NearestCentroid() elif method == 'NN': clf = neighbors.KNeighborsClassifier(method_n) elif method == 'RanForest': clf = ensemble.RandomForestClassifier( n_estimators=method_n, n_jobs=method_n_jobs, max_features=method_max_features, random_state=method_random) elif method == 'AdaBoost': clf = ensemble.AdaBoostClassifier(n_estimators=method_n, random_state=method_random) elif method == 'AdaBoostPP': clf = Pipeline(steps=[('normalizer', Normalizer()), ('AdaBoost', ensemble.AdaBoostClassifier( n_estimators=method_n, random_state=method_random))]) elif method == 'tree': clf = tree.DecisionTreeClassifier(random_state=method_random) elif method == 'ExtraTrees': clf = ensemble.ExtraTreesClassifier( n_estimators=method_n, max_features=method_max_features, n_jobs=method_n_jobs, random_state=method_random) elif method == 'Bagging': clf = ensemble.BaggingClassifier( n_estimators=method_n, max_features=method_max_features, n_jobs=method_n_jobs, random_state=method_random) elif method == 'dumb': clf = dummy.DummyClassifier(strategy="constant", constant=0) else: clf = svm.LinearSVC() #scores = cross_validation.cross_val_score(clf, training_X, training_Y) #print scores if method == "xgb": xg_train = xgb.DMatrix(training_X, label=training_Y) param = {} num_round = 100 # use softmax multi-class classification param['objective'] = 'multi:softmax' # scale weight of positive examples param['eta'] = 0.1 param['max_depth'] = 8 param['silent'] = 1 param['nthread'] = 4 param['num_class'] = 2 clf = xgb.train(param, xg_train, num_round) elif method != 'dumb': clf.fit(training_X, training_Y) if multilabel > 1 and method != 'dumb': if debug: print("Fitting direct...") training_X = convert_image_list(training_images_direct) training_Y = np.ravel( np.concatenate(tuple(j for j in training_direct))) if method2 == "xgb": clf2 = None if method2 == "SVM": clf2 = svm.SVC() elif method2 == "nuSVM": clf2 = svm.NuSVC() elif method2 == 'NC': clf2 = neighbors.NearestCentroid() elif method2 == 'NN': clf2 = neighbors.KNeighborsClassifier(method_n) elif method2 == 'RanForest': clf2 = ensemble.RandomForestClassifier( n_estimators=method_n, n_jobs=method_n_jobs, max_features=method_max_features, random_state=method_random) elif method2 == 'AdaBoost': clf2 = ensemble.AdaBoostClassifier( n_estimators=method_n, random_state=method_random) elif method2 == 'AdaBoostPP': clf2 = Pipeline(steps=[('normalizer', Normalizer()), ('AdaBoost', ensemble.AdaBoostClassifier( n_estimators=method_n, random_state=method_random))]) elif method2 == 'tree': clf2 = tree.DecisionTreeClassifier( random_state=method_random) elif method2 == 'ExtraTrees': clf2 = ensemble.ExtraTreesClassifier( n_estimators=method_n, max_features=method_max_features, n_jobs=method_n_jobs, random_state=method_random) elif method2 == 'Bagging': clf2 = ensemble.BaggingClassifier( n_estimators=method_n, max_features=method_max_features, n_jobs=method_n_jobs, random_state=method_random) elif method2 == 'dumb': clf2 = dummy.DummyClassifier(strategy="constant", constant=0) else: clf2 = svm.LinearSVC() if method2 == "xgb": xg_train = xgb.DMatrix(training_X, label=training_Y) param = {} num_round = 100 # use softmax multi-class classification param['objective'] = 'multi:softmax' # scale weight of positive examples param['eta'] = 0.1 param['max_depth'] = 8 param['silent'] = 1 param['nthread'] = 4 param['num_class'] = multilabel clf2 = xgb.train(param, xg_train, num_round) elif method != 'dumb': clf2.fit(training_X, training_Y) #print(clf.score(training_X,training_Y)) if debug: print(clf) print(clf2) else: print("Warning : zero total mask size!, using null classifier") clf = dummy.DummyClassifier(strategy="constant", constant=0) if method == 'xgb' and method2 == 'xgb': #save clf.save_model(output) clf2.save_model(output + '_2') else: with open(output, 'wb') as f: pickle.dump([clf, clf2], f, -1) except mincError as e: print("Exception in linear_registration:{}".format(str(e))) traceback.print_exc(file=sys.stdout) raise except: print("Exception in linear_registration:{}".format(sys.exc_info()[0])) traceback.print_exc(file=sys.stdout) raise
from sklearn import tree, svm, neighbors, linear_model clftree = tree.DecisionTreeClassifier() # Tree clfsvm = svm.SVC() # Support Vector Machine (SVM) clfnc = neighbors.NearestCentroid() # Nearest Centroid (NC) clfsgd = linear_model.SGDClassifier( loss='hinge', penalty='l2', max_iter=1000) # Stochastic Gradient Descent (SGD) # [height, weight, shoe_size] X = [[181, 80, 44], [177, 70, 43], [160, 60, 38], [154, 54, 37], [166, 65, 40], [190, 90, 47], [175, 64, 39], [177, 70, 40], [159, 55, 37], [171, 75, 42], [181, 85, 43]] Y = [ 'male', 'male', 'female', 'female', 'male', 'male', 'female', 'female', 'female', 'male', 'male' ] clftree = clftree.fit(X, Y) # Tree clfsvm = clfsvm.fit(X, Y) # SVM clfnc = clfnc.fit(X, Y) # NC clfsgd = clfsgd.fit(X, Y) # SGD prediction_tree = clftree.predict([[190, 70, 43]]) # Tree prediction_svm = clfsvm.predict([[190, 70, 43]]) # SVM prediction_nc = clfnc.predict([[190, 70, 43]]) # NC prediction_sgd = clfsgd.predict([[190, 70, 43]]) # SGD print('Tree', prediction_tree) # Tree print('SVM', prediction_svm) # SVM
def __init__(self, **kwargs): self.classifier = neighbors.NearestCentroid(**kwargs)
import matplotlib.pyplot as plt from matplotlib.colors import ListedColormap from sklearn import datasets, neighbors neighbors_cnt = 15 if __name__ == "__main__": print("Loading data...") data = datasets.load_iris() X, y = data.data[:, :2], data.target step = 0.01 cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF']) cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF']) for shrinkage in [None, 0.2]: model = neighbors.NearestCentroid(shrink_threshold=shrinkage) model.fit(X, y) y_prediction = model.predict(X) print(shrinkage, np.mean(y_prediction == y)) x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, step), np.arange(y_min, y_max, step)) Z = model.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.figure() plt.pcolormesh(xx, yy, Z, cmap=cmap_light) plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold, edgecolor='k', s=20) plt.title("3-Class classification (shrink_threshold=%r)" % shrinkage)
}] } #训练和测试 for clf, name, parameters in ( (RidgeClassifier(), "Ridge Classifier of linear_model", parameters_condition['RidgeClassifier']), (Perceptron(), "Perceptron of linear_model", parameters_condition['Perceptron']), (PassiveAggressiveClassifier(), "Passive-Aggressive of linear_model", parameters_condition['PassiveAggressiveClassifier']), (SGDClassifier(), "SGD model of linear_model", parameters_condition['SGDClassifier']), (neighbors.KNeighborsClassifier(), "kNN", parameters_condition['KNeighborsClassifier']), (neighbors.NearestCentroid(), "NearestCentroid", parameters_condition['NearestCentroid']), (svm.SVC(), "SVC", parameters_condition['SVC']), (svm.LinearSVC(), "LinearSVC", parameters_condition['LinearSVC']), (svm.NuSVC(), "NuSVC", parameters_condition['NuSVC']), (MultinomialNB(), "MultinomialNB", parameters_condition['MultinomialNB']), (BernoulliNB(), "BernoulliNB", parameters_condition['BernoulliNB']), (RandomForestClassifier(), "Random forest", parameters_condition['RandomForestClassifier'])): print('=' * 80) print(name) results.append(benchmark(clf=clf, parameters=parameters)) # make some plots import matplotlib.pyplot as plt
def get_skl_estimator(self, **default_parameters): return neighbors.NearestCentroid(**default_parameters)
def run(self): start_time = datetime.datetime.now() best_acc, best_epoch, fts_means = 0., 0, None for epoch in range(1, self.args.epoch + 1): if self.scheduler is not None: self.scheduler.step() if self.increment_phase > 0: new_loss, ebd_loss = self.train_increment( epoch=epoch, model=self.model, criterion=self.criterion, embedding_loss=self.embedding_loss, optimizer=self.optimizer, new_loader=self.sampler_train_loader, train_loader=self.train_loader_old) if epoch % 2 == 0: validate_start = datetime.datetime.now() with torch.no_grad(): new_embeddings, new_targets = self.extractEmbeddings( self.model, self.train_loader) old_embeddings, old_targets = self.extractEmbeddings( self.model, self.train_loader_old) ######################################## embeddings = torch.cat((new_embeddings, old_embeddings)) targets = np.append(new_targets, old_targets) fts_means, labels = self.extract_feature_mean( embeddings, targets) clf_knn = neighbors.KNeighborsClassifier( n_neighbors=self.args.vote).fit( embeddings.cpu().data.numpy(), targets) clf_ncm = neighbors.NearestCentroid().fit( fts_means.cpu().data.numpy(), labels) # clf_ncm = neighbors.NearestCentroid().fit(self.means.cpu().data.numpy(), labels) ############################################# # New Train accuracy new_train_accy, new_train_fts, new_train_lbls = self.validate( args=self.args, model=self.model, clf_knn=clf_knn, loader=self.train_loader, clf_ncm=clf_ncm) # Old train acc old_train_accy, old_train_fts, old_train_lbls = self.validate( args=self.args, model=self.model, clf_knn=clf_knn, loader=self.train_loader_old, clf_ncm=clf_ncm) # Test accuracy valid_accy, pred_fts, pred_lbls = self.validate( args=self.args, model=self.model, loader=self.test_loader, clf_knn=clf_knn, clf_ncm=clf_ncm) self.log(epoch, new_loss, new_train_accy, valid_accy, validate_start, fts_means, pred_lbls, best_acc, ebd_loss, old_train_accy) if (valid_accy[1] > best_acc) or epoch == self.args.epoch: best_acc = max(best_acc, valid_accy[1]) best_epoch = epoch self.save_model(epoch, fts_means, preserved_embedding=None) elif self.increment_phase == 0: train_loss = self.train_epoch( epoch=epoch, model=self.model, criterion=self.criterion, optimizer=self.optimizer, new_loader=self.sampler_train_loader, pairwise=self.args.pairwise) if epoch % 4 == 0: validate_start = datetime.datetime.now() # Validate with torch.no_grad(): embeddings, targets = self.extractEmbeddings( model=self.model, train_loader=self.train_loader) fts_means, labels = self.extract_feature_mean( embeddings, targets) # [n, feature_dimension], [n] clf_knn = neighbors.KNeighborsClassifier( n_neighbors=self.args.vote).fit( embeddings.cpu().data.numpy(), targets) clf_ncm = neighbors.NearestCentroid().fit( fts_means.cpu().data.numpy(), labels) # Train accuracy train_accy, train_fts, train_lbls = self.validate( args=self.args, model=self.model, loader=self.train_loader, clf_knn=clf_knn, clf_ncm=clf_ncm) # Test accuracy valid_accy, pred_fts, pred_lbls = self.validate( args=self.args, model=self.model, loader=self.test_loader, clf_knn=clf_knn, clf_ncm=clf_ncm) self.log(epoch, train_loss, train_accy, valid_accy, validate_start, fts_means, pred_lbls, best_acc=best_acc) if (train_accy[1] >= 0.96 and valid_accy[1] > best_acc ) or epoch == self.args.epoch: best_acc = max(best_acc, valid_accy[1]) best_epoch = epoch preserved_embedding = self.preserve_image( epoch, embeddings, targets, fts_means, self.classes) self.save_model(epoch, fts_means, preserved_embedding) elif self.increment_phase == -1: losses = self.train_cross_entropy( train_loader=self.train_loader, model=self.model, criterion=self.criterion, optimizer=self.optimizer, epoch=epoch) # TODO end_time = datetime.datetime.now() self.f.write( 'Best accy: {:.4f}, Best_epoch: {}, Time comsumed: {}mins'. format(best_acc, best_epoch, int(((end_time - start_time).seconds) / 60))) print('Best accy: {:.4f}, Best_epoch: {}, Time comsumed: {}mins'. format(best_acc, best_epoch, int(((end_time - start_time).seconds) / 60)))
# Level 2 Score: clf = linear_model.ElasticNetCV(cv=5, verbose=0) model_sum = blend_proba(clf=clf, X_train=train, y=target, X_test=test, nfolds=5,seed=rnd, category="regressor", filename = "ElasticNet", setused=setused) # Level 2 Score: clf = linear_model.BayesianRidge() model_sum = blend_proba(clf=clf, X_train=train, y=target, X_test=test, nfolds=5, seed=rnd, category="regressor", filename = "BayesianRidge", setused=setused) # Level 2 Score: clf = neighbors.NearestCentroid() model_sum = blend_proba(clf=clf, X_train=train, y=target, X_test=test, nfolds=5, seed=rnd, category="regressor", filename = "NearCentroid", setused=setused) # Level 2 Score: clf = naive_bayes.GaussianNB() model_sum = blend_proba(clf=clf, X_train=train, y=target, X_test=test, nfolds=5, seed=rnd, category="classifier", filename = "GaussianNB", setused=setused) # Level 2 Score, k= 2: # Level 2 Score, k= 4: # Level 2 Score, k= 8: # Level 2 Score, k= 16: # Level 2 Score, k= 32: # Level 2 Score, k= 64:
# for c in c_logreg_values: # logreg_digit = linear_model.LogisticRegression(C=c, solver='liblinear') # logreg_digit.fit(digit_data_train, digit_targets_train) # digit_ypred_test = logreg_digit.predict(digit_data_test) # logreg_liblinear_acc.append(metrics.accuracy_score(digit_targets_test, digit_ypred_test)) # # x_axis = c_logreg_values # plt.plot(x_axis, logreg_acc_lbfgs_multi, x_axis, logreg_liblinear_acc) # plt.title('Logistic Regression Accuracy Scores for different parameters and solvers') # plt.legend(('lbgfs with multinomial multiclass', 'liblinear')) # plt.xlabel('C') # plt.ylabel('Accuracy Score') # plt.axis('tight') # plt.show() nc_digit = neighbors.NearestCentroid() nc_digit.fit(digit_data_train, digit_targets_train) digit_targets_pred_nc = nc_digit.predict(digit_data_test) print("Nearest Centroid Accuracy (Digit Dataset):", metrics.accuracy_score(digit_targets_test, digit_targets_pred_nc)) qda_digit = discriminant_analysis.QuadraticDiscriminantAnalysis() qda_digit.fit(digit_data_train, digit_targets_train) digit_targets_pred_qda = qda_digit.predict(digit_data_test) print("QDA Accuracy (Digit Dataset):", metrics.accuracy_score(digit_targets_test, digit_targets_pred_qda)) # iris_data_split = np.reshape(iris_data, [10, len(iris_data)/10]) # iris_targets_split = np.reshape(iris_targets [10, len(iris_data)/10])