def EpsDBSCAN(D, k): nn = NearestNeighbors(n_neighbors=k + 1) nn.fit(D) distances, indices = nn.kneighbors(D) distances = np.delete(distances, 0, 1) Dist = distances.max(axis=1) Array = sorted(Dist) AvgDist = distances.sum(axis=1) / k Avg_Array = sorted(AvgDist) plt.plot(Avg_Array, 'b') num = len(Avg_Array) n_Array = [0 for i in range(num)] minArray = min(Avg_Array) maxArray = max(Avg_Array) for i in range(num): n_Array[i] = (Avg_Array[i] - minArray) / (maxArray - minArray) * (1.0 - 0.0) bins = np.linspace(0, 1, 10) bin_indice = np.digitize(n_Array, bins) Eps = [] Avg_Array = np.array(Avg_Array) count_max = 0 for i in range(10): count = len(np.where(bin_indice == i)[0]) if count >= k: #print count e = np.sum(Avg_Array[bin_indice == i], axis=0) / count plt.hlines(e, xmin=0, xmax=len(Array), colors='r') Eps.append(e) N = len(Eps) Eps_index = [] for i in range(N): for j in range(num): if Avg_Array[j] > Eps[i]: Eps_index.append(j) break ave_slope = (maxArray - minArray) / num #print 'ave slope' #print ave_slope #print '' for i in range(N - 1): slope = (Eps[i + 1] - Eps[i]) / (Eps_index[i + 1] - Eps_index[i]) #print slope if slope > ave_slope * 2: out = Eps[i] break else: out = Eps[i + 1] return Eps
def EpsValue(D, k): nn = NearestNeighbors(n_neighbors=k + 1) nn.fit(D) distances, indices = nn.kneighbors(D) distances = np.delete(distances, 0, 1) Dist = distances.max(axis=1) AvgDist = distances.sum(axis=1) / k out = (max(Dist) - min(AvgDist)) / 100 return min(AvgDist), out
def nn_latentspace(self, verbose=False): data_train, _, labels_train = self.labelled_set.get_latent() data_test, _, labels_test = self.unlabelled_set.get_latent() nn = KNeighborsClassifier() nn.fit(data_train, labels_train) score = nn.score(data_test, labels_test) if verbose: print("NN classifier score:", score) print("NN classifier tuple:", compute_accuracy_tuple(labels_test, nn.predict(data_test))) return score
def uniformly_random_subsample(pairs_file, n_samples, out_file): pairs = pd.read_csv(pairs_file, sep='\t') samples = np.random.uniform(size=(n_samples,pairs.shape[1]-2)) nn = NearestNeighbors(1, n_jobs=-1) nn.fit(pairs[['vec_sim', 'jac_sim', 'len_sim', 'top_sim']]) index = pd.DataFrame(nn.kneighbors(samples, return_distance=False), columns=['index']) df = pairs.reset_index().merge(index).drop_duplicates() df.to_csv(out_file, sep='\t', index=None)
def plot(X_tr, y_tr, a_tr, X_val, y_val, a_val): class_err = np.zeros((11, 1)) mae = np.zeros((11, 1)) count = 0 for i in [0.0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.10]: nn = NN(i, 20) nn.fit(X_tr, y_tr, a_tr) nn.predict(X_tr) [class_err_1, class_err_2, mae1, mae2] = nn.calc_error(X_tr, y_tr, a_tr, X_val, y_val, a_val) class_err[count] = class_err_2 mae[count] = mae2 count = count + 1 fig1 = plt.gcf() plt.plot( np.arange(0.0, 1.1, 0.1).tolist(), class_err, marker='o', linestyle='dashed', label= r'Observed Classification Error Rate (Y-axis) at Trade-off parameter $\alpha$ (X-axis)' ) plt.legend(prop={'size': 22}) plt.ylabel('Classification Error on Validation Data', fontsize=22) plt.yticks(np.arange(0.0, 1.1, 0.1).tolist()) plt.xlabel(r'Trade-off parameter $\alpha$', fontsize=22) plt.xticks(np.arange(0.0, 1.1, 0.1).tolist()) plt.title('Classification Error vs Trade-off Parameter', fontsize=22) mng = plt.get_current_fig_manager() mng.window.showMaximized() plt.show() fig1.savefig('C.png') fig2 = plt.gcf() plt.plot( np.arange(0.0, 1.1, 0.1).tolist(), mae, marker='o', linestyle='dashed', label= r'Observed Mean Absolute Error (Y-axis) at Trade-off parameter $\alpha$ (X-axis)' ) plt.legend(prop={'size': 22}) plt.ylabel('Mean Absolute Error on Validation Data', fontsize=22) plt.xlabel(r'Trade-off parameter $\alpha$', fontsize=22) plt.xticks(np.arange(0.0, 1.1, 0.1).tolist()) plt.title('Mean Absolute Error vs Trade-off Parameter', fontsize=22) mng = plt.get_current_fig_manager() mng.window.showMaximized() plt.show() fig2.savefig('M.png')
def main(): DATA_DIR = '../data' data = np.load(os.path.join(DATA_DIR, "mnist_rot_train.npz")) X_tr, y_tr, a_tr = data["X"], data["labels"], data["angles"] data = np.load(os.path.join(DATA_DIR, "mnist_rot_validation.npz")) X_val, y_val, a_val = data["X"], data["labels"], data["angles"] #Note: test class labels and angles are not provided #in the data set data = np.load(os.path.join(DATA_DIR, "mnist_rot_test.npz")) X_te, y_te, a_te = data["X"], data["labels"], data["angles"] nn = BestNN(0.2, 20) nn.fit(X_tr, y_tr, a_tr)
def experiment_scikitlearn_baselines(train_X, train_Y, test_X, test_Y): train_X = train_X.numpy().reshape(316, 28*50) train_Y.numpy() sv = svm.SVC() sv.fit(train_X, train_Y) nn = NearestCentroid() nn.fit(train_X, train_Y) ga = GaussianNB() ga.fit(train_X, train_Y) dt = tree.DecisionTreeClassifier() dt.fit(train_X, train_Y) test_X = test_X.numpy().reshape(100, 28*50) test_Y.numpy() print("SVM " + str(accuracy_score(test_Y, sv.predict(test_X)))) print("NN " + str(accuracy_score(test_Y, nn.predict(test_X)))) print("Gausian " + str(accuracy_score(test_Y, ga.predict(test_X)))) print("DT " + str(accuracy_score(test_Y, dt.predict(test_X)))) print("Warning: The following is taking approximately 1.5 hours in an average laptop.")
def main(): DATA_DIR = 'data' data = np.load(os.path.join(DATA_DIR, "mnist_rot_train.npz")) X_tr, y_tr, a_tr = data["X"], data["labels"], data["angles"] data = np.load(os.path.join(DATA_DIR, "mnist_rot_validation.npz")) X_val, y_val, a_val = data["X"], data["labels"], data["angles"] #Note: test class labels and angles are not provided #in the data set data = np.load(os.path.join(DATA_DIR, "mnist_rot_test.npz")) X_te, y_te, a_te = data["X"], data["labels"], data["angles"] #plt.imshow(X_tr[0].reshape((28,28))) #plt.show() nn = BestNN(1.0, 30) #[X_tr, X_val, X_te] = nn.preprocess(X_tr, X_val, X_te) nn.fit(X_tr, y_tr, a_tr) nn.predict(X_tr) nn.calc_error(X_tr, y_tr, a_tr, X_val, y_val, a_val) nn.savetestpred(X_te)
def update_unlabelled_knn(self): with torch.set_grad_enabled(False): self.model.eval() latent_train, _, labels_train = self.labelled_set.get_latent() latent_test, _, labels_test = self.unlabelled_set.sequential( ).get_latent() # need original counts as input counts_test, _ = self.unlabelled_set.sequential().raw_data() nn = KNeighborsClassifier() # on a subset of the train data restriction # latent_train = latent_train # labels_train = labels_train nn.fit(latent_train, labels_train) print("SCORE nn :", nn.score(latent_test, labels_test)) proba_test = nn.predict_proba(latent_test) classification_ratio = np.zeros( (len(latent_test), self.gene_dataset.n_labels)) classification_ratio[:, nn.classes_] = proba_test to_keep = proba_test.max(axis=1) >= 0.8 # Threshold of confidence # # 1 - Maxime's initial formulation # classification_ratio = np.log(classification_ratio + 1e-8) * self.knn_classification_ratio # # 2 - Alternative here classification_ratio > 0 implies the opposite sign somewher # classification_ratio = -np.log(1 - classification_ratio) counts_test, labels_test, classification_ratio = \ counts_test[to_keep], labels_test[to_keep], classification_ratio[to_keep] labelled_test_set = TensorDataset( torch.from_numpy(counts_test.astype(np.float32)), torch.from_numpy(labels_test.astype(np.int64)), torch.from_numpy(classification_ratio.astype(np.float32))) self.unlabelled_knn_set = self.create_posterior( gene_dataset=labelled_test_set, shuffle=True) self.model.train()
def neuralNetworkSK(xTrain, yTrain): xTrainFlat = flattenComponents(xTrain) nn = MLPClassifier(activation='logistic', solver='sag') nn.fit(xTrainFlat, yTrain) return nn
X_tr = X_tr.reshape(X_tr.shape[0], -1) X_t = X_t.reshape(X_t.shape[0], -1) one_hot = OneHotEncoder() y_tr = train_dataset.targets.numpy().reshape(-1, 1) y_t = test_dataset.targets.numpy().reshape(-1, 1) y_tr1 = one_hot.fit_transform(y_tr).toarray() y_t1 = one_hot.fit_transform(y_t).toarray() nn.fit(X_tr, y_tr1, epochs=1200, batch_size=64, loss=BinaryCrossEntropy(), optimizer=Adam(lr=0.001), show_progress=TQDM_TERMINAL) preds = np.round(nn.predict(X_t)) total = len(preds) correct = 0 for pred, y in zip(preds, y_t1): if pred.argmax() == y.argmax(): correct += 1 print(f"Accuracy: {float(correct) * 100 / total}%")
def main(): # if GPU is availale, use GPU device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') print("Use " + str(device)) # create dataset file_list = None for path, dirs, files in os.walk(test_path, topdown=False): file_list = list(files) # preprocessing steps transform = transforms.Compose([ transforms.Resize((512, 512)), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ]) test_dataset = Leaf_test_Dataset(file_list, test_path, transform) test_loader = DataLoader(dataset=test_dataset, batch_size=batchSize) print("Start testing:") # net model eff_models = [] for model_path in eff_model_paths: eff_net = EfficientNet.from_name('efficientnet-b4') eff_net._fc = nn.Linear(eff_net._fc.in_features, 5) eff_net.load_state_dict(torch.load(model_path)) eff_net = eff_net.to(device) eff_net.eval() eff_models.append(eff_net) preds = [] result = None with torch.no_grad(): batch_num = len(test_loader) for index, image in enumerate(test_loader): image = image.to(device) eff_result = [] for eff_net in eff_models: output = eff_net(image) output = output.to('cpu') pred = output.argmax(dim=1, keepdim=True).flatten() eff_result.append(pred) if len(preds) == 0: preds = np.dstack(eff_result)[0] else: preds = np.vstack([preds, np.dstack(eff_result)[0]]) # start train combine model df = pd.read_csv(pred_train_csv) # 移除全错选项 # get the pred acc for this line def get_acc(pred_csv, index): label = pred_csv.loc[index, 'label'] acc = 0 if pred_csv.loc[index, 'pred_0'] == label: acc += 0.2 if pred_csv.loc[index, 'pred_1'] == label: acc += 0.2 if pred_csv.loc[index, 'pred_2'] == label: acc += 0.2 if pred_csv.loc[index, 'pred_3'] == label: acc += 0.2 if pred_csv.loc[index, 'pred_4'] == label: acc += 0.2 return round(acc, 1) delete_index = [] for index in range(len(df)): acc = get_acc(df, index) # remove noise data if acc <= 0: delete_index.append(index) df = df.drop(delete_index) df = df.reset_index(drop=True) X = np.array(df[["pred_0", "pred_1", "pred_2", "pred_3", "pred_4"]]) y = np.array(df[["label"]]).flatten() from sklearn.neural_network import MLPClassifier # Neural Network nn = MLPClassifier(max_iter=2000) nn.fit(X, y) result = nn.predict(preds) pred_result = pd.concat([ pd.DataFrame(file_list, columns=['image_id']), pd.DataFrame(result, columns=['label']) ], axis=1) pred_result.to_csv(output_path + "submission.csv", index=False, sep=',') print("Done.")
def EpsDBSCAN(D, k): nn = NearestNeighbors(n_neighbors=k+1) nn.fit(D) distances, indices = nn.kneighbors(D) distances = np.delete(distances, 0, 1) Dist = distances.max(axis=1) Array = sorted(Dist) AvgDist = distances.sum(axis=1)/k Avg_Array = sorted(AvgDist) ##plt.plot(Avg_Array, 'b') num = len(Avg_Array) n_Array = [0 for i in range(num)] minArray = min(Avg_Array) maxArray = max(Avg_Array) for i in range(num): n_Array[i] = (Avg_Array[i]-minArray)/(maxArray-minArray)*(1.0-0.0) bins = np.linspace(0, 1, 10) bin_indice = np.digitize(n_Array, bins) Eps = [] Avg_Array = np.array(Avg_Array) count_max = 0 for i in range(10): count = len(np.where(bin_indice == i)[0]) if count >= k: e = np.sum(Avg_Array[bin_indice == i], axis=0)/count ##plt.hlines(e, xmin=0, xmax=len(Array), colors='r') Eps.append(e) N = len(Eps) Eps_index = [] for i in range(N): for j in range(num): if Avg_Array[j] > Eps[i]: Eps_index.append(j) break ave_slope = (maxArray - minArray)/num Slopes = [] old_slope = 0.0 for i in range(N-1): slope = (Eps[i+1] - Eps[i]) / (Eps_index[i+1] - Eps_index[i]) Slopes.append(slope) ##if slope > old_slope and slope < old_slope * 1.1: ## out = Eps[i] ## break #if i > 0 and slope > ave_slope: # out = Eps[i] # break #else: # out = Eps[i+1] # old_slope = slope ave_slope = sum(Slopes)/len(Slopes) for i in range(N-1): if i > 0 and Slopes[i] > ave_slope: out = Eps[i] break else: out = Eps[i+1] #if N % 2 == 0: # median1 = N/2 # median2 = N/2 + 1 # median1 = int(median1) - 1 # median2 = int(median2) - 1 # median = (Eps[median1] + Eps[median2]) / 2 #else: # median = (N + 1) / 2 # median = int(median) - 1 # median = Eps[median] #out = median #out = Avg_Array[int(num*0.9)] #out = Array[int(num*0.8)] #out = float(sum(Eps)/len(Eps)) out = Eps[1] ##plt.show() return out