def test_predict_proba(self): clf = SklearnClassifier(estimator=GaussianProcessClassifier(), missing_label='nan') self.assertRaises(NotFittedError, clf.predict_proba, X=self.X) clf.fit(X=self.X, y=self.y1) P = clf.predict_proba(X=self.X) est = GaussianProcessClassifier().fit(X=np.zeros((3, 1)), y=['tokyo', 'paris', 'tokyo']) P_exp = est.predict_proba(X=self.X) np.testing.assert_array_equal(P_exp, P) np.testing.assert_array_equal(clf.classes_, est.classes_) clf.fit(X=self.X, y=self.y2) with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") P = clf.predict_proba(X=self.X) self.assertEqual(len(w), 1) P_exp = np.ones((len(self.X), 1)) np.testing.assert_array_equal(P_exp, P) clf = SklearnClassifier(estimator=GaussianProcessClassifier(), classes=['ny', 'paris', 'tokyo'], missing_label='nan') clf.fit(X=self.X, y=self.y_nan) P = clf.predict_proba(X=self.X) P_exp = np.ones((len(self.X), 3)) / 3 np.testing.assert_array_equal(P_exp, P) clf.fit(X=self.X, y=self.y1) P = clf.predict_proba(X=self.X) P_exp = np.zeros((len(self.X), 3)) P_exp[:, 1:] = est.predict_proba(X=self.X) np.testing.assert_array_equal(P_exp, P)
def gpc_sklearn(ax, x, y, kernel, optimizer="fmin_l_bfgs_b"): """ Implemented with GaussianProcessClassifier in sklearn.gaussisan_process. The implementation is based on Algorithm 3.1, 3.2, and 5.1 of GPML. The Laplace approximation is used for approximating the non-Gaussian posterior by a Gaussian. The implementation is restricted to using the logistic link function. INPUT: ax: an Axes object x: (N, ) np.array y: (N, ) np.array kernel: sklearn.gaussian_process.kernels object. Used to initialize GaussianProcessClassifier optimizer : string or callable. Can either be one of the internally supported optimizers for optimizing the kernel's parameters, specified by a string, or an externally defined optimizer passed as a callable. If a callable is passed, it must have the signature. If None is passed, the kernel's parameters are kept [ ax: an Axes object """ # Fit GaussianProcessClassification and LinearRegression models gpc = GaussianProcessClassifier(kernel=kernel, optimizer=optimizer) gpc.fit(x[:, np.newaxis], y) print("\nLearned kernel: %s" % gpc.kernel_) y_ = gpc.predict_proba(x[:, np.newaxis])[:, 1] xs = np.linspace(np.min(x), np.max(x), 1000) ys = gpc.predict_proba(xs[:, np.newaxis])[:, 1] # lr = LinearRegression() # lr.fit(x[:, np.newaxis], y) # x needs to be 2d for LinearRegression # Plot # ax.plot(x, y, 'r.', markersize=12, alpha = 0.2) ax.plot(xs, ys, markersize=12, alpha=0.2) # ax.plot(x, lr.predict(x[:, np.newaxis]), 'b-') # ax.set_xlim(-0.1, 1.1) # ax.set_ylim(-0.1, 1.1) # compute ece and acc after calibration ece = EceEval(np.array([1 - y_, y_]).T, y, num_bins=100) y_predict = y_ > 0.5 acc = (y_predict == y).mean() ax.text(0.05, 0.8, 'ECE=%.4f\nACC=%.4f' % (ece, acc), size=14, ha='left', va='center', bbox={ 'facecolor': 'green', 'alpha': 0.5, 'pad': 4 }) return ax
def GPAL(X, Y, train_ind, candidate_ind, test_ind, sample='En', kernel='rbf', Niter=500, eta=10): ourRes = [] train_index = train_ind.copy() test_index = test_ind.copy() candidate_index = candidate_ind.copy() varRes = [] enRes = [] for i in range(Niter): print(i) if (kernel == 'linear'): dotkernel = DotProduct(sigma_0=1) model = GPC(kernel=dotkernel) else: model = GPC() model.fit(X[train_index], Y[train_index]) ourRes.append(model.score(X[test_index, :], Y[test_index])) print(ourRes[-1]) if (sample == 'rand'): sampleIndex = np.random.randint(len(candidate_index)) elif (sample == 'En'): proba = model.predict_proba(X[candidate_index, :]) en = sp.stats.entropy(proba.T) sampleScore = en sampleIndex = np.argmax(sampleScore) elif (sample == 'var'): model.predict_proba(X[candidate_index, :]) meanVar = np.zeros(len(candidate_index)) for tem in model.base_estimator_.estimators_: meanVar = meanVar + tem.var sampleIndex = np.argmax(meanVar) elif (sample == 'varEN'): proba = model.predict_proba(X[candidate_index, :]) en = sp.stats.entropy(proba.T) meanVar = np.zeros(len(candidate_index)) enRes.append(np.mean(en)) for tem in model.base_estimator_.estimators_: meanVar = meanVar + tem.var sampleIndex = np.argmax(meanVar / len(np.unique(Y)) * eta + en) varRes.append(np.mean(meanVar)) print('max var %f----selected var %f-----selected en %f ' % (np.max(meanVar), meanVar[sampleIndex], en[sampleIndex])) sampleIndex = candidate_index[sampleIndex] train_index = train_index + [sampleIndex] candidate_index = [ x for x in candidate_index if x not in [sampleIndex] ] return [ourRes, varRes, enRes]
def test_multi_class_n_jobs(kernel): # Test that multi-class GPC produces identical results with n_jobs>1. gpc = GaussianProcessClassifier(kernel=kernel) gpc.fit(X, y_mc) gpc_2 = GaussianProcessClassifier(kernel=kernel, n_jobs=2) gpc_2.fit(X, y_mc) y_prob = gpc.predict_proba(X2) y_prob_2 = gpc_2.predict_proba(X2) assert_almost_equal(y_prob, y_prob_2)
def calculate_t(dataset_no): print("Starting to find orignal_labels ans for dataset no", dataset_no) X, y = load_datasets(dataset_no) rows, col = X.shape kernel = 1.0 * RBF(1.0) ROW = int(Training_percent * rows) ROW = 800 print("Starting $") gpc = GaussianProcessClassifier(kernel=kernel, random_state=0).fit(X[:ROW, :], y[:ROW]) print("Successfully trained ", dataset_no) print("Starting predicting data for full length") orignal_probability = gpc.predict_proba(X[:ROW, :]) print("Orignal_probability array calculated for dataset_no", dataset_no) mrl = [None for _ in range(5)] current_pos = int(STARTING_FRACTION * col) while 1: gpc = GaussianProcessClassifier(kernel=kernel, random_state=0).fit( X[:ROW, :current_pos], y[:ROW]) new_probability = gpc.predict_proba(X[:ROW, :current_pos]) print("Probabilities calculated for current value of f = ", current_pos) for i in range(5): value_mrl = mrl[i] if not (value_mrl): if i == 4: temporary = 16 else: temporary = i + 1 if (check_probabilities_for_f(orignal_probability, new_probability, alpha, y, temporary)): mrl[i] = current_pos print("F for label", temporary, " is ", current_pos) # print("Saving model") # s = 'label_id' + str(i+1) + "component" + str(dataset_no) # filename = 'models/' + s + '.sav' # pickle.dump(gpc, open(filename, 'wb')) all_completed = 1 for value_mrl in mrl: if not (value_mrl): all_completed = 0 if all_completed: break current_pos = current_pos + 5 return mrl
class myGPBinary(myModel): def make(self , make_params ): self.model = GaussianProcessClassifier(**make_params ) return self def fit(self , xtrain , ytrain , xtest =None, ytest =None , fit_params = {} ): self.model.fit(xtrain , ytrain , **fit_params) def predict(self , xs , threshold = 0.5): return self.model.predict def predict_proba(self, xs): if len(xs.shape) == 1: return self.model.predict_proba(xs.reshape(1,-1)) else: return self.model.predict_proba(xs)[:,1]
def GPC(train, target, test): kernel = 1.0 * RBF(1.0) gpc = GaussianProcessClassifier(kernel=kernel, random_state=0) gpc.fit(train, target) #print("Score:",gpc.score(train, target)) prediction = gpc.predict_proba(test)[:, 1] return prediction
def get_new_labels_entropy(evaluated_set_X, evaluated_set_y, unevaluated_X, number_of_new_labels, _KRIGING=0): """ Get a set of parameter combinations according to their predicted label entropy """ if _KRIGING: clf = GaussianProcessClassifier() clf.fit(evaluated_set_X, calibration_condition(evaluated_set_y, calibration_threshold)) else: clf = fit_entropy_classifier(evaluated_set_X, evaluated_set_y, surrogate_model, surrogate_parameter_space) y_hat_probability = clf.predict_proba(unevaluated_X) y_hat_entropy = np.array(map(entropy, y_hat_probability)) y_hat_entropy /= y_hat_entropy.sum() unevaluated_X_size = unevaluated_X.shape[0] selections = np.random.choice(a=unevaluated_X_size, size=number_of_new_labels, replace=False, p=y_hat_entropy) return selections
def test_predict_consistent(): """ Check binary predict decision has also predicted probability above 0.5. """ for kernel in kernels: gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y) assert_array_equal(gpc.predict(X), gpc.predict_proba(X)[:, 1] >= 0.5)
def test_predict_consistent_structured(): # Check binary predict decision has also predicted probability above 0.5. X = ["A", "AB", "B"] y = np.array([True, False, True]) kernel = MiniSeqKernel(baseline_similarity_bounds="fixed") gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y) assert_array_equal(gpc.predict(X), gpc.predict_proba(X)[:, 1] >= 0.5)
def activity_3_3(): iris = datasets.load_iris() X = iris.data[:, :2] y = np.array(iris.target, dtype=int) h = .02 # crea una malla para realizar la grafica x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) kernel = 1.0 * kernels.RBF([1.0]) gpc_rbf_isotropic = GaussianProcessClassifier(kernel=kernel).fit(X, y) Z = gpc_rbf_isotropic.predict_proba(np.c_[xx.ravel(), yy.ravel()]) # coloca el resultado en colores Z = Z.reshape((xx.shape[0], xx.shape[1], 3)) plt.imshow(Z, extent=(x_min, x_max, y_min, y_max), origin="lower") # Grafica plt.scatter(X[:, 0], X[:, 1], c=np.array(["r", "g", "b"])[y], edgecolors=(0, 0, 0)) plt.xlabel('Sepal length') plt.ylabel('Sepal width') plt.xlim(xx.min(), xx.max()) plt.ylim(yy.min(), yy.max()) plt.show()
def train_on_pool(choice_function, X, y, pool_idcs, train_idcs, test_idcs, name): Xtest, ytest = X[test_idcs], y[test_idcs] accuracies, balances, n_points, train_idcs, pool_idcs = list(), list( ), list(), copy(train_idcs), copy(pool_idcs) gp = GaussianProcessClassifier(n_restarts_optimizer=25, kernel=Matern(), n_jobs=-1, random_state=42) #Add initial points while pool_idcs: Xtrain, ytrain = X[train_idcs], y[train_idcs] gp.fit(Xtrain, ytrain) preds = gp.predict(Xtest) accuracies.append(accuracy_score(ytest, preds)) n_points.append(len(train_idcs)) train_classes = np.unique(y[train_idcs], return_counts=True)[1] balances.append(max(train_classes) / sum(train_classes)) print( f"{len(train_idcs)}: {name}: {accuracies[-1]:.3}, class balance: {balances[-1]:.3}" ) y_pool_p = gp.predict_proba(X[pool_idcs]) chosen_idx = choice_function(y_pool_p) train_idcs.append(pool_idcs.pop(chosen_idx)) return n_points, accuracies, balances
class GaussianProcess_(ProbabilisticModel): """GaussianProcess Classifier """ def __init__(self, *args, **kwargs): self.model = GaussianProcessClassifier(*args, **kwargs) self.name = "gpc" def train(self, dataset, *args, **kwargs): return self.model.fit(*(dataset.format_sklearn() + args), **kwargs) def predict(self, feature, *args, **kwargs): return self.model.predict(feature, *args, **kwargs) def score(self, testing_dataset, *args, **kwargs): return self.model.score(*(testing_dataset.format_sklearn() + args), **kwargs) def predict_proba(self, feature, *args, **kwargs): return self.model.predict_proba(feature, *args, **kwargs) def feature_importances_(self): LOGGER.warn("GPC model does not support feature_importance") return None def get_params(self): return self.model.get_params()
def build_classifier_gp(data, labels, **kwargs): linear_kernel = Sum(k1=Product(k1=DotProduct(sigma_0=0, sigma_0_bounds='fixed'), k2=ConstantKernel()), k2=ConstantKernel()) gp_clf = GaussianProcessClassifier(kernel=linear_kernel) gp_clf.fit(data, labels) id_pos_class = gp_clf.classes_ == labels.max() return gp_clf, gp_clf.predict_proba(data)[:, id_pos_class]
def test_multi_class(kernel): # Test GPC for multi-class classification problems. gpc = GaussianProcessClassifier(kernel=kernel) gpc.fit(X, y_mc) y_prob = gpc.predict_proba(X2) assert_almost_equal(y_prob.sum(1), 1) y_pred = gpc.predict(X2) assert_array_equal(np.argmax(y_prob, 1), y_pred)
def evaluate_gp(y,x,y_test,x_test): from sklearn.gaussian_process.kernels import RBF, ConstantKernel,WhiteKernel from sklearn.gaussian_process import GaussianProcessClassifier np.random.seed(200) kernel =ConstantKernel()*RBF() + WhiteKernel() start = time.time() gp = GaussianProcessClassifier(kernel=kernel, n_restarts_optimizer=10).fit(x, y) logp = np.log(gp.predict_proba(x_test)) end = time.time() print('Gp took {}s'.format(end - start)) test_loglik = np.mean(y_test.reshape(-1)*logp[:,1] + (1-y_test.reshape(-1))*logp[:,0]) return(test_loglik)
def GPRTraining(XEstimate, XValidate, Parameters, class_labels): kernel = RBF(length_scale=1.0, length_scale_bounds=(1e-05, 100000.0)) #clf = GaussianProcessClassifier(kernel=kernel, n_restarts_optimizer=1) clf = GaussianProcessClassifier(kernel=RBF(length_scale=1.0), optimizer=None, multi_class='one_vs_one', n_jobs=1) print(clf.fit(XEstimate, class_labels)) Yvalidate = clf.predict(XValidate) EstParameters = clf.get_params() print(clf.predict_proba(XValidate)) return {"Yvalidate": Yvalidate, "EstParameters": EstParameters, "clf": clf}
def calculate_mrl(X, Y, alpha, initial_f): ''' Calculates the mrl for a component ''' kernel = 1.0 * RBF(1.0) gpc = GaussianProcessClassifier(kernel=kernel, random_state=0).fit(X, Y) original_prob = gpc.predict_proba(X) print("Probabilities for the complete MTS calculated") mrl = [None for _ in range(LABELS)] current_f = initial_f while True: gpc = GaussianProcessClassifier(kernel=kernel, random_state=0).fit( X[:, :current_f], Y) new_prob = gpc.predict_proba(X[:, :current_f]) print("Probabilities calculated when value of F is %s" % (str(current_f))) for i, mrl_value in enumerate(mrl): if not (mrl_value): if check_probabilities(original_prob, new_prob, alpha, Y, i + 1): mrl[i] = current_f print("F for label %s is %s" % (str(i + 1), str(current_f))) all_values_are_found = True for mrl_value in mrl: if not (mrl_value): all_values_are_found = False if (all_values_are_found): break current_f += 1 return mrl
def run_gaussian_clf(df, config): df = df[0:100] start = time.time() X = df.drop(columns={"flux_list", "wavelength", "objid", "ra", "dec", "class", "spectral_lines"}) y = df["class"] X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.33, random_state=42) kernel = 1.0 * RBF(1.0)#config['kernel_val']) model = GaussianProcessClassifier(kernel=kernel, random_state=0).fit(X_train, y_train) y_pred_test = model.predict(X_test) accuracy_test = acc(y_test, y_pred_test) end = time.time() tt = end - start print("Accuracy of trained model on test set: %.2f%%" % (accuracy_test * 100.0)) # print(y_pred_test) print("time :", tt) model.predict_proba(X_test) df_result_GC = pd.DataFrame(model.predict_proba(X_test)) df_result_GC_rename = df_result_GC.rename(columns={0: "GALAXY", 1: "QSO", 2: "STAR"}) df_result_GC_rename["predict"] = y_pred_test df_result_GC_rename["actual"] = y_test
def getBestParametersRBF(X, Y): nMag = 10 nScale = 10 nValidation = 2 magnitudes = np.logspace(-2, 4, nMag) #10^-2 to 10^4 scales = np.logspace(-1, 2, nScale) #10^-1 to 10^2 nlpd = np.zeros((nMag, nScale)) for i in range(nMag): for j in range(nScale): sumProb = 0.0 numProb = 0 for n_k in range(nValidation): gp = GaussianProcessClassifier(kernel=magnitudes[i] * RBF(scales[j])) xTrain, yTrain, xTest, yTest = splitTrainingTestingData( X, Y, nValidation, n_k) gp.fit(xTrain, yTrain) #get prediction probabilities probs = gp.predict_proba(xTest) sumProb += sum(-np.log(probs[i][(1 + yTest[i]) // 2]) for i in range(probs.shape[0])) numProb += probs.shape[0] #calculate negtive log predictive density nlpd[i, j] = sumProb / numProb id_x, id_y = np.unravel_index(nlpd.argmin(), nlpd.shape) #index of minimum value of NLPD #3D plot of NLPD X, Y = np.meshgrid(magnitudes, scales) nlpd = np.transpose(nlpd) fig = plt.figure() ax = fig.gca(projection='3d') surf = ax.plot_surface(np.log10(X), np.log10(Y), nlpd, cmap=cm.coolwarm, linewidth=0, antialiased=False) fig.colorbar(surf, shrink=0.5, aspect=10) ax.set_xlabel("Magnitude") ax.set_ylabel("Length-scale") ax.set_zlabel("Negative Log Predictive Density") plt.show() return scales[id_y], magnitudes[id_x]
def common_test_gpc(self, dtype=np.float32, n_classes=2): gp = GaussianProcessClassifier() gp, X = self.fit_classification_model(gp, n_classes=n_classes) # return_cov=False, return_std=False if dtype == np.float32: cls = FloatTensorType else: cls = DoubleTensorType model_onnx = to_onnx(gp, initial_types=[('X', cls([None, None]))], target_opset=TARGET_OPSET, options={ GaussianProcessClassifier: { 'zipmap': False, 'optim': 'cdist' } }) self.assertTrue(model_onnx is not None) try: sess = InferenceSession(model_onnx.SerializeToString()) except OrtFail: if not hasattr(self, 'path'): return suffix = 'Double' if dtype == np.float64 else 'Float' # Operator Solve is missing model_onnx = change_onnx_domain( model_onnx, {'Solve': ('Solve%s' % suffix, 'ai.onnx.contrib')}) so = SessionOptions() so.register_custom_ops_library(self.path) sess = InferenceSession(model_onnx.SerializeToString(), so) res = sess.run(None, {'X': X.astype(dtype)}) assert_almost_equal(res[0].ravel(), gp.predict(X).ravel()) assert_almost_equal(res[1], gp.predict_proba(X), decimal=3) return dt = 32 if dtype == np.float32 else 64 dump_data_and_model(X.astype(dtype), gp, model_onnx, verbose=False, basename="SklearnGaussianProcessRBFT%d%d" % (n_classes, dt))
class GaussianProcessClassifierImpl: def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X) def predict_proba(self, X): return self._wrapped_model.predict_proba(X)
def model_GPC(self): kf, X, y = self.data_KF() avg = 0 for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] kernel = 1.0 * RBF(1.0) gpc = GaussianProcessClassifier(kernel=kernel, random_state=0).fit(X, y) result = gpc.predict_proba(X_test) count0 = 0 for index in range(len(result)): if result[index] == y_test[index]: count0 += 1 avg += count0 / len(result) # print(result) return avg / self.nsp
def OnlineGPC(self,X,y,shift=1): """ :param X: global input :param y: ground truth :return: alert=1 OK , alert=0 senting a alert """ T = X.shape[0] alert = np.zeros(T-shift) clf = GaussianProcessClassifier() for t in np.arange(shift,T): x_t = X[:t,:] y_t = y[:t] clf.fit(x_t,y_t) score = clf.predict_proba(X[t,:]) alert[t-shift] = (score[0][int(y[t]-1)] >= 0.5) return alert
def apply_gaussian_classifier(feature, col_required, array_to_predict): Main_X, Main_Y = load_datasets(feature) print("Starting Gausian") gpc = GaussianProcessClassifier(kernel=kernel, random_state=0).fit( Main_X[:ROWS, :col_required], Main_Y[:ROWS]) print("Successfully Trained :)") orignal_probability = gpc.predict_proba(array_to_predict) maxa = max(orignal_probability[0]) for j in range(5): if (orignal_probability[0][j] == maxa): index = j if (index == 0): return 1 elif (index == 1): return 2 elif (index == 2): return 3 elif (index == 3): return 4 else: return 16
class GaussianProcess(BaseEstimator): """Implement a Gaussian Process Classifier. GP is by definition a Bayesian model, so uncertainty on the prediction is easy acquired. Attributes: model (TYPE): Description """ def __init__(self, name='GP'): super().__init__(name) self.model = None def fit(self, x, y, **kwargs): """Train a Gaussian Process Classifier. Args: x (np.array): design matrix (inputa data) y (np.array): labels **kwargs: additional parameters """ # Specify Gaussian Processes with fixed and optimized hyperparameters self.model = GaussianProcessClassifier(kernel=1.0 * RBF(length_scale=1.0), multi_class='one_vs_rest') self.model.fit(x, y) def predict(self, x): """Perform prediction using GP. Args: x (np.array): input data Returns: np.array: multiple predictions for each sample x_i """ return self.model.predict_proba(x)
class GaussianProcess(AbstractModel): def __init__(self, optimised=True): self.create_model(optimised) def create_model(self, optimised): self.model = GaussianProcessClassifier() def fit_model(self, x_train, y_train): self.model.fit(x_train, y_train) def predict(self, x_test): y_pred = self.model.predict(x_test) return y_pred def get_model(self): return self.model def predict_proba(self, x_test): y_pred = self.model.predict_proba(x_test) return y_pred def print(self): pass
def trainPredict(subjectid, makeplot=False): print("testing participant " + subjectid) # Load training data from the file matlab generates traindata = np.genfromtxt('csvdata/' + subjectid + '_sim.csv', delimiter=',', missing_values=['NaN', 'nan'], filling_values=None) # Clean + downsample this data trainx, trainy = cleandata(traindata, downsamplefactor=20) # Train a Gaussian Process anisokern = kernels.RBF() # default kernel gp = GaussianProcessClassifier(kernel=anisokern) # Initialize the GPC gp.fit(trainx, trainy) # train this class on the data trainx = trainy = None # Discard all training data to preserve memory # load test data testdata = np.genfromtxt('csvdata/' + subjectid + '_rival.csv', delimiter=',', missing_values=['NaN', 'nan'], filling_values=None) testx, testy = cleandata(testdata, downsamplefactor=4) # clean data testdata = None # clear from memory # work out percentage in percept for each data point: percentages, nextpercept = assign_percentage(testy) # get a prediction for all points in the test data: predicty = gp.predict(testx) proby = gp.predict_proba(testx) if makeplot: summaryplot(participant, testx, testy, predicty, proby, gp) # Summarise prediction by reported percept meanprediction = {'mean' + percept: proby[testy == value, 1].mean() for percept, value in perceptindices.iteritems()} predictiondev = {'stdev' + percept: proby[testy == value, 1].std() for percept, value in perceptindices.iteritems()} predictionaccuracy = {'acc' + percept: (predicty[testy == value] == testy[testy == value]).mean() for percept, value in perceptindices.iteritems()} # Summarise prediction by percentage in percept predictioncourse = {'timecourse' + percept + str(cutoff): proby[(testy == value) & (percentages < cutoff) & (percentages > cutoff - 0.1), 1].mean() for percept, value in perceptindices.iteritems() for cutoff in np.linspace(0.1, 1, 10)} # Summarise mixed percept time courses by the next percept nextcourse = {'nextcourse' + percept + str(cutoff): proby[(testy == 0) & (percentages < cutoff) & (percentages > cutoff - 0.1) & (nextpercept == perceptindices[percept]), 1].mean() for percept in ['highfreq', 'lowfreq'] for cutoff in np.linspace(0.1, 1, 10)} afterdominant = {'after' + percept + "_" + after + "_" + str(cutoff): proby[(testy == perceptindices[percept]) & (percentages < cutoff) & (percentages > cutoff - 0.1) & (nextpercept == perceptindices[after]), 1].mean() for percept, after in [('highfreq', 'mixed'), ('highfreq', 'lowfreq'), ('lowfreq', 'mixed'), ('lowfreq', 'highfreq')] for cutoff in np.linspace(0.1, 1, 10)} # Only return the summarised data return meanprediction, predictiondev, predictionaccuracy, \ predictioncourse, nextcourse, afterdominant
def test_predict_consistent(kernel): # Check binary predict decision has also predicted probability above 0.5. gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y) assert_array_equal(gpc.predict(X), gpc.predict_proba(X)[:, 1] >= 0.5)
xx, yy = np.meshgrid(np.linspace(-3, 3, 50), np.linspace(-3, 3, 50)) rng = np.random.RandomState(0) X = rng.randn(200, 2) Y = np.logical_xor(X[:, 0] > 0, X[:, 1] > 0) # fit the model plt.figure(figsize=(10, 5)) kernels = [1.0 * RBF(length_scale=1.0), 1.0 * DotProduct(sigma_0=1.0)**2] for i, kernel in enumerate(kernels): clf = GaussianProcessClassifier(kernel=kernel, warm_start=True).fit(X, Y) # plot the decision function for each datapoint on the grid Z = clf.predict_proba(np.vstack((xx.ravel(), yy.ravel())).T)[:, 1] Z = Z.reshape(xx.shape) plt.subplot(1, 2, i + 1) image = plt.imshow(Z, interpolation='nearest', extent=(xx.min(), xx.max(), yy.min(), yy.max()), aspect='auto', origin='lower', cmap=plt.cm.PuOr_r) contours = plt.contour(xx, yy, Z, levels=[0], linewidths=2, linetypes='--') plt.scatter(X[:, 0], X[:, 1], s=30, c=Y, cmap=plt.cm.Paired) plt.xticks(()) plt.yticks(()) plt.axis([-3, 3, -3, 3]) plt.colorbar(image) plt.title("%s\n Log-Marginal-Likelihood:%.3f" % (clf.kernel_, clf.log_marginal_likelihood(clf.kernel_.theta)),
logpmf1 = predict_copula_classification(copula_classification_obj, x_plot_grid) pmf1 = jnp.exp(logpmf1) jnp.save('plot_files/ccopula_moon_pmf', pmf1) #Predictive Resample B = 1000 T = 5000 logpmf_ytest_samp, logpmf_yn_samp, y_samp, x_samp, pdiff = predictive_resample_classification( copula_classification_obj, y, x, x_plot_grid, B, T) jnp.save('plot_files/ccopula_moon_logpmf_ytest_pr', logpmf_ytest_samp) jnp.save('plot_files/ccopula_moon_logpmf_yn_pr', logpmf_yn_samp) #Convergence T = 10000 #T = 10000, seed = 50 for i = 30 seed = 200 _, _, _, _, pdiff = predictive_resample_classification( copula_classification_obj, y, x, x_test[0:1], 1, T, seed=seed) jnp.save('plot_files/ccopula_moon_pdiff', pdiff) #Gaussian Process from sklearn.gaussian_process.kernels import RBF, ConstantKernel, WhiteKernel from sklearn.gaussian_process import GaussianProcessClassifier kernel = ConstantKernel() * RBF() + WhiteKernel() gp = GaussianProcessClassifier(kernel=kernel, n_restarts_optimizer=10).fit( x, y.reshape(-1, )) p_pred = gp.predict_proba( np.array([x_meshgrid[0].ravel(), x_meshgrid[1].ravel()]).transpose()) jnp.save('plot_files/gp_moon_pred', p_pred)
train_set, test_set = train_test_split(parts_labeled, random_state=42) # get X and Y values X_train, X_test = [s[['corr_scaled','mass_scaled']].values for s in (train_set, test_set)] y_train, y_test = [s['manual_label'].values for s in (train_set, test_set)] #clf_scaler_path = '../output/pipeline/GPClassification/GPCclfRBF.p' #with open(clf_scaler_path, 'rb') as f: # clf = pickle.load(f) # scaler = pickle.load(f) # train a gaussian process classifier with RBF kernel (Default) clf = GaussianProcessClassifier(1.0 * RBF(1.0), random_state=42, n_jobs=-1) clf.fit(X_train, y_train) plot2dDecisionFunc(clf, X_train, y_train, save=save_dir+'prob_surfaceGPC.pdf') clf.score(X_test, y_test) labels_pred = clf.predict_proba(X_test)[:,1] # compute f1 score: harmonic mean between precision and recall # see https://en.wikipedia.org/wiki/F1_score prob_f1 = pd.DataFrame() prob_f1['prob_thresh'] = np.linspace(0.1, 1, 90, endpoint=False) f1score = np.array([metrics.precision_recall_fscore_support(y_test, labels_pred>thresh)[2] for thresh in prob_thresh]) prob_f1['f1score_False']= f1score[:,0] prob_f1['f1score_True']= f1score[:,1] prob_f1.to_csv(save_dir+'prob_f1score.csv', index=False) fig, ax = plt.subplots() ax.plot(prob_f1.prob_thresh, prob_f1.f1score_False, color='r') ax.plot(prob_f1.prob_thresh, prob_f1.f1score_True, color='b') ax.set(ylabel='F1 score', xlabel='Prob. threshold')
# training loop over the number of epochs batch_size = 5 batches = int(len(X_) / batch_size) for epoch in range(training_epochs): losses = 0 dkl_losses = 0 accs = 0 for j in range(batches): idx = np.random.randint(X_.shape[0], size=batch_size) X_b = X_[idx] Y_b = Y_[idx] # get the GPC predictions... and slice only the positive class probabilities Y_g = gpc.predict_proba(X_b)[:, 1].reshape((-1, 1)) # train the network, note the dictionary of inputs and labels sess.run(train_step, feed_dict={x: X_b, y: Y_b, y_g: Y_g}) # feedforwad the same data and labels, but grab the accuracy and loss as outputs acc, l, soft_max_a, l_2 = sess.run([accuracy, loss, a, loss_2], feed_dict={ x: X_b, y: Y_b, y_g: Y_g }) losses = losses + np.sum(l) accs = accs + np.sum(acc) dkl_losses = dkl_losses + np.sum(l_2) print("Epoch %.8d " % epoch, "avg train loss over", batches,
class Population(object): """Popultation object containing every function and data structures to run for the pipeline""" types = { 'eas': 'east asian', 'nfe': 'non finish european', 'sas': 'south asian', 'afr': 'african', 'amr': 'mixed american', 'nan': 'unknown', 'fin': 'finish' } dataset = { 'labels_miniproj.txt': 'https://www.dropbox.com/s/dmgchsjklm1jvgk/\ acb_ii_mini_project_labels.txt?dl=1', 'data_miniproj.vcf.bgz': 'https://www.dropbox.com/s/iq8c81awi31067c/\ acb_ii_mini_project.vcf.bgz?dl=1' } maxallelefreq = None callrate = None outfile = None labeled = None nbcomp = None valofinterest = None clf = None train_pha = None test_pha = None train_gt = None test_gt = None train_red = None pred_red = None rec = None classifier = None tofind = None found = None def __init__(self): super(Population, self).__init__() os.system('mkdir data') def __getitem__(self, key): """ you can get an item as with a dictionary """ print self.types[str( list(self.all.loc[self.all['sample_id'] == key]['ancestry'])[0])] def __len__(self): return (self.labeled.shape[0], self.tofind.shape[0]) if self.labeled is not None else 0 def __iter__(self, labeled=True): """ you can get iterate as with a dictionary """ return iter(list(self.labeled['sample_id'])) def keys(self): """ you can get the keys as with a dictionary """ return list(self.all['sample_id']).__iter__() def values(self): """ you can get the values as with a dictionary """ return list(self.all['ancestry']).__iter__() # Utilities def _get_training_data(self, inp, percentage=0.3): X_train, X_test, y_train, y_test = train_test_split( inp, self.labeled['ancestry'], test_size=percentage, random_state=0) return X_train, X_test, y_train, y_test # Functions def load_dataset(self, filename=None, url=None): """ load the data from dropbox in case the user don't have it already you can load your own dataset from anywhere Params: ------ filename: str, the file name url: str, the url """ if filename is None: for key, val in self.dataset.iteritems(): if not os.path.exists('data/' + key): print "downloading " + key + " with urllib" f = urlopen(val) data = f.read() with open('data/' + key, "wb") as code: code.write(data) else: print "file is already there" else: if not os.path.exists(filename): print "downloading " + filename + " with urllib" f = urlopen(url) data = f.read() with open(filename, "wb") as code: code.write(data) else: print "file is already there" def filter_variants(self, name="data/data_miniproj.vcf.bgz", out="out", onlypruning=False, minmissing=30000, maxr2=0.01, callrate=0.8, maxallelefreq=0.01): """ Successful, clean PCA on human genetic data will require filtering data to high-quality variants that are linkage disequilibrium (LD)-pruned. In general, we like to run PCA on high-callrate, bi-allelic, common (allele frequency > 0.01) variants that are pruned to r^2<0.1; but you are welcome to run PCA on whichever set of variants you find work best for you. min missing r2 """ if not onlypruning: print "assuming you have mawk, vcftools, cat, cut installed" self.maxallelefreq = maxallelefreq self.callrate = callrate self.outfile = out filt = "vcftools --gzvcf '" + name + "' --recode --out data/lowDPbefore" filt += " --maf " + str(maxallelefreq) filt += ' --min-alleles 2 --max-alleles 2' filt += ' --max-missing ' + str(callrate) print "applying first filter" os.system(filt) print "applying second filter" os.system( 'vcftools --vcf "data/lowDPbefore.recode.vcf" --missing-indv --out data/out' ) print "finding if too much missing individuals and recoding the file" os.system( "mawk '$4 > 30000' data/out.imiss | cut -f1 > data/lowDP.indv") os.system( "vcftools --vcf 'data/lowDPbefore.recode.vcf' --recode --remove data/lowDP.indv\ --out data/filtered2") print "removing garbage.." os.system('rm data/lowDP*') vcf_reader = vcf.Reader(open('data/filtered2.recode.vcf', 'r')) os.system('mkdir data/chunks') print "dividing the input file.." for i in vcf_reader.contigs.keys(): i = str(i) if len(i) < 3: os.system( "vcftools --vcf data/filtered2.recode.vcf --chr " + i + " --recode --recode-INFO-all --out data/chunks/VCF_ch" + i) print "running the ld prunning in parallel (might still take time (avg is 60mn)" for i in vcf_reader.contigs.keys(): i = str(i) if len(i) < 3: os.system( "vcftools --vcf data/chunks/VCF_ch" + i + ".recode.vcf --min-r2 0.1 --geno-r2 --out data/chunks/filtVCF_ch" + i + " &") start = time.time() while (True): nbjob = 0 for p in psutil.process_iter(): try: if str(p.name()) == 'vcftools': nbjob += 1 except (psutil.AccessDenied, psutil.ZombieProcess): pass except psutil.NoSuchProcess: continue if nbjob == 0: break else: print "there is still " + str(nbjob) + " jobs \r", end = time.time() print "it took " + str(end - start) + " seconds" print "concatenating every file" os.system('rm data/*.log') os.system('cat data/chunks/filtVCF_ch* > data/all_VCF.geno.lg') print "now prunning..." os.system( 'vcftools --vcf data/filtered2.recode.vcf --exclude-positions \ data/all_VCF.geno.lg --recode --out data/' + out) def extract_unlabeled(self, filename=None): filename = filename if filename is not None else "data/labels_miniproj.txt" labels = pd.read_csv(filename, sep='\t') indices = labels['ancestry'].isna() self.tofind = labels[indices] self.labeled = labels[indices == False] self.all = pd.concat([self.labeled, self.tofind]) def load_from_vcf(self, filename=None, printinfo=True, maxval=1000, keep_prev=False): """ parloadval read from the filtered vcf file, the names given in df Params: ------ df : dataframe - a dataframe with a sample_id index containing the names of the different samples to extract from the file filename : str - the name of the file printinfo : flag - show the information about the vcf file being read Returns: ------- gt: np.array [nbofindividuals,nbofrecords] - 0-1-2 values stating if the genotype has the ALTval in 0-1-2 of its chromosomes. pha: np.array [nbofindividuals,nbofrecords] - bool stating if this variant is phased or not rec: dict[chromvalue:list[POS,REFval,ALTval]] - a dicionnary of meta information about the records being read """ filename = filename if filename is not None else 'data/' + self.outfile + '.recode.vcf' vcf_reader = vcf.Reader(open(filename, 'r')) if printinfo: print "having " + str(len(vcf_reader.contigs)) + " chromosomes" size = 0 for key, val in vcf_reader.contigs.iteritems(): size += val.length print "meta :" print vcf_reader.metadata print "genomesize : " print size label_names = list(self.labeled['sample_id']) test_names = list(self.tofind['sample_id']) if not keep_prev: self.train_gt = np.empty((0, len(label_names)), int) self.train_pha = np.empty((0, len(label_names)), bool) self.test_gt = np.empty((0, len(test_names)), int) self.test_pha = np.empty((0, len(test_names)), bool) self.rec = {} else: self.test_pha = self.test_pha.T self.test_gt = self.test_gt.T self.train_pha = self.train_pha.T self.train_gt = self.train_gt.T chrom = -1 j = 0 numa = 0 count = 0 for record in vcf_reader: if keep_prev: for key, val in self.rec.iteritems(): for key, val2 in val.iteritems(): vcf_reader.next() numa += 1 keep_prev = False count = numa + j print str(count) + " doing chrom : " + str( record.CHROM) + ', at pos : ' + str(record.POS) + "\r", if record.CHROM != chrom: chrom = record.CHROM if record.CHROM not in self.rec: self.rec.update({chrom: {}}) self.rec[chrom].update( {record.ID: [count, record.POS, record.REF, record.ALT]}) train_gt = np.zeros(len(label_names)) train_pha = np.zeros(len(label_names)) for i, name in enumerate(label_names): train_gt[i] = record.genotype(name).gt_type if record.genotype( name).gt_type is not None else 0 train_pha[i] = record.genotype(name).phased if record.genotype( name).phased is not None else 0 test_gt = np.zeros(len(test_names)) test_pha = np.zeros(len(test_names)) for i, name in enumerate(test_names): test_gt[i] = record.genotype(name).gt_type if record.genotype( name).gt_type is not None else 0 test_pha[i] = record.genotype(name).phased if record.genotype( name).phased is not None else 0 self.train_gt = np.vstack((self.train_gt, train_gt)) self.train_pha = np.vstack((self.train_pha, train_pha)) self.test_gt = np.vstack((self.test_gt, test_gt)) self.test_pha = np.vstack((self.test_pha, test_pha)) j += 1 if j > maxval - 1: break # """ # we are using numpy, more efficient # we order by individuals x records self.test_pha = self.test_pha.T self.test_gt = self.test_gt.T self.train_pha = self.train_pha.T self.train_gt = self.train_gt.T print ' ' # to jump a line print "PHASE nonzero " + str(np.count_nonzero(self.train_pha)) print "SNPs nonzero " + str(np.count_nonzero(self.train_gt)) for key, val in self.types.iteritems(): print "you have " + str(self.labeled.loc[self.labeled['ancestry'] == key].shape[0])\ + " " + str(val) + " in your labeled set" def par_load_from_vcf(self, filename, printinfo=True): """ the parallel version of loadfromvcf,should be way faster same inputs but reduced choice for now """ filename = filename if filename is not None else 'data/' + self.outfile + '.recode.vcf' vcf_reader = vcf.Reader(open(filename, 'r')) print "dividing the input file.." files = [] for i in vcf_reader.contigs.keys(): i = str(i) if len(i) < 3: files.append(i) os.system( "vcftools --vcf " + filename + " --chr " + i + " --recode --recode-INFO-all --out data/chunks/inpar_ch" + i) label_names = list(self.labeled['sample_id']) test_names = list(self.tofind['sample_id']) self.rec = {} self.train_gt = np.empty((0, len(label_names)), int) self.train_pha = np.empty((0, len(label_names)), bool) self.test_gt = np.empty((0, len(test_names)), int) self.test_pha = np.empty((0, len(test_names)), bool) if printinfo: print "having " + str(len(vcf_reader.contigs)) + " chromosomes" size = 0 for key, val in vcf_reader.contigs.iteritems(): size += val.length print vcf_reader.metadata print size vals = Parallel(n_jobs=-1)( delayed(_inpar)(file, label_names, test_names) for file in files) for i, val in enumerate(vals): if len(val[1]) != 0: # wether or not it is equal to zero we consider it is the same for all others self.train_gt = np.vstack((self.train_gt, convertlist(val[1]))) self.train_pha = np.vstack( (self.train_pha, convertlist(val[2], type=np.bool))) self.test_gt = np.vstack((self.test_gt, convertlist(val[3]))) self.test_pha = np.vstack( (self.test_pha, convertlist(val[4], type=np.bool))) self.rec.update({files[i]: val[0]}) self.test_pha = self.test_pha.T self.test_gt = self.test_gt.T self.train_pha = self.train_pha.T self.train_gt = self.train_gt.T print "PHASE nonzero " + str(np.count_nonzero(self.train_pha)) print "SNPs nonzero " + str(np.count_nonzero(self.train_gt)) for key, val in self.types.iteritems(): print "you have " + str(self.labeled.loc[self.labeled['ancestry'] == key].shape[0])\ + " " + str(val) + " in your labeled set" os.system("rm *.log") os.system("rm data/*.log") os.system("rm data/chunks/*.log") def reduce_features(self, inp=None, topred=None, reducer='pca', n_components=500, val='gt', retrain=True): """ will use a dimensionality reduction algorithm to reduce the number of features of the dataset you can pass it you own inputs or use the ones that are stored in the file Params: ------ inp: np.array[values,features],the input array you have and want to reduce and will train on topred: np.array[values,features], the input array you have and want to reduce and predict reducer: str, the reducer algorithm to use (pca,) n_components : int, the final number of features in your reduced dataset val : str (gt|pha), to see if there is any predictibility using phasing.. retrain: flag, set to false if you already have trained the PCA and don't want to restart (espacially important if you consider to compare two different datasets) Outs: ---- nbcomp: saves the number of components valofinterest: the value of interest (val) train_red, pred_red: and the reduced train and pred arrays """ self.nbcomp = n_components self.valofinterest = val if topred is None and inp is None: topred = self.test_gt if val is 'gt' else self.test_pha if inp is None: inp = self.train_gt if val is 'gt' else self.train_pha toreduce = np.vstack((inp, topred)) if topred is not None else inp if reducer is 'pca': redu = PCA(n_components=n_components) if reducer is 'kpca': redu = KernelPCA(n_components=n_components, kernel='linear') if reducer is 'spca': redu = SparsePCA(n_components=n_components, alpha=1, ridge_alpha=0.01, max_iter=1000, method='lars', n_jobs=-1) if reducer is 'lda': redu = TruncatedSVD(n_components=n_components, algorithm='randomized', n_iter=5) red = redu.fit_transform(toreduce) if retrain else redu.fit(toreduce) self.train_red = red[:inp.shape[0]] self.pred_red = red[inp.shape[0]:] def train_classifier(self, inp=None, labels=None, classifier='knn', train=True, test='CV', scoring='accuracy', percentage=0.3, proba=True, iter=100): """ will use a classification algorithm and train it on the training set using the labels and predict its accuracy you can pass it your own inputs and labels (be carefull to reduce their features before hand or use the ones that are stored in the file Params: ------ inp: np.array[values,features], the input array you will train on labels: list of values, the input array you have and want to reduce and predict classifier: str, the classification algorithm to use (adaboost **, knn ***, svm ***, gaussian ***** ) test: str, the test algorithm to use (reg,CV) scoring: string, the scoring to use (not all of them work for this type of classification) percentage: float, the percentage of your data that should be used for testing for the regular testing algorithm proba: flag, to say if you want the algorithm to compute the probability of each class (uniquely for the svm) iter: int, number of iterations for the gradient descent of the gaussian mixture classifier Returns: ------ score, float, the final score the classifier had Outs ---- clf: will save the classifier classifier: and its name """ if inp is None: inp = self.train_red if labels is None: labels = self.labeled['ancestry'] self.classifier = classifier if classifier is 'adaboost': self.clf = AdaBoostClassifier(n_estimators=int(self.nbcomp * 0.7)) elif classifier is 'knn': self.clf = NearestCentroid() elif classifier is 'svm': self.clf = SVC(C=1.0, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=proba, tol=0.001, cache_size=400, class_weight=None, verbose=False, max_iter=-1) elif classifier is 'gaussian': self.clf = GCP(max_iter_predict=iter) else: print "unkown classifier" if test is 'CV': scores = cross_val_score(self.clf, inp, labels, scoring=scoring, cv=3, n_jobs=1) print "cv scores : " + str(scores) score = np.mean(scores) elif test is 'reg': X_train, X_test, y_train, y_test = self._get_training_data( inp, percentage=percentage) self.clf.fit(X_train, y_train) y_pred = self.clf.predict(X_test) score = accuracy_score(y_test, y_pred) self.clf.fit(inp, labels) if train else None print "the total score is of " + str(score) return score def predict_labels(self, inp=None, minval=0.75): """ give it an input (that you have been passed already to the PCA algorithm precising that it has already been trained) and gives you the labels Params: ------ inp: np.array[values,features], the input array you will train on ( optional) Returns: ------- found : list, of found values (saved in the class) """ if self.clf is not None: if self.classifier is 'svm': founde = [] print "not checking that you are using svm" self.found = self.clf.predict_proba( inp) if inp is not None else self.clf.predict_proba( self.pred_red) for x, i in enumerate(self.tofind['sample_id']): print '----------------------' print str(i) + 'has:' y = 0 a = '' keya = 'U' for key in self.types.keys(): if key is not 'nan': if self.found[x][y] > minval: print "####", a = self.found[x][y] keya = key print str(self.found[x][y] ) + "% chance to be " + self.types[key] y += 1 founde.append([keya, a]) self.found = founde else: self.found = self.clf.predict( inp) if inp is not None else self.clf.predict( self.pred_red) return self.found def compute_features_nb(self, classifier='knn', reducer='pca', vmin=50, vmax=1000, step=10, k=5): """ computes the number of features that is the best with a simple gready search does not count as training Params: ------ classifier: string : name of the classifier for which you want to the best number of features vmin : int minimal value vmax : step : Returns: ------- a plt plot scores : np.array the ordered list of best scores vals: list the corresponding ordered values """ vals = range(vmin, vmax, step) scores = np.zeros(len(vals)) for i, val in enumerate(vals): self.reduce_features(n_components=val, reducer=reducer) self.selectfeatures(auto=True, k=k) score = self.train_classifier(classifier=classifier, train=False) scores[i] = score plt.plot(vals, scores) ind = np.argsort(scores) scores[:] = scores[ind] vals = [vals[i] for i in ind] return scores, vals def savedata(self, name): """ saves the PC values in a gzip file and the labels in a json file Params: ------ name: str, name of the files n which to save """ filename1 = "data/save/" + name + ".json" filename2 = "data/save/" + name + ".gz" print "writing in " + name d = {} for i, val in enumerate(list(self.tofind['sample_id'])): d.update({val: self.found[i]}) data = json.dumps(d, indent=4, separators=(',', ': ')) dirname = os.path.dirname(filename1) if not os.path.exists(dirname): os.makedirs(dirname) with open(filename1, 'w') as f: f.write(data) np.savetxt(filename2, np.vstack((self.train_red, self.pred_red))) print "it worked !" # not working with the SNP IDs of Han et al. def selectSNPs(self, names=[[11, 'rs232045'], [11, 'rs12786973'], [11, 'rs7946015'], [11, 'rs4756778'], [11, 'rs7931276'], [11, 'rs4823557'], [11, 'rs10832001'], [5, 'rs35397'], [11, 'rs11604470'], [11, 'rs10831841'], [1, 'rs2296224'], [11, 'rs12286898'], [11, 'rs1869084'], [11, 'rs4491181'], [11, 'rs1604797'], [11, 'rs7931276'], [11, 'rs11826168'], [11, 'rs477036'], [11, 'rs7940199'], [11, 'rs4429025'], [11, 'rs6483747'], [15, 'rs199138']]): """ Will select a subset of snps from the list given (before dim reducing) Params: ------ names: list[int chromnb,str ID], list containing the chromosome number and the id of the snps """ newtraingt = np.zeros((self.train_gt.shape[0], len(names))) newtestgt = np.zeros((self.test_gt.shape[0], len(names))) newtestpha = np.zeros((self.test_pha.shape[0], len(names))) newtrainpha = np.zeros((self.train_pha.shape[0], len(names))) for i, name in enumerate(names): val = self.rec[str(name[0])][name[1]][0] # selecting the positional value of the SNP in the matrix from # the chrom and ID of the snps newtraingt.T[:][i] = self.train_gt.T[:][val] newtestgt.T[:][i] = self.test_gt.T[:][val] newtestpha.T[:][i] = self.test_pha.T[:][val] newtrainpha.T[:][i] = self.train_pha.T[:][val] self.train_gt = newtraingt self.test_gt = newtestgt self.test_pha = newtestpha self.train_pha = newtrainpha def selectfeatures(self, inp=None, out=None, auto=False, features=None, k=7): """ will select a subset of features from the list or automatically according to an ANOVA F-value """ if not auto and type(featuresnumber) is not list: raise NameError("need feature numbers as a list") inp = inp if inp is not None else self.train_red out = out if out is not None else self.labeled['ancestry'] if auto: sel = selector(k=k) self.train_red = sel.fit_transform(inp, out) self.pred_red = self.pred_red.T[:][sel.get_support(True)].T else: self.train_red = self.train_red.T[:][featuresnumber].T self.pred_red = self.pred_red.T[:][featuresnumber].T def plotPC(self, interactive=False, pc=[0, 1], foundplot=True, tsne=False): """ will plot the features that have been extracted by the reducer algorithm it has nice features such as a color for each label and an interactive plot to zoom and analyze each different individual Params: ----- Interactive: flag, if using bokeh or not to plot foundplot: flag, if add the predicted labels or not pc: list of size 2, the two PCs to analyse tsne: flag, use tsne or plot only two values Returns: ------- p: object, the plot object """ colormap = { 'eas': "#3498db", 'nfe': "#2ecc71", 'sas': "#9b59b6", 'afr': '#34495e', 'amr': '#f1c40f', 'nan': '#000000', 'fin': "#f39c12", 'U': '#7f8c8d' } if self.found is not None: found = self.found if self.classifier is not 'svm' else [ i[0] for i in self.found ] else: found = list(self.tofind['ancestry']) # just nans colorslab = [colormap[x] for x in list(self.labeled['ancestry'])] colorsnot = [colormap[str(x)] for x in found] if foundplot else None labels = list(self.labeled['ancestry']) labels.extend(found) if foundplot else None if tsne: reduced = TSNE( n_components=2, perplexity=30.0, verbose=1, learning_rate=200.0, n_iter=1000).fit_transform( np.vstack(( self.train_red, self.pred_red)) if foundplot else self.train_red) else: tot = np.vstack((self.train_red, self.pred_red)) if foundplot else self.train_red reduced = np.empty((tot.shape[0], 2)) reduced.T[:][0] = tot.T[:][pc[0]] reduced.T[:][1] = tot.T[:][pc[1]] colorslab.extend(colorsnot) if foundplot else None if interactive: names = list(self.labeled['sample_id']) names.extend(list(self.tofind['sample_id'])) if foundplot else None print " if you are on a notebook you should write 'from bokeh.io import output_notebook'" source = ColumnDataSource( data=dict(x=reduced[:, 0], y=reduced[:, 1], label=[ names[i] + "origin :" + self.types[x] for i, x in enumerate(labels) ], color=colorslab)) output_notebook() hover = HoverTool(tooltips=[ ("label", "@label"), ]) p = figure(title="T-sne plot of the PC values", tools=[ hover, BoxZoomTool(), WheelZoomTool(), SaveTool(), ResetTool() ]) p.circle(x='x', y='y', source=source, color='color') show(p) output_file(self.classifier + "plot.html") save(p) return p else: fig = plt.figure() ax = fig.add_subplot(111) ax.scatter(reduced[:, 0], reduced[:, 1], c=colorslab) plt.show()
y = np.array(g(X) > 0, dtype=int) # Instanciate and fit Gaussian Process Model kernel = C(0.1, (1e-5, np.inf)) * DotProduct(sigma_0=0.1) ** 2 gp = GaussianProcessClassifier(kernel=kernel) gp.fit(X, y) print("Learned kernel: %s " % gp.kernel_) # Evaluate real function and the predicted probability res = 50 x1, x2 = np.meshgrid(np.linspace(- lim, lim, res), np.linspace(- lim, lim, res)) xx = np.vstack([x1.reshape(x1.size), x2.reshape(x2.size)]).T y_true = g(xx) y_prob = gp.predict_proba(xx)[:, 1] y_true = y_true.reshape((res, res)) y_prob = y_prob.reshape((res, res)) # Plot the probabilistic classification iso-values fig = plt.figure(1) ax = fig.gca() ax.axes.set_aspect('equal') plt.xticks([]) plt.yticks([]) ax.set_xticklabels([]) ax.set_yticklabels([]) plt.xlabel('$x_1$') plt.ylabel('$x_2$') cax = plt.imshow(y_prob, cmap=cm.gray_r, alpha=0.8,
gp_opt.fit(X[:train_size], y[:train_size]) print("Log Marginal Likelihood (initial): %.3f" % gp_fix.log_marginal_likelihood(gp_fix.kernel_.theta)) print("Log Marginal Likelihood (optimized): %.3f" % gp_opt.log_marginal_likelihood(gp_opt.kernel_.theta)) print( "Accuracy: %.3f (initial) %.3f (optimized)" % ( accuracy_score(y[:train_size], gp_fix.predict(X[:train_size])), accuracy_score(y[:train_size], gp_opt.predict(X[:train_size])), ) ) print( "Log-loss: %.3f (initial) %.3f (optimized)" % ( log_loss(y[:train_size], gp_fix.predict_proba(X[:train_size])[:, 1]), log_loss(y[:train_size], gp_opt.predict_proba(X[:train_size])[:, 1]), ) ) # Plot posteriors plt.figure(0) plt.scatter(X[:train_size, 0], y[:train_size], c="k", label="Train data") plt.scatter(X[train_size:, 0], y[train_size:], c="g", label="Test data") X_ = np.linspace(0, 5, 100) plt.plot(X_, gp_fix.predict_proba(X_[:, np.newaxis])[:, 1], "r", label="Initial kernel: %s" % gp_fix.kernel_) plt.plot(X_, gp_opt.predict_proba(X_[:, np.newaxis])[:, 1], "b", label="Optimized kernel: %s" % gp_opt.kernel_) plt.xlabel("Feature") plt.ylabel("Class 1 probability") plt.xlim(0, 5)
optimizer=None) gp_fix.fit(X[:train_size], y[:train_size]) gp_opt = GaussianProcessClassifier(kernel=1.0 * RBF(length_scale=1.0)) gp_opt.fit(X[:train_size], y[:train_size]) print("Log Marginal Likelihood (initial): %.3f" % gp_fix.log_marginal_likelihood(gp_fix.kernel_.theta)) print("Log Marginal Likelihood (optimized): %.3f" % gp_opt.log_marginal_likelihood(gp_opt.kernel_.theta)) print("Accuracy: %.3f (initial) %.3f (optimized)" % (accuracy_score(y[:train_size], gp_fix.predict(X[:train_size])), accuracy_score(y[:train_size], gp_opt.predict(X[:train_size])))) print("Log-loss: %.3f (initial) %.3f (optimized)" % (log_loss(y[:train_size], gp_fix.predict_proba(X[:train_size])[:, 1]), log_loss(y[:train_size], gp_opt.predict_proba(X[:train_size])[:, 1]))) # Plot posteriors plt.figure(0) plt.scatter(X[:train_size, 0], y[:train_size], c='k', label="Train data", edgecolors=(0, 0, 0)) plt.scatter(X[train_size:, 0], y[train_size:], c='g', label="Test data", edgecolors=(0, 0, 0)) X_ = np.linspace(0, 5, 100) plt.plot(X_, gp_fix.predict_proba(X_[:, np.newaxis])[:, 1], 'r', label="Initial kernel: %s" % gp_fix.kernel_) plt.plot(X_, gp_opt.predict_proba(X_[:, np.newaxis])[:, 1], 'b', label="Optimized kernel: %s" % gp_opt.kernel_) plt.xlabel("Feature")
def plot(df, options): UNIQ_GROUPS = df.group.unique() UNIQ_GROUPS.sort() sns.set_style("white") grppal = sns.color_palette("Set2", len(UNIQ_GROUPS)) print '# UNIQ GROUPS', UNIQ_GROUPS cent_stats = df.groupby( ['position', 'group', 'side']).apply(stats_per_group) cent_stats.reset_index(inplace=True) import time from sklearn import preprocessing from sklearn.gaussian_process import GaussianProcessRegressor, GaussianProcessClassifier from sklearn.gaussian_process.kernels import Matern, WhiteKernel, ExpSineSquared, ConstantKernel, RBF ctlDF = cent_stats[ cent_stats['group'] == 0 ] TNRightDF = cent_stats[ cent_stats['group'] != 0] TNRightDF = TNRightDF[TNRightDF['side'] == 'right'] dataDf = pd.concat([ctlDF, TNRightDF], ignore_index=True) print dataDf yDf = dataDf['group'] == 0 yDf = yDf.astype(int) y = yDf.values print y print y.shape XDf = dataDf[['position', 'values']] X = XDf.values X = preprocessing.scale(X) print X print X.shape # kernel = ConstantKernel() + Matern(length_scale=mean, nu=3 / 2) + \ # WhiteKernel(noise_level=1e-10) kernel = 1**2 * Matern(length_scale=1, nu=1.5) + \ WhiteKernel(noise_level=0.1) figure = plt.figure(figsize=(10, 6)) stime = time.time() gp = GaussianProcessClassifier(kernel) gp.fit(X, y) print gp.kernel_ print gp.log_marginal_likelihood() print("Time for GPR fitting: %.3f" % (time.time() - stime)) # create a mesh to plot in h = 0.1 x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) plt.figure(figsize=(10, 5)) # Plot the predicted probabilities. For that, we will assign a color to # each point in the mesh [x_min, m_max]x[y_min, y_max]. Z = gp.predict_proba(np.c_[xx.ravel(), yy.ravel()]) Z = Z[:,1] print Z print Z.shape # Put the result into a color plot Z = Z.reshape((xx.shape[0], xx.shape[1])) print Z.shape plt.imshow(Z, extent=(x_min, x_max, y_min, y_max), origin="lower") # Plot also the training points plt.scatter(X[:, 0], X[:, 1], c=np.array(["r", "g"])[y]) plt.xlabel('position') plt.ylabel('normalized val') plt.xlim(xx.min(), xx.max()) plt.ylim(yy.min(), yy.max()) plt.xticks(()) plt.yticks(()) plt.title("%s, LML: %.3f" % ("TN vs. Control", gp.log_marginal_likelihood(gp.kernel_.theta))) plt.tight_layout() if options.title: plt.suptitle(options.title) if options.output: plt.savefig(options.output, dpi=150) if options.is_show: plt.show()