def SVM_fit(X_in, y_in, X_out, gamma, C): M = len(X_in[0]) #Number of features seed(time()) #To prevent data snooping, breakes the input set into train. cross validation and test sets, with sizes proportional to 8-1-1 #First puts aside 10% of the data for the tests test_indices, train_indices = split_indices(len(X_in), int(round(0.1 * len(X_in)))) shuffle(X_in, y_in) X_test = [X_in[i] for i in test_indices] y_test = [y_in[i] for i in test_indices] X_in = [X_in[i] for i in train_indices] y_in = [y_in[i] for i in train_indices] #scale data first scaler = Scaler(copy=False) #in place modification #Normalize the data and stores as inner parameters the mean and standard deviation #To avoid data snooping, normalization is computed on training set only, and then reported on data scaler.fit(X_test, y_test) X_in = scaler.transform(X_in) X_test = scaler.transform(X_test) X_out = scaler.transform( X_out) #uses the same transformation (same mean_ and std_) fit before std_test = X_test.std(axis=0) f_indices = [j for j in range(M) if std_test[j] > 1e-7] #Removes feature with null variance X_in = [[X_in[i][j] for j in f_indices] for i in range(len(X_in))] X_test = [[X_test[i][j] for j in f_indices] for i in range(len(X_test))] X_out = [[X_out[i][j] for j in f_indices] for i in range(len(X_out))] M = len(f_indices) #Then, on the remaining data, performs a ten-fold cross validation over the number of features considered svc = svm.SVC(kernel='rbf', C=C, gamma=gamma, verbose=False, cache_size=4092, tol=1e-5) svc.fit(X_in, y_in) y_out = svc.predict(X_out) return y_out
def data_to_kernels(tr_data, te_data): scaler = Scaler(copy=False) scaler.fit_transform(tr_data) #tr_data, mu, sigma = standardize(tr_data) tr_data = power_normalize(tr_data, 0.5) tr_data = L2_normalize(tr_data) #te_data, _, _ = standardize(te_data, mu, sigma) scaler.transform(te_data) te_data = power_normalize(te_data, 0.5) te_data = L2_normalize(te_data) tr_kernel = np.dot(tr_data, tr_data.T) te_kernel = np.dot(te_data, tr_data.T) return tr_kernel, te_kernel
def process_data(self): test = pandas.read_csv("test.csv") testMat = test.as_matrix() train = pandas.read_csv("train.csv") trainMat = train.as_matrix() trainResult = trainMat[:, 0] trainMat = trainMat[:, 1:] # trainInd = np.where(trainResult == 0)[0] # how_many = (trainResult == 1).sum() - len(trainInd) # np.random.shuffle(trainInd) # addedResult = trainResult[trainInd[:how_many],:] # addedData = trainMat[trainInd[:how_many],:] # trainResult = np.append(trainResult,addedResult) # trainMat = np.vstack((trainMat,addedData)) cv = StratifiedKFold(trainResult, 2) # cv = KFold(n=trainResult.shape[0],k=2) reduceFeatures = ExtraTreesClassifier( compute_importances=True, random_state=1234, n_jobs=self.cpus, n_estimators=1000, criterion="gini" ) reduceFeatures.fit(trainMat, trainResult) trainScaler = Scaler() self.cv_data = [] self.cv_data_nonreduced = [] for train, test in cv: X_train, X_test, Y_train, Y_test = ( trainMat[train, :], trainMat[test, :], trainResult[train, :], trainResult[test, :], ) X_train = trainScaler.fit_transform(X_train) X_test = trainScaler.transform(X_test) self.cv_data_nonreduced.append((X_train, X_test, Y_train, Y_test)) X_train = reduceFeatures.transform(X_train) X_test = reduceFeatures.transform(X_test) self.cv_data.append((X_train, X_test, Y_train, Y_test)) testMat = trainScaler.transform(testMat) self.testMat_nonreduced = testMat self.testMat = reduceFeatures.transform(testMat) allData = self.testMat, self.cv_data, self.testMat_nonreduced, self.cv_data_nonreduced data_handle = open("allData.pkl", "w") pickle.dump(allData, data_handle) data_handle.close()
def SVM_fit(X_in, y_in, X_out, gamma, C): M = len(X_in[0]) #Number of features seed(time()) #To prevent data snooping, breakes the input set into train. cross validation and test sets, with sizes proportional to 8-1-1 #First puts aside 10% of the data for the tests test_indices, train_indices = split_indices(len(X_in), int(round(0.1*len(X_in)))) shuffle(X_in, y_in) X_test = [X_in[i] for i in test_indices] y_test = [y_in[i] for i in test_indices] X_in = [X_in[i] for i in train_indices] y_in = [y_in[i] for i in train_indices] #scale data first scaler = Scaler(copy=False) #in place modification #Normalize the data and stores as inner parameters the mean and standard deviation #To avoid data snooping, normalization is computed on training set only, and then reported on data scaler.fit(X_test, y_test) X_in = scaler.transform(X_in) X_test = scaler.transform(X_test) X_out = scaler.transform(X_out) #uses the same transformation (same mean_ and std_) fit before std_test = X_test.std(axis=0) f_indices = [j for j in range(M) if std_test[j] > 1e-7] #Removes feature with null variance X_in = [[X_in[i][j] for j in f_indices] for i in range(len(X_in))] X_test = [[X_test[i][j] for j in f_indices] for i in range(len(X_test))] X_out = [[X_out[i][j] for j in f_indices] for i in range(len(X_out))] M = len(f_indices) #Then, on the remaining data, performs a ten-fold cross validation over the number of features considered svc = svm.SVC(kernel='rbf', C=C, gamma=gamma, verbose=False, cache_size=4092, tol=1e-5) svc.fit(X_in, y_in) y_out = svc.predict(X_out) return y_out
def test_center_kernel(): """Test that KernelCenterer is equivalent to Scaler in feature space""" X_fit = np.random.random((5, 4)) scaler = Scaler(with_std=False) scaler.fit(X_fit) X_fit_centered = scaler.transform(X_fit) K_fit = np.dot(X_fit, X_fit.T) # center fit time matrix centerer = KernelCenterer() K_fit_centered = np.dot(X_fit_centered, X_fit_centered.T) K_fit_centered2 = centerer.fit_transform(K_fit) assert_array_almost_equal(K_fit_centered, K_fit_centered2) # center predict time matrix X_pred = np.random.random((2, 4)) K_pred = np.dot(X_pred, X_fit.T) X_pred_centered = scaler.transform(X_pred) K_pred_centered = np.dot(X_pred_centered, X_fit_centered.T) K_pred_centered2 = centerer.transform(K_pred) assert_array_almost_equal(K_pred_centered, K_pred_centered2)
def test_scaler_without_centering(): rng = np.random.RandomState(42) X = rng.randn(4, 5) X[:, 0] = 0.0 # first feature is always of zero X_csr = sp.csr_matrix(X) scaler = Scaler(with_mean=False).fit(X) X_scaled = scaler.transform(X, copy=True) assert_false(np.any(np.isnan(X_scaled))) scaler_csr = Scaler(with_mean=False).fit(X_csr) X_csr_scaled = scaler_csr.transform(X_csr, copy=True) assert_false(np.any(np.isnan(X_csr_scaled.data))) assert_equal(scaler.mean_, scaler_csr.mean_) assert_array_almost_equal(scaler.std_, scaler_csr.std_) assert_array_almost_equal( X_scaled.mean(axis=0), [0., -0.01, 2.24, -0.35, -0.78], 2) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis0(X_csr_scaled) assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0)) assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0)) # Check that X has not been modified (copy) assert_true(X_scaled is not X) assert_true(X_csr_scaled is not X_csr) X_scaled_back = scaler.inverse_transform(X_scaled) assert_true(X_scaled_back is not X) assert_true(X_scaled_back is not X_scaled) assert_array_almost_equal(X_scaled_back, X) X_csr_scaled_back = scaler_csr.inverse_transform(X_csr_scaled) assert_true(X_csr_scaled_back is not X_csr) assert_true(X_csr_scaled_back is not X_csr_scaled) assert_array_almost_equal(X_scaled_back, X)
def test_scaler_without_centering(): rng = np.random.RandomState(42) X = rng.randn(4, 5) X[:, 0] = 0.0 # first feature is always of zero X_csr = sp.csr_matrix(X) scaler = Scaler(with_mean=False).fit(X) X_scaled = scaler.transform(X, copy=True) assert_false(np.any(np.isnan(X_scaled))) scaler_csr = Scaler(with_mean=False).fit(X_csr) X_csr_scaled = scaler_csr.transform(X_csr, copy=True) assert_false(np.any(np.isnan(X_csr_scaled.data))) assert_equal(scaler.mean_, scaler_csr.mean_) assert_array_almost_equal(scaler.std_, scaler_csr.std_) assert_array_almost_equal(X_scaled.mean(axis=0), [0., -0.01, 2.24, -0.35, -0.78], 2) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis0(X_csr_scaled) assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0)) assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0)) # Check that X has not been modified (copy) assert_true(X_scaled is not X) assert_true(X_csr_scaled is not X_csr) X_scaled_back = scaler.inverse_transform(X_scaled) assert_true(X_scaled_back is not X) assert_true(X_scaled_back is not X_scaled) assert_array_almost_equal(X_scaled_back, X) X_csr_scaled_back = scaler_csr.inverse_transform(X_csr_scaled) assert_true(X_csr_scaled_back is not X_csr) assert_true(X_csr_scaled_back is not X_csr_scaled) assert_array_almost_equal(X_scaled_back, X)
def test_scale_sparse_with_mean_raise_exception(): rng = np.random.RandomState(42) X = rng.randn(4, 5) X_csr = sp.csr_matrix(X) # check scaling and fit with direct calls on sparse data assert_raises(ValueError, scale, X_csr, with_mean=True) assert_raises(ValueError, Scaler(with_mean=True).fit, X_csr) # check transform and inverse_transform after a fit on a dense array scaler = Scaler(with_mean=True).fit(X) assert_raises(ValueError, scaler.transform, X_csr) X_transformed_csr = sp.csr_matrix(scaler.transform(X)) assert_raises(ValueError, scaler.inverse_transform, X_transformed_csr)
def test_scale_sparse_with_mean_raise_exception(): rng = np.random.RandomState(42) X = rng.randn(4, 5) X_csr = sp.csr_matrix(X) # check scaling and fit with direct calls on sparse data assert_raises(ValueError, scale, X_csr, with_mean=True) assert_raises(ValueError, Scaler(with_mean=True).fit, X_csr) # check transform and inverse_transform after a fit on a dense array scaler = Scaler(with_mean=True).fit(X) assert_raises(ValueError, scaler.transform, X_csr) X_transformed_csr = sp.csr_matrix(scaler.transform(X)) assert_raises(ValueError, scaler.inverse_transform, X_transformed_csr)
def run_real_data_experiments(nr_samples, delta, verbose=0, do_scatter_plot=False): dataset = Dataset('hollywood2', suffix='.per_slice.delta_%d' % delta, nr_clusters=256) samples, _ = dataset.get_data('test') nr_samples = np.minimum(len(samples), nr_samples) nr_samples = np.maximum(1, nr_samples) if verbose > 2: print "Loading train data." tr_data, _, _ = load_sample_data(dataset, 'train', pi_derivatives=True) scaler = Scaler() scaler.fit(tr_data) true_values, approx_values = [], [] for ii in xrange(nr_samples): if verbose > 2: sys.stdout.write("%s\r" % samples[ii].movie) data, _, _ = load_sample_data(dataset, str(samples[ii]), pi_derivatives=True) data = scaler.transform(data) L2_norm_true, L2_norm_approx = L2_approx(data) true_values.append(L2_norm_true) approx_values.append(L2_norm_approx) if verbose: print print_info(true_values, approx_values, verbose) print if do_scatter_plot: scatter_plot(true_values, approx_values)
labels = data[:,0] n_train = 35000 #n_val = n - n_train n_val = 7000 trainset = records[:n_train,:] trainlabels = labels[:n_train] #valset = records[n_train:,:] #vallabels = labels[n_train:,:] valset = records[n_train:n_train+n_val,:] vallabels = labels[n_train:n_train+n_val] n,dim = trainset.shape # mean centering, stdev normalization and whitening scaler = Scaler() scaler.fit(trainset) trainset = scaler.transform(trainset) valset = scaler.transform(valset) pca = PCA(n_components=dim,whiten=True) pca.fit(trainset) trainset = pca.transform(trainset) valset = pca.transform(valset) config = Train_config() config.iterations = 10 config.nonlinearity = 'tanh' config.batchsize = 50 config.learning_rate = 0.2 config.momentum = 0.7 log = open('log.txt','w') nn = Net([dim,300,10],log_file=log) nn.fit(trainset,trainlabels,config,val_set=valset,val_labels=vallabels)
year_1 = [i.year for i in actual_dates_1] year_2 = [i.year for i in actual_dates_2] df_i['month_1'] = month_1 df_i['month_2'] = month_2 df_i['year_1'] = year_1 df_i['year_2'] = year_2 # Fillnas to zero train_sets.append(df_i.fillna(0)) # Log response variables for i in range(len(outcomes)): outcomes[i] = np.log(outcomes[i]) df_test_quants = df_test[quants] scaled_test = scaler.transform(df_test_quants.fillna(0)) clusters_test = dpgmm.predict(scaled_test) df_test['clusters'] = clusters_test df_test = df_test.fillna(0) time_deltas_1_test = [timedelta(int(i)) for i in df_test['Date_1'].values] time_deltas_2_test = [timedelta(int(i)) for i in df_test['Date_2'].values] actual_dates_1_test = [jan1+i for i in time_deltas_1_test] actual_dates_2_test = [jan1+i for i in time_deltas_2_test] month_1_test = [i.month for i in actual_dates_1_test] month_2_test = [i.month for i in actual_dates_2_test] year_1_test = [i.year for i in actual_dates_1_test] year_2_test = [i.year for i in actual_dates_2_test] df_test['month_1'] = month_1_test df_test['month_2'] = month_2_test df_test['year_1'] = year_1_test
def main(): X =[] Y=[] featuresDB = Base(os.getcwd()+"\\Databases\\features.db") featuresDB.open() print "features open" for rec in featuresDB: vec = [] vec.append(rec.f1) vec.append(rec.f3) vec.append(rec.f4) vec.append(rec.f5) vec.append(rec.f6) vec.append(rec.f7) vec.append(rec.f10) vec.append(rec.f11) vec.append(rec.f12) vec.append(rec.f13) vec.append(rec.f14) vec.append(rec.f15) vec.append(rec.f16) vec.append(rec.f17) vec.append(rec.f18) vec.append(rec.f19) vec.append(rec.f20) vec.append(rec.f21) vec.append(rec.f22) vec.append(rec.f23) X.append(vec) Y.append(rec.score) print "building classifier" Y = np.array(Y) ybar = Y.mean() for i in range(len(Y)): if Y[i]<ybar: Y[i]=1 else: Y[i]=2 scaler = Scaler().fit(X) X = scaler.transform(X) X= np.array(X) Y=np.array(Y) skf = cross_validation.StratifiedKFold(Y,k=2) for train, test in skf: X_train, X_test = X[train], X[test] y_train, y_test = Y[train], Y[test] clf = ExtraTreesClassifier(n_estimators=8,max_depth=None,min_split=1,random_state=0,compute_importances=True) scores = cross_validation.cross_val_score(clf,X_train,y_train,cv=5) clf.fit_transform(X_train,y_train) print "Accuracy: %0.4f (+/- %0.2f)" % (scores.mean(), scores.std() / 2) print clf.feature_importances_ y_pred =clf.predict(X_test) print classification_report(y_test,y_pred) model=(scaler,clf) joblib.dump(model,'AestheticModel\\aestheticModel.pkl') print "Done"
class KMPBase(BaseEstimator): def __init__(self, n_nonzero_coefs=0.3, loss=None, # components (basis functions) init_components=None, n_components=None, check_duplicates=False, scale=False, scale_y=False, # back-fitting n_refit=5, estimator=None, # metric metric="linear", gamma=0.1, coef0=1, degree=4, # validation X_val=None, y_val=None, n_validate=1, epsilon=0, score_func=None, # misc random_state=None, verbose=0, n_jobs=1): if n_nonzero_coefs < 0: raise AttributeError("n_nonzero_coefs should be > 0.") self.n_nonzero_coefs = n_nonzero_coefs self.loss = loss self.init_components = init_components self.n_components = n_components self.check_duplicates = check_duplicates self.scale = scale self.scale_y = scale_y self.n_refit = n_refit self.estimator = estimator self.metric = metric self.gamma = gamma self.coef0 = coef0 self.degree = degree self.X_val = X_val self.y_val = y_val self.n_validate = n_validate self.epsilon = epsilon self.score_func = score_func self.random_state = random_state self.verbose = verbose self.n_jobs = n_jobs def _kernel_params(self): return {"gamma" : self.gamma, "degree" : self.degree, "coef0" : self.coef0} def _get_estimator(self): if self.estimator is None: estimator = LinearRegression() else: estimator = clone(self.estimator) estimator.fit_intercept = False return estimator def _get_loss(self): if self.loss == "squared": return SquaredLoss() else: return None def _pre_fit(self, X, y): random_state = check_random_state(self.random_state) if self.scale_y: self.y_scaler_ = Scaler(copy=True).fit(y) y = self.y_scaler_.transform(y) if self.metric == "precomputed": self.components_ = None n_components = X.shape[1] else: if self.init_components is None: if self.verbose: print "Selecting components..." self.components_ = select_components(X, y, self.n_components, random_state=random_state) else: self.components_ = self.init_components n_components = self.components_.shape[0] n_nonzero_coefs = self.n_nonzero_coefs if 0 < n_nonzero_coefs and n_nonzero_coefs <= 1: n_nonzero_coefs = int(n_nonzero_coefs * n_components) n_nonzero_coefs = int(n_nonzero_coefs) if n_nonzero_coefs > n_components: raise AttributeError("n_nonzero_coefs cannot be bigger than " "n_components.") if self.verbose: print "Computing dictionary..." start = time.time() K = pairwise_kernels(X, self.components_, metric=self.metric, filter_params=True, n_jobs=self.n_jobs, **self._kernel_params()) if self.verbose: print "Done in", time.time() - start, "seconds" if self.scale: if self.verbose: print "Scaling dictionary" start = time.time() copy = True if self.metric == "precomputed" else False self.scaler_ = Scaler(copy=copy) K = self.scaler_.fit_transform(K) if self.verbose: print "Done in", time.time() - start, "seconds" # FIXME: this allocates a lot of intermediary memory norms = np.sqrt(np.sum(K ** 2, axis=0)) return n_nonzero_coefs, K, y, norms def _fit_multi(self, K, y, Y, n_nonzero_coefs, norms): if self.verbose: print "Starting training..." start = time.time() coef = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)( delayed(_run_iterator)(self._get_estimator(), self._get_loss(), K, Y[:, i], n_nonzero_coefs, norms, self.n_refit, self.check_duplicates) for i in xrange(Y.shape[1])) self.coef_ = np.array(coef) if self.verbose: print "Done in", time.time() - start, "seconds" def _score(self, y_true, y_pred): if self.score_func == "auc": return auc(y_true, y_pred) if hasattr(self, "lb_"): y_pred = self.lb_.inverse_transform(y_pred, threshold=0.5) if self.score_func is None: return np.mean(y_true == y_pred) else: return self.score_func(y_true, y_pred) else: # FIXME: no need to ravel y_pred if y_true is 2d! return -np.mean((y_true - y_pred.ravel()) ** 2) def _fit_multi_with_validation(self, K, y, Y, n_nonzero_coefs, norms): iterators = [FitIterator(self._get_estimator(), self._get_loss(), K, Y[:, i], n_nonzero_coefs, norms, self.n_refit, self.check_duplicates, self.verbose) for i in xrange(Y.shape[1])] if self.verbose: print "Computing validation dictionary..." start = time.time() K_val = pairwise_kernels(self.X_val, self.components_, metric=self.metric, filter_params=True, n_jobs=self.n_jobs, **self._kernel_params()) if self.verbose: print "Done in", time.time() - start, "seconds" if self.scale: K_val = self.scaler_.transform(K_val) y_val = self.y_val if self.scale_y: y_val = self.y_scaler_.transform(y_val) if self.verbose: print "Starting training..." start = time.time() best_score = -np.inf validation_scores = [] training_scores = [] iterations = [] for i in xrange(1, n_nonzero_coefs + 1): iterators = [it.next() for it in iterators] #iterators = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)( #delayed(_run_iterator)(it) for it in iterators) coef = np.array([it.coef_ for it in iterators]) y_train_pred = np.array([it.y_train_ for it in iterators]).T if i % self.n_validate == 0: if self.verbose >= 2: print "Validating %d/%d..." % (i, n_nonzero_coefs) y_val_pred = np.dot(K_val, coef.T) validation_score = self._score(y_val, y_val_pred) training_score = self._score(y, y_train_pred) if validation_score > best_score: self.coef_ = coef.copy() best_score = np.abs(validation_score) validation_scores.append(np.abs(validation_score)) training_scores.append(np.abs(training_score)) iterations.append(i) if len(iterations) > 2 and self.epsilon > 0: diff = (validation_scores[-1] - validation_scores[-2]) diff /= validation_scores[0] if abs(diff) < self.epsilon: if self.verbose: print "Converged at iteration", i break self.validation_scores_ = np.array(validation_scores) self.training_scores_ = np.array(training_scores) self.iterations_ = np.array(iterations) self.best_score_ = best_score if self.verbose: print "Done in", time.time() - start, "seconds" def _fit(self, K, y, Y, n_nonzero_coefs, norms): if self.X_val is not None and self.y_val is not None: meth = self._fit_multi_with_validation else: meth = self._fit_multi meth(K, y, Y, n_nonzero_coefs, norms) def _post_fit(self): if self.metric != "precomputed": used_basis = np.sum(self.coef_ != 0, axis=0, dtype=bool) self.coef_ = self.coef_[:, used_basis] self.components_ = self.components_[used_basis] def decision_function(self, X): K = pairwise_kernels(X, self.components_, metric=self.metric, filter_params=True, n_jobs=self.n_jobs, **self._kernel_params()) if self.scale: K = self.scaler_.transform(K) pred = np.dot(K, self.coef_.T) if self.scale_y: pred = self.y_scaler_.inverse_transform(pred) return pred
def tree_train(X_in, y_in, X_out, min_meaningful_features_ratio=1., file_log=None): if file_log: file_log.writelines('# of Samples: {}, # of Features: {}\n'.format( len(X_in), len(X_in[0]))) M = len(X_in[0]) #Number of features seed(time()) #To prevent data snooping, breaks the input set into train. cross validation and test sets, with sizes proportional to 8-1-1 #First puts aside 10% of the data for the tests test_indices, train_indices = split_indices(len(X_in), int(round(0.1 * len(X_in)))) X_scaler = [X_in[i] for i in test_indices] y_scaler = [y_in[i] for i in test_indices] X_in = [X_in[i] for i in train_indices] y_in = [y_in[i] for i in train_indices] #scale data first scaler = Scaler(copy=False) #in place modification #Normalize the data and stores as inner parameters the mean and standard deviation #To avoid data snooping, normalization is computed on training set only, and then reported on data scaler.fit(X_scaler, y_scaler) X_scaler = scaler.transform(X_scaler) X_in = scaler.transform(X_in) X_out = scaler.transform( X_out) #uses the same transformation (same mean_ and std_) fit before std_test = X_scaler.std(axis=0) f_indices = [j for j in range(M) if std_test[j] > 1e-7] #Removes feature with null variance X_in = [[X_in[i][j] for j in f_indices] for i in range(len(X_in))] X_scaler = [[X_scaler[i][j] for j in f_indices] for i in range(len(X_scaler))] X_out = [[X_out[i][j] for j in f_indices] for i in range(len(X_out))] M = len(f_indices) #Then, on the remaining data, performs a ten-fold cross validation over the number of features considered best_cv_accuracy = 0. best_features_number = M for features_number in range(int(floor(M * min_meaningful_features_ratio)), M + 1): # kfold = cross_validation.KFold(len(y_in), k=10, shuffle=True) kfold = cross_validation.StratifiedKFold(y_in, k=10) svc = ExtraTreesClassifier(criterion='entropy', max_features=features_number) in_accuracy = 0. cv_accuracy = 0. for t_indices, cv_indices in kfold: X_train = array([[X_in[i][j] for j in range(M)] for i in t_indices]) y_train = [y_in[i] for i in t_indices] X_cv = array([[X_in[i][j] for j in range(M)] for i in cv_indices]) y_cv = [y_in[i] for i in cv_indices] svc.fit(X_train, y_train) in_accuracy += svc.score(X_train, y_train) cv_accuracy += svc.score(X_cv, y_cv) in_accuracy /= kfold.k cv_accuracy /= kfold.k if file_log: file_log.writelines('# of features: {}\n'.format(len(X_train[0]))) file_log.writelines('\tEin= {}\n'.format(1. - in_accuracy)) file_log.writelines('\tEcv= {}\n'.format(1. - cv_accuracy)) if (cv_accuracy > best_cv_accuracy): best_features_number = features_number best_cv_accuracy = cv_accuracy #Now tests the out of sample error if file_log: file_log.writelines('\nBEST result: E_cv={}, t={}\n'.format( 1. - best_cv_accuracy, best_features_number)) svc = ExtraTreesClassifier(criterion='entropy', n_estimators=features_number) svc.fit(X_in, y_in) if file_log: file_log.writelines('Ein= {}\n'.format(1. - svc.score(X_in, y_in))) file_log.writelines( 'Etest= {}\n'.format(1. - svc.score(X_scaler, y_scaler))) y_out = svc.predict(X_out) return y_out
def Logistic_train(X_in, y_in, X_out, cs, file_log=None): if file_log: file_log.writelines('# of Samples: {}, # of Features: {}\n'.format( len(X_in), len(X_in[0]))) M = len(X_in[0]) #Number of features seed(time()) #To prevent data snooping, breakes the input set into train. cross validation and test sets, with sizes proportional to 8-1-1 #First puts aside 10% of the data for the tests test_indices, train_indices = split_indices(len(X_in), int(round(0.1 * len(X_in)))) X_scaler = [X_in[i] for i in test_indices] y_scaler = [y_in[i] for i in test_indices] X_in = [X_in[i] for i in train_indices] y_in = [y_in[i] for i in train_indices] #scale data first scaler = Scaler(copy=False) #in place modification #Normalize the data and stores as inner parameters the mean and standard deviation #To avoid data snooping, normalization is computed on training set only, and then reported on data scaler.fit(X_scaler, y_scaler) X_scaler = scaler.transform(X_scaler) X_in = scaler.transform(X_in) X_out = scaler.transform( X_out) #uses the same transformation (same mean_ and std_) fit before std_test = X_scaler.std(axis=0) f_indices = [j for j in range(M) if std_test[j] > 1e-7] #Removes feature with null variance X_in = [[X_in[i][j] for j in f_indices] for i in range(len(X_in))] X_scaler = [[X_scaler[i][j] for j in f_indices] for i in range(len(X_scaler))] X_out = [[X_out[i][j] for j in f_indices] for i in range(len(X_out))] M = len(X_in[0]) #Then, on the remaining data, performs a ten-fold cross validation over the number of features considered best_cv_accuracy = 0. best_c = 0. for c in cs: kfold = cross_validation.StratifiedKFold(y_in, k=10) lrc = LogisticRegression(C=c, tol=1e-5) in_accuracy = 0. cv_accuracy = 0. for t_indices, cv_indices in kfold: X_train = array([X_in[i][:] for i in t_indices]) y_train = [y_in[i] for i in t_indices] X_cv = array([X_in[i][:] for i in cv_indices]) y_cv = [y_in[i] for i in cv_indices] lrc.fit(X_train, y_train) in_accuracy += lrc.score(X_train, y_train) cv_accuracy += lrc.score(X_cv, y_cv) in_accuracy /= kfold.k cv_accuracy /= kfold.k if file_log: file_log.writelines('C: {}\n'.format(c)) file_log.writelines('\tEin= {}\n'.format(1. - in_accuracy)) file_log.writelines('\tEcv= {}\n'.format(1. - cv_accuracy)) if (cv_accuracy > best_cv_accuracy): best_c = c best_cv_accuracy = cv_accuracy #Now tests the out of sample error if file_log: file_log.writelines('\nBEST result: E_cv={}, C={}\n'.format( 1. - best_cv_accuracy, best_c)) lrc = LogisticRegression(C=best_c, tol=1e-5) lrc.fit(X_in, y_in) if file_log: file_log.writelines('Ein= {}\n'.format(1. - lrc.score(X_in, y_in))) file_log.writelines( 'Etest= {}\n'.format(1. - lrc.score(X_scaler, y_scaler))) y_out = lrc.predict(X_out) return y_out
all_folds[split, fold, test] = 0 for d in range(0, dims.shape[0]): Xtrain = Xm_shfl[train, :, dims[d]] ytrain = y_shfl[train] sw_train = sw_shfl[train] # (deal with NaN in training) ytrain = ytrain[~np.isnan(np.nansum(Xtrain, axis=1))] sw_train = sw_train[~np.isnan(np.nansum(Xtrain, axis=1))] Xtrain = Xtrain[~np.isnan(np.nansum(Xtrain, axis=1)), :] if np.unique(ytrain).shape[0] > 1: # feature selection (find the 50% most discriminative channels) fs.fit(Xtrain, ytrain) # find Xtrain = fs.transform(Xtrain) # remove unnecessary channels # normalization scaler.fit(Xtrain) # find Xtrain = scaler.transform(Xtrain) # apply zscore # SVM fit clf.fit(Xtrain, ytrain, sample_weight=sw_train) # retrieve hyperplan feature identification coef[split, fold, dims[d], :, :] = 0 # initialize #--- univariate uni_features = fs.pvalues_ <= stats.scoreatpercentile(fs.pvalues_, fs.percentile) #--- multivariate coef[split, fold, dims[d], :, uni_features] = clf.coef_.T # predict cross val (deal with NaN in testing) Xtest = Xm_shfl[test, :, dims[d]] test_nan = np.isnan(np.nansum(Xtest, axis=1)) Xtest = fs.transform(Xtest) Xtest = scaler.transform(Xtest) if (Xtest.shape[0] - np.sum(test_nan)) > 0: if compute_predict:
def SVM_train(X_in, y_in, X_out, gammas, cs, file_log=None): if file_log: file_log.writelines('# of Samples: {}, # of Features: {}\n'.format( len(X_in), len(X_in[0]))) M = len(X_in[0]) #Number of features seed(time()) #To prevent data snooping, breaks the input set into train. cross validation #and scale sets, with sizes proportional to 8-1-1 #First puts aside 10% of the data for the tests scale_set_indices, train_indices = split_indices( len(X_in), int(round(0.1 * len(X_in)))) # shuffle(X_in, y_in) X_scale = [X_in[i] for i in scale_set_indices] y_scale = [y_in[i] for i in scale_set_indices] X_in = [X_in[i] for i in train_indices] y_in = [y_in[i] for i in train_indices] #Scale data first scaler = Scaler(copy=False) #WARNING: copy=False => in place modification #Normalize the data and stores as inner parameters the mean and standard deviation #To avoid data snooping, normalization is computed on a separate subsetonly, and then reported on data scaler.fit(X_scale, y_scale) X_scale = scaler.transform(X_scale) X_in = scaler.transform(X_in) X_out = scaler.transform( X_out) #uses the same transformation (same mean_ and std_) fit before std_test = X_scale.std(axis=0) f_indices = [j for j in range(M) if std_test[j] > 1e-7] #Removes feature with null variance X_in = [[X_in[i][j] for j in f_indices] for i in range(len(X_in))] X_scale = [[X_scale[i][j] for j in f_indices] for i in range(len(X_scale))] X_out = [[X_out[i][j] for j in f_indices] for i in range(len(X_out))] if file_log: file_log.writelines('Initial features :{}, Features used: {}\n'.format( M, len(X_in[0]))) M = len(f_indices) best_cv_accuracy = 0. best_gamma = 0. best_c = 0. #Then, on the remaining data, performs a ten-fold cross validation over the number of features considered for c in cs: for g in gammas: #Balanced cross validation (keeps the ratio of the two classes as #constant as possible across the k folds). kfold = cross_validation.StratifiedKFold(y_in, k=10) svc = svm.SVC(kernel='rbf', C=c, gamma=g, verbose=False, cache_size=4092, tol=1e-5) in_accuracy = 0. cv_accuracy = 0. for t_indices, cv_indices in kfold: X_train = array([X_in[i][:] for i in t_indices]) y_train = [y_in[i] for i in t_indices] X_cv = array([X_in[i][:] for i in cv_indices]) y_cv = [y_in[i] for i in cv_indices] svc.fit(X_train, y_train) in_accuracy += svc.score(X_train, y_train) cv_accuracy += svc.score(X_cv, y_cv) in_accuracy /= kfold.k cv_accuracy /= kfold.k if file_log: file_log.writelines('C:{}, gamma:{}\n'.format(c, g)) file_log.writelines('\tEin= {}\n'.format(1. - in_accuracy)) file_log.writelines('\tEcv= {}\n'.format(1. - cv_accuracy)) if (cv_accuracy > best_cv_accuracy): best_gamma = g best_c = c best_cv_accuracy = cv_accuracy if file_log: file_log.writelines('\nBEST result: E_cv={}, C={}, gamma={}\n'.format( 1. - best_cv_accuracy, best_c, best_gamma)) svc = svm.SVC(kernel='rbf', C=best_c, gamma=best_gamma, verbose=False, cache_size=4092, tol=1e-5) svc.fit(X_in, y_in) if file_log: file_log.writelines('Ein= {}\n'.format(1. - svc.score(X_in, y_in))) file_log.writelines('Etest= {}\n'.format(1. - svc.score(X_scale, y_scale))) y_out = svc.predict(X_out) #DEBUG: output = ['{} {:+}\n'.format(id_out[i], int(y_scale[i])) for i in range(len(X_out))] #DEBUG: file_log.writelines('------------------------') return y_out
def SVM_train(X_in, y_in, X_out, gammas, cs, file_log=None): if file_log: file_log.writelines('# of Samples: {}, # of Features: {}\n'.format(len(X_in), len(X_in[0]))) M = len(X_in[0]) #Number of features seed(time()) #To prevent data snooping, breaks the input set into train. cross validation #and scale sets, with sizes proportional to 8-1-1 #First puts aside 10% of the data for the tests scale_set_indices, train_indices = split_indices(len(X_in), int(round(0.1*len(X_in)))) # shuffle(X_in, y_in) X_scale = [X_in[i] for i in scale_set_indices] y_scale = [y_in[i] for i in scale_set_indices] X_in = [X_in[i] for i in train_indices] y_in = [y_in[i] for i in train_indices] #Scale data first scaler = Scaler(copy=False) #WARNING: copy=False => in place modification #Normalize the data and stores as inner parameters the mean and standard deviation #To avoid data snooping, normalization is computed on a separate subsetonly, and then reported on data scaler.fit(X_scale, y_scale) X_scale = scaler.transform(X_scale) X_in = scaler.transform(X_in) X_out = scaler.transform(X_out) #uses the same transformation (same mean_ and std_) fit before std_test = X_scale.std(axis=0) f_indices = [j for j in range(M) if std_test[j] > 1e-7] #Removes feature with null variance X_in = [[X_in[i][j] for j in f_indices] for i in range(len(X_in))] X_scale = [[X_scale[i][j] for j in f_indices] for i in range(len(X_scale))] X_out = [[X_out[i][j] for j in f_indices] for i in range(len(X_out))] if file_log: file_log.writelines('Initial features :{}, Features used: {}\n'.format(M, len(X_in[0]))) M = len(f_indices) best_cv_accuracy = 0. best_gamma = 0. best_c = 0. #Then, on the remaining data, performs a ten-fold cross validation over the number of features considered for c in cs: for g in gammas: #Balanced cross validation (keeps the ratio of the two classes as #constant as possible across the k folds). kfold = cross_validation.StratifiedKFold(y_in, k=10) svc = svm.SVC(kernel='rbf', C=c, gamma=g, verbose=False, cache_size=4092, tol=1e-5) in_accuracy = 0. cv_accuracy = 0. for t_indices, cv_indices in kfold: X_train = array([X_in[i][:] for i in t_indices]) y_train = [y_in[i] for i in t_indices] X_cv = array([X_in[i][:] for i in cv_indices]) y_cv = [y_in[i] for i in cv_indices] svc.fit(X_train, y_train) in_accuracy += svc.score(X_train, y_train) cv_accuracy += svc.score(X_cv, y_cv) in_accuracy /= kfold.k cv_accuracy /= kfold.k if file_log: file_log.writelines('C:{}, gamma:{}\n'.format(c, g)) file_log.writelines('\tEin= {}\n'.format(1. - in_accuracy)) file_log.writelines('\tEcv= {}\n'.format(1. - cv_accuracy)) if (cv_accuracy > best_cv_accuracy): best_gamma = g best_c = c best_cv_accuracy = cv_accuracy if file_log: file_log.writelines('\nBEST result: E_cv={}, C={}, gamma={}\n'.format(1. - best_cv_accuracy, best_c, best_gamma)) svc = svm.SVC(kernel='rbf', C=best_c, gamma=best_gamma, verbose=False, cache_size=4092, tol=1e-5) svc.fit(X_in, y_in) if file_log: file_log.writelines('Ein= {}\n'.format(1. - svc.score(X_in, y_in))) file_log.writelines('Etest= {}\n'.format(1. - svc.score(X_scale, y_scale))) y_out = svc.predict(X_out) #DEBUG: output = ['{} {:+}\n'.format(id_out[i], int(y_scale[i])) for i in range(len(X_out))] #DEBUG: file_log.writelines('------------------------') return y_out
def Logistic_train(X_in, y_in, X_out, cs, file_log=None): if file_log: file_log.writelines('# of Samples: {}, # of Features: {}\n'.format(len(X_in), len(X_in[0]))) M = len(X_in[0]) #Number of features seed(time()) #To prevent data snooping, breakes the input set into train. cross validation and test sets, with sizes proportional to 8-1-1 #First puts aside 10% of the data for the tests test_indices, train_indices = split_indices(len(X_in), int(round(0.1*len(X_in)))) X_scaler = [X_in[i] for i in test_indices] y_scaler = [y_in[i] for i in test_indices] X_in = [X_in[i] for i in train_indices] y_in = [y_in[i] for i in train_indices] #scale data first scaler = Scaler(copy=False) #in place modification #Normalize the data and stores as inner parameters the mean and standard deviation #To avoid data snooping, normalization is computed on training set only, and then reported on data scaler.fit(X_scaler, y_scaler) X_scaler = scaler.transform(X_scaler) X_in = scaler.transform(X_in) X_out = scaler.transform(X_out) #uses the same transformation (same mean_ and std_) fit before std_test = X_scaler.std(axis=0) f_indices = [j for j in range(M) if std_test[j] > 1e-7] #Removes feature with null variance X_in = [[X_in[i][j] for j in f_indices] for i in range(len(X_in))] X_scaler = [[X_scaler[i][j] for j in f_indices] for i in range(len(X_scaler))] X_out = [[X_out[i][j] for j in f_indices] for i in range(len(X_out))] M = len(X_in[0]) #Then, on the remaining data, performs a ten-fold cross validation over the number of features considered best_cv_accuracy = 0. best_c = 0. for c in cs: kfold = cross_validation.StratifiedKFold(y_in, k=10) lrc = LogisticRegression(C=c, tol=1e-5) in_accuracy = 0. cv_accuracy = 0. for t_indices, cv_indices in kfold: X_train = array([X_in[i][:] for i in t_indices]) y_train = [y_in[i] for i in t_indices] X_cv = array([X_in[i][:] for i in cv_indices]) y_cv = [y_in[i] for i in cv_indices] lrc.fit(X_train, y_train) in_accuracy += lrc.score(X_train, y_train) cv_accuracy += lrc.score(X_cv, y_cv) in_accuracy /= kfold.k cv_accuracy /= kfold.k if file_log: file_log.writelines('C: {}\n'.format(c)) file_log.writelines('\tEin= {}\n'.format(1. - in_accuracy)) file_log.writelines('\tEcv= {}\n'.format(1. - cv_accuracy)) if (cv_accuracy > best_cv_accuracy): best_c = c best_cv_accuracy = cv_accuracy #Now tests the out of sample error if file_log: file_log.writelines('\nBEST result: E_cv={}, C={}\n'.format(1. - best_cv_accuracy, best_c)) lrc = LogisticRegression(C=best_c, tol=1e-5) lrc.fit(X_in, y_in) if file_log: file_log.writelines('Ein= {}\n'.format(1. - lrc.score(X_in, y_in))) file_log.writelines('Etest= {}\n'.format(1. - lrc.score(X_scaler, y_scaler))) y_out = lrc.predict(X_out) return y_out
def tree_train(X_in, y_in, X_out, min_meaningful_features_ratio=1., file_log=None): if file_log: file_log.writelines('# of Samples: {}, # of Features: {}\n'.format(len(X_in), len(X_in[0]))) M = len(X_in[0]) #Number of features seed(time()) #To prevent data snooping, breaks the input set into train. cross validation and test sets, with sizes proportional to 8-1-1 #First puts aside 10% of the data for the tests test_indices, train_indices = split_indices(len(X_in), int(round(0.1*len(X_in)))) X_scaler = [X_in[i] for i in test_indices] y_scaler = [y_in[i] for i in test_indices] X_in = [X_in[i] for i in train_indices] y_in = [y_in[i] for i in train_indices] #scale data first scaler = Scaler(copy=False) #in place modification #Normalize the data and stores as inner parameters the mean and standard deviation #To avoid data snooping, normalization is computed on training set only, and then reported on data scaler.fit(X_scaler, y_scaler) X_scaler = scaler.transform(X_scaler) X_in = scaler.transform(X_in) X_out = scaler.transform(X_out) #uses the same transformation (same mean_ and std_) fit before std_test = X_scaler.std(axis=0) f_indices = [j for j in range(M) if std_test[j] > 1e-7] #Removes feature with null variance X_in = [[X_in[i][j] for j in f_indices] for i in range(len(X_in))] X_scaler = [[X_scaler[i][j] for j in f_indices] for i in range(len(X_scaler))] X_out = [[X_out[i][j] for j in f_indices] for i in range(len(X_out))] M = len(f_indices) #Then, on the remaining data, performs a ten-fold cross validation over the number of features considered best_cv_accuracy = 0. best_features_number = M for features_number in range(int(floor(M * min_meaningful_features_ratio)), M + 1): # kfold = cross_validation.KFold(len(y_in), k=10, shuffle=True) kfold = cross_validation.StratifiedKFold(y_in, k=10) svc = ExtraTreesClassifier(criterion='entropy', max_features=features_number) in_accuracy = 0. cv_accuracy = 0. for t_indices, cv_indices in kfold: X_train = array([[X_in[i][j] for j in range(M)] for i in t_indices]) y_train = [y_in[i] for i in t_indices] X_cv = array([[X_in[i][j] for j in range(M)] for i in cv_indices]) y_cv = [y_in[i] for i in cv_indices] svc.fit(X_train, y_train) in_accuracy += svc.score(X_train, y_train) cv_accuracy += svc.score(X_cv, y_cv) in_accuracy /= kfold.k cv_accuracy /= kfold.k if file_log: file_log.writelines('# of features: {}\n'.format(len(X_train[0]))) file_log.writelines('\tEin= {}\n'.format(1. - in_accuracy)) file_log.writelines('\tEcv= {}\n'.format(1. - cv_accuracy)) if (cv_accuracy > best_cv_accuracy): best_features_number = features_number best_cv_accuracy = cv_accuracy #Now tests the out of sample error if file_log: file_log.writelines('\nBEST result: E_cv={}, t={}\n'.format(1. - best_cv_accuracy, best_features_number)) svc = ExtraTreesClassifier(criterion='entropy', n_estimators=features_number) svc.fit(X_in, y_in) if file_log: file_log.writelines('Ein= {}\n'.format(1. - svc.score(X_in, y_in))) file_log.writelines('Etest= {}\n'.format(1. - svc.score(X_scaler, y_scaler))) y_out = svc.predict(X_out) return y_out