def objective(params): params['num_leaves'] = int(params['num_leaves']) params['bagging_freq'] = int(params['bagging_freq']) params['max_depth'] = int(params['max_depth']) skf = cross_validation.StratifiedKFold( y_train, # Samples to split in K folds n_folds=5, # Number of folds. Must be at least 2. shuffle= True, # Whether to shuffle each stratification of the data before splitting into batches. random_state= 423 # pseudo-random number generator state used for shuffling ) boost_rounds = [] score = [] for train, test in skf: _train_x, _test_x, _train_y, _test_y = \ x_train.iloc[train], x_train.iloc[test], y_train[train], y_train[test] train_lgb = lgb.Dataset(np.array(_train_x), np.array(_train_y)) test_lgb = lgb.Dataset(np.array(_test_x), np.array(_test_y), reference=train_lgb) model = lgb.train(params, train_lgb, num_boost_round=10000, valid_sets=test_lgb, early_stopping_rounds=300) boost_rounds.append(model.best_iteration) score.append(model.best_score) #score.append(-verify_accuracy(binary_predict(model.predict(_test_x), 0.5), _test_y)) # print('nb_trees={} val_loss={}'.format(boost_rounds, score)) # print(len(score)) mean_score = np.mean( [list(score[k]['valid_0'].values())[0] for k in range(len(score))]) #mean_score = np.mean(score) # print('average of best iteration:', np.average(boost_rounds)) return {'loss': mean_score, 'status': STATUS_OK}
def test_kfold_can_detect_dependent_samples_on_digits(): # see #2372 # The digits samples are dependent: they are apparently grouped by authors # although we don't have any information on the groups segment locations # for this data. We can highlight this fact be computing k-fold cross- # validation with and without shuffling: we observe that the shuffling case # wrongly makes the IID assumption and is therefore too optimistic: it # estimates a much higher accuracy (around 0.96) than than the non # shuffling variant (around 0.86). digits = load_digits() X, y = digits.data[:800], digits.target[:800] model = SVC(C=10, gamma=0.005) n = len(y) cv = cval.KFold(n, 5, shuffle=False) mean_score = cval.cross_val_score(model, X, y, cv=cv).mean() assert_greater(0.88, mean_score) assert_greater(mean_score, 0.85) # Shuffling the data artificially breaks the dependency and hides the # overfitting of the model with regards to the writing style of the authors # by yielding a seriously overestimated score: cv = cval.KFold(n, 5, shuffle=True, random_state=0) mean_score = cval.cross_val_score(model, X, y, cv=cv).mean() assert_greater(mean_score, 0.95) cv = cval.KFold(n, 5, shuffle=True, random_state=1) mean_score = cval.cross_val_score(model, X, y, cv=cv).mean() assert_greater(mean_score, 0.95) # Similarly, StratifiedKFold should try to shuffle the data as little # as possible (while respecting the balanced class constraints) # and thus be able to detect the dependency by not overestimating # the CV score either. As the digits dataset is approximately balanced # the estimated mean score is close to the score measured with # non-shuffled KFold cv = cval.StratifiedKFold(y, 5) mean_score = cval.cross_val_score(model, X, y, cv=cv).mean() assert_greater(0.88, mean_score) assert_greater(mean_score, 0.85)
def sample_fold_indices(table, folds=10, stratified=False, random_state=None): """ :param Orange.data.Table table: :param int folds: Number of folds :param bool stratified: Return stratified indices (if applicable). :param Random random_state: :rval tuple-of-arrays: A tuple of array indices one for each fold. """ n = len(table) if stratified and is_discrete(table.domain.class_var): # XXX: StratifiedKFold does not support random_state ind = cross_validation.StratifiedKFold( table.Y.ravel(), folds, # random_state=random_state ) else: ind = cross_validation.KFold( n, folds, shuffle=True, random_state=random_state ) return tuple(ind)
def tune_parameters(data, labels): """ Tune the parameters using exhaustive grid search """ # set cv here, why not cv = cross_validation.StratifiedKFold(labels, n_folds=5, shuffle=True) pipeline = Pipeline([('normaliser', preprocessing.Normalizer()), ('svm', SVC(kernel='poly', gamma=1, cache_size=1000))]) # can test multiple kernels as well if desired #param_grid = [{'kernel': 'poly', 'coef0': [1, 5, 10, 20], 'degree': [2, 3, 4, 5, 10]}] param_grid = [{'svm__coef0': [1, 2, 3, 4, 5], 'svm__degree': [2, 3, 4, 5]}] clf = GridSearchCV(pipeline, param_grid, n_jobs=-1, cv=cv) clf.fit(data, labels) print 'best parameters found:' print clf.best_estimator_ return clf.best_estimator_
def main(): X, Y = utils.read_data("../files/train_10.csv") n_target = len(set(Y)) Y = map(int, Y) folds = 5 stf = cross_validation.StratifiedKFold(Y, folds) loss = [] accs = [] classMap = sorted(list(set(Y))) X, Y = np.array(X), np.array(Y) print "Testing..." for i, (train, test) in enumerate(stf): X_train, X_test, y_train, y_test = X[train], X[test], Y[train], Y[test] probs = [[0.001 for x in range(n_target)] for y in range(len(y_test))] loss.append(utils.logloss(probs, y_test, classMap)) accs.append(utils.accuracy([1] * len(y_test), y_test)) print "Accuracy(Fold {0}): ".format(i) + str(accs[len(accs) - 1]) print "Loss(Fold {0}): ".format(i) + str(loss[len(loss) - 1]) print "Mean Accuracy: " + str(np.mean(accs)) print "Mean Loss: " + str(np.mean(loss))
def __init__(self, dataframe, base_cv=None, **cv_kwargs): # We create a copy of the dataframe with a new last level # index which is an enumeration of the rows (like proper indices) self.all_segments = pd.DataFrame({ 'Preictal': dataframe['Preictal'], 'i': np.arange(len(dataframe)) }) self.all_segments.set_index('i', append=True, inplace=True) # Now create a series with only the segments as rows. This is what we will pass into the wrapped cross # validation generator self.segments = self.all_segments['Preictal'].groupby( level='segment').first() self.segments.sort(inplace=True) if base_cv is None: self.cv = cross_validation.StratifiedKFold(self.segments, **cv_kwargs) else: self.cv = base_cv(self.segments, **cv_kwargs)
def train_classifier(predictors, response, feature_names=relevant_feature_names, tuned_clf=Clf.LINEAR_SVC, param_grid=None, test_size=0.5, scoring=weighted_f1, random_state=0): param_grid = param_grid or default_param_grid(tuned_clf) kf_cv = cross_validation.StratifiedKFold(response, n_folds=10, shuffle=True, random_state=random_state) cv_clf = GridSearchCV(estimator=tuned_clf, param_grid=param_grid, cv=kf_cv, scoring=scoring) cv_clf.fit(predictors, response) return cv_clf
def tune_parameters(data, labels): """ Tune the parameters using exhaustive grid search """ # set cv here, why not cv = cross_validation.StratifiedKFold(labels, n_folds=5, shuffle=True) pipeline = Pipeline([('normaliser', preprocessing.Normalizer()), ('svm', SVC(kernel='poly', cache_size=1000))]) param_grid = [{'svm__coef0': [1, 2, 3, 4, 5], 'svm__degree': [2, 3, 4, 5], 'svm__C': [1, 2], 'svm__gamma': [0, 1]}] print 'tuning params' clf = GridSearchCV(pipeline, param_grid, n_jobs=-1, cv=cv) clf.fit(data, labels) print 'best parameters found:' print clf.best_estimator_ return clf.best_estimator_
def cv_run(rd, X, y): print "X:", X.shape, "y:", y.shape n_cv = 16 #cv1 = cross_validation.KFold(len(y), n_folds=n_cv, random_state=random_state) cv1 = cross_validation.StratifiedKFold(y, n_folds=n_cv) scores = cross_validation.cross_val_score( rd, X, y, cv=cv1, scoring='roc_auc', #scoring=make_scorer(roc_auc_score), n_jobs=-1, verbose=1) print "scores:", scores print "%d Fold CV Score: %.6f +- %.4f" % ( n_cv, np.mean(scores), 2 * np.std(scores), )
def get_comb_models(traindata, targets, crossval=True): # traindata: list with NumExamples * NumOutputs(=10) array with length 'no. preprocessors' # reshape to NumExamples * [NumPreprocessors * NumOutputs] traindata = np.array(traindata).transpose((1, 0, 2)) traindata = np.reshape(traindata, [traindata.shape[0], -1]) # needs to be not one-hot targets = targets.argmax(axis=1) models = [ linear_model.LogisticRegression(penalty='l1', dual=False, C=5., fit_intercept=False), linear_model.LogisticRegression(penalty='l2', dual=False, C=10., fit_intercept=False), linear_model.LogisticRegression(penalty='l2', dual=False, C=20., fit_intercept=True) ] if crossval: # use StratifiedKFold, because survived 0/1 is not evenly distributed cv = cross_validation.StratifiedKFold(targets, n_folds=5) scores = [0] * len(models) for ii in range(len(models)): if crossval: # get scores scores[ii] = cross_validation.cross_val_score(models[ii], traindata, targets, \ cv=cv, n_jobs=1, scoring='accuracy') print "Cross-validation accuracy on the training set for model %d:" % ii print "%0.3f (+/-%0.03f)" % (scores[ii].mean(), scores[ii].std() / 2) else: models[ii].fit(traindata, targets) return models
def repeated_cross_fold_validation(models, n=10, k=5): """ Run cross validation on a set of models n times All models are tested using the same cross validation splits at each iteration. Args: models: List of dictionaries containing the model and training or testing data. n: number of iterations to repeat cross validation (default 10) k: number of folds to use at each iteration (default 5) Returns: A list of scorer objects of type ROCAnalysisScorer, one for each model passed. """ scorers = {} for i in range(n): # create a new cross validation set for each iteration & test. skf = cross_validation.StratifiedKFold(models[0]['train_data'][1], n_folds=k) for model in models: model_name = model['name'] if model_name not in scorers: scorers[model_name] = ROCAnalysisScorer() results = score_pipeline(model, cv=skf) # for each model collect the results into a single scorer. # note: no average is made at this stage. The results of each # of the k folds is collected into a single k * n list for # the model. scorers[model_name].f1scores_ += results[0].f1scores_ scorers[model_name].f2scores_ += results[0].f2scores_ scorers[model_name].fhalf_scores_ += results[0].fhalf_scores_ scorers[model_name].rates_ += results[0].rates_ scorers[model_name].aucs_ += results[0].aucs_ return scorers
def do_xgb_MOE(num_points_to_sample, X_train, y_train, verbose=True, **kwargs): # Finding Best XGB parameters using MOE xgb_parameters = {} # Range of XGBoost parameters that are optimized exp_xgb = Experiment( [[0.1, 1], [0.002, 1]], [0.01, 1] ) # learning_rate_range = [0.1, 1]; n_estimators_range = [2, 1000] is normalized # max_depth_range = [1, 100] is normalized n_folds = 10 cv_folds = cross_validation.StratifiedKFold(y_train, n_folds=n_folds) best_point = [] best_point_value = 0. for _ in range(num_points_to_sample): # Use MOE to determine what is the point with highest Expected Improvement to use next next_point_to_sample = gp_next_points( exp_xgb, rest_host='localhost', rest_port=6543, **kwargs)[0] # By default we only ask for one point # Sample the point from objective function xgb_parameters['learning_rate'] = next_point_to_sample[0] xgb_parameters['n_estimators'] = int( round(next_point_to_sample[1] * 1000)) xgb_parameters['max_depth'] = int(round(next_point_to_sample[2] * 100)) acc_cv, prec_cv, rec_cv, cm_cv, cm_full_cv = xgboost_cross_validation( X_train, y_train, xgb_parameters, cv_folds) value_of_next_point = acc_cv if value_of_next_point > best_point_value: best_point_value = value_of_next_point best_point = next_point_to_sample if verbose: print "Sampled f({0:s}) = {1:.18E}".format( str(next_point_to_sample), value_of_next_point) # Add the information about the point to the experiment historical data to inform the GP exp_xgb.historical_data.append_sample_points( [SamplePoint(next_point_to_sample, -value_of_next_point, 0.0001)]) # We can add some noise best_point[1] = int(round(best_point[1] * 1000)) best_point[2] = int(round(best_point[2] * 100)) return best_point, best_point_value
def compute_auc(gram_matrix, data, k=10, C=1.0): kv = cross_validation.StratifiedKFold(labels, n_folds=k) s = 0.0 for train_index, test_index in kv: gm_train = gram_matrix[train_index, :] gm_train = gm_train[:, train_index] data_train = data[train_index] # libSVM wants the distances from test instances to all train instances as input # see http://stackoverflow.com/questions/10978261/libsvm-precomputed-kernels gm_test = gram_matrix[test_index, :] gm_test = gm_test[:, train_index] #! data_test = data[test_index] # Have to use libsvm directly here, because of a bug in sklearn with precomputed gram matrices x = [] for i in range(len(gm_train)): l = gm_train[i].tolist() l.insert(0, i + 1) x.append(l) prob = svmutil.svm_problem(data_train.tolist(), x, isKernel=True) param = svmutil.svm_parameter("-t 4 -c %.410f -q" % C) m = svmutil.svm_train(prob, param) xx = [] for i in range(len(gm_test)): t = gm_test[i].tolist() t.insert(0, i + 1) xx.append(t) p_label, p_acc, p_val = svmutil.svm_predict(data_test.tolist(), xx, m) fpr, tpr, thresholds = roc_curve(data_test, p_val, pos_label=1.0) AUC = roc_auc_score(data_test, p_val) s += AUC return s / k
def generate_model(data, classes, args): # Define the parameters tuned_parameters = {'C': C_RANGE, 'class_weight': CLASS_WEIGHTS} # Define the classifier clf = linear_model.LogisticRegression(max_iter=SCORE_MAX_ITER, n_jobs=args.cores) print_verbose("Classifier: %s" % str(clf), 5) print_verbose("Parameters: %s" % str(tuned_parameters), 5) # Generate the K-fold development skf = cross_validation.StratifiedKFold(classes, n_folds=SCORE_K_FOLD, shuffle=True) print_verbose("KFold: %s" % str(skf), 5) gscv = grid_search.GridSearchCV(clf, tuned_parameters, cv=skf, scoring='mean_squared_error', n_jobs=1, verbose=get_verbose_level()) # Search print_verbose("GridSearch: %s" % str(gscv), 5) gscv.fit(data, classes) # Print scores print_verbose("GridSearch scores:", 5) for params, mean_score, scores in gscv.grid_scores_: print_verbose( "%0.6f (+/-%0.06f) for %r" % (mean_score, scores.std() / 2, params), 5) # Print best score print_verbose("GridSearch best score:", 0) print_verbose("%0.6f for %r" % (gscv.best_score_, gscv.best_params_), 0) return gscv
def getBestThreshold(features, labels_pooled, labels_current): print("length of pooled and current", len(labels_pooled), len(labels_current)) maxent = LogisticRegression(penalty='l1') scores = {"F1": [], "Recall": [], "Accuracy": [], "Precision": []} thresholds = [] print('Finding best thresholds...') fold = 1 # for TrainIndices, TestIndices in cross_validation.StratifiedKFold(labels_pooled, n_folds=2, shuffle=False, random_state=None): for TrainIndices, TestIndices in cross_validation.StratifiedKFold( labels_pooled, n_folds=10, shuffle=False, random_state=None): # for TrainIndices, TestIndices in cross_validation.KFold(n=features.shape[0], n_folds=10, shuffle=False, random_state=None): print('\r' + str(fold), end="") fold += 1 TrainX_i = features[TrainIndices] Trainy_i = labels_pooled[TrainIndices] TestX_i = features[TestIndices] Testy_i = labels_current[TestIndices] maxent.fit(TrainX_i, Trainy_i) #get prediction thresh_i, ypred_i, score = optimize_threshold(maxent, TestX_i, Testy_i) thresholds.append(thresh_i) scores["F1"].append(score[0]) scores["Recall"].append(score[1]) scores["Accuracy"].append(score[2]) scores["Precision"].append(score[3]) #scores = cross_validation.cross_val_score(maxent, features, labels, cv=10) print("\n--") for key in sorted(scores.keys()): currentmetric = np.array(scores[key]) print("%s : %0.2f (+/- %0.2f)" % (key, currentmetric.mean(), currentmetric.std())) print("--") return maxent, np.array(thresholds)
def getBestThreshold(X, y_current_tr, y_current_te, regularization='l2'): assert len(X) == len(y_current_tr) == len( y_current_te ), 'Number of features ({}), annotator1 labels ({}) and annotator2 labels ({}) is not equal!'.format( len(X), len(y_current_tr), len(y_current_te)) maxent = LogisticRegression(penalty=regularization) scores = {"F1": [], "Recall": [], "Accuracy": [], "Precision": []} thresholds = [] print('Finding best thresholds...') fold = 1 for TrainIndices, TestIndices in cross_validation.StratifiedKFold( y_current_tr, n_folds=10, shuffle=False, random_state=None): print('\r' + str(fold), end="") fold += 1 TrainX_i = X[TrainIndices] Trainy_i = y_current_tr[TrainIndices] TestX_i = X[TestIndices] Testy_i = y_current_te[TestIndices] maxent.fit(TrainX_i, Trainy_i) #get prediction thresh_i, ypred_i, score = optimize_threshold(maxent, TestX_i, Testy_i) thresholds.append(thresh_i) scores["F1"].append(score[0]) scores["Recall"].append(score[1]) scores["Accuracy"].append(score[2]) scores["Precision"].append(score[3]) #scores = cross_validation.cross_val_score(maxent, features, labels, cv=10) print("\n--") for key in sorted(scores.keys()): currentmetric = np.array(scores[key]) print("%s : %0.2f (+/- %0.2f)" % (key, currentmetric.mean(), currentmetric.std())) print("--") return maxent, np.array(thresholds)
def classify_ads(Xy): classifier = BernoulliNB() cv = cross_validation.StratifiedKFold(Xy[1], 2) precision = [] recall = [] for train, test in cv: X_train = Xy[0][train] X_test = Xy[0][test] y_train = Xy[1][train] y_test = Xy[1][test] classifier.fit(X_train, y_train) y_hat = classifier.predict(X_test) p, r, _, _ = metrics.precision_recall_fscore_support(y_test, y_hat) precision.append(p[1]) recall.append(r[1]) print classifier print 'precision:', np.average(precision), '+/-', np.std(precision) print 'recall:', np.average(recall), '+/-', np.std(recall) return classifier
def _calculate(self, X, y, categorical, metafeatures, helpers): import sklearn.naive_bayes if len(y.shape) == 1 or y.shape[1] == 1: kf = cross_validation.StratifiedKFold(y, n_folds=10) else: kf = cross_validation.KFold(y.shape[0], n_folds=10) accuracy = 0. for train, test in kf: nb = sklearn.naive_bayes.GaussianNB() if len(y.shape) == 1 or y.shape[1] == 1: nb.fit(X[train], y[train]) else: nb = OneVsRestClassifier(nb) nb.fit(X[train], y[train]) predictions = nb.predict(X[test]) accuracy += sklearn.metrics.accuracy_score(predictions, y[test]) return accuracy / 10
def fit_layer(self, layer_idx, X, y): if layer_idx >= len(self.layers): return elif layer_idx == len(self.layers) - 1: self.layers[layer_idx].fit(X, y) else: n_classes = len(set(y)) - 1 n_classifiers = len(self.layers[layer_idx]) output = np.zeros((X.shape[0], n_classes * n_classifiers)) skf = cross_validation.StratifiedKFold(y, self.cv) for tra, tst in skf: self.layers[layer_idx].fit(X[tra], y[tra]) out = self.layers[layer_idx].output(X[tst], mode=self.mode) if self.mode in ['probs', 'votes']: output[tst, :] = out[:, 1:, :].reshape( out.shape[0], (out.shape[1] - 1) * out.shape[2]) elif self.mode in ['labels']: output[tst, :] = out self.layers[layer_idx].fit(X, y) self.fit_layer(layer_idx + 1, output, y)
def grid_search(estimator, data, featTypes=('BoW', ), nFolds=10, random_seed=44, param_grid=()): labels = [x.severity for x in data] generatePrimaryFeats(data, featTypes) featurized = [] for d in data: instance = {} for featname, values in d.feats.items(): # Give each feature a unique name to avoid overwriting features. # If e.g. a concept feature has the same name as a bow word, the old code # would overwrite one of the features. instance.update( {"{0}-{1}".format(featname, k): v for k, v in values.items()}) featurized.append(instance) d = DictVectorizer() x_train = d.fit_transform(featurized) folds = cross_validation.StratifiedKFold(labels, n_folds=nFolds, shuffle=True, random_state=random_seed) grid = GridSearchCV(estimator, param_grid=param_grid, scoring="f1", n_jobs=-1, cv=folds) fit_grid = grid.fit(x_train, labels) print(fit_grid.best_params_) return fit_grid.best_params_
def svm_dummy_comparison(inputfile): x, y, labels = load_csv_svm(inputfile) x_scaled = preprocessing.scale(x) if USE_PCA: pca = PCA(n_components=PCA_COMPONENTS) x = pca.fit_transform(x_scaled) print(pca.explained_variance_ratio_) else: x = x_scaled visual_svm_clf = svm.SVC(gamma=GAMMA, C=C, class_weight=WEIGHT, kernel=KERNEL, cache_size=400) # gamma=.01, C=.01, dummy_svm_clf = DummyClassifier( strategy='most_frequent', random_state=0) # most_frequent, uniform, stratified cv = cross_validation.StratifiedKFold(y, 30) #cv = cross_validation.LeaveOneOut(len(y)) metric = 'f1' # accuracy, precision, recall, f1 visual_scores = cross_validation.cross_val_score(visual_svm_clf, x, y, cv=cv, scoring=metric) dummy_scores = cross_validation.cross_val_score(dummy_svm_clf, x, y, cv=cv, scoring=metric) print(metric) # print('real_scores: {0}'.format(visual_scores)) print('avg_real: {0}'.format(np.mean(visual_scores))) # print('dummy_scores: {0}'.format(dummy_scores)) print('avg_dumb: {0}'.format(np.mean(dummy_scores)))
def NoveltyDetectionFolds(folder, n_folds=2, trgt=None, dev=False, verbose=False): if n_folds < 2: print 'Invalid number of folds' return -1 if not dev: file_name = '%s/%i_folds_cross_validation.jbl' % (folder, n_folds) else: file_name = '%s/%i_folds_cross_validation_dev.jbl' % (folder, n_folds) if not os.path.exists(file_name): if verbose: print "Creating %s" % (file_name) if trgt is None: print 'Invalid trgt' return -1 CVO = {} for inovelty, novelty_class in enumerate(np.unique(trgt)): process_trgt = trgt[trgt != novelty_class] CVO[inovelty] = cross_validation.StratifiedKFold( process_trgt, n_folds) CVO[inovelty] = list(CVO[inovelty]) if verbose: print 'Saving in %s' % (file_name) joblib.dump([CVO], file_name, compress=9) else: if verbose: print "Reading from %s" % (file_name) [CVO] = joblib.load(file_name) return CVO
def trainAndEvaluateANN(features, labels, connRate, hidNodes, error): """ Train and evaluate a neural network on the given features with the given attributes. 3-fold cross-validation is used on each run, the average accuracy, precision, recall, and fmeasure of all three folds is returned. """ # Create 3-fold cross validation indices skf = cross_validation.StratifiedKFold(labels) binary = LabelBinarizer() accuracySum = 0 totalResults = [] totalTargets = [] # For each k-fold split for trainIndex, testIndex in skf: # Get data split featuresTrain, featuresTest = features[trainIndex], features[testIndex] labelsTrain, labelsTest = labels[trainIndex], labels[testIndex] # Train the neural network ann = trainANN(featuresTrain, labelsTrain, connRate, hidNodes, error, binary) # Evaluate ANN on test data accuracy, outputLabels = evaluateANN(featuresTest, labelsTest, ann, binary) accuracySum += accuracy # Store the results / targets for larger analysis totalResults.extend(outputLabels.tolist()) totalTargets.extend(labelsTest.tolist()) # Generate performance report report = classification_report(totalTargets, totalResults) return (accuracySum / 3.0, report)
def compute_cross_correlation_score(df, clfs, preprocess_scaling=True, nFold=10): """ return an iterator with cross validation data :param df: :param clfs: :param preprocess_scaling: :param nFold: :return: """ to_sklearn_features = DataFrameMapper([ ('features', sklearn.feature_extraction.DictVectorizer()) ]) data_X = to_sklearn_features.fit_transform(df) data_Y = df.expected_class skf = cross_validation.StratifiedKFold(data_Y, n_folds=nFold) classification_results = [] scores = [] for num, (train_index, test_index) in enumerate(skf): X_train, X_test = data_X[train_index], data_X[test_index] Y_train, Y_test = data_Y[train_index], data_Y[test_index] print("Len train{}, Len test{}".format(Y_train.size, Y_test.size)) cross_valid_data = Cross_validation_split(X_train, X_test, Y_train, Y_test) cross_valid_data = preprocess(cross_valid_data, preprocess_scaling=preprocess_scaling, preprocess_correlation=False) for clf in clfs: score, classification = generate_score(clf, cross_valid_data, fold=num) scores.append(score) classification_results.append(classification) return scores, classification_results
def eval_dag(dag, filename, dag_id=None): dag = normalize_dag(dag) # utils.draw_dag(dag) # pprint.pprint(dag) if filename not in input_cache: input_cache[filename] = pd.read_csv('data/' + filename, sep=';') data = input_cache[filename] feats = data[data.columns[:-1]] targets = data[data.columns[-1]] le = preprocessing.LabelEncoder() ix = targets.index targets = pd.Series(le.fit_transform(targets), index=ix) errors = [] start_time = time.time() for train_idx, test_idx in cross_validation.StratifiedKFold(targets, n_folds=5): train_data = (feats.iloc[train_idx], targets.iloc[train_idx]) test_data = (feats.iloc[test_idx], targets.iloc[test_idx]) ms = train_dag(dag, train_data) preds = test_dag(dag, ms, test_data) acc = mm.quadratic_weighted_kappa(test_data[1], preds) if filename == 'ml-prove.csv': acc = metrics.accuracy_score(test_data[1], preds) errors.append(acc) m_errors = float(np.mean(errors)) s_errors = float(np.std(errors)) return m_errors, s_errors, time.time() - start_time
def error_analysis_for_labeling(instances, X, y, folds, data_folder, clf=svm.LinearSVC(C=0.01)): cv = cross_validation.StratifiedKFold(y, n_folds=folds, random_state=0) for i, (train, test) in enumerate(cv): model = clf.fit(X[train], y[train]) y_pred = model.predict(X[test]) scores = model.decision_function(X[test]) #scores = model.predict_proba(X[test])[:,1] #precision, recall, thresholds = precision_recall_curve(y[test], scores) #print thresholds.shape[0] #for i in range(thresholds.shape[0]): # print "Threshold: %f, Precision: %f, Recall: %f" %(thresholds[i], precision[i], recall[i]) print("\nROC score on Test Data") print roc_auc_score(y[test], scores) do_error_analysis(y[test], y_pred, scores, test, instances) #relabel(y[test], y_pred, scores, test, instances, data_folder) print "\n" * 5
def Classifier(filename): print 'Loading data...' id, data, target = readTrainData(filename) print 'Total Examples', data.shape[0], 'Dummy percentage', 1 - target.mean( ) accuracy = [] kf = cross_validation.StratifiedKFold(target, 5) print 'Training and Testing...' for train, test in kf: dataTrain, dataTest, targetTrain, targetTest = data[train], data[ test], target[train], target[test] idTest = id[test] clf = BlendedClassifiers() clf.fit(dataTrain, targetTrain) probs = clf.predict_proba(dataTest) metric = PAtK(probs, targetTest, idTest) accuracy.append(metric) # print clf.predict_proba(dataTest) print 'P@K:', metric mean = np.mean(accuracy) ci = 1.96 * (np.std(accuracy) / np.sqrt(5)) print 'Mean P@K', mean, 'CI 95%', mean - ci, '-', mean + ci return accuracy
def lr_crossv_getC(trainx, trainy, Carr=[0.1, 1.0, 10.0, 100.0], seed=0): ''' Get an appropriate C value for the LR. Carr is the array of C values to test. ''' # Get stratified k folds skf = cross_validation.StratifiedKFold(trainy, n_folds=10) # Cross-validate for the best C best_c = 0 best_score = 0 for this_c in Carr: lr_est = LogisticRegression(penalty='l1', class_weight='auto', C=this_c, random_state=seed) scores = cross_validation.cross_val_score(lr_est, trainx, y=trainy, \ scoring='f1', cv=skf) # If this this_c scored, on average, better than the best C value so far, update best_c this_score = scores.mean() print 'This score and C: ', this_score, this_c if this_score > best_score: best_score = this_score best_c = this_c return best_c
def test(texts, classes, models, nn_params, folds=4): ''' Check the performance on an SVM implementation, given a list of texts and their classes (negative/neutral/positive) Uses k-fold cross-validation (keeping in mind to divide the data appropriately, depending on the class) ''' classes = np.array(classes) texts = np.array(texts) wrongs = [] auc_sum = 0 for train, test in cross_validation.StratifiedKFold(classes, folds): texts_train = texts[train] classes_train = classes[train] texts_test = texts[test] classes_test = classes[test] n = Ensemble(texts_train, classes_train, nn_params, models) predictions = n.classify(texts_test) predictions[predictions<0] = 0 auc = calculate_auc(classes_test, predictions) print auc auc_sum += auc for i in range(len(texts_test)): if abs(classes_test[i] - predictions[i]) > 0.5: wrongs.append((classes_test[i], predictions[i], texts_test[i])) ''' import csv writer = open('wrongs.csv', 'w') for w in wrongs: writer.write('%s,%s,%s\n' % w) writer.close() ''' return auc_sum / folds
def kfold(tracks, feature_names, folds=5, shuffle=True, **kwargs): labels = [track['label'] for track in tracks] kf = cross_validation.StratifiedKFold(labels, n_folds=folds, shuffle=shuffle) for train, test in kf: train_tracks = [tracks[i] for i in train] test_tracks = [tracks[i] for i in test] clf = machine_learning.Classifier(**kwargs) clf = machine_learning.train_tracks(clf, train_tracks, feature_names) predicted_all = [] Y_test_all = [] for track in test_tracks: X_test, Y_test = machine_learning.shape_features([track], feature_names) predicted = machine_learning.predict(X_test, clf) track['sample_predictions'] = predicted track['prediction'], track['predictions'] = util.most_common( predicted) predicted_all.extend(predicted) Y_test_all.extend(Y_test) yield test_tracks