def load_data(self): classes_count = len(le_.classes_) # load test data # x_test = TestDataTask(self.task_core).run() # train x_train, y_train = TrainingDataTask(self.task_core).run() # split train_idxs, test_idxs = list(StratifiedShuffleSplit(y_train, 1, test_size=self.task_core.cv_ratio, random_state=self.task_core.n_seed))[0] x_test = x_train.filter_rows_by_idxs(test_idxs) y_test = y_train[test_idxs] x_train = x_train.filter_rows_by_idxs(train_idxs) y_train = y_train[train_idxs] # 2014 only for test x_test, y_test, _, _ = divide_by_has_sessions(x_test, y_test) print('running prediction model') probabilities = run_model(x_train, y_train, x_test, classes_count, self.classifier, self.task_core.n_threads, self.task_core.n_seed, self.task_core.cache_dir) print_probabilities(probabilities) s = score(probabilities, y_test) return {'Score': s}
def do_cv(x_cv, y_cv, classifier, n_fold): perm = np.random.permutation(x_cv.shape[0]) x_cv = x_cv[perm,:] y_cv = y_cv[perm] cv_scores = cross_val_score( classifier, x_cv, y_cv, scoring=make_scorer((lambda true_values, predictions: score(predictions, true_values)), needs_proba=True), cv=n_fold, verbose=10) print('cv_scores: ', cv_scores, '; mean: ', np.mean(cv_scores))
def do_grid_search(x_search, y_search, classifier, param_grid): search_classifier = GridSearchCV( clone(classifier), param_grid, cv=4, verbose=10, n_jobs=1, scoring=make_scorer((lambda true_values, predictions: score(predictions, true_values)), needs_proba=True) ) perm = np.random.permutation(x_search.shape[0]) x_search = x_search[perm,:] y_search = y_search[perm] search_classifier.fit(x_search, y_search) print('grid_scores_: ', search_classifier.grid_scores_) print('best_score_: ', search_classifier.best_score_) print('best_params_: ', search_classifier.best_params_) return search_classifier.best_estimator_
def train_blend_feature(classifier, scale, x, y, classes_count, random_state, n_folds): classifiers = [clone(classifier) for i in range(n_folds)] if scale: scalers = [StandardScaler() for i in range(n_folds)] else: scalers = None print("train_blend_feature: scale - ", scale, ", x - ", x.shape, "; y - ", y.shape) scores = [] folds = list(StratifiedKFold(y, n_folds, shuffle=True, random_state=random_state)) blend_train = np.zeros((x.shape[0], classes_count)) for i, (train_idx, test_idx) in enumerate(folds): print("fold: ", i) x_blend_train = x[train_idx] y_blend_train = y[train_idx] x_blend_test = x[test_idx] classifier = classifiers[i] if scale: scaler = scalers[i].fit(x_blend_train) x_blend_train = scaler.transform(x_blend_train) x_blend_test = scaler.transform(x_blend_test) if isinstance(classifier, XGBClassifier): classifier.fit(x_blend_train, y_blend_train, eval_metric=ndcg5_eval) # 'ndcg@5') else: classifier.fit(x_blend_train, y_blend_train) y_blend_predicted = classifiers[i].predict_proba(x_blend_test) blend_train[test_idx, :classes_count] = y_blend_predicted # score y_blend_test = y[test_idx] scores.append(score(y_blend_predicted, y_blend_test, max_classes=min(classes_count, 5))) print("feature score: ", np.average(scores)) if scale: return classifiers, scalers, blend_train else: return classifiers, blend_train
def main(): while True: try: response = input("Training a newtork? y/n ") if not (response == 'y' or response == 'n'): raise ValueError if response == 'y': training = True else: training = False break except ValueError as e: print("Did not enter 'y' or 'n', try again") if training: initFile, trainFile, outFile, learningRate, nEpochs = userInput.getTrain() Node.learningRate = learningRate inputFeeder = nnLoad.Input(initFile) dataFeeder = data.DataFeeder(trainFile) model = Network(inputFeeder) for epoch in range(0, nEpochs): nCorrect = 0 for example in range(1, dataFeeder.listMax+1): features, target = dataFeeder.getNextExample() model.forward(features) model.backward(target) if all(np.round(model.activations) == target): nCorrect += 1 print('Pct Correct', nCorrect / dataFeeder.listMax) f = open(outFile, 'w') for e in inputFeeder.l[0]: f.write('{0:d} '.format(int(e))) f.seek(f.tell()-1, 0) # to overwrite trailing space f.write('\n') for layer in model.layers: for node in layer.nodes: if hasattr(node, 'w'): print(node.w) for e in node.w: f.write('{0:.3f} '.format(e)) f.seek(f.tell()-1, 0) # to overwrite trailing space f.write('\n') else: # testing trainedFile, testFile, outFile = userInput.getTest() inputFeeder = nnLoad.Input(trainedFile) dataFeeder = data.DataFeeder(testFile) model = Network(inputFeeder) targets = [] predictions = [] for example in range(1, dataFeeder.listMax+1): features, target = dataFeeder.getNextExample() model.forward(features) predictions.append(model.activations) targets.append(target) targets = np.array(targets, np.int, ndmin=2) predictions = np.array(np.round(predictions), np.int, ndmin=2) # Make sure shapes of targets and preds are nClasses by nExamples if np.any(np.shape(targets) != np.shape(predictions)): targets = np.swapaxes(targets, 0, 1) if np.shape(targets)[0] == dataFeeder.listMax: targets = np.transpose(targets) predictions = np.transpose(predictions) score(targets, predictions, outFile)