示例#1
0
文件: tasks.py 项目: kaluzhny/airbnb
    def load_data(self):
        classes_count = len(le_.classes_)

        # load test data
        # x_test = TestDataTask(self.task_core).run()

        # train
        x_train, y_train = TrainingDataTask(self.task_core).run()

        # split
        train_idxs, test_idxs = list(StratifiedShuffleSplit(y_train, 1, test_size=self.task_core.cv_ratio,
                                                       random_state=self.task_core.n_seed))[0]
        x_test = x_train.filter_rows_by_idxs(test_idxs)
        y_test = y_train[test_idxs]
        x_train = x_train.filter_rows_by_idxs(train_idxs)
        y_train = y_train[train_idxs]

        # 2014 only for test
        x_test, y_test, _, _ = divide_by_has_sessions(x_test, y_test)

        print('running prediction model')
        probabilities = run_model(x_train, y_train, x_test, classes_count, self.classifier,
                                  self.task_core.n_threads, self.task_core.n_seed,
                                  self.task_core.cache_dir)

        print_probabilities(probabilities)
        s = score(probabilities, y_test)
        return {'Score': s}
示例#2
0
文件: tasks.py 项目: kaluzhny/airbnb
def do_cv(x_cv, y_cv, classifier, n_fold):
    perm = np.random.permutation(x_cv.shape[0])
    x_cv = x_cv[perm,:]
    y_cv = y_cv[perm]
    cv_scores = cross_val_score(
        classifier, x_cv, y_cv,
        scoring=make_scorer((lambda true_values, predictions: score(predictions, true_values)), needs_proba=True),
        cv=n_fold, verbose=10)
    print('cv_scores: ', cv_scores, '; mean: ', np.mean(cv_scores))
示例#3
0
文件: tasks.py 项目: kaluzhny/airbnb
def do_grid_search(x_search, y_search, classifier, param_grid):
    search_classifier = GridSearchCV(
        clone(classifier),
        param_grid,
        cv=4,
        verbose=10,
        n_jobs=1,
        scoring=make_scorer((lambda true_values, predictions: score(predictions, true_values)), needs_proba=True)
    )
    perm = np.random.permutation(x_search.shape[0])
    x_search = x_search[perm,:]
    y_search = y_search[perm]
    search_classifier.fit(x_search, y_search)
    print('grid_scores_: ', search_classifier.grid_scores_)
    print('best_score_: ', search_classifier.best_score_)
    print('best_params_: ', search_classifier.best_params_)
    return search_classifier.best_estimator_
示例#4
0
文件: blend.py 项目: kaluzhny/airbnb
def train_blend_feature(classifier, scale, x, y, classes_count, random_state, n_folds):
    classifiers = [clone(classifier) for i in range(n_folds)]

    if scale:
        scalers = [StandardScaler() for i in range(n_folds)]
    else:
        scalers = None

    print("train_blend_feature: scale - ", scale, ", x - ", x.shape, "; y - ", y.shape)

    scores = []

    folds = list(StratifiedKFold(y, n_folds, shuffle=True, random_state=random_state))
    blend_train = np.zeros((x.shape[0], classes_count))
    for i, (train_idx, test_idx) in enumerate(folds):
        print("fold: ", i)
        x_blend_train = x[train_idx]
        y_blend_train = y[train_idx]
        x_blend_test = x[test_idx]

        classifier = classifiers[i]

        if scale:
            scaler = scalers[i].fit(x_blend_train)
            x_blend_train = scaler.transform(x_blend_train)
            x_blend_test = scaler.transform(x_blend_test)

        if isinstance(classifier, XGBClassifier):
            classifier.fit(x_blend_train, y_blend_train, eval_metric=ndcg5_eval)  # 'ndcg@5')
        else:
            classifier.fit(x_blend_train, y_blend_train)
        y_blend_predicted = classifiers[i].predict_proba(x_blend_test)
        blend_train[test_idx, :classes_count] = y_blend_predicted

        # score
        y_blend_test = y[test_idx]
        scores.append(score(y_blend_predicted, y_blend_test, max_classes=min(classes_count, 5)))

    print("feature score: ", np.average(scores))

    if scale:
        return classifiers, scalers, blend_train
    else:
        return classifiers, blend_train
示例#5
0
文件: nn.py 项目: ccurro/snn
def main():
    while True:
        try:
            response = input("Training a newtork? y/n ")
            if not (response == 'y' or response == 'n'):
                raise ValueError
            if response == 'y':
                training = True
            else:
                training = False
            break
        except ValueError as e:
            print("Did not enter 'y' or 'n', try again")


    if training:
        initFile, trainFile, outFile, learningRate, nEpochs = userInput.getTrain()

        Node.learningRate = learningRate
        inputFeeder = nnLoad.Input(initFile)
        dataFeeder = data.DataFeeder(trainFile)
        model = Network(inputFeeder)

        for epoch in range(0, nEpochs):
            nCorrect = 0
            for example in range(1, dataFeeder.listMax+1):
                features, target = dataFeeder.getNextExample()
                model.forward(features)
                model.backward(target)
                if all(np.round(model.activations) == target):
                    nCorrect += 1

            print('Pct Correct', nCorrect / dataFeeder.listMax)


        f = open(outFile, 'w')
        for e in inputFeeder.l[0]:
            f.write('{0:d} '.format(int(e)))
        f.seek(f.tell()-1, 0) # to overwrite trailing space
        f.write('\n')

        for layer in model.layers:
            for node in layer.nodes:
                if hasattr(node, 'w'):
                    print(node.w)
                    for e in node.w:
                        f.write('{0:.3f} '.format(e))
                    f.seek(f.tell()-1, 0) # to overwrite trailing space
                    f.write('\n')
    else: # testing
        trainedFile, testFile, outFile = userInput.getTest()
        inputFeeder = nnLoad.Input(trainedFile)
        dataFeeder = data.DataFeeder(testFile)
        model = Network(inputFeeder)

        targets = []
        predictions = []

        for example in range(1, dataFeeder.listMax+1):
            features, target = dataFeeder.getNextExample()
            model.forward(features)
            predictions.append(model.activations)
            targets.append(target)

        targets = np.array(targets, np.int, ndmin=2)
        predictions = np.array(np.round(predictions), np.int, ndmin=2)

        # Make sure shapes of targets and preds are nClasses by nExamples
        if np.any(np.shape(targets) != np.shape(predictions)):
            targets = np.swapaxes(targets, 0, 1)

        if np.shape(targets)[0] == dataFeeder.listMax:
            targets = np.transpose(targets)
            predictions = np.transpose(predictions)

        score(targets, predictions, outFile)