Пример #1
0
def learn(examples,
          Classifier,
          classifierArgs,
          develFolds=10,
          verbose=3,
          n_jobs=1,
          predKey="ml_comb_pred",
          limitTerms=None):
    print "Parameter grid search"
    develExamples = getSubset(examples, ["devel"])
    clf = GridSearchCV(Classifier(),
                       classifierArgs,
                       cv=develFolds,
                       verbose=verbose,
                       n_jobs=n_jobs,
                       scoring="f1_micro")
    clf.fit(develExamples["features"], develExamples["classes"])
    print "Best params", (clf.best_params_, clf.best_score_)
    print "Predicting all examples"
    minMax = MinMaxScaler((0.03, 1.0))
    allPredictions = clf.predict(examples["features"])
    if hasattr(clf, "predict_proba"):
        allProbabilities = clf.predict_proba(examples["features"])
    else:
        allProbabilities = clf.decision_function(examples["features"])
        #import pdb; pdb.set_trace()
        minMax.fit(
            allProbabilities)  #minmax_scale(testProbabilities, (0.03, 1.0))
        allProbabilities = minMax.transform(
            allProbabilities
        )  #allProbabilities = minmax_scale(allProbabilities, (0.03, 1.0))
    print "Predicting the test set"
    testExamples = getSubset(examples, ["test"])
    testPredictions = clf.predict(testExamples["features"])
    if hasattr(clf, "predict_proba"):
        testProbabilities = clf.predict_proba(testExamples["features"])
    else:
        testProbabilities = clf.decision_function(testExamples["features"])
        testProbabilities = minMax.transform(testProbabilities)
    binaryToMultiLabel(testExamples, testPredictions, testProbabilities,
                       predKey)
    print "Evaluating test set ensemble predictions"
    testProteins = {x["id"]: x for x in testExamples["proteins"]}
    multiLabelTestExamples = evaluateFile.makeExamples(testProteins,
                                                       limitTerms=limitTerms,
                                                       limitToSets=["test"],
                                                       predKey=predKey)
    loading.vectorizeExamples(multiLabelTestExamples, None, sparseLabels=True)
    results = evaluation.evaluate(multiLabelTestExamples["labels"],
                                  multiLabelTestExamples["predictions"],
                                  multiLabelTestExamples,
                                  terms=None,
                                  averageOnly=True,
                                  noAUC=True)
    print "Average for test set:", evaluation.metricsToString(
        results["average"])
    binaryToMultiLabel(examples, allPredictions, allProbabilities, predKey)
Пример #2
0
def classification_by_monkey(X, y, labelset, param_grid, stream,
                             n_folds_test=10, n_folds_gridsearch=5,
                             verbose=True):
    for monkey in X.keys():
        if verbose:
            print '-' * len(monkey)
            print monkey
            print '-' * len(monkey)

        print >>stream, '***', monkey

        y_true = None
        y_pred = None
        pvals = None
        print >>stream, '\n**** Cross-validation scores\n'

        for fold in range(n_folds_test):
            if verbose:
                print '  FOLD:', fold
            X_train, X_test, y_train, y_test = train_test_split(X[monkey],
                                                                y[monkey],
                                                                test_size=0.1)
            if verbose:
                print 'training classifier...'
            clf = GridSearchCV(SVC(),
                               param_grid,
                               cv=n_folds_gridsearch,
                               scoring='f1',
                               verbose=1 if verbose else 0, n_jobs=-1)
            clf.fit(X_train, y_train)

            print >>stream, 'FOLD:', fold, clf.best_score_
            print >>stream, pformat(clf.best_params_)

            if verbose:
                print 'predicting class labels...'
            if y_true is None:
                y_true = y_test
                y_pred = clf.predict(X_test)
                pvals = expit(clf.decision_function(X_test))
            else:
                y_true = np.hstack((y_true, y_test))
                y_pred = np.hstack((y_pred, clf.predict(X_test)))
                pvals = np.hstack((pvals, expit(clf.decision_function(X_test))))
        print >>stream, '\n**** Classification report\n'
        print >>stream, metrics.classification_report(y_true, y_pred,
                                            target_names=labelset[monkey])
        print >>stream, '\n**** Confusion matrix\n'
        print_cm(stream, metrics.confusion_matrix(y_true, y_pred),
                 labelset[monkey])
        print >>stream, ''
        stream.flush()
        with open('results/clf_by_monkey_{0}_blue_merged.pkl'.format(monkey), 'wb') as fid:
            pickle.dump((y_true, y_pred, pvals, labelset[monkey]), fid, -1)
Пример #3
0
 def fit_validate_and_predict(self, train_idx, test_idx, sklearn_model,
                              sklearn_params):
     print(sklearn_params)
     clf = GridSearchCV(sklearn_model,
                        sklearn_params,
                        scoring='roc_auc',
                        n_jobs=8,
                        cv=5,
                        verbose=4)
     clf.fit(self.data[train_idx, :],
             self.labels[train_idx] - 1)  # handle '' added for test labels
     print('Best Estimator:')
     print(clf.best_estimator_)
     print()
     print('Grid Scores:')
     for params, mean_score, scores in clf.grid_scores_:
         print("%0.3f (+/-%0.03f) for %r" %
               (mean_score, scores.std() / 2, params))
     print()
     try:
         prob = clf.predict_proba(self.data[test_idx, :])
     except:
         scores = clf.decision_function(self.data[test_idx, :])
         prob = 1. / (1. + np.exp(-scores))
     return prob
def gridsearch(model, params):
    gs = GridSearchCV(model, params, scoring='roc_auc', n_jobs=-1)
    gs.fit(X_train, y_train)
    print ('Best params: ', gs.best_params_)
    print ('Best auc on training set: ', gs.best_score_)
    print ('Best auc on test set: ', gs.score(X_test, y_test))
    return gs.predict(X_test), gs.decision_function(X_test)
Пример #5
0
def test_grid_search():
    """Test that the best estimator contains the right value for foo_param"""
    clf = MockClassifier()
    grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]}, verbose=3)
    # make sure it selects the smallest parameter in case of ties
    old_stdout = sys.stdout
    sys.stdout = StringIO()
    grid_search.fit(X, y)
    sys.stdout = old_stdout
    assert_equal(grid_search.best_estimator_.foo_param, 2)

    for i, foo_i in enumerate([1, 2, 3]):
        assert_true(grid_search.cv_scores_[i][0] == {'foo_param': foo_i})
    # Smoke test the score etc:
    grid_search.score(X, y)
    grid_search.predict_proba(X)
    grid_search.decision_function(X)
    grid_search.transform(X)
Пример #6
0
def trainLinearSVC(trainData, trainLabels, testData):
    print("\nTraining Linear SVC...")

    trainData = np.asarray(trainData)
    trainLabels = np.asarray(trainLabels)
    print(trainData.shape)
    print(trainLabels.shape)

    iter = 2000
    cross_val = 5

    Cs = np.power(2, np.linspace(-3, 9, num=7))
    parameters = {
        "estimator__C": Cs,
    }

    osvc = OneVsRestClassifier(LinearSVC(class_weight='balanced',
                                         verbose=False,
                                         multi_class='ovr',
                                         max_iter=iter),
                               n_jobs=-1)
    svc = GridSearchCV(osvc, cv=cross_val, param_grid=parameters, n_jobs=-1)

    t0 = time()
    svc.fit(trainData, trainLabels)
    print("\nTraining finished in %0.3fs \n" % (time() - t0))

    print("Best parameters: ")
    print(svc.best_params_)
    print("\nBest estimator: ")
    print(svc.best_estimator_)
    print("Best score: ")
    print(svc.best_score_)

    t0 = time()
    predictedLabels = svc.predict(testData)
    print("\nTesting finished in %0.3fs" % (time() - t0))

    t0 = time()
    confidence_scores = svc.decision_function(testData)
    print("\nTesting finished in %0.3fs" % (time() - t0))

    print("\nPredicted Labels")
    print("----------------------------------")
    print(predictedLabels)

    print("\nConfidence Scores")
    print("----------------------------------")
    print(confidence_scores)

    params = {
        'iter': iter,
        'cv': cross_val,
    }

    return confidence_scores, predictedLabels, params
def test_grid_search():
    """Test that the best estimator contains the right value for foo_param"""
    clf = MockClassifier()
    grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]}, verbose=3)
    # make sure it selects the smallest parameter in case of ties
    old_stdout = sys.stdout
    sys.stdout = StringIO()
    grid_search.fit(X, y)
    sys.stdout = old_stdout
    assert_equal(grid_search.best_estimator_.foo_param, 2)

    for i, foo_i in enumerate([1, 2, 3]):
        assert_true(grid_search.cv_scores_[i][0]
                    == {'foo_param': foo_i})
    # Smoke test the score etc:
    grid_search.score(X, y)
    grid_search.predict_proba(X)
    grid_search.decision_function(X)
    grid_search.transform(X)
Пример #8
0
def trainAndTestLinearSVM_withfolds(train, test, GT_train, GT_test, folds,
                                    start, end, numparams):
    print 'Training and Testing a Linear SVM'
    init = time.time()
    stdSlr = StandardScaler().fit(train)
    train = stdSlr.transform(train)
    kernelMatrix = histogramIntersection(train, train)
    tuned_parameters = [{
        'kernel': ['linear'],
        'C': np.linspace(start, end, num=numparams)
    }]
    clf = GridSearchCV(svm.SVC(kernel='linear', decision_function_shape='ovr'),
                       tuned_parameters,
                       cv=folds,
                       scoring='accuracy',
                       n_jobs=8)
    clf.fit(kernelMatrix, GT_train)
    print(clf.best_params_)
    predictMatrix = histogramIntersection(stdSlr.transform(test), train)
    SVMpredictions = clf.predict(predictMatrix)
    correct = sum(1.0 * (SVMpredictions == GT_test))
    accuracy = correct / len(GT_test)
    cm = confusion_matrix(GT_test, SVMpredictions)
    end = time.time()
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    y_score = clf.decision_function(predictMatrix)

    for i in range(8):
        fpr[i], tpr[i], _ = roc_curve(np.asarray(GT_test),
                                      y_score[:, i],
                                      pos_label=i)
        roc_auc[i] = auc(fpr[i], tpr[i])

        # First aggregate all false positive rates
    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(8)]))

    # Then interpolate all ROC curves at this points
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(8):
        mean_tpr += interp(all_fpr, fpr[i], tpr[i])

    # Finally average it and compute AUC
    mean_tpr /= 8

    fpr["macro"] = all_fpr
    tpr["macro"] = mean_tpr
    roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

    print 'Done in ' + str(end - init) + ' secs.'
    return accuracy, cm, fpr, tpr, roc_auc
Пример #9
0
def fit_predict_classification_pr(X,y,Xtest,ytest):
    #Linear SVC with hinge loss
    metric="roc_auc_score"
    param={'C':[100,10,1,0.1,0.01,0.001,0.0001]}
    model=GridSearchCV(svm.LinearSVC(loss='hinge'),param,scoring=metric)
    model.fit(X,y)
    
    print "Model:LinearSVC, metric:%s, best_param:" %(metric), model.best_params_
    print model.grid_scores_
    
    ypred=model.decision_function(Xtest)
    
    return {'ypred': ypred, 'ytest': ytest, 'auprc': metrics.roc_auc_score(ytest,ypred)}
Пример #10
0
def test_grid_search():
    # Test that the best estimator contains the right value for foo_param
    clf = MockClassifier()
    grid_search = GridSearchCV(clf, {"foo_param": [1, 2, 3]}, verbose=3)
    # make sure it selects the smallest parameter in case of ties
    old_stdout = sys.stdout
    sys.stdout = StringIO()
    grid_search.fit(X, y)
    sys.stdout = old_stdout
    assert_equal(grid_search.best_estimator_.foo_param, 2)

    for i, foo_i in enumerate([1, 2, 3]):
        assert_true(grid_search.grid_scores_[i][0] == {"foo_param": foo_i})
    # Smoke test the score etc:
    grid_search.score(X, y)
    grid_search.predict_proba(X)
    grid_search.decision_function(X)
    grid_search.transform(X)

    # Test exception handling on scoring
    grid_search.scoring = "sklearn"
    assert_raises(ValueError, grid_search.fit, X, y)
Пример #11
0
def fit_predict_classification_pr(X,y,Xtest,ytest):
    #Linear SVC with hinge loss
    metric="roc_auc_score"
    param={'C':[100,10,1,0.1,0.01,0.001,0.0001]}
    model=GridSearchCV(svm.LinearSVC(loss='hinge'),param,scoring=metric)
    model.fit(X,y)
    
    print "Model:LinearSVC, metric:%s, best_param:" %(metric), model.best_params_
    print model.grid_scores_
    
    ypred=model.decision_function(Xtest)
    
    return {'ypred': ypred, 'ytest': ytest, 'auprc': metrics.roc_auc_score(ytest,ypred)}
Пример #12
0
 def fit_validate_and_predict(self, train_idx, test_idx, sklearn_model, sklearn_params):
         print(sklearn_params)
         clf = GridSearchCV(sklearn_model, sklearn_params, scoring='roc_auc', n_jobs=8, cv=5, verbose=4)
         clf.fit(self.data[train_idx, :], self.labels[train_idx]-1)  # handle '' added for test labels
         print('Best Estimator:')
         print(clf.best_estimator_)
         print()
         print('Grid Scores:')
         for params, mean_score, scores in clf.grid_scores_:
             print("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() / 2, params))
         print()
         try:
                 prob = clf.predict_proba(self.data[test_idx, :])
         except:
                 scores = clf.decision_function(self.data[test_idx, :])
                 prob = 1. / (1. + np.exp(-scores))
         return prob
Пример #13
0
def fit_clf(args, clf_name, val, n_fold, project_name, save, scoring):
    '''
    Run fit method from val with X and y
    clf_name is a string with the classifier name
    '''
    train, test = args[0]['kf'][n_fold]
    X = args[0]['X'][train, :]
    y = args[0]['y'][train]
    file_name = 'poly_{}/models/{}_{}.p'.format(
        project_name, clf_name, n_fold + 1)
    start = time.time()
    if os.path.isfile(file_name):
        logger.info('Loading {} {}'.format(file_name, n_fold))
        clf = joblib.load(file_name)
    else:
        logger.info('Training {} {}'.format(clf_name, n_fold))
        clf = deepcopy(val['clf'])
        if val['parameters']:
            clf = GridSearchCV(clf, val['parameters'], n_jobs=1, cv=3,
                               scoring=_scorer)
        clf.fit(X, y)
        if save:
            joblib.dump(clf, file_name)

    train_score = _scorer(clf, X, y)

    X = args[0]['X'][test, :]
    y = args[0]['y'][test]
    # Scores
    test_score = _scorer(clf, X, y)
    ypred = clf.predict(X)
    if hasattr(clf, 'predict_proba'):
        yprob = clf.predict_proba(X)
    elif hasattr(clf, 'decision_function'):
        yprob = clf.decision_function(X)

    confusion = confusion_matrix(y, ypred)
    duration = time.time() - start
    logger.info('{0:25} {1:2}: Train {2:.2f}/Test {3:.2f}, {4:.2f} sec'.format(
        clf_name, n_fold, train_score, test_score, duration))
    return (train_score, test_score,
            ypred, yprob,  # predictions and probabilities
            confusion,  # confusion matrix
            clf)  # fitted clf
class PageClassifier(object):

    def __init__(self, feature_extractors):
        self._fx_exts = feature_extractors
        self._clf = None

    def train(self, wikicode_list, labels):
        features = [ext.fit_extract(wikicode_list) for ext in self._fx_exts]
        X = np.concatenate(features, axis=1)
        kbest = SelectKBest(f_classif)
        # model = RandomForestClassifier(
        #             class_weight='balanced')
        # model = GradientBoostingClassifier()
        model = LinearSVC(class_weight='balanced',
                          dual=False,
                          penalty='l1')
        pipe = Pipeline([('kbest', kbest), ('model', model)])
        self._clf = GridSearchCV(pipe,
                                 {'kbest__k': list(range(1, X.shape[1], 10))},
                                 scoring='roc_auc',
                                 cv=10
                                 ).fit(X, labels)

    @_ensure_trained
    def predict(self, wikicode_list):
        X = self._extract_feature_vectors_from_wikicode_list(wikicode_list)
        return self._clf.predict(X)

    @_ensure_trained
    def predict_proba(self, wikicode_list):
        X = self._extract_feature_vectors_from_wikicode_list(wikicode_list)
        # return [cls1 for cls0, cls1 in self._clf.predict_proba(X)]
        return self._clf.decision_function(X)

    def _extract_feature_vectors_from_wikicode_list(self, wikicode_list):
        features = [ext.extract(wikicode_list) for ext in self._fx_exts]
        X = np.concatenate(features, axis=1)
        return X
Пример #15
0
def logreg():
    # Считайте таблицу с признаками из файла features.csv.
    train_data = pd.read_csv('data/features_training.csv', index_col='match_id')
    X_train = train_data.drop(['duration', 'radiant_win', 'tower_status_radiant', 'tower_status_dire',
                           'barracks_status_radiant', 'barracks_status_dire'], axis=1)
    y_train = train_data['radiant_win']

    # Замените пропуски на нули с помощью функции fillna().
    X_train.fillna(value=0, inplace=True)


    # 1. Оцените качество логистической регрессии (sklearn.linear_model.LogisticRegression с L2-регуляризацией) с
    # помощью кросс-валидации по той же схеме, которая использовалась для градиентного бустинга. Подберите при этом
    # лучший параметр регуляризации (C). Какое наилучшее качество у вас получилось? Как оно соотносится с качеством
    # градиентного бустинга? Чем вы можете объяснить эту разницу? Быстрее ли работает логистическая регрессия по
    # сравнению с градиентным бустингом?

    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train_scaled = pd.DataFrame(scaler.transform(X_train))

    # Разобъем данные на тестовую и обучающую выборки
    X_small_train, X_small_test, y_small_train, y_small_test = train_test_split(X_train_scaled, y_train, test_size = 0.5,
                                                                                random_state = 1)

    # Зафиксируйте генератор разбиений для кросс-валидации по 5 блокам (KFold), не
    # забудьте перемешать при этом выборку (shuffle=True), поскольку данные в таблице отсортированы по времени, и без
    # перемешивания можно столкнуться с нежелательными эффектами при оценивании качества.
    k_fold = KFold(len(X_small_train), n_folds=5, shuffle=True, random_state=1)

    # Проведем поиск по сетке параметров - в качестве параметра будет выступать коэффициент регуляризации 'C':
    logreg_params = {'C': numpy.power(10.0, numpy.arange(-5, 6, 1))}

    clf_logreg = LogisticRegression(random_state=1, verbose=0)
    clf_logreg_grid = GridSearchCV(clf_logreg,
                                   logreg_params,
                                   cv=k_fold,
                                   scoring='roc_auc'
                                   )
    clf_logreg_grid.fit(X_small_train, y_small_train)


    y_train_score_logreg = clf_logreg_grid.decision_function(X_small_train)
    y_test_score_logreg = clf_logreg_grid.decision_function(X_small_test)

    # Какое качество получилось у логистической регрессии над всеми исходными признаками?
    # Как оно соотносится с качеством градиентного бустинга? Чем можно объяснить эту разницу?
    # Быстрее ли работает логистическая регрессия по сравнению с градиентным бустингом?
    print(clf_logreg_grid.best_params_)
    print(roc_auc_score(y_small_train, y_train_score_logreg))
    print(roc_auc_score(y_small_test, y_test_score_logreg))

    ###################################################################################################################

    # 2. Среди признаков в выборке есть категориальные, которые мы использовали как числовые, что вряд ли является
    # хорошей идеей. Категориальных признаков в этой задаче одиннадцать: lobby_type и r1_hero, r2_hero, ..., r5_hero,
    # d1_hero, d2_hero, ..., d5_hero. Уберите их из выборки, и проведите кросс-валидацию для логистической регрессии
    # на новой выборке с подбором лучшего параметра регуляризации. Изменилось ли качество? Чем вы можете это объяснить?

    X_train_cleaned = X_train.drop(['lobby_type', 'r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero',
                                    'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero'], axis=1)

    # масштабируем признаки
    scaler = StandardScaler()
    scaler.fit(X_train_cleaned)
    X_train_cleaned_scaled = pd.DataFrame(scaler.transform(X_train_cleaned))

    # Разобъем данные на тестовую и обучающую выборки
    X_small_train, X_small_test, y_small_train, y_small_test = train_test_split(X_train_cleaned_scaled, y_train,
                                                                                test_size=0.5, random_state=1)

    k_fold = KFold(len(X_small_train), n_folds=5, shuffle=True, random_state=1)

    # Проведем поиск по сетке параметров - в качестве параметра будет выступать коэффициент регуляризации 'C':
    logreg_params = {'C': numpy.power(10.0, numpy.arange(-5, 6, 1))}

    clf_logreg = LogisticRegression(random_state=1, verbose=0)
    clf_logreg_grid = GridSearchCV(clf_logreg,
                                   logreg_params,
                                   cv=k_fold,
                                   scoring='roc_auc'
                                   )

    clf_logreg_grid.fit(X_small_train, y_small_train)

    y_train_score_logreg = clf_logreg_grid.decision_function(X_small_train)
    y_test_score_logreg = clf_logreg_grid.decision_function(X_small_test)

    # Какое качество получилось у логистической регрессии над некатегориальными исходными признаками?
    print(clf_logreg_grid.best_params_)
    print(roc_auc_score(y_small_train, y_train_score_logreg))
    print(roc_auc_score(y_small_test, y_test_score_logreg))

    ###################################################################################################################

    # 3. На предыдущем шаге мы исключили из выборки признаки rM_hero и dM_hero, которые показывают, какие именно герои
    # играли за каждую команду. Это важные признаки — герои имеют разные характеристики, и некоторые из них
    # выигрывают чаще, чем другие. Выясните из данных, сколько различных идентификаторов героев существует в данной
    # игре (вам может пригодиться фукнция unique или value_counts).

    heroes = train_data[['r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero',
                                    'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero']]

    heroes = heroes.stack().unique()
    #print(heroes.head())
    print(len(heroes))

    ###################################################################################################################

    # 4. Воспользуемся подходом "мешок слов" для кодирования информации о героях. Пусть всего в игре имеет N различных
    # героев. Сформируем N признаков, при этом i-й будет равен
    #   нулю, если i-й герой не участвовал в матче;
    #   единице, если i-й герой играл за команду Radiant;
    #   минус единице, если i-й герой играл за команду Dire.

    # N — количество различных героев в выборке
    N = heroes.max()
    X_pick = numpy.zeros((X_train.shape[0], N))

    for i, match_id in enumerate(X_train.index):
        for p in range(5):
            X_pick[i, X_train.ix[match_id, 'r%d_hero' % (p+1)]-1] = 1
            X_pick[i, X_train.ix[match_id, 'd%d_hero' % (p+1)]-1] = -1

    #print(X_pick)

    X_pick_df = pd.DataFrame(X_pick)
    X_pick_df.columns = range(1, N + 1)
    cols = [col for col in X_pick_df.columns if col in heroes]
    X_pick_df = X_pick_df[cols]

    X_fin = pd.concat([X_train_cleaned_scaled, X_pick_df], axis = 1)

    #print(X_fin.head())

    ###################################################################################################################

    # 5. Проведите кросс-валидацию для логистической регрессии на новой выборке с подбором лучшего параметра
    # регуляризации. Какое получилось качество? Улучшилось ли оно? Чем вы можете это объяснить?

    # Разобъем данные на тестовую и обучающую выборки
    X_small_train, X_small_test, y_small_train, y_small_test = train_test_split(X_fin, y_train,
                                                                                test_size=0.5, random_state=1)

    k_fold = KFold(len(X_small_train), n_folds=5, shuffle=True, random_state=1)

    # Проведем поиск по сетке параметров - в качестве параметра будет выступать коэффициент регуляризации 'C':
    logreg_params = {'C': numpy.power(10.0, numpy.arange(-5, 6, 1))}

    clf_logreg = LogisticRegression(random_state=1, verbose=0)
    clf_logreg_grid = GridSearchCV(clf_logreg,
                                   logreg_params,
                                   cv=k_fold,
                                   scoring='roc_auc'
                                   )

    clf_logreg_grid.fit(X_small_train, y_small_train)

    y_train_score_logreg = clf_logreg_grid.decision_function(X_small_train)
    y_test_score_logreg = clf_logreg_grid.decision_function(X_small_test)

    # Какое качество получилось у логистической регрессии над некатегориальными исходными признаками?
    print(clf_logreg_grid.best_params_)
    print(roc_auc_score(y_small_train, y_train_score_logreg))
    print(roc_auc_score(y_small_test, y_test_score_logreg))

    ###################################################################################################################

    # 6. Постройте предсказания вероятностей победы команды Radiant для тестовой выборки с помощью лучшей из изученных
    # моделей (лучшей с точки зрения AUC-ROC на кросс-валидации). Убедитесь, что предсказанные вероятности адекватные
    # — находятся на отрезке [0, 1], не совпадают между собой (т.е. что модель не получилась константной).

    # Считайте таблицу с признаками из файла features.csv.
    test_data = pd.read_csv('data/features_test.csv', index_col='match_id')
    X_test = test_data.drop(['lobby_type', 'r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero',
                                    'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero'], axis=1)

    # Замените пропуски на нули с помощью функции fillna().
    X_test.fillna(value=0, inplace=True)

    scaler = StandardScaler()
    scaler.fit(X_test)
    X_test_scaled = pd.DataFrame(scaler.transform(X_test))

    # соберем список героев для "bag of words"
    heroes = test_data[['r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero',
                                    'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero']].stack().unique()


    # Воспользуемся подходом "мешок слов" для кодирования информации о героях. Пусть всего в игре имеет N различных
    # героев. Сформируем N признаков, при этом i-й будет равен
    #   нулю, если i-й герой не участвовал в матче;
    #   единице, если i-й герой играл за команду Radiant;
    #   минус единице, если i-й герой играл за команду Dire.

    # N — количество различных героев в выборке
    N = heroes.max()
    X_pick = numpy.zeros((X_test.shape[0], N))

    for i, match_id in enumerate(X_test.index):
        for p in range(5):
            X_pick[i, test_data.ix[match_id, 'r%d_hero' % (p+1)]-1] = 1
            X_pick[i, test_data.ix[match_id, 'd%d_hero' % (p+1)]-1] = -1

    #print(X_pick)

    X_pick_df = pd.DataFrame(X_pick)
    X_pick_df.columns = range(1, N + 1)
    cols = [col for col in X_pick_df.columns if col in heroes]
    X_pick_df = X_pick_df[cols]

    X_fin = pd.concat([X_test_scaled, X_pick_df], axis=1)

    #print(X_fin.head())

    logistic = clf_logreg_grid.best_estimator_.predict_proba(X_fin)
    #print(logistic[:, 1])
    print('Min probability: %s' % str(min(logistic[:, 1])))
    print('Max probability: %s' % str(max(logistic[:, 1])))
def main():

    model_metrics = metrics.BinaryMetricsRecorder(domains=riskofbias.CORE_DOMAINS)
    stupid_metrics = metrics.BinaryMetricsRecorder(domains=riskofbias.CORE_DOMAINS)

    f = open('test_data.csv','wb')
    w = csv.DictWriter(f, ["pmid", "domain", "sent_text", "random", "human", "algorithm", "top3", "top1"], escapechar="\\")
    w.writeheader()

    # parse the risk of bias data from Cochrane     
    data = riskofbias.RoBData(test_mode=False)
    data.generate_data(doc_level_only=False)

    docs = riskofbias.MultiTaskSentFilter(data)

    uids = np.array(docs.get_ids())
    no_studies = len(uids)

    kf = KFold(no_studies, n_folds=5, shuffle=False)

    tuned_parameters = {"alpha": np.logspace(-4, -1, 5), "class_weight": [{1: i, -1: 1} for i in np.logspace(0, 2, 5)]}

    vec = modhashvec.ModularVectorizer(norm=None, non_negative=True, binary=True, ngram_range=(1, 2), n_features=2**26) # since multitask + bigrams = huge feature space

    for k_i, (train, test) in enumerate(kf):

        if k_i == 1:
            break

        y_train = docs.y(uids[train])

            
        vec.builder_clear()
        vec.builder_add_interaction_features(docs.X(uids[train]), low=7) # add base features
        vec.builder_add_interaction_features(docs.X_i(uids[train]), low=2) # then add interactions
        X_train = vec.builder_fit_transform()

        clf = GridSearchCV(SGDClassifier(loss="hinge", penalty="L2"), tuned_parameters, scoring='recall', n_jobs=16)

        # import pdb; pdb.set_trace()

        clf.fit(X_train, y_train)
        del X_train, y_train
        clf = clf.best_estimator_ # and we only need the best performing, discard the rest

        # Test on each domain in turn

        # filtered_data = riskofbias.SentFilter(data)



        for domain in riskofbias.CORE_DOMAINS:

            print "Testing on %s" % domain

            

            vec.builder_clear()
            vec.builder_add_interaction_features(docs.X(uids[test], domain=domain)) # add base features
            vec.builder_add_interaction_features(docs.X_i(uids[test], domain=domain)) # then add interactions
            X_test = vec.builder_transform()

            y_test = docs.y(uids[test], domain=domain)
            y_preds = clf.predict(X_test)




            y_df = clf.decision_function(X_test) # get distances from the decision boundary
            # positive distances = more likely to be relevant sentences

            r_len = len(y_preds)
            y_top3 = []
            y_top1 = []
            y_rand = []

            y_uids = np.array(docs.y_uids(uids[test], domain=domain))

            # import pdb; pdb.set_trace()

            for y_uid in np.unique(y_uids):

                mask = np.where(y_uids == y_uid)[0]
                doc_df = y_df[mask]

                doc_top3 = np.argpartition(doc_df, -3)[-3:]
                y_top3.extend(list(mask[doc_top3]))
                
                doc_top1 = np.argmax(doc_df)
                y_top1.append(mask[doc_top1])

                doc_rand = np.random.randint(0, len(doc_df))
                y_rand.append(mask[doc_rand])


            human_sent_indices = np.where(y_test==1)[0]
            algorithm_sent_indices = np.where(y_preds==1)[0]

            model_metrics.add_preds_test(y_preds, y_test, domain=domain)
            stupid_metrics.add_preds_test([-1] * len(y_test), y_test, domain=domain)

            # import pdb; pdb.set_trace()

            for doc_i, (doc, pmid) in enumerate(izip(docs.X(uids[test], domain=domain), docs.iter_pmid(uids[test], domain=domain))):

                row = {"domain": domain,
                       "sent_text": doc,
                       "random": doc_i in y_rand,
                       "human": doc_i in human_sent_indices,
                       "algorithm": doc_i in algorithm_sent_indices,
                       "top3": doc_i in y_top3,
                       "top1": doc_i in y_top1,
                       "pmid": pmid}

                if row["random"] or row["human"] or row["top3"] or row["top1"]:
                    # please note, the sentences will only be included in the analysis if
                    # in the top1 or top3
                    # we do have data on whether the raw classifier has predicted yes/no
                    # 
                    # this in effect means where the classifier picks <= 3 sentences
                    # we use all raw classifier data
                    # where >3 sentences are predicted by raw classifier, only the
                    # top 3 are used; the rest are discarded
                    w.writerow(row)

            del X_test, y_test, y_preds

        del clf



    model_metrics.save_csv(os.path.join('results', outputnames.filename(label="model")))
    stupid_metrics.save_csv(os.path.join('results', outputnames.filename(label="stupid-baseline")))
    f.close()
Пример #17
0
class WangBaseSenser(BaseSenser):
    """Abstract class for disambiguating relation senses.

    Attributes:
      n_y (int): number of distinct classes

    """
    __metaclass__ = abc.ABCMeta

    # private members
    PARAM_GRID = {"clf__C": [float(i) / 100. for i in xrange(1, 3)]}
    N_JOBS = -1

    def __init__(self, a_clf=None, a_grid_search=False):
        """Class constructor.

        Initialize classifier.

        Args:
          a_clf (classifier or None):
            classifier to use or None for default
          a_grid_search (bool): use grid search for estimating hyper-parameters

        """
        classifier = a_clf or LinearSVC(C=DFLT_C, **DFLT_PARAMS)
        self._gs = a_grid_search
        self._model = Pipeline([("vect", DictVectorizer()),
                                ("clf", classifier)])

    def train(self,
              a_train_data,
              a_dev_data=None,
              a_n_y=-1,
              a_i=-1,
              a_train_out=None,
              a_dev_out=None):
        """Method for training the model.

        Args:
          a_train_data (tuple[list, dict]):
            list of training JSON data
          a_dev_data (tuple[list, dict] or None):
            list of development JSON data
          a_n_y (int):
            number of distinct classes
          a_i (int):
            row index for the output predictions
          a_train_out (np.array or None):
            predictions for the training set
          a_dev_out (np.array or None):
            predictions for the training set

        Returns:
          void:

        Note:
          updates ``a_train_out`` and ``a_dev_out`` in place

        """
        self.n_y = a_n_y
        x_train, y_train = self._generate_ts(a_train_data)
        x_dev, y_dev = self._generate_ts(a_dev_data)
        # determine cross-validation and grid-search strategy and fit the model
        if self._gs:
            if a_dev_data is None or not a_dev_data[0]:
                cv = StratifiedKFold(y_train, n_folds=NFOLDS, shuffle=True)
            else:
                cv = self._devset_cv(y_train, len(y_dev), NFOLDS)
                x_train = x_train + x_dev
                y_train = y_train + y_dev
            scorer = make_scorer(f1_score, average="macro")
            self._model = GridSearchCV(self._model,
                                       self.PARAM_GRID,
                                       scoring=scorer,
                                       cv=cv,
                                       n_jobs=self.N_JOBS,
                                       verbose=1)
        self._model.fit([el[-1] for el in x_train], y_train)
        # output best hyper-parameters
        if self._gs:
            print("Best params:",
                  repr(self._model.best_params_),
                  file=sys.stderr)
        if a_i >= 0:
            if a_train_out is not None:
                if self._gs and a_dev_data and a_dev_data[0]:
                    x_train = x_train[:-len(x_dev)]
                for i, x_i in x_train:
                    self._predict(x_i, a_train_out[i], a_i)
            if a_dev_out is not None:
                for i, x_i in x_dev:
                    self._predict(x_i, a_dev_out[i], a_i)

    def predict(self, a_rel, a_data, a_ret, a_i):
        """Method for predicting sense of single relation.

        Args:
          a_rel (dict):
            discourse relation whose sense should be predicted
          a_data (2-tuple(dict, dict)):
            list of input JSON data
          a_ret (np.array):
            output prediction vector
          a_i (int):
            row index in the output vector

        Returns:
          void:

        Note:
          updates ``a_ret[a_i]`` in place

        """
        feats = self._extract_features(a_rel, a_data[-1])
        self._predict(feats, a_ret, a_i)

    @abc.abstractmethod
    def _extract_features(self, a_rel, a_parses):
        """Extract classification features for a given relation.

        Args:
          a_rel (dict):
            discourse relation to extract features for
          a_parses (dict):
            parsed sentences

        Returns:
          void:

        """
        raise NotImplementedError

    def _predict(self, a_feats, a_ret, a_i):
        """Method for predicting sense of single relation.

        Args:
          a_feats (dict):
            features of the input instance
          a_ret (np.array):
            output prediction vector
          a_i (int):
            row index in the output vector

        Returns:
          void:
            updates ``a_ret[a_i]`` in place

        """
        # obtain model's estimates
        dec = self._model.decision_function(a_feats)
        if len(dec.shape) > 1:
            dec = np.mean(dec, axis=0)
        # normalize using softmax
        dec = np.exp(dec)
        exp_ret = np.sum(dec) or 1e10
        dec /= exp_ret
        # map model's classes to original indices
        for i, ival in enumerate(dec):
            a_ret[a_i][self._model.classes_[i]] += ival

    def _free(self):
        """Free resources used by the model.

        """
        self.n_y = -1

    def _generate_ts(self, a_data):
        """Generate training set.

        Args:
          a_data (2-tuple(list, dict)):
            input data (discourse relations and parses)

        Returns:
          tuple(list, list):
            lists of input features and expected classes

        """
        x, y = [], []
        if a_data is None:
            return (x, y)
        x_i = y_i = None
        # generate features
        for i, irel in a_data[0]:
            x_i = self._extract_features(irel, a_data[1])
            if not x_i:
                continue
            x.append((i, x_i))
            y_i = np.argmax(irel[SENSE])
            y.append(y_i)
        return (x, y)

    def _devset_cv(self, a_y_train, a_n_dev, a_n_folds):
        """Generate train-test split from training and development data.

        Args:
          a_y_train (list[int]):
            list of training instances' tags
          a_n_dev (int):
            number of devset instances
          a_n_folds (int):
            number of folds

        Returrns:
          list[tuple]: list of training/testing folds

        """
        folds = []
        n_train = len(a_y_train)
        dev_ids = [n_train + i for i in xrange(a_n_dev)]
        # create stratified K-folds over the training data
        skf = StratifiedKFold(a_y_train, a_n_folds)
        for train_ids, test_ids in skf:
            folds.append((train_ids, np.concatenate((test_ids, dev_ids))))
        return folds
Пример #18
0
#clf = GridSearchCV(svm.SVC(C=1, probability=True), param_grid, cv=5)

#try other alg for svm
#clf = GridSearchCV(svm.NuSVC(nu=.5, probability=True), param_grid, cv=5)
#clf = GridSearchCV(svm.SVR(degree=3), param_grid, cv=5)
clf = GridSearchCV(svm.LinearSVC(C=1, class_weight='balanced'),
                   param_grid,
                   cv=5)

#fit the classifier
#clf=svm.SVC(C=C, kernel=kernel, gamma=g) #use this line if only run once, w/out param grid
clf.fit(Xlearn, Ylearn)
confidence_scores = []
best_candidates = []
best_can = None
confidence_scores = clf.decision_function(Xtest)
#prev_page_image=[names_test[0][names_test[0].index('#'):names_test[0].index(',')]]

predictions_by_page = {}

for i in range(0, len(names_test)):
    name_parts = names_test[i].strip().split(',')
    page_img = name_parts[0]
    book = name_parts[2]
    pred = name_parts[1]
    score = confidence_scores[i]
    page_id = book + "_" + page_img
    print(page_id, pred, score)
    if page_id in predictions_by_page:
        predictions_by_page[page_id] += [(pred, score)]
    else:
def multi_SVM(needcv = False):
	NeedReFetch = False
	allGenreSongsTrain,allGenreSongsTest = fetchData_eTA(NUM_NEED_PER_GENRE,GENRES,NeedReFetch,USED_GENRES)
	# allGenreSongsTrain,allGenreSongsTest = featureSelection (allGenreSongsTrain,allGenreSongsTest,method = 'MIC',testmode = False,n_features_to_select = 85)


	# assert(len(allGenreSongsTrain[0][0]) == 106)

	TrainX = []
	TrainY = []
	TestX = []
	TestY = []
	for i in range(sum(USED_GENRES)):
		for j in allGenreSongsTrain[i]:
			TrainX.append(j)
			TrainY.append(i)
		for k in allGenreSongsTest[i]:
			TestX.append(k)
			TestY.append(i)
	confuseMat = [[0 for i in range(sum(USED_GENRES))] for j in range(sum(USED_GENRES))];
	if not needcv:
		print "Start SVM training ... "
		model = SVC(probability=True,decision_function_shape='ovr',kernel = 'rbf',gamma = 0.0078125, C = 8)
		model.fit(TrainX,TrainY)
		print "Start SVM predicting ... "
		PredY = model.predict(TestX)
		print model.decision_function(TestX)
		for i in range(len(TestY)):
			confuseMat[TestY[i]][PredY[i]] += 1
		print(clfr(TestY, PredY))
	else:
		tuned_parameters = [															## remained to be play with
							{'kernel': ['rbf'], 'gamma': [2**i for i in range(-15,-4)], 'C': [2**i for i in range(-5,8)]},
		 					# {'kernel': ['linear'], 'C': [2**i for i in range(-8,9,2)]},
		 					# {'kernel': ['poly'], 'gamma': [2**i for i in range(-8,9,2)], 'C': [2**i for i in range(-8,9,2)], 'degree':[2,3,4]},
		 					]
		print "Start SVM CV ... "
		clf = GSCV(SVC(decision_function_shape='ovr'), tuned_parameters, cv=7)
		clf.fit(TrainX, TrainY)
		print clf.decision_function(TestX)


		print("Best parameters set found on development set:")
		print(clf.best_params_)
		# print("Grid scores on development set:")
		# print()
		# for params, mean_score, scores in clf.grid_scores_:
		# 	print("%0.4f (+/-%0.03f) for %r" % (mean_score, scores.std(), params))
		# print()

		print "Start SVM predicting ... "

		PredY = clf.predict(TestX)


		print(clfr(TestY, PredY))

		for i in range(len(TestY)):
			confuseMat[TestY[i]][PredY[i]] += 1

	return confuseMat
Пример #20
0
    ax.set_xlim(xx.min(), xx.max())
    ax.set_ylim(yy.min(), yy.max())
    ax.set_xticks(())
    ax.set_yticks(())
    i += 1

    # iterate over classifiers
    for name, clf in zip(names, classifiers):
        ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)

        # Plot the decision boundary. For that, we will assign a color to each
        # point in the mesh [x_min, x_max]x[y_min, y_max].
        if hasattr(clf, "decision_function"):
            Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
        else:
            Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]

        # Put the result into a color plot
        Z = Z.reshape(xx.shape)
        ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)

        # Plot also the training points
        ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)
        # and testing points
        ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright,
                   alpha=0.6)

        ax.set_xlim(xx.min(), xx.max())
        ax.set_ylim(yy.min(), yy.max())
Пример #21
0
clf = GridSearchCV(svm.LinearSVC(C=1, random_state=1), param_grid, cv=5)
#^^FOUND LINEARSVC works best for my data right now...



#fit the classifier
#clf=svm.SVC(C=C, kernel=kernel, gamma=g) #use this line if only run once, w/out param grid
clf.fit(Xlearn,Ylearn)

#print optimal parameter set
print ("Optimal Parameters:", clf.best_params_)

#make predictions using model
Yhat=clf.predict(Xtest) #expected outcomes, using the model
#Yhat=clf.predict(Xlearn) #see if it can predict the training ones right at all (if not, 3 features are currently garbage)
Yd=clf.decision_function(Xtest) #changed Xtest to Xlearn

# decision_function is similar to predict_proba, but for LinearSVC (bigger # means comp more confident about it's prediction; closer to 0=less confident)

#try adding in function to score data points, to see how far off things are from being marked as recipe or not
#score=clf.predict_proba(Xtest) 
#print(score)


#compute precision-recall and plot curve
precision=dict()
recall=dict()
average_precision=dict()
for i in range(2): #2 b/c have two target values (recipe or not)
	precision[i], recall[i],_=precision_recall_curve(Ytest,Yd)    #CHANGED TO YLEARN FROM YTEST
	average_precision[i]=average_precision_score(Ytest,Yd)
Пример #22
0
        fig = visualize_cv(clf)
        plt.show()
    elif action == 3:
        # run svm on test set
        assert len(
            sys.argv) > 4 and sys.argv[3] == '--label', 'Label unspecified'
        label = int(sys.argv[4])
        with open(model_path.format(label), 'rb') as f:
            clf = pickle.load(f)
        with open(scaler_path, 'rb') as f:
            mas = pickle.load(f)
        X = np.load(feature_test_path)
        Y = np.load(label_test_path)
        X_scaled = mas.transform(X)
        # probs = clf.predict_proba(X)
        probs = clf.decision_function(X_scaled)
        metrics = eval_prediction(probs, Y[:, label])
        # plt.clf()
        # plt.xlabel('Recall')
        # plt.ylabel('Precision')
        # plt.ylim([0, 1.05])
        # plt.xlim([0, 1.])
        # plt.plot(metrics['pr'].recall, metrics['pr'].precision, lw=1)
        # plt.show()
        print(metrics)
        print('> AP = ' + str(metrics['ap']))
'''
    elif action == 2:
        # features visualization
        assert len(sys.argv) > 4 and sys.argv[3] == '--label', 'Label unspecified'
        label = int(sys.argv[4])
Пример #23
0
print("AUC for SVC: {:.3f}".format(svc_auc))
#在模型选择中使用评估指标
roc_auc =  cross_val_score(SVC(), digits.data, digits.target == 9,
                           scoring="roc_auc")
print("AUC scoring: {}".format(roc_auc))

X_train, X_test, y_train, y_test = train_test_split(
    digits.data, digits.target == 9, random_state=0)
param_grid = {'gamma': [0.0001, 0.01, 0.1, 1, 10]}
grid = GridSearchCV(SVC(), param_grid=param_grid, scoring="roc_auc") 
grid.fit(X_train, y_train)
print("\nGrid-Search with AUC")
print("Best parameters:", grid.best_params_)
print("Best cross-validation score (AUC): {:.3f}".format(grid.best_score_))
print("Test set AUC: {:.3f}".format(
    roc_auc_score(y_test, grid.decision_function(X_test))))
print("Test set accuracy: {:.3f}".format(grid.score(X_test, y_test)))
###算法链与管道,将模型、预处理、数据划分集合在一起,数据划为训练部分、验证部分、测试部分
#简单管道
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
pipe = Pipeline([("scaler", MinMaxScaler()), ("svm", SVC())]) #scaler是MinMaxScaler()的实例,svm是SVC()的实例
pipe.fit(X_train, y_train)
print("Test score: {:.2f}".format(pipe.score(X_test, y_test)))
#在网格搜索中使用管道
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
pipe = Pipeline([("scaler", MinMaxScaler()), ("svm", SVC())])
param_grid = {'svm__C': [0.001, 0.01, 0.1, 1, 10, 100], #“模型__参数”
              'svm__gamma': [0.001, 0.01, 0.1, 1, 10, 100]}
grid = GridSearchCV(pipe, param_grid=param_grid, cv=5) #交叉验证5次
Пример #24
0
print "X_train shape =", X_train.shape, "  y_train shape=", y_train.shape
#print "X_test shape =", X==.shape, "  y_test shape=", y_test.shape
print

"""
The following lines train the SVM using our extracted training dataset and
is parameterized based on the gridding results. Then the trained SVM is
used to carry out predictions on the test data set. The percentage 
of accuracy predictions is printed
"""
clf = svm.SVC(kernel='rbf', C=10, gamma = 0.00001, degree = 3.0, coef0 = 0.0).fit(X_train, y_train)
print "clf.get_params(deep=True) =", clf.get_params(deep=True)
print "clf.score(X_test, y_test) = {0}%".format(int((clf.score(X_test, y_test) * 10000))/100.)
print "clf.predict(X_test) = ", clf.predict(X_test)
print "clf.decision_function(X_test) = ", clf.decision_function(X_test)
print "======================="
print "clf.score(X_train, y_train) = {0}%".format(int((clf.score(X_train, y_train) * 10000))/100.)
print "clf.predict(X_train) = ", clf.predict(X_train)
print "clf.decision_function(X_train) = ", clf.decision_function(X_train)
print "======================="
print
print
print "#####################################"
"""
http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC
"""
print "clf.support_ = ", clf.support_
print
print "len(clf.support_) = ", len(clf.support_)
print
Пример #25
0
class TensorParser(object):
    def train(self, train_set):
        logger.info("Converting features to list")
        features = []
        values = []
        for source, examples in train_set:
            group_features = list(self.get_features(examples))
            values += [feature[1] for feature in group_features]
            group_features = [feature[0] for feature in group_features]
            features += group_features
        logger.info("Total number of instances: %d", len(features))
        assert len(features) == len(values)


        logger.info("Training - building vectors")
        self.vectorizer = DictVectorizer()
        vectors = self.vectorizer.fit_transform(features)


        logger.info("Training classifier")
        svm = LinearSVC()

        parameters = {'C': [0.1, 1.0, 10.0, 100.0]}
        self.classifier = GridSearchCV(svm, parameters, scoring='f1')

        self.classifier.fit(vectors, values)
        self.classifier = self.classifier.best_estimator_

        logger.info("SVM classes: %r", self.classifier.classes_)

        feature_scores = self.vectorizer.inverse_transform(self.classifier.coef_)
        best_features = sorted(feature_scores[0].iteritems(), key=itemgetter(1), reverse=True)        
        logger.debug("Top SVM parameters: %r", best_features[:100])
        logger.debug("Top negative SVM parameters: %r", best_features[::-1][:100])

        logger.info("Finished training")

    def test(self, test_set):
        logger.info("Evaluation on test set")
        count = 0
        results = []
        for source, group in test_set:
            data = self.get_features(group)
            features, values = zip(*list(data))

            vectors = self.vectorizer.transform(features)
            predictions = self.classifier.decision_function(vectors)
            best_index = np.argmax(predictions)
            results.append(group[best_index])
            count += 1
            if count % 100 == 0:
                logger.info("Processed %d items", count)
        return results

    def get_features(self, examples):
        for example in examples:
            features = self.get_example_features(example)
            yield features, example['score'] > 0.0

    def get_example_features(self, example):
        source_tokens = self.get_sentence_features(example['source'])
        target_tokens = self.get_sentence_features(example['target'])
        features = []
        for source in source_tokens:
            for target in target_tokens:
                features.append(source + ':' + target)
        return {f: 1.0 for f in features}

    def get_sentence_features(self, sentence):
        tokens = tokenize(sentence)
        return [token for token in tokens if token not in STOPWORDS]

    def __repr__(self):
        return type(self).__name__
Пример #26
0
class TensorSystem(object):
    def __init__(self, oracle_class=OracleSystem):
        self.random = random.Random(1)
        self.connector = Connector()
        self.possible_connections = None
        self.oracle_class = oracle_class
        self.expression_features = {}

    def set_best_expression_set(self, train_set):
        expression_counts = Counter()
        for expressions in self.query_expressions.values():
            expression_counts.update(expressions)
        logger.info("Found %d unique expressions", len(expression_counts))
        self.frequent_expressions = set()
        covered = 0
        uncovered_set = train_set
        while len(uncovered_set) > 0:
            frequent = expression_counts.most_common(1)[0][0]
            self.frequent_expressions.add(frequent)
            logger.info("Most frequent expression: %r", frequent)

            covered = 0
            removed = Counter()
            new_uncovered_set = []
            new_expression_counts = Counter()
            for query, target in uncovered_set:
                _, oracle_expressions = self.oracle.get_best_results_and_expressions(query)
                oracle_expressions = set(oracle_expressions)
                if frequent not in oracle_expressions and len(oracle_expressions) > 0:
                    new_uncovered_set.append((query, target))
                    new_expression_counts.update(oracle_expressions)
            uncovered_set = new_uncovered_set
            expression_counts = new_expression_counts
            logger.info("Frequent expressions: %d, uncovered: %d, expressions_remaining: %d",
                        len(self.frequent_expressions),
                        len(uncovered_set),
                        len(expression_counts))
        

    def train(self, train_set):
        logger.info("Training tensor based classifier")
        
        self.oracle = self.oracle_class(train_set)
        self.query_expressions = {}
        for query, target in train_set:
            _, expressions = self.oracle.get_best_results_and_expressions(query)
            if len(expressions) == 0:
                continue
            self.query_expressions[query] = expressions
        logger.info("Obtained %d items from oracle", len(self.query_expressions))

        features = []
        values = []

        self.set_best_expression_set(train_set)

        all_features = []
        values = []
        for query, correct_expressions in self.query_expressions.iteritems():
            logger.debug("Building features for query %r, %d correct expressions",
                         query, len(correct_expressions))
            query_tokens = self.get_sentence_features(query)

            for expression in correct_expressions & self.frequent_expressions:
                features = self.get_query_expression_features(query_tokens, expression)
                all_features.append(features)
                values.append(1)

            for expression in self.frequent_expressions - correct_expressions:
                features = self.get_query_expression_features(query_tokens, expression)
                all_features.append(features)
                values.append(0)

        self.frequent_expressions = list(self.frequent_expressions)

        logger.info("Training - building vectors with %d features", len(all_features))
        self.vectorizer = DictVectorizer()
        vectors = self.vectorizer.fit_transform(all_features)


        logger.info("Training classifier")
        #svm = LinearSVC(random_state=1, tol=1e-5)
        svm = LinearSVC(tol=1e-6)

        parameters = {'C': [0.1, 1.0, 10.0, 100.0]}
        self.classifier = GridSearchCV(svm, parameters, scoring='f1')

        self.classifier.fit(vectors, values)
        logger.info("Best score in cross validation: %f", self.classifier.best_score_)

        self.classifier = self.classifier.best_estimator_

        logger.info("SVM classes: %r", self.classifier.classes_)

        self.feature_scores = self.vectorizer.inverse_transform(self.classifier.coef_)[0]
        best_features = sorted(self.feature_scores.iteritems(), key=itemgetter(1), reverse=True)        
        logger.debug("Top SVM parameters: %r", best_features[:100])
        logger.debug("Top negative SVM parameters: %r", best_features[::-1][:100])

        logger.info("Finished training")
        
    def make_features(self, all_query_tokens, all_connections):
        return [self.get_query_expression_features(query_tokens, connection)
                for query_tokens, connection in zip(all_query_tokens, all_connections)]

    def get_best_expressions(self, query):
        query_features = self.get_sentence_features(query)
        logger.debug("Query features: %r", query_features)
        all_features = [self.get_query_expression_features(query_features, expression)
                        for expression in self.frequent_expressions]
        vectors = self.vectorizer.transform(all_features)
        predictions = self.classifier.decision_function(vectors)
        best_indices =  np.argsort(predictions)[::-1]
        best_expressions = [self.frequent_expressions[i] for i in best_indices]
        return best_expressions
        
        # random_expressions = list(self.all_expressions)
        # random.shuffle(random_expressions)
        # return random_expressions


    def execute(self, query):
        logger.debug("Executing query: %r", query)
        best_expressions = self.get_best_expressions(query)

        entities = self.connector.get_query_entities(query)
        for expression in best_expressions:
            try:
                result_ids = expression.apply(entities, self.connector.related)
            except Exception:
                logger.exception("Exception applying expression")
                result_ids = []
            result = set(self.connector.related.get_names(result) for result in result_ids)
            logger.debug("Searching for best expression, expression: %r, result: %r",
                         expression, result)
            if len(result) > 0:
                return result
        return set()

    def get_expression_features(self, expression):
        if expression in self.expression_features:
            return self.expression_features[expression]
        try:
            connections = [expression.connection]
        except AttributeError:
            connections = [expression.expression1.connection,
                           expression.expression2.connection]
        relations = reduce(list.__add__, [list(c) for c in connections])
        connection_names= [self.connector.related.get_names(relation) for relation in relations]
        logger.info("Connections: %r, Connection names: %s", connections, connection_names)
        pseudo_sentence = ' '.join(connection_names)
        features = self.get_sentence_features(pseudo_sentence)
        self.expression_features[expression] = features
        return features

    def get_query_expression_features(self, query, expression):
        expression_features = self.get_expression_features(expression)
        return self.get_tensor_features(query, expression_features)

    def get_tensor_features(self, source_tokens, target_tokens):
        features = []
        for source in source_tokens:
            for target in target_tokens:
                features.append(source + ':' + target)
        return {f: 1.0 for f in features}

    def get_sentence_features(self, sentence):
        tokens = tokenize(sentence)
        return [token for token in tokens if token not in STOPWORDS]

    def __repr__(self):
        return type(self).__name__
Пример #27
0
class TensorSystem(object):
    def __init__(self, oracle_class=OracleSystem):
        self.random = random.Random(1)
        self.connector = Connector()
        self.possible_connections = None
        self.oracle_class = oracle_class
        self.expression_features = {}

    def set_best_expression_set(self, train_set):
        expression_counts = Counter()
        for expressions in self.query_expressions.values():
            expression_counts.update(expressions)
        logger.info("Found %d unique expressions", len(expression_counts))
        self.frequent_expressions = set()
        covered = 0
        uncovered_set = train_set
        while len(uncovered_set) > 0:
            frequent = expression_counts.most_common(1)[0][0]
            self.frequent_expressions.add(frequent)
            logger.info("Most frequent expression: %r", frequent)

            covered = 0
            removed = Counter()
            new_uncovered_set = []
            new_expression_counts = Counter()
            for query, target in uncovered_set:
                _, oracle_expressions = self.oracle.get_best_results_and_expressions(
                    query)
                oracle_expressions = set(oracle_expressions)
                if frequent not in oracle_expressions and len(
                        oracle_expressions) > 0:
                    new_uncovered_set.append((query, target))
                    new_expression_counts.update(oracle_expressions)
            uncovered_set = new_uncovered_set
            expression_counts = new_expression_counts
            logger.info(
                "Frequent expressions: %d, uncovered: %d, expressions_remaining: %d",
                len(self.frequent_expressions), len(uncovered_set),
                len(expression_counts))

    def train(self, train_set):
        logger.info("Training tensor based classifier")

        self.oracle = self.oracle_class(train_set)
        self.query_expressions = {}
        for query, target in train_set:
            _, expressions = self.oracle.get_best_results_and_expressions(
                query)
            if len(expressions) == 0:
                continue
            self.query_expressions[query] = expressions
        logger.info("Obtained %d items from oracle",
                    len(self.query_expressions))

        features = []
        values = []

        self.set_best_expression_set(train_set)

        all_features = []
        values = []
        for query, correct_expressions in self.query_expressions.iteritems():
            logger.debug(
                "Building features for query %r, %d correct expressions",
                query, len(correct_expressions))
            query_tokens = self.get_sentence_features(query)

            for expression in correct_expressions & self.frequent_expressions:
                features = self.get_query_expression_features(
                    query_tokens, expression)
                all_features.append(features)
                values.append(1)

            for expression in self.frequent_expressions - correct_expressions:
                features = self.get_query_expression_features(
                    query_tokens, expression)
                all_features.append(features)
                values.append(0)

        self.frequent_expressions = list(self.frequent_expressions)

        logger.info("Training - building vectors with %d features",
                    len(all_features))
        self.vectorizer = DictVectorizer()
        vectors = self.vectorizer.fit_transform(all_features)

        logger.info("Training classifier")
        #svm = LinearSVC(random_state=1, tol=1e-5)
        svm = LinearSVC(tol=1e-6)

        parameters = {'C': [0.1, 1.0, 10.0, 100.0]}
        self.classifier = GridSearchCV(svm, parameters, scoring='f1')

        self.classifier.fit(vectors, values)
        logger.info("Best score in cross validation: %f",
                    self.classifier.best_score_)

        self.classifier = self.classifier.best_estimator_

        logger.info("SVM classes: %r", self.classifier.classes_)

        self.feature_scores = self.vectorizer.inverse_transform(
            self.classifier.coef_)[0]
        best_features = sorted(self.feature_scores.iteritems(),
                               key=itemgetter(1),
                               reverse=True)
        logger.debug("Top SVM parameters: %r", best_features[:100])
        logger.debug("Top negative SVM parameters: %r",
                     best_features[::-1][:100])

        logger.info("Finished training")

    def make_features(self, all_query_tokens, all_connections):
        return [
            self.get_query_expression_features(query_tokens, connection) for
            query_tokens, connection in zip(all_query_tokens, all_connections)
        ]

    def get_best_expressions(self, query):
        query_features = self.get_sentence_features(query)
        logger.debug("Query features: %r", query_features)
        all_features = [
            self.get_query_expression_features(query_features, expression)
            for expression in self.frequent_expressions
        ]
        vectors = self.vectorizer.transform(all_features)
        predictions = self.classifier.decision_function(vectors)
        best_indices = np.argsort(predictions)[::-1]
        best_expressions = [self.frequent_expressions[i] for i in best_indices]
        return best_expressions

        # random_expressions = list(self.all_expressions)
        # random.shuffle(random_expressions)
        # return random_expressions

    def execute(self, query):
        logger.debug("Executing query: %r", query)
        best_expressions = self.get_best_expressions(query)

        entities = self.connector.get_query_entities(query)
        for expression in best_expressions:
            try:
                result_ids = expression.apply(entities, self.connector.related)
            except Exception:
                logger.exception("Exception applying expression")
                result_ids = []
            result = set(
                self.connector.related.get_names(result)
                for result in result_ids)
            logger.debug(
                "Searching for best expression, expression: %r, result: %r",
                expression, result)
            if len(result) > 0:
                return result
        return set()

    def get_expression_features(self, expression):
        if expression in self.expression_features:
            return self.expression_features[expression]
        try:
            connections = [expression.connection]
        except AttributeError:
            connections = [
                expression.expression1.connection,
                expression.expression2.connection
            ]
        relations = reduce(list.__add__, [list(c) for c in connections])
        connection_names = [
            self.connector.related.get_names(relation)
            for relation in relations
        ]
        logger.info("Connections: %r, Connection names: %s", connections,
                    connection_names)
        pseudo_sentence = ' '.join(connection_names)
        features = self.get_sentence_features(pseudo_sentence)
        self.expression_features[expression] = features
        return features

    def get_query_expression_features(self, query, expression):
        expression_features = self.get_expression_features(expression)
        return self.get_tensor_features(query, expression_features)

    def get_tensor_features(self, source_tokens, target_tokens):
        features = []
        for source in source_tokens:
            for target in target_tokens:
                features.append(source + ':' + target)
        return {f: 1.0 for f in features}

    def get_sentence_features(self, sentence):
        tokens = tokenize(sentence)
        return [token for token in tokens if token not in STOPWORDS]

    def __repr__(self):
        return type(self).__name__
Пример #28
0
class WangBaseSenser(BaseSenser):
    """Abstract class for disambiguating relation senses.

    Attributes:
      n_y (int): number of distinct classes

    """
    __metaclass__ = abc.ABCMeta

    # private members
    PARAM_GRID = {"clf__C": [float(i)/100. for i in xrange(1, 3)]}
    N_JOBS = -1

    def __init__(self, a_clf=None, a_grid_search=False):
        """Class constructor.

        Initialize classifier.

        Args:
          a_clf (classifier or None):
            classifier to use or None for default
          a_grid_search (bool): use grid search for estimating hyper-parameters

        """
        classifier = a_clf or LinearSVC(C=DFLT_C,
                                        **DFLT_PARAMS)
        self._gs = a_grid_search
        self._model = Pipeline([("vect", DictVectorizer()),
                                ("clf", classifier)])

    def train(self, a_train_data, a_dev_data=None, a_n_y=-1,
              a_i=-1, a_train_out=None, a_dev_out=None):
        """Method for training the model.

        Args:
          a_train_data (tuple[list, dict]):
            list of training JSON data
          a_dev_data (tuple[list, dict] or None):
            list of development JSON data
          a_n_y (int):
            number of distinct classes
          a_i (int):
            row index for the output predictions
          a_train_out (np.array or None):
            predictions for the training set
          a_dev_out (np.array or None):
            predictions for the training set

        Returns:
          void:

        Note:
          updates ``a_train_out`` and ``a_dev_out`` in place

        """
        self.n_y = a_n_y
        x_train, y_train = self._generate_ts(a_train_data)
        x_dev, y_dev = self._generate_ts(a_dev_data)
        # determine cross-validation and grid-search strategy and fit the model
        if self._gs:
            if a_dev_data is None or not a_dev_data[0]:
                cv = StratifiedKFold(y_train, n_folds=NFOLDS, shuffle=True)
            else:
                cv = self._devset_cv(y_train, len(y_dev), NFOLDS)
                x_train = x_train + x_dev
                y_train = y_train + y_dev
            scorer = make_scorer(f1_score, average="macro")
            self._model = GridSearchCV(self._model, self.PARAM_GRID,
                                       scoring=scorer,
                                       cv=cv, n_jobs=self.N_JOBS, verbose=1)
        self._model.fit([el[-1] for el in x_train], y_train)
        # output best hyper-parameters
        if self._gs:
            print("Best params:", repr(self._model.best_params_),
                  file=sys.stderr)
        if a_i >= 0:
            if a_train_out is not None:
                if self._gs and a_dev_data and a_dev_data[0]:
                    x_train = x_train[:-len(x_dev)]
                for i, x_i in x_train:
                    self._predict(x_i, a_train_out[i], a_i)
            if a_dev_out is not None:
                for i, x_i in x_dev:
                    self._predict(x_i, a_dev_out[i], a_i)

    def predict(self, a_rel, a_data, a_ret, a_i):
        """Method for predicting sense of single relation.

        Args:
          a_rel (dict):
            discourse relation whose sense should be predicted
          a_data (2-tuple(dict, dict)):
            list of input JSON data
          a_ret (np.array):
            output prediction vector
          a_i (int):
            row index in the output vector

        Returns:
          void:

        Note:
          updates ``a_ret[a_i]`` in place

        """
        feats = self._extract_features(a_rel, a_data[-1])
        self._predict(feats, a_ret, a_i)

    @abc.abstractmethod
    def _extract_features(self, a_rel, a_parses):
        """Extract classification features for a given relation.

        Args:
          a_rel (dict):
            discourse relation to extract features for
          a_parses (dict):
            parsed sentences

        Returns:
          void:

        """
        raise NotImplementedError

    def _predict(self, a_feats, a_ret, a_i):
        """Method for predicting sense of single relation.

        Args:
          a_feats (dict):
            features of the input instance
          a_ret (np.array):
            output prediction vector
          a_i (int):
            row index in the output vector

        Returns:
          void:
            updates ``a_ret[a_i]`` in place

        """
        # obtain model's estimates
        dec = self._model.decision_function(a_feats)
        if len(dec.shape) > 1:
            dec = np.mean(dec, axis=0)
        # normalize using softmax
        dec = np.exp(dec)
        exp_ret = np.sum(dec) or 1e10
        dec /= exp_ret
        # map model's classes to original indices
        for i, ival in enumerate(dec):
            a_ret[a_i][self._model.classes_[i]] += ival

    def _free(self):
        """Free resources used by the model.

        """
        self.n_y = -1

    def _generate_ts(self, a_data):
        """Generate training set.

        Args:
          a_data (2-tuple(list, dict)):
            input data (discourse relations and parses)

        Returns:
          tuple(list, list):
            lists of input features and expected classes

        """
        x, y = [], []
        if a_data is None:
            return (x, y)
        x_i = y_i = None
        # generate features
        for i, irel in a_data[0]:
            x_i = self._extract_features(irel, a_data[1])
            if not x_i:
                continue
            x.append((i, x_i))
            y_i = np.argmax(irel[SENSE])
            y.append(y_i)
        return (x, y)

    def _devset_cv(self, a_y_train, a_n_dev, a_n_folds):
        """Generate train-test split from training and development data.

        Args:
          a_y_train (list[int]):
            list of training instances' tags
          a_n_dev (int):
            number of devset instances
          a_n_folds (int):
            number of folds

        Returrns:
          list[tuple]: list of training/testing folds

        """
        folds = []
        n_train = len(a_y_train)
        dev_ids = [n_train + i for i in xrange(a_n_dev)]
        # create stratified K-folds over the training data
        skf = StratifiedKFold(a_y_train, a_n_folds)
        for train_ids, test_ids in skf:
            folds.append((train_ids,
                          np.concatenate((test_ids, dev_ids))))
        return folds
Пример #29
0
print "Ridge Classifier:"
# print dtc.best_params_
# print dtc.grid_scores_
print "\tTraining accuracy: " + str(rc.score(Train_feat, yTrain) * 100)
print "\tTesting accuracy: " + str(rc.score(Test_feat, yTest) * 100)
print confusion_matrix(rc.predict(Test_feat), yTest)
# print 'auc: first predict first then real first'
# print metrics.roc_auc_score(rc.predict(Test_feat), yTest)
# print metrics.roc_auc_score(yTest, rc.predict(Test_feat))
print "\n"

# print rc.decision_function(Test_feat).shape

# sys.exit()
add_to_curve_c(rc.decision_function(Test_feat), metrics.roc_auc_score(yTest, rc.decision_function(Test_feat), average='weighted'),yTrain, yTest, 'Ridge Regression ')



# print "sgd Classifier:"
# # print dtc.best_params_
# # print dtc.grid_scores_
# print "\tTraining accuracy: " + str(sgdclas.score(Train_feat, yTrain) * 100)
# print "\tTesting accuracy: " + str(sgdclas.score(Test_feat, yTest) * 100)
# print confusion_matrix(sgdclas.predict(Test_feat), yTest)
# print 'auc:'
# print metrics.roc_auc_score(sgdclas.predict(Test_feat), yTest)
# print "\n"
# add_to_curve(sgdclas.predict_proba(Test_feat), metrics.roc_auc_score(yTest, sgdclas.predict_proba(Test_feat)[:,1]),yTrain, yTest, 'SGD based Logistic Regression')

print "log regress Classifier:"
Пример #30
0
def run():

    # Create log file and grab script text
    create_log()
    script_text = get_file_text('run.py')

    # Create output directory if it does not exist
    if not os.path.isdir(OUTPUTS_DIR):
        os.mkdir(OUTPUTS_DIR)

    # The code below follows a performance estimation procedure suggested by the following
    # post on Stack Overflow: https://stats.stackexchange.com/questions/102631/k-fold-cross-validation-of-ensemble-learning

    # Load ROIs
    roi_names = load_roi_names(FILE_HC_SZ)
    rois = {}
    for roi_name in roi_names:
        roi = load_roi(
            os.path.join(ROIS_DIR, 'hc_sz', roi_name + '_age_matched.txt'))
        for i in roi.index:
            diagnosis = roi.loc[i, 'diagnosis1']
            roi.set_value(i, 'diagnosis1', 0 if diagnosis == 'HC' else 1)
        roi['diagnosis1'] = roi['diagnosis1'].astype(int)
        rois[roi_name] = roi
        log('added ROI: {}'.format(roi_name))

    # Define parameter range for grid search later
    param_grid = [{'C': [2**x for x in range(-5, 15, 2)]}]

    # Get subject IDs and labels
    roi = rois[roi_names[0]]
    subject_ids = roi.index
    subject_labels = roi['diagnosis1']
    log('nr. subjects: {}'.format(len(subject_ids)))

    scores_pred = []
    scores_dist = []
    fold = 1

    # This outer CV loop is meant for averaging scores
    for train, test in StratifiedKFold(subject_labels,
                                       n_folds=10,
                                       shuffle=True):

        predictions_file = 'outputs/predictions_train{}.txt'.format(fold)
        distances_file = 'outputs/distances_train{}.txt'.format(fold)

        if not os.path.isfile(predictions_file):

            # Create empty tables for holding predictions and distances
            predictions = dict()
            predictions['diagnosis'] = subject_labels[train]
            distances = dict()
            distances['diagnosis'] = subject_labels[train]

            # Run through all ROIs
            for roi_name in roi_names:

                log('calculating out-of-sample predictions for {}'.format(
                    roi_name))

                # Initialize prediction table for this ROI's column
                predictions[roi_name] = []
                distances[roi_name] = []

                # Get training data from the data frame
                X, y = get_xy(rois[roi_name].loc[subject_ids[train]],
                              label_column='diagnosis1',
                              exclude_columns=['diagnosis1', 'diagnosis2'])

                # Use 4-fold CV to get out-of-sample predictions for all training points
                i = 1
                for train1, test1 in StratifiedKFold(subject_labels[train],
                                                     n_folds=4):

                    # Do grid search to find optimal C parameter
                    classifier = GridSearchCV(SVC(kernel='linear'),
                                              param_grid=param_grid,
                                              cv=5)
                    classifier.fit(X[train1], y[train1])

                    # Store predictions and distances for this ROI
                    y_pred = classifier.predict(X[test1])
                    predictions[roi_name].extend(y_pred)
                    y_dist = classifier.decision_function(X[test1])
                    distances[roi_name].extend(y_dist)
                    print('  step {} - {}'.format(i, 4))
                    i += 1

            # Save predictions to file
            log('saving file: {}'.format(predictions_file))
            predictions = pd.DataFrame(predictions, index=subject_ids[train])
            predictions.to_csv(predictions_file, index_label='id')

            # Save distances to file
            log('saving file: {}'.format(distances_file))
            distances = pd.DataFrame(distances, index=subject_ids[train])
            distances.to_csv(distances_file, index_label='id')

        # ---------------------

        param_grid_rbf = [{
            'C': [2**x for x in range(-5, 15, 2)],
            'gamma': [2**x for x in range(-15, 4, 2)]
        }]

        # Train classifier on predictions
        log('training level-2 prediction classifier')
        predictions = pd.read_csv(predictions_file, index_col='id')
        X, y = get_xy(predictions,
                      label_column='diagnosis',
                      exclude_columns=['diagnosis'])
        classifier_pred = GridSearchCV(SVC(kernel='rbf'),
                                       param_grid=param_grid_rbf,
                                       cv=5)
        classifier_pred.fit(X, y)
        log('saving level-2 prediction classifier')
        joblib.dump(classifier_pred,
                    'outputs/classifier_pred{}.pkl'.format(fold))

        # Train classifier on distances
        log('training level-2 distance classifier')
        distances = pd.read_csv(distances_file, index_col='id')
        X, y = get_xy(distances,
                      label_column='diagnosis',
                      exclude_columns=['diagnosis'])
        classifier_dist = GridSearchCV(SVC(kernel='rbf'),
                                       param_grid=param_grid_rbf,
                                       cv=5)
        classifier_dist.fit(X, y)
        log('saving level-2 distance classifier')
        joblib.dump(classifier_pred,
                    'outputs/classifier_dist{}.pkl'.format(fold))

        # ---------------------

        # Train each ROI classifier on all training points and save it to disk
        for roi_name in roi_names:

            log('training {} on all training points'.format(roi_name))

            # Skip this step if exported classifier already exists
            classifier_file = 'outputs/classifier_' + roi_name + '_train{}.pkl'.format(
                fold)
            if os.path.isfile(classifier_file):
                continue

            # Get training data for this fold
            X, y = get_xy(rois[roi_name].loc[subject_ids[train]],
                          label_column='diagnosis1',
                          exclude_columns=['diagnosis1', 'diagnosis2'])

            # Train classifier using grid search
            classifier = GridSearchCV(SVC(kernel='linear'),
                                      param_grid=param_grid,
                                      cv=5)
            classifier.fit(X, y)

            # Save best classifier to file
            log('saving {} classifier to disk'.format(roi_name))
            joblib.dump(classifier, classifier_file)

        # ---------------------

        # Load ROI classifiers from file
        classifiers = {}
        for roi_name in roi_names:
            classifier_file = 'outputs/classifier_' + roi_name + '_train{}.pkl'.format(
                fold)
            classifiers[roi_name] = joblib.load(classifier_file)

        # ---------------------

        predictions_test_file = 'outputs/predictions_test{}.txt'.format(fold)
        distances_test_file = 'outputs/distances_test{}.txt'.format(fold)

        if not os.path.isfile(predictions_test_file):

            predictions_test = dict()
            predictions_test['diagnosis'] = subject_labels[test]
            distances_test = dict()
            distances_test['diagnosis'] = subject_labels[test]

            for roi_name in roi_names:

                predictions_test[roi_name] = []
                distances_test[roi_name] = []

                # Get test data from the data frame
                X, y = get_xy(rois[roi_name].loc[subject_ids[test]],
                              label_column='diagnosis1',
                              exclude_columns=['diagnosis1', 'diagnosis2'])

                log('calculating predictions and distances for {}'.format(
                    roi_name))

                # Store predictions and distances
                y_pred = classifiers[roi_name].predict(X)
                predictions_test[roi_name].extend(y_pred)
                y_dist = classifiers[roi_name].decision_function(X)
                distances_test[roi_name].extend(y_dist)

            # Save predictions to file
            log('saving predictions to file')
            predictions_test = pd.DataFrame(predictions_test,
                                            index=subject_ids[test])
            predictions_test.to_csv(predictions_test_file, index_label='id')

            # Save distances to file
            log('saving distances to file')
            distances_test = pd.DataFrame(distances_test,
                                          index=subject_ids[test])
            distances_test.to_csv(distances_test_file, index_label='id')

        # ---------------------

        # Load prediction classifier and run it on test predictions
        predictions_test = pd.read_csv(predictions_test_file, index_col='id')
        X_test, y_test = get_xy(predictions_test,
                                label_column='diagnosis',
                                exclude_columns=['diagnosis'])
        classifier_pred = joblib.load(
            'outputs/classifier_pred{}.pkl'.format(fold))
        y_pred = classifier_pred.predict(X_test)
        scores_pred.append(accuracy_score(y_test, y_pred))
        log('score: {} (predictions)'.format(scores_pred[-1]))

        # Load distance classifier and run it on test distances
        distances_test = pd.read_csv(distances_file, index_col='id')
        X_test, y_test = get_xy(distances_test,
                                label_column='diagnosis',
                                exclude_columns=['diagnosis'])
        classifier_dist = joblib.load(
            'outputs/classifier_dist{}.pkl'.format(fold))
        y_pred = classifier_dist.predict(X_test)
        scores_dist.append(accuracy_score(y_test, y_pred))
        log('score: {} (distances)'.format(scores_dist[-1]))

        fold += 1

    log('overall score: {} (predictions)'.format(np.mean(scores_pred)))
    log('overall score: {} (distances)'.format(np.mean(scores_dist)))

    # Append script to log and close it
    add_text_to_log(script_text)
    finish_log()
Пример #31
0
def run():

    # Create log file and grab script text
    create_log()
    script_text = get_file_text('run.py')

    # Create output directory if it does not exist
    if not os.path.isdir(OUTPUTS_DIR):
        os.mkdir(OUTPUTS_DIR)

    # The code below follows a performance estimation procedure suggested by the following
    # post on Stack Overflow: https://stats.stackexchange.com/questions/102631/k-fold-cross-validation-of-ensemble-learning

    # Load ROIs
    roi_names = load_roi_names(FILE_HC_SZ)
    rois = {}
    for roi_name in roi_names:
        roi = load_roi(os.path.join(ROIS_DIR, 'hc_sz', roi_name + '_age_matched.txt'))
        for i in roi.index:
            diagnosis = roi.loc[i, 'diagnosis1']
            roi.set_value(i, 'diagnosis1', 0 if diagnosis == 'HC' else 1)
        roi['diagnosis1'] = roi['diagnosis1'].astype(int)
        rois[roi_name] = roi
        log('added ROI: {}'.format(roi_name))

    # Define parameter range for grid search later
    param_grid = [{
        'C': [2**x for x in range(-5, 15, 2)]}]

    # Get subject IDs and labels
    roi = rois[roi_names[0]]
    subject_ids = roi.index
    subject_labels = roi['diagnosis1']
    log('nr. subjects: {}'.format(len(subject_ids)))

    scores_pred = []
    scores_dist = []
    fold = 1

    # This outer CV loop is meant for averaging scores
    for train, test in StratifiedKFold(subject_labels, n_folds=10, shuffle=True):

        predictions_file = 'outputs/predictions_train{}.txt'.format(fold)
        distances_file = 'outputs/distances_train{}.txt'.format(fold)

        if not os.path.isfile(predictions_file):

            # Create empty tables for holding predictions and distances
            predictions = dict()
            predictions['diagnosis'] = subject_labels[train]
            distances = dict()
            distances['diagnosis'] = subject_labels[train]

            # Run through all ROIs
            for roi_name in roi_names:

                log('calculating out-of-sample predictions for {}'.format(roi_name))

                # Initialize prediction table for this ROI's column
                predictions[roi_name] = []
                distances[roi_name] = []

                # Get training data from the data frame
                X, y = get_xy(
                    rois[roi_name].loc[subject_ids[train]],
                    label_column='diagnosis1', exclude_columns=['diagnosis1', 'diagnosis2'])

                # Use 4-fold CV to get out-of-sample predictions for all training points
                i = 1
                for train1, test1 in StratifiedKFold(subject_labels[train], n_folds=4):

                    # Do grid search to find optimal C parameter
                    classifier = GridSearchCV(SVC(kernel='linear'), param_grid=param_grid, cv=5)
                    classifier.fit(X[train1], y[train1])

                    # Store predictions and distances for this ROI
                    y_pred = classifier.predict(X[test1])
                    predictions[roi_name].extend(y_pred)
                    y_dist = classifier.decision_function(X[test1])
                    distances[roi_name].extend(y_dist)
                    print('  step {} - {}'.format(i, 4))
                    i += 1

            # Save predictions to file
            log('saving file: {}'.format(predictions_file))
            predictions = pd.DataFrame(predictions, index=subject_ids[train])
            predictions.to_csv(predictions_file, index_label='id')

            # Save distances to file
            log('saving file: {}'.format(distances_file))
            distances = pd.DataFrame(distances, index=subject_ids[train])
            distances.to_csv(distances_file, index_label='id')

        # ---------------------

        param_grid_rbf = [{
            'C': [2**x for x in range(-5, 15, 2)],
            'gamma': [2**x for x in range(-15, 4, 2)]}]

        # Train classifier on predictions
        log('training level-2 prediction classifier')
        predictions = pd.read_csv(predictions_file, index_col='id')
        X, y = get_xy(predictions,
            label_column='diagnosis', exclude_columns=['diagnosis'])
        classifier_pred = GridSearchCV(SVC(kernel='rbf'), param_grid=param_grid_rbf, cv=5)
        classifier_pred.fit(X, y)
        log('saving level-2 prediction classifier')
        joblib.dump(classifier_pred, 'outputs/classifier_pred{}.pkl'.format(fold))

        # Train classifier on distances
        log('training level-2 distance classifier')
        distances = pd.read_csv(distances_file, index_col='id')
        X, y = get_xy(distances,
            label_column='diagnosis', exclude_columns=['diagnosis'])
        classifier_dist = GridSearchCV(SVC(kernel='rbf'), param_grid=param_grid_rbf, cv=5)
        classifier_dist.fit(X, y)
        log('saving level-2 distance classifier')
        joblib.dump(classifier_pred, 'outputs/classifier_dist{}.pkl'.format(fold))

        # ---------------------

        # Train each ROI classifier on all training points and save it to disk
        for roi_name in roi_names:

            log('training {} on all training points'.format(roi_name))

            # Skip this step if exported classifier already exists
            classifier_file = 'outputs/classifier_' + roi_name + '_train{}.pkl'.format(fold)
            if os.path.isfile(classifier_file):
                continue

            # Get training data for this fold
            X, y = get_xy(
                rois[roi_name].loc[subject_ids[train]],
                label_column='diagnosis1', exclude_columns=['diagnosis1', 'diagnosis2'])

            # Train classifier using grid search
            classifier = GridSearchCV(SVC(kernel='linear'), param_grid=param_grid, cv=5)
            classifier.fit(X, y)

            # Save best classifier to file
            log('saving {} classifier to disk'.format(roi_name))
            joblib.dump(classifier, classifier_file)

        # ---------------------

        # Load ROI classifiers from file
        classifiers = {}
        for roi_name in roi_names:
            classifier_file = 'outputs/classifier_' + roi_name + '_train{}.pkl'.format(fold)
            classifiers[roi_name] = joblib.load(classifier_file)

        # ---------------------

        predictions_test_file = 'outputs/predictions_test{}.txt'.format(fold)
        distances_test_file = 'outputs/distances_test{}.txt'.format(fold)

        if not os.path.isfile(predictions_test_file):

            predictions_test = dict()
            predictions_test['diagnosis'] = subject_labels[test]
            distances_test = dict()
            distances_test['diagnosis'] = subject_labels[test]

            for roi_name in roi_names:

                predictions_test[roi_name] = []
                distances_test[roi_name] = []

                # Get test data from the data frame
                X, y = get_xy(
                    rois[roi_name].loc[subject_ids[test]],
                    label_column='diagnosis1', exclude_columns=['diagnosis1', 'diagnosis2'])

                log('calculating predictions and distances for {}'.format(roi_name))

                # Store predictions and distances
                y_pred = classifiers[roi_name].predict(X)
                predictions_test[roi_name].extend(y_pred)
                y_dist = classifiers[roi_name].decision_function(X)
                distances_test[roi_name].extend(y_dist)

            # Save predictions to file
            log('saving predictions to file')
            predictions_test = pd.DataFrame(predictions_test, index=subject_ids[test])
            predictions_test.to_csv(predictions_test_file, index_label='id')

            # Save distances to file
            log('saving distances to file')
            distances_test = pd.DataFrame(distances_test, index=subject_ids[test])
            distances_test.to_csv(distances_test_file, index_label='id')

        # ---------------------

        # Load prediction classifier and run it on test predictions
        predictions_test = pd.read_csv(predictions_test_file, index_col='id')
        X_test, y_test = get_xy(predictions_test,
            label_column='diagnosis', exclude_columns=['diagnosis'])
        classifier_pred = joblib.load('outputs/classifier_pred{}.pkl'.format(fold))
        y_pred = classifier_pred.predict(X_test)
        scores_pred.append(accuracy_score(y_test, y_pred))
        log('score: {} (predictions)'.format(scores_pred[-1]))

        # Load distance classifier and run it on test distances
        distances_test = pd.read_csv(distances_file, index_col='id')
        X_test, y_test = get_xy(distances_test,
            label_column='diagnosis', exclude_columns=['diagnosis'])
        classifier_dist = joblib.load('outputs/classifier_dist{}.pkl'.format(fold))
        y_pred = classifier_dist.predict(X_test)
        scores_dist.append(accuracy_score(y_test, y_pred))
        log('score: {} (distances)'.format(scores_dist[-1]))

        fold += 1

    log('overall score: {} (predictions)'.format(np.mean(scores_pred)))
    log('overall score: {} (distances)'.format(np.mean(scores_dist)))

    # Append script to log and close it
    add_text_to_log(script_text)
    finish_log()
grid_predictions = clf.predict(X_test)

# print(confusion_matrix(y_test, grid_predictions))

# decision function on testing data
plt.scatter(X_test[:, 0], X_test[:, 1], c = y_test, s=30)

ax = plt.gca()
xlim = ax.get_xlim()
ylim = ax.get_ylim()

# create grid to evaluate model
xx = np.linspace(xlim[0], xlim[1], 30)
yy = np.linspace(ylim[0], ylim[1], 30)
YY, XX = np.meshgrid(yy, xx)
xy = np.vstack([XX.ravel(), YY.ravel()]).T
Z = grid.decision_function(xy).reshape(XX.shape)

# plot decision boundary and margins
ax.contour(XX, YY, Z, colors='k', levels=[-1, 0, 1], alpha=0.5,
           linestyles=['--', '-', '--'], )
# plot support vectors
ax.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1], s=10,
           linewidth=1, c = 'b', facecolors='none', label = 'Support Vectors')

plt.title("Decision Function after Grid Search")
plt.legend()
plt.savefig('grid-search-decision-function-on-testing-data.png', dpi = 600)
plt.show()
Пример #33
0
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("assigned weights to features: ")
print(clf.coef_)
#target_names = ['','']
print(classification_report(y_test, y_pred)) #target_names=target_names
'''
'''
Save classifier
'''
#joblib.dump(clf, 'classifier_cdiff.pkl')
'''
Get distance from the hyperplane
'''
distance_from_boundry = clf.decision_function(X_test)
#print(distance_from_boundry)

if (wanna_see_graphs == 1):
    '''
    Stacked graph of results
    '''
    distances_1s = [
        i for (i, j) in zip(distance_from_boundry, y_test) if j > 0
    ]
    distances_0s = [
        i for (i, j) in zip(distance_from_boundry, y_test) if j <= 0
    ]

    graph_val_neg = []
    graph_val_pos = []