def evalModel(train_data, eval_data, train_labels, eval_labels, seed):
    joined_data = np.concatenate((train_data,eval_data),axis=0)
    joined_labels = np.concatenate((train_labels,eval_labels),axis=0)
    train_mask = np.zeros(train_data.shape[0]) - 1.0
    eval_mask = np.zeros(eval_data.shape[0])
    joined_mask = np.concatenate((train_mask,eval_mask),axis=0)
    ps = PredefinedSplit(test_fold=joined_mask)
    loss  = make_scorer(get_rmsle, greater_is_better=False)
    train_data = sparse.csr_matrix(train_data)
    eval_data = sparse.csr_matrix(eval_data)
    
    clf = RandomForestRegressor(random_state=seed, verbose=1)
    #clf.fit(train_data, train_labels)
    #preds = clf.predict(eval_data)
    #print(get_rmsle(eval_labels, preds))
    ## achieves 0.263
    
    # specify parameters and distributions to sample from
    param_dist = {"n_estimators": sp_randint(300, 800),
                  "max_depth": sp_randint(10, 50),
                  "max_features": ['auto','sqrt','log2'],
                  "min_samples_split": sp_randint(1, 11),
                  "min_samples_leaf": sp_randint(1, 11)}
    
    # run randomized search
    n_iter_search = 60
    random_search = RandomizedSearchCV(clf, param_distributions=param_dist, cv=ps, scoring=loss,
                                       n_iter=n_iter_search,n_jobs=-1,pre_dispatch='n_jobs',verbose=2)
    
    start = time()
    random_search.fit(joined_data, joined_labels)
    print("RandomizedSearchCV took %.2f seconds for %d candidates"
          " parameter settings." % ((time() - start), n_iter_search))
    report(random_search.grid_scores_)
Пример #2
0
def svm(speaker, X_train, y_train, X_test, y_test):
    '''
    change C, gamma
    '''

    ####svm model optimizing###
    # C from 0.01 to 16384 ( 0.01 * 2 ^14 ) ,
    #    num_C = 9
    #    Cs = 10 ** np.arange(num_C) * 1e-4
    #
    #    # gamma for rbf
    #    gammas = [1e-4, 1e-3, 1e-2, 1e-1, 1]
    param_grid = {'estimator__C': Cs, 'estimator__gamma': gammas}

    train_val_features = np.concatenate((X_train, X_test), axis=0)
    train_val_labels = np.concatenate((y_train, y_test), axis=0)
    test_fold = np.zeros(train_val_features.shape[0])
    test_fold[:X_train.shape[0]] = -1  # train set indexs are -1
    ps = PredefinedSplit(test_fold=test_fold)

    model = OneVsRestClassifier(SVC(kernel='rbf'))
    clf = GridSearchCV(estimator=model, param_grid=param_grid, cv=ps)
    clf = clf.fit(train_val_features, train_val_labels)
    #    train_score = clf.score(X_train, y_train)
    #    test_score = clf.score(X_test, y_test)
    #    clf_y_train = clf.predict(X_train)
    #    clf_y_test = clf.predict(X_test)
    #    print('speaker {} in svm classification, train accuracy: {}, test accuracy: {}'.format(speaker, train_score,test_score))
    # means = clf.cv_results_['mean_test_score']
    # stds = clf.cv_results_['std_test_score']
    # for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    #    print('%0.3f (+/-%0.3f) for %r' % (mean, std * 2, params))
    #print('best params are {}'.format(clf.best_params_))
    #print(classification_report(y_test, clf_y_test))
    return clf  #, train_score, test_score, clf_y_train, clf_y_test
Пример #3
0
def KLabelFold(labels, n_folds=3, shuffle=False, random_state=None):
    kfold = KFold(labels.nunique(),
                  n_folds=n_folds,
                  shuffle=shuffle,
                  random_state=random_state)
    unique_labels = labels.unique()
    return PredefinedSplit(
        pd.concat([
            labels.isin(i_x[1]) * i_x[0]
            for i_x in enumerate([unique_labels[mask[1]] for mask in kfold])
        ],
                  axis=1).sum(axis=1))
Пример #4
0
def set_cv(data, best_features):
    data['CV'] = -1
    data.loc[(data['Date'] >= '01-Aug-2014') & (data['Date'] <= '17-Sep-2014'),
             'CV'] = 0
    data.loc[(data['Date'] >= '01-Aug-2013') & (data['Date'] <= '17-Sep-2013'),
             'CV'] = 1
    data.loc[(data['Date'] >= '01-Jun-2015') & (data['Date'] <= '17-Jul-2015'),
             'CV'] = 2
    X = data[data['Set'] > 0].loc[:, best_features].values
    y = data[data['Set'] > 0].iloc[:, 6].values
    cv_set = data[data['Set'] > 0].iloc[:, 41].values
    ps = PredefinedSplit(test_fold=cv_set)
    return (X, y, ps)
def perform_svm_grid_search(patient_data,
                            classifier,
                            svm_grid,
                            decision_rule_grid,
                            svm_score_func,
                            decision_score_func,
                            estimator_fit_params=None,
                            preictal_time=210):

    # Separate data into folds
    estimator_fit_params = estimator_fit_params if estimator_fit_params is not None else {}
    # decision_score_func_params = decision_score_func_params if decision_score_func_params is not None else {}

    sorted_container = prepare_train_test_viz_data(
        patient_data,
        preictal_time=preictal_time,
        train_only_interictal=False,
    )
    # consolidate folds data into array, give indices to separate folds for validation
    fold_data = sorted_container['fold_data']
    total_data = []
    data_labels = []
    fold_numbers = []
    running_fold_number = 0
    for data_type in fold_data:

        for fold_number in fold_data[data_type]:
            total_data.extend(fold_data[data_type][fold_number]['data'])
            data_labels.extend(
                [data_type] *
                np.size(fold_data[data_type][fold_number]['data'], 0))
            fold_numbers.extend(
                [running_fold_number] *
                np.size(fold_data[data_type][fold_number]['data'], 0))
            running_fold_number += 1

    total_data = np.array(total_data)
    # Use custom fold iterator (Predefined Split), along files.
    cv = PredefinedSplit(fold_numbers)

    # Perform Cross Validation on data using GridSearchCV
    cross_validator = GridSearchCV(classifier,
                                   svm_grid,
                                   scoring=svm_score_func,
                                   fit_params=estimator_fit_params,
                                   cv=cv,
                                   verbose=1)
    # TODO: with best parameters, optimize other stuff
    best_estimator = cross_validator.fit(total_data, data_labels)

    a = 1
Пример #6
0
def train_model(set, clf, params):
    """"
	Keyword arguments: 
	set -- dataset (dictionary)
	clf -- sklearn model 
	params -- fine-tuning parameter 

	Returns: 
	f1_score train, f1_score valid, f1_score test, best parameter 

	the function: 
	- uses GridSearchCV to find the best hyperparameter for the model
	- refits the model with the parameters 
	- predicts train, valid and test sets 
	- find respective f1_scores
	"""
    train = set['train']
    valid = set['valid']
    test = set['test']

    train_input = train[0]
    valid_input = valid[0]
    test_input = test[0]

    train_truth = train[1]
    valid_truth = valid[1]
    test_truth = test[1]

    if params != None:
        ''' use predetermined validation set in the cross-validation  
		1) Combine training and validation set into one big training set 
		2) set test_fold vectors - 0 for validation entries, -1 for training entries
		3) feed the split into GridSearchCV
		'''
        combine_input = sparse.vstack([train_input, valid_input])
        combine_truth = np.concatenate((train_truth, valid_truth))
        fold = [-1 for i in range(train_input.shape[0])
                ] + [0 for i in range(valid_input.shape[0])]
        ps = PredefinedSplit(test_fold=fold)
        clf = GridSearchCV(clf, params, cv=ps, refit=True)
        clf.fit(combine_input, combine_truth)
    else:
        clf.fit(train_input, train_truth)

    best_param = None if params == None else clf.best_params_

    f1_train = f1_score(train_truth, clf.predict(train_input), average=average)
    f1_valid = f1_score(valid_truth, clf.predict(valid_input), average=average)
    f1_test = f1_score(test_truth, clf.predict(test_input), average=average)

    return f1_train, f1_valid, f1_test, best_param
Пример #7
0
def check_xgb_model(train, valid, predictors):

    classifier = lambda: XGBClassifier(objective='binary:logistic',
                                       silent=True,
                                       booster='gbtree',
                                       learning_rate=0.1,
                                       n_estimators=300,
                                       max_depth=5,
                                       min_child_weight=2,
                                       gamma=0,
                                       subsample=0.8,
                                       colsample_bytree=0.8,
                                       scale_pos_weight=1,
                                       n_jobs=20,
                                       reg_alpha=0,
                                       reg_lambda=1,
                                       seed=100)

    model = Pipeline(steps=[('en', classifier())])

    parameters = {
        #'en__n_estimators':[100, 300, 500, 700, 1000],
        #'en__max_depth':range(3,10,2),
        #'en__min_child_weight':np.arange(1,2.5,0.1),
        #'en__gamma':[i/10.0 for i in range(0,6)]
        #'en__subsample':[i/100.0 for i in range(75,90, 5)],
        #'en__colsample_bytree':[i/100.0 for i in range(75, 90, 5)]
        #'en__reg_alpha':[1e-5, 1e-2, 0.1, 0, 1, 10, 100],
        #'en__reg_lambda':[1e-5, 1e-2, 0.1, 0, 1, 10, 100],
    }
    data = pd.concat([train, valid])
    print(data[predictors].head())
    print("train size:%s, val size:%s, data size:%s" %
          (train.shape[0], valid.shape[0], data.shape[0]))
    index = np.zeros(data.shape[0])
    index[:train.shape[0]] = -1
    ps = PredefinedSplit(test_fold=index)

    grid_search = GridSearchCV(model,
                               parameters,
                               cv=ps,
                               n_jobs=-1,
                               verbose=1,
                               scoring='roc_auc')
    grid_search = grid_search.fit(data[predictors], data['label'])

    return grid_search
Пример #8
0
def create_cv_from_trials(tnums, test_perc=.1, n_iter=5):
    """Create a cross validation object using trial numbers.

    This is similar to LeavePLabelOut, but it defines a stopping point and
    shuffles unique labels before doing the splits. This lets you keep
    datapoints together in train/test splits.

    Parameters
    ----------
    tnums : array, dtype int, shape(n_test)
        The labels to use for permutation.
    """
    tnums = tnums.squeeze()
    unique_labels = np.unique(tnums).squeeze()
    n_test = np.floor(unique_labels.shape[0] * test_perc)
    test_ixs = np.random.permutation(unique_labels)[:n_test * n_iter]
    test_ixs = test_ixs.reshape([n_iter, n_test])
    test_fold = np.zeros_like(tnums)
    for i, ifold in enumerate(test_ixs):
        for fld in ifold:
            test_fold[tnums == fld] = i
    cv = PredefinedSplit(test_fold)
    return cv
Пример #9
0
def check_model(train, valid, predictors):
    #

    classifier = lambda: SGDClassifier(loss='log',
                                       penalty='elasticnet',
                                       fit_intercept=True,
                                       max_iter=100,
                                       shuffle=True,
                                       n_jobs=1,
                                       class_weight=None)

    model = Pipeline(steps=[('ss', StandardScaler()), ('en', classifier())])

    parameters = {
        'en__alpha': [0.001, 0.01, 0.1],
        'en__l1_ratio': [0.001, 0.01, 0.1]
    }
    #训练集是train+valid
    data = pd.concat([train, valid])
    #print(data[predictors].head())
    print("train size:%s, val size:%s, data size:%s" %
          (train.shape[0], valid.shape[0], data.shape[0]))
    #生成验证集
    index = np.zeros(data.shape[0])
    index[:train.shape[0]] = -1
    ps = PredefinedSplit(test_fold=index)

    grid_search = GridSearchCV(model,
                               parameters,
                               cv=ps,
                               n_jobs=-1,
                               verbose=1,
                               scoring='roc_auc')
    grid_search = grid_search.fit(data[predictors], data['label'])

    return grid_search
Пример #10
0
def stacking(train_ensum, val_ensum, test_ensum, y_train, y_val, y_test):
    C_params = [i/10 for i in range(1, 10)] + [i for i in range(1, 10, 1)] + [i for i in range(10, 100, 10)]
    cw_params = [i for i in range(1, 10, 1)] + [i for i in range(10, 100, 10)]
    params = {
        'C': C_params,
        'class_weight': [{1:w} for w in cw_params],
    }
    train_val_features = np.concatenate((train_ensum, val_ensum), axis = 0)
    train_val_labels = np.concatenate((y_train, y_val), axis = 0)
    test_fold = np.zeros(train_val_features.shape[0])
    test_fold[:train_ensum.shape[0]] = -1
    ps = PredefinedSplit(test_fold = test_fold)

    lr_stack = GridSearchCV(estimator=LogisticRegression(), param_grid=params, 
                         scoring=my_scoring, n_jobs=-1, cv=ps, verbose=0)
    lr_stack.fit(train_val_features, train_val_labels)

    lr_stack_train_pred_prob = lr_stack.predict_proba(train_ensum)[:, 1]
    lr_stack_val__pred_prob = lr_stack.predict_proba(val_ensum)[:, 1]
    lr_stack_test_pred_prob = lr_stack.predict_proba(test_ensum)[:, 1]
    a, b = utils.model_key_performance(lr_stack_train_pred_prob, y_train)
    c, d = utils.model_key_performance(lr_stack_val__pred_prob, y_val)
    e, f = utils.model_key_performance(lr_stack_test_pred_prob, y_test)
    return a, b, c, d, e, f
def main():

    # pickels filenames
    saved_classifier_filename = "../classifiers/msu_mfsd.pkl"

    # load or recompute train features. If none, the train features are not loaded into memory
    load_train_features = True
    # retrain or load classifier
    load_classifier = True
    # load or recompute test features
    load_test_features = True
    # descriptor computer

    mlbp_feature_computer = feature_computer.FrameFeatureComputer(
        features.MultiScaleLocalBinaryPatterns((8, 1), (8, 2), (16, 2))
    )
    # mlbp_feature_computer = feature_computer.FrameFeatureComputer(features.LocalBinaryPatterns(8,1))

    (
        real_features,
        spoof_features_per_dir,
        labels_real,
        labels_spoof_per_dir,
    ) = get_features_and_labels(load_train_features, mlbp_feature_computer)

    # here I should do a cross validation on the features
    """
    param_grid = [
            {'C': [0.0001, 0.001, 0.01], 'kernel':['linear'], 'class_weight':['balanced', None]},
            {'C': [0.0001, 0.001, 0.01], 'kernel':['rbf'],'gamma':[0.0001, 0.001], 'class_weight':['balanced', None]}
        ]
    """
    test_fold = dbfeatures.compute_msu_ussa_subjects_folds_arr()
    ps = PredefinedSplit(test_fold=test_fold)

    clf = svm.SVC(
        verbose=True,
        probability=True,
        C=0.0001,
        kernel="linear",
        class_weight="balanced",
    )

    folds_eer = []
    threshes = []
    confusion_matrices = []
    for train_index, test_index in ps:
        # split the features into current train and test folds
        train_features = real_features[train_index]
        test_features = real_features[test_index]
        train_labels = labels_real[train_index]
        test_labels = labels_real[test_index]
        for i in range(len(spoof_features_per_dir)):
            train_features = np.concatenate(
                (train_features, spoof_features_per_dir[i][train_index]), 0
            )
            test_features = np.concatenate(
                (test_features, spoof_features_per_dir[i][test_index]), 0
            )
            train_labels = np.concatenate(
                (train_labels, labels_spoof_per_dir[i][train_index]), 0
            )
            test_labels = np.concatenate(
                (test_labels, labels_spoof_per_dir[i][test_index]), 0
            )

        # train the classifier
        clf.fit(train_features, train_labels)

        # use the classifier to predict the labels for test_features
        pred_labels = clf.predict(test_features)

        # create the roc curve
        fpr, tpr, threshold = roc_curve(test_labels, pred_labels, pos_label=1)

        # compute the equal error rate
        eer = brentq(lambda x: 1.0 - x - interp1d(fpr, tpr)(x), 0.0, 1.0)
        thresh = interp1d(fpr, threshold)(eer)

        folds_eer.append(eer)
        threshes.append(thresh)

        conf_mat = confusion_matrix(test_labels, pred_labels)
        confusion_matrices.append(conf_mat)

    # print the mean and standard deviation of equal error rate across the folds
    print(np.mean(folds_eer), np.std(folds_eer))
    for conf_mat in confusion_matrices:
        print(conf_mat)
Пример #12
0
    # CLASSIFIER #
    #------------#

    if not os.path.exists(config['results_folder']):
        os.makedirs(config['results_folder'])
    f = open(config['results_folder'] + experiment_name + '.txt', 'w')

    if config['audios_list'] == False:
        print('train/val/test partitions are pre-defined!')
        if config['model_type'] == 'SVM':
            # hyperparameter search in val set
            x_dev = np.concatenate((x_train, x_val), axis=0)
            y_dev = np.concatenate((y_train, y_val), axis=0)
            val_mask = np.concatenate(
                (-np.ones(len(y_train)), np.zeros(len(y_val))), axis=0)
            ps = PredefinedSplit(test_fold=val_mask)
            svc = SVC()
            hps = GridSearchCV(svc,
                               svm_params,
                               cv=ps,
                               n_jobs=3,
                               pre_dispatch=3 * 8,
                               verbose=config['SVM_verbose']).fit(
                                   x_dev, y_dev)
            print('Best hyperparameter: ' + str(hps.best_params_))
            # define final model
            model = SVC()
            model.set_params(**hps.best_params_)
        else:
            score_max = 0
            h_max = -1
def _cost_fn(argd,
             X,
             y,
             EX_list,
             valid_size,
             n_folds,
             shuffle,
             random_state,
             use_partial_fit,
             info,
             timeout,
             _conn,
             loss_fn=None,
             best_loss=None):
    '''Calculate the loss function
    '''
    try:
        t_start = time.time()
        # Extract info from calling function.
        if 'classifier' in argd:
            classifier = argd['classifier']
            regressor = argd['regressor']
            preprocessings = argd['preprocessing']
            ex_pps_list = argd['ex_preprocs']
        else:
            classifier = argd['model']['classifier']
            regressor = argd['model']['regressor']
            preprocessings = argd['model']['preprocessing']
            ex_pps_list = argd['model']['ex_preprocs']
        learner = classifier if classifier is not None else regressor
        is_classif = classifier is not None
        untrained_learner = copy.deepcopy(learner)
        # -- N.B. modify argd['preprocessing'] in-place

        # Determine cross-validation iterator.
        if n_folds is not None:
            if n_folds == -1:
                info('Will use leave-one-out CV')
                cv_iter = LeaveOneOut(len(y))
            elif is_classif:
                info('Will use stratified K-fold CV with K:', n_folds,
                     'and Shuffle:', shuffle)
                cv_iter = StratifiedKFold(y,
                                          n_folds=n_folds,
                                          shuffle=shuffle,
                                          random_state=random_state)
            else:
                info('Will use K-fold CV with K:', n_folds, 'and Shuffle:',
                     shuffle)
                cv_iter = KFold(len(y),
                                n_folds=n_folds,
                                shuffle=shuffle,
                                random_state=random_state)
        else:
            if not shuffle:  # always choose the last samples.
                info('Will use the last', valid_size,
                     'portion of samples for validation')
                n_train = int(len(y) * (1 - valid_size))
                valid_fold = np.ones(len(y), dtype=np.int)
                valid_fold[:n_train] = -1  # "-1" indicates train fold.
                cv_iter = PredefinedSplit(valid_fold)
            elif is_classif:
                info(
                    'Will use stratified shuffle-and-split with validation \
                      portion:', valid_size)
                cv_iter = StratifiedShuffleSplit(y,
                                                 1,
                                                 test_size=valid_size,
                                                 random_state=random_state)
            else:
                info('Will use shuffle-and-split with validation portion:',
                     valid_size)
                cv_iter = ShuffleSplit(len(y),
                                       1,
                                       test_size=valid_size,
                                       random_state=random_state)

        # Use the above iterator for cross-validation prediction.
        cv_y_pool = np.array([])
        cv_pred_pool = np.array([])
        cv_n_iters = np.array([])
        for train_index, valid_index in cv_iter:
            Xfit, Xval = X[train_index], X[valid_index]
            yfit, yval = y[train_index], y[valid_index]
            if EX_list is not None:
                _EX_list = [(EX[train_index], EX[valid_index])
                            for EX in EX_list]
                EXfit_list, EXval_list = zip(*_EX_list)
            else:
                EXfit_list = None
                EXval_list = None
            XEXfit, XEXval = transform_combine_XEX(Xfit, info, preprocessings,
                                                   Xval, EXfit_list,
                                                   ex_pps_list, EXval_list)
            learner = copy.deepcopy(untrained_learner)
            info('Training learner', learner, 'on X/EX of dimension',
                 XEXfit.shape)
            if hasattr(learner, "partial_fit") and use_partial_fit:
                learner, n_iters = pfit_until_convergence(learner,
                                                          is_classif,
                                                          XEXfit,
                                                          yfit,
                                                          info,
                                                          best_loss=best_loss,
                                                          XEXval=XEXval,
                                                          yval=yval,
                                                          timeout=timeout,
                                                          t_start=t_start)
            else:
                learner.fit(XEXfit, yfit)
                n_iters = None
            if learner is None:
                break
            cv_y_pool = np.append(cv_y_pool, yval)
            info('Scoring on X/EX validation of shape', XEXval.shape)
            cv_pred_pool = np.append(cv_pred_pool, learner.predict(XEXval))
            cv_n_iters = np.append(cv_n_iters, n_iters)
        else:  # all CV folds are exhausted.
            if loss_fn is None:
                if is_classif:
                    loss = 1 - accuracy_score(cv_y_pool, cv_pred_pool)
                    # -- squared standard error of mean
                    lossvar = (loss * (1 - loss)) / max(1, len(cv_y_pool) - 1)
                    info('OK trial with accuracy %.1f +- %.1f' %
                         (100 * (1 - loss), 100 * np.sqrt(lossvar)))
                else:
                    loss = 1 - r2_score(cv_y_pool, cv_pred_pool)
                    lossvar = None  # variance of R2 is undefined.
                    info('OK trial with R2 score %.2e' % (1 - loss))
            else:
                # Use a user specified loss function
                loss = loss_fn(cv_y_pool, cv_pred_pool)
                lossvar = None
                info('OK trial with loss %.1f' % loss)
            t_done = time.time()
            rval = {
                'loss': loss,
                'loss_variance': lossvar,
                'learner': untrained_learner,
                'preprocs': preprocessings,
                'ex_preprocs': ex_pps_list,
                'status': hyperopt.STATUS_OK,
                'duration': t_done - t_start,
                'iterations': cv_n_iters.max(),
            }
            rtype = 'return'
        # The for loop exit with break, one fold did not finish running.
        if learner is None:
            t_done = time.time()
            rval = {
                'status': hyperopt.STATUS_FAIL,
                'failure': 'Not enough time to finish training on \
                            all CV folds',
                'duration': t_done - t_start,
            }
            rtype = 'return'

    ##==== Cost function exception handling ====##
    except (NonFiniteFeature, ) as exc:
        print('Failing trial due to NaN in', str(exc))
        t_done = time.time()
        rval = {
            'status': hyperopt.STATUS_FAIL,
            'failure': str(exc),
            'duration': t_done - t_start,
        }
        rtype = 'return'

    except (ValueError, ) as exc:
        if ('k must be less than or equal'
                ' to the number of training points') in str(exc):
            t_done = time.time()
            rval = {
                'status': hyperopt.STATUS_FAIL,
                'failure': str(exc),
                'duration': t_done - t_start,
            }
            rtype = 'return'
        else:
            rval = exc
            rtype = 'raise'

    except (AttributeError, ) as exc:
        print('Failing due to k_means_ weirdness')
        if "'NoneType' object has no attribute 'copy'" in str(exc):
            # -- sklearn/cluster/k_means_.py line 270 raises this sometimes
            t_done = time.time()
            rval = {
                'status': hyperopt.STATUS_FAIL,
                'failure': str(exc),
                'duration': t_done - t_start,
            }
            rtype = 'return'
        else:
            rval = exc
            rtype = 'raise'

    except Exception as exc:
        rval = exc
        rtype = 'raise'

    # -- return the result to calling process
    _conn.send((rtype, rval))
Пример #14
0
    def gridSearchSingleSet(self, parameters, datasetIdx):
        """
        Perform grid search using a single set
        Goals is to find the optimal classifier parameters
        :return:
        """

        datasetId = self.datasetIds[datasetIdx]

        dataset = {
            'id': datasetId,
            'videoFile': os.path.join(self.projectDirectory, 'videos', datasetId+".MOV"),
            'cutouts': {
                'posDir': os.path.join(self.projectDirectory, 'cutouts', datasetId, 'pos'),
                'negDir': os.path.join(self.projectDirectory, 'cutouts', datasetId, 'neg')
            },
            'labelsFile': os.path.join(self.projectDirectory, 'labels', datasetId+"_output.txt"),
            'framesDir': os.path.join(self.projectDirectory, 'frames', datasetId),
        }

        foldsCount = 5
        objectIndices = self.getObjectsIndices(dataset['cutouts']['posDir'])
        foldsIndices = self.getFoldsIndices(objectIndices, foldsCount)

        negCutoutFiles = glob.glob(dataset['cutouts']['negDir']+"/*")
        negSamples = []
        for fileName in negCutoutFiles:
            negSamples.append({
                'fileName': fileName,
                'label': False,
                'features': self.getImageFileFeatures(fileName)
            })

        negSamplesPerFold = len(negSamples)/foldsCount

        samples = []
        foldIdx = 0
        for objectIndices in foldsIndices:
            posCutoutFiles = []
            for objectIndex in objectIndices:
                posCutoutFiles += (glob.glob(dataset['cutouts']['posDir'] + "/cutout_" + str(objectIndex) + "_*.png"))

            foldPosSamples = []
            for fileName in posCutoutFiles:
                foldPosSamples.append({
                    'fileName': fileName,
                    'label': True,
                    'features': self.getImageFileFeatures(fileName),
                    'foldIdx': foldIdx
                })

            foldNegSamples = negSamples[negSamplesPerFold*foldIdx:negSamplesPerFold*foldIdx + negSamplesPerFold]
            for foldNegSample in foldNegSamples:
                foldNegSample['foldIdx'] = foldIdx

            samples.extend(foldPosSamples+foldNegSamples)
            foldIdx += 1

        X = [sample['features'] for sample in samples]
        y = [sample['label'] for sample in samples]
        test_fold = [sample['foldIdx'] for sample in samples]
        ps = PredefinedSplit(test_fold=test_fold)

        if self.classifierType == 'SVM-RBF':
            est = svm.SVC()
            parameters = {'kernel':['rbf'], 'C':parameters['C'], 'gamma': parameters['gamma']}
        elif self.classifierType == 'KNN':
            est = KNeighborsClassifier()
            parameters = {'n_neighbors': parameters['n_neighbors']}
        elif self.classifierType == 'SVM-LIN':
            est = svm.SVC()
            parameters = {'C': parameters['C']}
        else:
            raise Exception("Uknown classifier type %s" % self.classifierType)

        clf = GridSearchCV(
            estimator=est,
            param_grid=parameters,
            n_jobs=1,
            pre_dispatch='2*n_jobs',
            iid=False,
            refit=True,
            cv=ps
        )
        clf.fit(X, y)

        print ("=== Scores:")
        pprint(clf.grid_scores_)
        print ("=== Best score:")
        pprint(clf.best_score_)
        print ("=== Best params:")
        pprint(clf.best_params_)

        return clf.best_score_
Пример #15
0
    def gridSearch(self, parameters):
        """
        Perform a grid search using all the available datasets
        Goals is to find the optimal classifier parameters

        :return:
        """

        datasets = []
        for setIdx, datasetId in enumerate(self.datasetIds):
            dataset = {
                'id': datasetId,
                'videoFile': os.path.join(self.projectDirectory, 'videos', datasetId+".MOV"),
                'cutouts': {
                    'posDir': os.path.join(self.projectDirectory, 'cutouts', datasetId, 'pos'),
                    'negDir': os.path.join(self.projectDirectory, 'cutouts', datasetId, 'neg')
                },
                'labelsFile': os.path.join(self.projectDirectory, 'labels', datasetId+"_output.txt"),
                'framesDir': os.path.join(self.projectDirectory, 'frames', datasetId),
            }

            posCutoutFiles = (glob.glob(dataset['cutouts']['posDir'] + "/*.png"))
            posLabels = [True] * len(posCutoutFiles)
            negCutoutFiles = (glob.glob(dataset['cutouts']['negDir'] + "/*.png"))
            negLabels = [False] * len(negCutoutFiles)
            samples = []
            for cutoutFile, label in zip(posCutoutFiles + negCutoutFiles, posLabels + negLabels):
                samples.append({
                    'fileName': cutoutFile,
                    'label': label,
                    'features': self.getImageFeatures(cv2.imread(cutoutFile)),
                    'foldIdx': setIdx
                })

            dataset['samples'] = samples

            datasets.append(dataset)

        X = []
        y = []
        test_fold = []
        for dataset in datasets:
            X.extend([sample['features'] for sample in dataset['samples']])
            y.extend([sample['label'] for sample in dataset['samples']])
            test_fold.extend([sample['foldIdx'] for sample in dataset['samples']])

        ps = PredefinedSplit(test_fold=test_fold)

        if self.classifierType == 'SVM-RBF':
            est = svm.SVC()
            parameters = {'kernel':['rbf'], 'C': parameters['C'], 'gamma': parameters['gamma']}
        elif self.classifierType == 'KNN':
            est = KNeighborsClassifier()
            parameters = {'n_neighbors':parameters['n_neighbors']}
        elif self.classifierType == 'SVM-LIN':
            est = svm.SVC()
            parameters = {'kernel': ['linear'], 'C': parameters['C']}
        else:
            raise Exception("Uknown classifier type %s" % self.classifierType)


        clf = GridSearchCV(
            estimator=est,
            param_grid=parameters,
            n_jobs=1,
            pre_dispatch='2*n_jobs',
            iid=False,
            refit=True,
            cv=ps
        )
        clf.fit(X, y)
        print ("=== Scores:")
        pprint(clf.grid_scores_)
        print ("=== Best score:")
        pprint(clf.best_score_)
        print ("=== Best params:")
        pprint(clf.best_params_)

        return clf.best_score_
Пример #16
0
    # increase processing time in a combinatorial way
    parameters = {
        'tfidf__max_df': [0.75, 1.],  #过滤了几十个词
        'tfidf__min_df': (5, 10, 20, 50),
        'tfidf__max_features': (200000, 400000, 600000),
        'tfidf__ngram_range': [(1, 3)],  # unigrams or trigrams,  use trigrams
        'tfidf__use_idf': [1],
        'tfidf__norm': ('l1', 'l2'),
        'clf__alpha': (0.00001, 0.000005, 0.000001),
        'clf__penalty': ('l2', 'elasticnet'),
        #'clf__n_iter': (10, 50, 80),
    }

    test_fold = np.zeros((train_docs_num + val_docs_num), dtype='int')
    test_fold[:train_docs_num] = -1
    ps = PredefinedSplit(test_fold=test_fold)

    t0 = time()

    my_score = make_scorer(f1_func, greater_is_better=True)
    grid_search = GridSearchCV(pipeline,
                               parameters,
                               cv=ps,
                               n_jobs=-1,
                               verbose=1,
                               scoring=my_score)

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)
    def transform(self, X):
        X = X.set_index('SalesID')[self.columns].sort_index()
        return X


if __name__ == '__main__':
    df = pd.read_csv('Train.csv')
    df = df.set_index('SalesID').sort_index()
    y = df.SalePrice

    # This is for predefined split... we want -1 for our training split,
    # 0 for the test split.
    cv_cutoff_date = pd.to_datetime('2011-01-01')
    cv = -1 * (pd.to_datetime(df.saledate) < cv_cutoff_date).astype(int)

    cross_val = PredefinedSplit(cv)

    p = Pipeline([('filter', FilterColumns()), ('type_change', DataType()),
                  ('replace_outliers', ReplaceOutliers()),
                  ('compute_age', ComputeAge()),
                  ('nearest_average', ComputeNearestMean()),
                  ('columns', ColumnFilter()), ('lm', LinearRegression())])
    df = df.reset_index()

    def rmsle(y_hat, y):
        target = y
        predictions = y_hat
        log_diff = np.log(predictions + 1) - np.log(target + 1)
        return np.sqrt(np.mean(log_diff**2))

    # GridSearch
Пример #18
0
train_data = pd.read_csv(open('semeval2016-task6-trainingdata-utf-8.txt'), '\t',
                         encoding='utf8',
                         index_col=0)
targets = list(train_data.Target.unique())

for target in targets:
    print 80 * "="
    print target
    print 80 * "="

    target_idx = train_data.Target == target
    target_train_data = train_data[target_idx]
    target_true_stances = target_train_data.Stance


    print 'training instances:', len(train_data)
    print 'target training instances:', len(target_train_data)

    target_cv = StratifiedKFold(target_true_stances, n_folds=5, shuffle=True,
                                random_state=13)

    predef_test_fold = -np.ones(len(train_data), dtype='int')
    predef_test_fold[np.where(target_idx)] = target_cv.test_folds

    train_cv = PredefinedSplit(predef_test_fold)

    for train, test in train_cv:
        print len(train), len(test), len(train) + len(test)
        print train_data.Target.iloc[test]
Пример #19
0
def data_init(X_train, X_val, X_test, y_train, y_val, y_test, k):
    # 归一化
    min_max_scaler = MinMaxScaler()
    min_max_scaler.fit(X_train)
    X_train = min_max_scaler.transform(X_train)
    X_val = min_max_scaler.transform(X_val)
    X_test = min_max_scaler.transform(X_test)
    # 集成学习分割数据
    X_train_1 = X_train[y_train == 1]
    y_train_1 = y_train[y_train == 1]
    X_train_0 = X_train[y_train == 0]
    y_train_0 = y_train[y_train == 0]
    step_size = X_train_0.shape[0] // k
    
    X_train_need = []
    y_train_need = []
    for i in range(k):
        tmp_x = X_train_0[i*step_size:min(X_train_0.shape[0]-1, i*step_size+step_size)]
        tmp_y = y_train_0[i*step_size:min(X_train_0.shape[0]-1, i*step_size+step_size)]
        X_train_need.append(np.concatenate((X_train_1, tmp_x), axis=0))
        y_train_need.append(np.concatenate((y_train_1, tmp_y), axis=0))

    C_params = [i/10 for i in range(1, 10)] + [i for i in range(1, 10, 1)] + [i for i in range(10, 100, 10)]
    cw_params = [i for i in range(1, 10, 1)] + [i for i in range(10, 100, 10)]
    params = {
        'C': C_params,
        'class_weight': [{1:w} for w in cw_params],
    }

    train_pred_prob_record = []
    val_pred_prob_record = []
    test_pred_prob_record = []
    for i in range(k):
        print(i)
        train_val_features = np.concatenate((X_train_need[i], X_val), axis = 0)
        train_val_labels = np.concatenate((y_train_need[i], y_val), axis = 0)
        test_fold = np.zeros(train_val_features.shape[0])
        test_fold[:X_train_need[i].shape[0]] = -1
        ps = PredefinedSplit(test_fold = test_fold)
        
        model = GridSearchCV(estimator=LogisticRegression(), param_grid=params, 
                            scoring=my_scoring, n_jobs=-1, cv=ps, verbose=0)
        model.fit(train_val_features, train_val_labels)
        print(model.best_params_ )
        print(model.best_score_ )
        train_pr = model.predict_proba(X_train)[:, 1]
        val_pr = model.predict_proba(X_val)[:, 1]
        test_pr = model.predict_proba(X_test)[:, 1]
        
        utils.model_key_performance(train_pr, y_train)
        utils.model_key_performance(val_pr, y_val)
        utils.model_key_performance(test_pr, y_test)
        
        train_pred_prob_record.append(train_pr)
        val_pred_prob_record.append(val_pr)
        test_pred_prob_record.append(test_pr)
        
        train_ensum = np.array(train_pred_prob_record).T
        val_ensum = np.array(val_pred_prob_record).T
        test_ensum = np.array(test_pred_prob_record).T
    return train_ensum, val_ensum, test_ensum