def train_logreg(trX,
                 trY,
                 vaX=None,
                 vaY=None,
                 teX=None,
                 teY=None,
                 penalty='l1',
                 max_iter=100,
                 C=2**np.arange(-8, 1).astype(np.float),
                 seed=42,
                 model=None,
                 eval_test=True,
                 neurons=None,
                 drop_neurons=False,
                 report_metric='acc',
                 automatic_thresholding=False,
                 threshold_metric='acc',
                 micro=False):

    # if only integer is provided for C make it iterable so we can loop over
    if not isinstance(C, collections.Iterable):
        C = list([C])
    # extract features for given neuron indices
    if neurons is not None:
        if drop_neurons:
            all_neurons = set(list(range(trX.shape[-1])))
            neurons = set(list(neurons))
            neurons = list(all_neurons - neurons)
        trX = trX[:, neurons]
        if vaX is not None:
            vaX = vaX[:, neurons]
        if teX is not None:
            teX = teX[:, neurons]
    # Cross validation over C
    n_classes = 1
    if len(trY.shape) > 1:
        n_classes = trY.shape[-1]
    scores = []
    if model is None:
        for i, c in enumerate(C):
            if n_classes <= 1:
                model = LogisticRegression(C=c,
                                           penalty=penalty,
                                           max_iter=max_iter,
                                           random_state=seed)
                model.fit(trX, trY)
                blank_info_dict = {
                    'fp': 0,
                    'tp': 0,
                    'fn': 0,
                    'tn': 0,
                    'std': 0.,
                    'metric': threshold_metric,
                    'micro': micro
                }
                if vaX is not None:
                    info_dict = update_info_dict(
                        blank_info_dict.copy(), vaY,
                        model.predict_proba(vaX)[:, -1])
                else:
                    info_dict = update_info_dict(
                        blank_info_dict.copy(), trY,
                        model.predict_proba(trX)[:, -1])
                scores.append(get_metric(info_dict))
                print(scores[-1])
                del model
            else:
                info_dicts = []
                model = []
                for cls in range(n_classes):
                    _model = LogisticRegression(C=c,
                                                penalty=penalty,
                                                max_iter=max_iter,
                                                random_state=seed)
                    _model.fit(trX, trY[:, cls])
                    blank_info_dict = {
                        'fp': 0,
                        'tp': 0,
                        'fn': 0,
                        'tn': 0,
                        'std': 0.,
                        'metric': threshold_metric,
                        'micro': micro
                    }
                    if vaX is not None:
                        info_dict = update_info_dict(
                            blank_info_dict.copy(), vaY[:, cls],
                            _model.predict_proba(vaX)[:, -1])
                    else:
                        info_dict = update_info_dict(
                            blank_info_dict.copy(), trY[:, cls],
                            _model.predict_proba(trX)[:, -1])
                    info_dicts.append(info_dict)
                    model.append(_model)
                scores.append(get_metric(info_dicts))
                print(scores[-1])
                del model
        c = C[np.argmax(scores)]
        if n_classes <= 1:
            model = LogisticRegression(C=c,
                                       penalty=penalty,
                                       max_iter=max_iter,
                                       random_state=seed)
            model.fit(trX, trY)
        else:
            model = []
            for cls in range(n_classes):
                _model = LogisticRegression(C=c,
                                            penalty=penalty,
                                            max_iter=max_iter,
                                            random_state=seed)
                _model.fit(trX, trY[:, cls])
                model.append(_model)
    else:
        c = model.C
    # predict probabilities and get accuracy of regression model on train, val, test as appropriate
    # also get number of regression weights that are not zero. (number of features used for modeling)
    scores = []
    if n_classes == 1:
        nnotzero = np.sum(model.coef_ != 0)
        preds = model.predict_proba(trX)[:, -1]
        train_score = get_metric(
            update_info_dict(blank_info_dict.copy(), trY, preds),
            report_metric)
    else:
        nnotzero = 0
        preds = []
        info_dicts = []
        for cls in range(n_classes):
            nnotzero += np.sum(model[cls].coef_ != 0)
            _preds = model[cls].predict_proba(trX)[:, -1]
            info_dicts.append(
                update_info_dict(blank_info_dict.copy(), trY[:, cls], _preds))
            preds.append(_preds)
        nnotzero /= n_classes
        train_score = get_metric(info_dicts, report_metric)
        preds = np.concatenate([p.reshape((-1, 1)) for p in preds], axis=1)
    scores.append(train_score * 100)
    if vaX is None:
        eval_data = trX
        eval_labels = trY
        val_score = train_score
    else:
        eval_data = vaX
        eval_labels = vaY
        if n_classes == 1:
            preds = model.predict_proba(vaX)[:, -1]
            val_score = get_metric(
                update_info_dict(blank_info_dict.copy(), vaY, preds),
                report_metric)
        else:
            preds = []
            info_dicts = []
            for cls in range(n_classes):
                _preds = model[cls].predict_proba(vaX)[:, -1]
                info_dicts.append(
                    update_info_dict(blank_info_dict.copy(), vaY[:, cls],
                                     _preds))
                preds.append(_preds)
            val_score = get_metric(info_dicts, report_metric)
            preds = np.concatenate([p.reshape((-1, 1)) for p in preds], axis=1)
    val_preds = preds
    val_labels = eval_labels
    scores.append(val_score * 100)
    eval_score = val_score
    threshold = np.array([.5] * n_classes)
    if automatic_thresholding:
        _, threshold, _, _ = _binary_threshold(
            preds.reshape(-1, n_classes), eval_labels.reshape(-1, n_classes),
            threshold_metric, micro)
        threshold = float(threshold.squeeze())
    if teX is not None and teY is not None and eval_test:
        eval_data = teX
        eval_labels = teY
        if n_classes == 1:
            preds = model.predict_proba(eval_data)[:, -1]
        else:
            preds = []
            for cls in range(n_classes):
                _preds = model[cls].predict_proba(eval_data)[:, -1]
                preds.append(_preds)
            preds = np.concatenate([p.reshape((-1, 1)) for p in preds], axis=1)
    if n_classes == 1:
        threshold = float(threshold.squeeze())
        eval_score = get_metric(
            update_info_dict(blank_info_dict.copy(),
                             eval_labels,
                             preds,
                             threshold=threshold), report_metric)
    else:
        info_dicts = []
        for cls in range(n_classes):
            info_dicts.append(
                update_info_dict(blank_info_dict.copy(),
                                 eval_labels[:, cls],
                                 preds[:, cls],
                                 threshold=threshold[cls]))
        eval_score = get_metric(info_dicts, report_metric)

    scores.append(eval_score * 100)
    return model, scores, preds, c, nnotzero
Пример #2
0
#dense_features = ngramTrain.toarray()
#dense_test = test_features.toarray()

TfidfAccuracy = []
TfidfModel = []

for classifier in randmClassifiers:
    for x in range(0, 10):
        random.seed(x)
        fit = classifier.fit(TfidfTrain, train['Label'])
        pred = fit.predict(TfidfTest)
        prob = fit.predict_proba(TfidfTest)[:, 1]

        accuracy = metrics.accuracy_score(pred, test['Label'])
        TfidfAccuracy.append(accuracy)
        TfidfModel.append(classifier.__class__.__name__)

        print('Accuracy of ' + classifier.__class__.__name__ + ' is ' +
              str(accuracy))

rndmModelPerf = pd.DataFrame({'Model': TfidfModel, 'Acc': TfidfAccuracy})

# hashing vectorizer - tokenizes by createing a correlational matrix
hashingVectorizer = HashingVectorizer(n_features=50)
hashingTrain = hashingVectorizer.fit_transform(trainHeadlines)

hashingModel = LogisticRegression(solver='lbfgs')
hashingModel = hashingModel.fit(hashingTrain, train["Label"])

hashingTest = hashingVectorizer.transform(testHeadlines)
hashingPreds = hashingModel.predict(hashingTest)