示例#1
0
def test_1():
    """
    Test two model in overfit mode
    """
    clf1 = mord.OrdinalRidge(alpha=0.)
    clf1.fit(X, y)

    clf2 = mord.LogisticAT(alpha=0.)
    clf2.fit(X, y)

    # the score is - absolute error, 0 is perfect
    # assert clf1.score(X, y) < clf2.score(X, y)

    clf3 = mord.LogisticSE(alpha=0.)
    clf3.fit(X, y)
    pred3 = clf3.predict(X)
    pred2 = clf2.predict(X)

    # check that it predicts better than the surrogate
    # for other loss
    assert np.abs(pred2 - y).mean() <= np.abs(pred3 - y).mean()
    # # the score is - absolute error, 0 is perfect
    # assert_almost_equal(clf.score(X, y), 0., places=2)
    #
    # clf = mord.LogisticIT(alpha=0.)
    # clf.fit(X, y)
    # # the score is classification error, 1 is perfect
    # assert_almost_equal(clf.score(X, y), 1., places=2)

    # test on sparse matrices
    X_sparse = sparse.csr_matrix(X)
    clf4 = mord.LogisticAT(alpha=0.)
    clf4.fit(X_sparse, y)
    pred4 = clf4.predict(X_sparse)
    assert metrics.mean_absolute_error(y, pred4) < 1.
示例#2
0
def order_logit_regression():
    data = read_csv(CSV_PATH)
    bunch = Bunch(data=data.iloc[:, 1:-1], target=data.iloc[:, -1])
    d = bunch.data

    train_len = int(0.75 * d.shape[0])
    trainX, trainY = d.ix[:train_len-1, :], bunch.target[:train_len]
    testX, testY = d.ix[train_len:, :], bunch.target[train_len:]

    clf1 = mord.LogisticAT(alpha=0.5)
    clf1.fit(trainX, trainY)
    pred = clf1.predict(testX)
    draw_acc_matrix(testY, pred, train_len)
    print 'Accuracy of LogisticAT: %s' % metrics.accuracy_score(testY, pred)
    print 'Mean absolute error of LogisticAT: %s' % \
          metrics.mean_absolute_error(pred, testY)

    clf2 = mord.LogisticIT(alpha=0.5)
    clf2.fit(trainX, trainY)
    pred2 = clf2.predict(testX)
    draw_acc_matrix(testY, pred2, train_len)
    print 'Accuracy of LogisticIT: %s' % metrics.accuracy_score(testY, pred2)
    print 'Mean absolute error of LogisticIT: %s' % \
          metrics.mean_absolute_error(pred2, testY)

    clf3 = mord.LogisticSE(alpha=0.5)
    clf3.fit(trainX, trainY)
    pred3 = clf3.predict(testX)
    draw_acc_matrix(testY, pred3, train_len)
    print 'Accuracy of LogisticSE: %s' % metrics.accuracy_score(testY, pred3)
    print 'Mean absolute error of LogisticSE: %s' % \
          metrics.mean_absolute_error(pred3, testY)
示例#3
0
文件: test_fit.py 项目: telgip/mord
def test_binary_class():
    Xc, yc = datasets.make_classification(n_classes=2, n_samples=1000)
    clf = linear_model.LogisticRegression(C=1e6)
    clf.fit(Xc[:500], yc[:500])
    pred_lr = clf.predict(Xc[500:])

    clf = mord.LogisticAT(alpha=1e-6)
    clf.fit(Xc[:500], yc[:500])
    pred_at = clf.predict(Xc[500:])
    assert_almost_equal(np.abs(pred_lr - pred_at).mean(), 0.)

    clf2 = mord.LogisticSE(alpha=1e-6)
    clf2.fit(Xc[:500], yc[:500])
    pred_at = clf2.predict(Xc[500:])
    assert_almost_equal(np.abs(pred_lr - pred_at).mean(), 0.)
示例#4
0
def test_predict_proba_nonnegative():
    """
    Test that predict_proba() function outputs a tuple of non-negative values
    """
    def check_for_negative_prob(proba):
        for p in np.ravel(proba):
            assert_greater_equal(np.round(p, 7), 0)

    clf = mord.LogisticAT(alpha=0.)
    clf.fit(X, y)
    check_for_negative_prob(clf.predict_proba(X))

    clf2 = mord.LogisticIT(alpha=0.)
    clf2.fit(X, y)
    check_for_negative_prob(clf2.predict_proba(X))

    clf3 = mord.LogisticSE(alpha=0.)
    clf3.fit(X, y)
    check_for_negative_prob(clf3.predict_proba(X))
示例#5
0
def train_ordinal_logistic(train_features, train_labels, skip_grid_search,
                           evaluation, num_jobs, loss, alpha, cost,
                           ordinal_algorithm):
    """
  returns the trained ordinal logistic model. loss, alpha and cost are ignored if grid
  search is requested.
  alpha: used only for se, it, at, and ridge and if grid search is not requested
  cost: used only for lad and if grid search is not requested
  loss: used only for lad and if grid search is not requested
  """
    # requested grid search. find best parameters, to achieve highest average score
    if not skip_grid_search:
        penalty_weights = 'dummy'
        clf = grid_search.grid_search(evaluation, train_features, train_labels,
                                      penalty_weights, ordinal_algorithm,
                                      num_jobs)
        params = clf.best_params_
        if 'penalty' in params:
            loss = params['loss']
        if 'alpha' in params:
            alpha = params['alpha']
        if 'cost' in params:
            cost = params['cost']

    # Now perform the training on full train data.
    if ordinal_algorithm == 'logisticse':
        model = mord.LogisticSE(alpha=alpha, max_iter=20000)
    elif ordinal_algorithm == 'logisticit':
        model = mord.LogisticIT(alpha=alpha, max_iter=20000)
    elif ordinal_algorithm == 'logisticat':
        model = mord.LogisticAT(alpha=alpha, max_iter=20000)
    elif ordinal_algorithm == 'ordinalridge':
        model = mord.OrdinalRidge(alpha=alpha)
    elif ordinal_algorithm == 'lad':
        model = mord.LAD(C=cost, loss=loss, max_iter=10000)
    model = model.fit(train_features, train_labels)

    return model
示例#6
0
features.loc[features.Cont == 'Medium', 'Cont'] = 2
features.loc[features.Cont == 'High', 'Cont'] = 3

le = preprocessing.LabelEncoder()
le.fit(features.loc[:, 'Type'])
features.loc[:, 'type_encoded'] = le.transform(features.loc[:, 'Type'])

X, y = features.loc[:, ('Infl', 'Cont', 'type_encoded')], data.target

clf1 = linear_model.LogisticRegression(solver='lbfgs',
                                       multi_class='multinomial')
clf1.fit(X, y)

print('Mean Absolute Error of LogisticRegression: %s' %
      metrics.mean_absolute_error(clf1.predict(X), y))

clf2 = mord.LogisticAT(alpha=1.)
clf2.fit(X, y)
print('Mean Absolute Error of LogisticAT %s' %
      metrics.mean_absolute_error(clf2.predict(X), y))

clf3 = mord.LogisticIT(alpha=1.)
clf3.fit(X, y)
print('Mean Absolute Error of LogisticIT %s' %
      metrics.mean_absolute_error(clf3.predict(X), y))

clf4 = mord.LogisticSE(alpha=1.)
clf4.fit(X, y)
print('Mean Absolute Error of LogisticSE %s' %
      metrics.mean_absolute_error(clf4.predict(X), y))
yfile = open('yfile', 'rb')
Y = pickle.load(yfile)
yfile.close()

print("Loading x...")
xfile = open('xfile', 'rb')
X = pickle.load(xfile)
xfile.close()

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.4,
                                                    random_state=1)

#model = md.LogisticIT(alpha = alpha)
model = md.LogisticSE(alpha=alpha)

print("Alpha:", alpha)

print("Training model...")
model.fit(X_train, y_train)

print("Saving model")
model_file = open('model_file', 'ab')
pickle.dump(model, model_file)
model_file.close()

print("Making predictions")
#predictions = model.predict(X_train)
predictions = model.predict(X_test)
示例#8
0
def train_user_classification(data, id_table, label_name, model_type, run_id):
    print('Model:', model_type, ', Label:', label_name)
    image_filename = os.path.join(HOME_DIRECTORY, 'output', run_id,
                                  '%s_%s.png' % (model_type, label_name))
    csv_filename = os.path.join(HOME_DIRECTORY, 'output', run_id,
                                '%s_%s.csv' % (model_type, label_name))
    if os.path.exists(image_filename):
        return

    results = pd.DataFrame(columns=[
        'subject_id', 'split_id', 'n_total', 'n_train', 'n_test', 'auc', 'mse',
        'vse', 'null_mse', 'null_vse', 'mae', 'vae', 'null_mae', 'null_vae',
        'macro_mse', 'macro_vse', 'null_macro_mse', 'null_macro_vse',
        'macro_mae', 'macro_vae', 'null_macro_mae', 'null_macro_vae'
    ])
    sorted_subjects = sorted(id_table.subject_id.unique())
    if DEBUG:
        sorted_subjects = sorted_subjects[:5]

    for subject in sorted_subjects:
        print_debug('--------------')

        # Filter subject's data and generate folds, skipping if not enough data
        subj_id_table, folds = preprocess_data(id_table, subject, label_name)
        if subj_id_table is None:
            continue

        # Go through the folds
        for fold_idx, (id_table_train_idxs,
                       id_table_test_idxs) in enumerate(folds):
            print('Subject: %s Fold: %d' % (subject, fold_idx))

            # Separate train and test IDs
            subj_id_table_train = subj_id_table.iloc[id_table_train_idxs, :]
            subj_id_table_test = subj_id_table.iloc[id_table_test_idxs, :]
            id_train = subj_id_table_train['ID'].values
            id_test = subj_id_table_test['ID'].values

            # Grab corresponding data
            subj_data_train = data[data['ID'].isin(id_train)]
            subj_data_test = data[data['ID'].isin(id_test)]

            # Add labels to the data
            subj_data_train = pd.merge(subj_data_train,
                                       subj_id_table_train[['ID', label_name]],
                                       on='ID',
                                       how='left')
            subj_data_test = pd.merge(subj_data_test,
                                      subj_id_table_test[['ID', label_name]],
                                      on='ID',
                                      how='left')

            # Separate into (train, validation, test) (features, labels)
            x_train = subj_data_train.drop(['ID', label_name], axis=1).values
            y_train = subj_data_train[label_name].values.astype(np.int)
            x_test = subj_data_test.drop(['ID', label_name], axis=1).values
            y_test = subj_data_test[label_name].values.astype(np.int)
            x_train, x_valid, y_train, y_valid = \
                train_test_split(x_train, y_train, test_size=FRAC_VALIDATION_DATA, stratify=y_train,
                                 random_state=RANDOM_SEED)
            train_classes, valid_classes, test_classes = np.unique(
                y_train), np.unique(y_valid), np.unique(y_test)
            num_features = x_train.shape[1]

            # Make sure that folds don't cut the data in a weird way
            if len(train_classes) <= 1:
                print_debug('Not enough classes in train')
                continue
            if len(test_classes) <= 1:
                print_debug('Not enough classes in test')
                continue
            if any([c not in train_classes for c in test_classes]):
                print_debug('There is a test class that is not in train')
                continue

            # Prepare data imputer for missing data
            imputer = IterativeImputer(estimator=KNeighborsRegressor(
                n_neighbors=int(num_features / 10)),
                                       random_state=RANDOM_SEED)

            # Construct the automatic feature selection method
            feature_selection = SelectPercentile(mutual_info_classif)
            param_grid = {'featsel__percentile': np.arange(25, 101, 25)}

            # Construct the base model
            missing_train_class = any(
                [k != train_classes[k] for k in range(len(train_classes))])
            missing_valid_class = any(
                [k != valid_classes[k] for k in range(len(valid_classes))])
            if model_type == CLASSIF_RANDOM_FOREST:
                base_model = RandomForestClassifier(random_state=RANDOM_SEED)
                param_grid = {
                    'model__n_estimators': np.arange(10, 51, 10),
                    **param_grid
                }
            elif model_type == CLASSIF_XGBOOST:
                base_model = xgb.XGBClassifier(objective="multi:softprob",
                                               random_state=RANDOM_SEED)
                base_model.set_params(**{'num_class': len(train_classes)})
                param_grid = {
                    'model__n_estimators': np.arange(25, 76, 10),
                    **param_grid
                }
            elif model_type == CLASSIF_ORDINAL_RANDOM_FOREST:
                base_model = OrdinalRandomForestClassifier(
                    random_state=RANDOM_SEED)
                param_grid = {
                    'model__n_estimators': np.arange(10, 51, 10),
                    **param_grid
                }
            elif model_type == CLASSIF_ORDINAL_LOGISTIC:
                base_model = mord.LogisticSE()
                param_grid = {
                    'model__alpha': np.logspace(-1, 1, 3),
                    **param_grid
                }
            elif model_type == CLASSIF_MLP:
                base_model = MLPClassifier(max_iter=1000,
                                           random_state=RANDOM_SEED)
                half_x, quart_x = int(num_features / 2), int(num_features / 4)
                param_grid = {
                    'model__hidden_layer_sizes': [(half_x), (half_x, quart_x)],
                    **param_grid
                }
            else:
                raise Exception('Not a valid model type')

            # Create a pipeline
            pipeline = Pipeline([('imputer',
                                  make_union(imputer, MissingIndicator())),
                                 ('featsel', feature_selection),
                                 ('model', base_model)])

            # Remap classes to fill in gap if one exists
            if model_type in (CLASSIF_ORDINAL_RANDOM_FOREST,
                              CLASSIF_ORDINAL_LOGISTIC):
                if missing_train_class:
                    print_debug('Forced to remap labels')
                    y_train = np.array(
                        list(
                            map(lambda x: np.where(train_classes == x),
                                y_train))).flatten()
                if missing_valid_class:
                    print_debug('Forced to remap labels')
                    y_valid = np.array(
                        list(
                            map(lambda x: np.where(valid_classes == x),
                                y_valid))).flatten()

            # Identify ideal parameters using stratified k-fold cross-validation on validation data
            cross_validator = StratifiedKFold(n_splits=PARAM_SEARCH_FOLDS,
                                              random_state=RANDOM_SEED)
            grid_search = GridSearchCV(pipeline,
                                       param_grid=param_grid,
                                       cv=cross_validator)
            grid_search.fit(x_valid, y_valid)
            model = pipeline.set_params(**grid_search.best_params_)
            print('Best params:', grid_search.best_params_)

            # Fit the model on train data
            model.fit(x_train, y_train)

            # Predict results on test data
            preds = model.predict(x_test)
            probs = model.predict_proba(x_test)

            # Calculate scores and other subject information
            scores = calculate_scores(y_train, y_test, train_classes,
                                      test_classes, subj_data_test, preds,
                                      probs)
            result = {
                'subject_id': subject,
                'split_id': fold_idx,
                'n_total': len(id_table_train_idxs) + len(id_table_test_idxs),
                'n_train': len(id_table_train_idxs),
                'n_test': len(id_table_test_idxs),
                **scores
            }
            results = results.append(result, ignore_index=True)

    # Save results
    results.to_csv(csv_filename, index=False, encoding='utf-8')

    # Plot results
    generate_plots(results, image_filename, model_type, label_name)
    print('**********************')
    return csv_filename, image_filename
示例#9
0
def doAll(trainFileName, testFileName):
    trainSet = makeListEntries(trainFileName)
    testSet = makeListEntries(testFileName)
    """**************************************"""
    # data
    listTrainText = makeListText(trainSet)
    listTestText = makeListText(testSet)

    # target
    listTrainStars = makeListStars(trainSet)
    listTestStars = makeListStars(testSet)
    """*************************************"""
    # could do CountVectorizer
    cv = CountVectorizer(stop_words='english')

    trainCVMatr = cv.fit_transform(listTrainText)
    testCVMatr = cv.transform(listTestText)

    # could do TfidfVectorizer
    # tv = TfidfVectorizer(stop_words = 'english')

    # trainTVMatr = cv.fit_transform(listTrainText)
    # testTVMatr = cv.transform(listTestText)
    """*************************************"""
    # using CountVectorizer
    LR_CV_model = LogisticRegression(multi_class='multinomial',
                                     max_iter=1000,
                                     class_weight='balanced')
    LR_CV_model.fit(trainCVMatr, listTrainStars)

    # get it to predict
    LR_CV_prediction = LR_CV_model.predict(testCVMatr)

    # get accuracy score
    LR_CV_score = metrics.accuracy_score(listTestStars, LR_CV_prediction)
    LR_CV_f1 = metrics.f1_score(listTestStars,
                                LR_CV_prediction,
                                average='micro')
    LR_CV_r2 = metrics.r2_score(listTestStars,
                                LR_CV_prediction,
                                multioutput='variance_weighted')
    LR_my = betterScoring(listTestStars, LR_CV_prediction)
    # this is the bit with the tfidf vectorizer
    # LR_TV_model = LogisticRegression(multi_class = 'multinomial', max_iter=1000)
    # LR_TV_model.fit(trainTVMatr, listTrainStars)

    # get it to predict
    # LR_TV_prediction = LR_TV_model.predict(testTVMatr)

    # get accuracy score
    # LR_TV_score = metrics.accuracy_score(listTestStars, LR_TV_prediction)

    # what do the data say?
    #print("Multiclass, logistic regression, CountVectorizer: " + str(LR_CV_score))
    #print("Multiclass, logistic regression, TfidfVectorizer: " + str(LR_TV_score))
    """*************************************"""
    # using CountVectorizer
    NB_CV_model = MultinomialNB()
    NB_CV_model.fit(trainCVMatr, listTrainStars)

    # get it to predict
    NB_CV_prediction = NB_CV_model.predict(testCVMatr)

    # get accuracy score
    NB_CV_score = metrics.accuracy_score(listTestStars, NB_CV_prediction)
    NB_CV_f1 = metrics.f1_score(listTestStars,
                                NB_CV_prediction,
                                average='micro')
    NB_CV_r2 = metrics.r2_score(listTestStars,
                                NB_CV_prediction,
                                multioutput='variance_weighted')
    NB_my = betterScoring(listTestStars, NB_CV_prediction)
    # this is the bit with the tfidf vectorizer
    # NB_TV_model = MultinomialNB()
    # NB_TV_model.fit(trainCVMatr, listTrainStars)

    # get it to predict
    # NB_TV_prediction = NB_TV_model.predict(testTVMatr)

    # get accuracy score
    # NB_TV_score = metrics.accuracy_score(listTestStars, NB_TV_prediction)

    # what do the data say?
    #print("Naive Bayes, CountVectorizer: " + str(NB_CV_score))
    # print("Naive Bayes, TfidfVectorizer: " + str(NB_TV_score))
    """*************************************"""
    sid = SentimentIntensityAnalyzer()
    listOfRes = []

    data2 = [json.loads(line) for line in open(testFileName, 'r')]

    for entry in data2:
        listOfRes.append(sid.polarity_scores(entry['review_body'])['compound'])

    scaledRes = []
    size = len(listOfRes)
    for i in range(size):
        num = listOfRes[i]
        score = -1
        if num >= q0 and num < q1:
            score = 1
        elif num >= q1 and num < q2:
            score = 2
        elif num >= q2 and num < q3:
            score = 3
        elif num >= q3 and num < q4:
            score = 4
        elif num >= q4 and num <= q5:
            score = 5

        # add score back in
        scaledRes.append(score)

    vader_acc = metrics.accuracy_score(listTestStars, scaledRes)
    vader_f1 = metrics.f1_score(listTestStars, scaledRes, average='micro')
    vader_r2 = metrics.r2_score(listTestStars,
                                scaledRes,
                                multioutput='variance_weighted')
    vader_my = betterScoring(listTestStars, scaledRes)
    """*************************************"""
    # dealing with the ordinal regression
    ord_model = OrdinalClassifier(DecisionTreeClassifier())
    ord_model.fit(trainCVMatr, listTrainStars)
    ord_model_prediction = ord_model.predict(testCVMatr)

    size = len(listTestStars)
    for i in range(size):
        if (ord_model_prediction[i] < 1):
            ord_model_prediction[i] = 1

    ord_acc = metrics.accuracy_score(listTestStars, ord_model_prediction)
    ord_f1 = metrics.f1_score(listTestStars,
                              ord_model_prediction,
                              average='micro')
    ord_r2 = metrics.r2_score(listTestStars,
                              ord_model_prediction,
                              multioutput='variance_weighted')
    ord_my = betterScoring(listTestStars, ord_model_prediction)
    """*************************************"""
    # trying mord

    arr = np.asarray(listTrainStars)
    clf2 = mord.LogisticAT(alpha=1.)
    clf2.fit(trainCVMatr, arr)
    clf2_prediction = clf2.predict(testCVMatr)

    LAT_acc = metrics.accuracy_score(listTestStars, clf2_prediction)
    LAT_f1 = metrics.f1_score(listTestStars, clf2_prediction, average='micro')
    LAT_r2 = metrics.r2_score(listTestStars,
                              clf2_prediction,
                              multioutput='variance_weighted')
    LAT_my = betterScoring(listTestStars, clf2_prediction)
    #print('AccuracyScore of LogisticAT %s' %
    #metrics.accuracy_score(listTestStars, clf2.predict(testCVMatr)))

    clf3 = mord.LogisticIT(alpha=1.)
    clf3.fit(trainCVMatr, arr)
    clf3_prediction = clf3.predict(testCVMatr)

    LIT_acc = metrics.accuracy_score(listTestStars, clf3_prediction)
    LIT_f1 = metrics.f1_score(listTestStars, clf3_prediction, average='micro')
    LIT_r2 = metrics.r2_score(listTestStars,
                              clf3_prediction,
                              multioutput='variance_weighted')
    LIT_my = betterScoring(listTestStars, clf3_prediction)
    #print('AccuracyScore of LogisticIT %s' %
    #metrics.accuracy_score(listTestStars, clf3.predict(testCVMatr)))

    clf4 = mord.LogisticSE(alpha=1.)
    clf4.fit(trainCVMatr, arr)
    clf4_prediction = clf4.predict(testCVMatr)

    LSE_acc = metrics.accuracy_score(listTestStars, clf4_prediction)
    LSE_f1 = metrics.f1_score(listTestStars, clf4_prediction, average='micro')
    LSE_r2 = metrics.r2_score(listTestStars,
                              clf4_prediction,
                              multioutput='variance_weighted')
    LSE_my = betterScoring(listTestStars, clf4_prediction)
    #print('AccuracyScore of LogisticSE %s' %
    #metrics.accuracy_score(listTestStars, clf4.predict(testCVMatr)))
    """*************************************"""

    # return value
    categoryName = trainFileName.replace("dataset/prodAnalysis/train_", "")
    categoryName = categoryName.replace(".json", "")
    return [
        categoryName,
        LR_CV_score,
        LR_CV_f1,
        LR_CV_r2,
        LR_my,
        NB_CV_score,
        NB_CV_f1,
        NB_CV_r2,
        NB_my,
        vader_acc,
        vader_f1,
        vader_r2,
        vader_my,
        ord_acc,
        ord_f1,
        ord_r2,
        ord_my,
        LAT_acc,
        LAT_f1,
        LAT_r2,
        LAT_my,
        LIT_acc,
        LIT_f1,
        LIT_r2,
        LIT_my,
        LSE_acc,
        LSE_f1,
        LSE_r2,
        LSE_my,
    ]