def test_1(): """ Test two model in overfit mode """ clf1 = mord.OrdinalRidge(alpha=0.) clf1.fit(X, y) clf2 = mord.LogisticAT(alpha=0.) clf2.fit(X, y) # the score is - absolute error, 0 is perfect # assert clf1.score(X, y) < clf2.score(X, y) clf3 = mord.LogisticSE(alpha=0.) clf3.fit(X, y) pred3 = clf3.predict(X) pred2 = clf2.predict(X) # check that it predicts better than the surrogate # for other loss assert np.abs(pred2 - y).mean() <= np.abs(pred3 - y).mean() # # the score is - absolute error, 0 is perfect # assert_almost_equal(clf.score(X, y), 0., places=2) # # clf = mord.LogisticIT(alpha=0.) # clf.fit(X, y) # # the score is classification error, 1 is perfect # assert_almost_equal(clf.score(X, y), 1., places=2) # test on sparse matrices X_sparse = sparse.csr_matrix(X) clf4 = mord.LogisticAT(alpha=0.) clf4.fit(X_sparse, y) pred4 = clf4.predict(X_sparse) assert metrics.mean_absolute_error(y, pred4) < 1.
def order_logit_regression(): data = read_csv(CSV_PATH) bunch = Bunch(data=data.iloc[:, 1:-1], target=data.iloc[:, -1]) d = bunch.data train_len = int(0.75 * d.shape[0]) trainX, trainY = d.ix[:train_len-1, :], bunch.target[:train_len] testX, testY = d.ix[train_len:, :], bunch.target[train_len:] clf1 = mord.LogisticAT(alpha=0.5) clf1.fit(trainX, trainY) pred = clf1.predict(testX) draw_acc_matrix(testY, pred, train_len) print 'Accuracy of LogisticAT: %s' % metrics.accuracy_score(testY, pred) print 'Mean absolute error of LogisticAT: %s' % \ metrics.mean_absolute_error(pred, testY) clf2 = mord.LogisticIT(alpha=0.5) clf2.fit(trainX, trainY) pred2 = clf2.predict(testX) draw_acc_matrix(testY, pred2, train_len) print 'Accuracy of LogisticIT: %s' % metrics.accuracy_score(testY, pred2) print 'Mean absolute error of LogisticIT: %s' % \ metrics.mean_absolute_error(pred2, testY) clf3 = mord.LogisticSE(alpha=0.5) clf3.fit(trainX, trainY) pred3 = clf3.predict(testX) draw_acc_matrix(testY, pred3, train_len) print 'Accuracy of LogisticSE: %s' % metrics.accuracy_score(testY, pred3) print 'Mean absolute error of LogisticSE: %s' % \ metrics.mean_absolute_error(pred3, testY)
def test_binary_class(): Xc, yc = datasets.make_classification(n_classes=2, n_samples=1000) clf = linear_model.LogisticRegression(C=1e6) clf.fit(Xc[:500], yc[:500]) pred_lr = clf.predict(Xc[500:]) clf = mord.LogisticAT(alpha=1e-6) clf.fit(Xc[:500], yc[:500]) pred_at = clf.predict(Xc[500:]) assert_almost_equal(np.abs(pred_lr - pred_at).mean(), 0.) clf2 = mord.LogisticSE(alpha=1e-6) clf2.fit(Xc[:500], yc[:500]) pred_at = clf2.predict(Xc[500:]) assert_almost_equal(np.abs(pred_lr - pred_at).mean(), 0.)
def test_predict_proba_nonnegative(): """ Test that predict_proba() function outputs a tuple of non-negative values """ def check_for_negative_prob(proba): for p in np.ravel(proba): assert_greater_equal(np.round(p, 7), 0) clf = mord.LogisticAT(alpha=0.) clf.fit(X, y) check_for_negative_prob(clf.predict_proba(X)) clf2 = mord.LogisticIT(alpha=0.) clf2.fit(X, y) check_for_negative_prob(clf2.predict_proba(X)) clf3 = mord.LogisticSE(alpha=0.) clf3.fit(X, y) check_for_negative_prob(clf3.predict_proba(X))
def train_ordinal_logistic(train_features, train_labels, skip_grid_search, evaluation, num_jobs, loss, alpha, cost, ordinal_algorithm): """ returns the trained ordinal logistic model. loss, alpha and cost are ignored if grid search is requested. alpha: used only for se, it, at, and ridge and if grid search is not requested cost: used only for lad and if grid search is not requested loss: used only for lad and if grid search is not requested """ # requested grid search. find best parameters, to achieve highest average score if not skip_grid_search: penalty_weights = 'dummy' clf = grid_search.grid_search(evaluation, train_features, train_labels, penalty_weights, ordinal_algorithm, num_jobs) params = clf.best_params_ if 'penalty' in params: loss = params['loss'] if 'alpha' in params: alpha = params['alpha'] if 'cost' in params: cost = params['cost'] # Now perform the training on full train data. if ordinal_algorithm == 'logisticse': model = mord.LogisticSE(alpha=alpha, max_iter=20000) elif ordinal_algorithm == 'logisticit': model = mord.LogisticIT(alpha=alpha, max_iter=20000) elif ordinal_algorithm == 'logisticat': model = mord.LogisticAT(alpha=alpha, max_iter=20000) elif ordinal_algorithm == 'ordinalridge': model = mord.OrdinalRidge(alpha=alpha) elif ordinal_algorithm == 'lad': model = mord.LAD(C=cost, loss=loss, max_iter=10000) model = model.fit(train_features, train_labels) return model
features.loc[features.Cont == 'Medium', 'Cont'] = 2 features.loc[features.Cont == 'High', 'Cont'] = 3 le = preprocessing.LabelEncoder() le.fit(features.loc[:, 'Type']) features.loc[:, 'type_encoded'] = le.transform(features.loc[:, 'Type']) X, y = features.loc[:, ('Infl', 'Cont', 'type_encoded')], data.target clf1 = linear_model.LogisticRegression(solver='lbfgs', multi_class='multinomial') clf1.fit(X, y) print('Mean Absolute Error of LogisticRegression: %s' % metrics.mean_absolute_error(clf1.predict(X), y)) clf2 = mord.LogisticAT(alpha=1.) clf2.fit(X, y) print('Mean Absolute Error of LogisticAT %s' % metrics.mean_absolute_error(clf2.predict(X), y)) clf3 = mord.LogisticIT(alpha=1.) clf3.fit(X, y) print('Mean Absolute Error of LogisticIT %s' % metrics.mean_absolute_error(clf3.predict(X), y)) clf4 = mord.LogisticSE(alpha=1.) clf4.fit(X, y) print('Mean Absolute Error of LogisticSE %s' % metrics.mean_absolute_error(clf4.predict(X), y))
yfile = open('yfile', 'rb') Y = pickle.load(yfile) yfile.close() print("Loading x...") xfile = open('xfile', 'rb') X = pickle.load(xfile) xfile.close() X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=1) #model = md.LogisticIT(alpha = alpha) model = md.LogisticSE(alpha=alpha) print("Alpha:", alpha) print("Training model...") model.fit(X_train, y_train) print("Saving model") model_file = open('model_file', 'ab') pickle.dump(model, model_file) model_file.close() print("Making predictions") #predictions = model.predict(X_train) predictions = model.predict(X_test)
def train_user_classification(data, id_table, label_name, model_type, run_id): print('Model:', model_type, ', Label:', label_name) image_filename = os.path.join(HOME_DIRECTORY, 'output', run_id, '%s_%s.png' % (model_type, label_name)) csv_filename = os.path.join(HOME_DIRECTORY, 'output', run_id, '%s_%s.csv' % (model_type, label_name)) if os.path.exists(image_filename): return results = pd.DataFrame(columns=[ 'subject_id', 'split_id', 'n_total', 'n_train', 'n_test', 'auc', 'mse', 'vse', 'null_mse', 'null_vse', 'mae', 'vae', 'null_mae', 'null_vae', 'macro_mse', 'macro_vse', 'null_macro_mse', 'null_macro_vse', 'macro_mae', 'macro_vae', 'null_macro_mae', 'null_macro_vae' ]) sorted_subjects = sorted(id_table.subject_id.unique()) if DEBUG: sorted_subjects = sorted_subjects[:5] for subject in sorted_subjects: print_debug('--------------') # Filter subject's data and generate folds, skipping if not enough data subj_id_table, folds = preprocess_data(id_table, subject, label_name) if subj_id_table is None: continue # Go through the folds for fold_idx, (id_table_train_idxs, id_table_test_idxs) in enumerate(folds): print('Subject: %s Fold: %d' % (subject, fold_idx)) # Separate train and test IDs subj_id_table_train = subj_id_table.iloc[id_table_train_idxs, :] subj_id_table_test = subj_id_table.iloc[id_table_test_idxs, :] id_train = subj_id_table_train['ID'].values id_test = subj_id_table_test['ID'].values # Grab corresponding data subj_data_train = data[data['ID'].isin(id_train)] subj_data_test = data[data['ID'].isin(id_test)] # Add labels to the data subj_data_train = pd.merge(subj_data_train, subj_id_table_train[['ID', label_name]], on='ID', how='left') subj_data_test = pd.merge(subj_data_test, subj_id_table_test[['ID', label_name]], on='ID', how='left') # Separate into (train, validation, test) (features, labels) x_train = subj_data_train.drop(['ID', label_name], axis=1).values y_train = subj_data_train[label_name].values.astype(np.int) x_test = subj_data_test.drop(['ID', label_name], axis=1).values y_test = subj_data_test[label_name].values.astype(np.int) x_train, x_valid, y_train, y_valid = \ train_test_split(x_train, y_train, test_size=FRAC_VALIDATION_DATA, stratify=y_train, random_state=RANDOM_SEED) train_classes, valid_classes, test_classes = np.unique( y_train), np.unique(y_valid), np.unique(y_test) num_features = x_train.shape[1] # Make sure that folds don't cut the data in a weird way if len(train_classes) <= 1: print_debug('Not enough classes in train') continue if len(test_classes) <= 1: print_debug('Not enough classes in test') continue if any([c not in train_classes for c in test_classes]): print_debug('There is a test class that is not in train') continue # Prepare data imputer for missing data imputer = IterativeImputer(estimator=KNeighborsRegressor( n_neighbors=int(num_features / 10)), random_state=RANDOM_SEED) # Construct the automatic feature selection method feature_selection = SelectPercentile(mutual_info_classif) param_grid = {'featsel__percentile': np.arange(25, 101, 25)} # Construct the base model missing_train_class = any( [k != train_classes[k] for k in range(len(train_classes))]) missing_valid_class = any( [k != valid_classes[k] for k in range(len(valid_classes))]) if model_type == CLASSIF_RANDOM_FOREST: base_model = RandomForestClassifier(random_state=RANDOM_SEED) param_grid = { 'model__n_estimators': np.arange(10, 51, 10), **param_grid } elif model_type == CLASSIF_XGBOOST: base_model = xgb.XGBClassifier(objective="multi:softprob", random_state=RANDOM_SEED) base_model.set_params(**{'num_class': len(train_classes)}) param_grid = { 'model__n_estimators': np.arange(25, 76, 10), **param_grid } elif model_type == CLASSIF_ORDINAL_RANDOM_FOREST: base_model = OrdinalRandomForestClassifier( random_state=RANDOM_SEED) param_grid = { 'model__n_estimators': np.arange(10, 51, 10), **param_grid } elif model_type == CLASSIF_ORDINAL_LOGISTIC: base_model = mord.LogisticSE() param_grid = { 'model__alpha': np.logspace(-1, 1, 3), **param_grid } elif model_type == CLASSIF_MLP: base_model = MLPClassifier(max_iter=1000, random_state=RANDOM_SEED) half_x, quart_x = int(num_features / 2), int(num_features / 4) param_grid = { 'model__hidden_layer_sizes': [(half_x), (half_x, quart_x)], **param_grid } else: raise Exception('Not a valid model type') # Create a pipeline pipeline = Pipeline([('imputer', make_union(imputer, MissingIndicator())), ('featsel', feature_selection), ('model', base_model)]) # Remap classes to fill in gap if one exists if model_type in (CLASSIF_ORDINAL_RANDOM_FOREST, CLASSIF_ORDINAL_LOGISTIC): if missing_train_class: print_debug('Forced to remap labels') y_train = np.array( list( map(lambda x: np.where(train_classes == x), y_train))).flatten() if missing_valid_class: print_debug('Forced to remap labels') y_valid = np.array( list( map(lambda x: np.where(valid_classes == x), y_valid))).flatten() # Identify ideal parameters using stratified k-fold cross-validation on validation data cross_validator = StratifiedKFold(n_splits=PARAM_SEARCH_FOLDS, random_state=RANDOM_SEED) grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=cross_validator) grid_search.fit(x_valid, y_valid) model = pipeline.set_params(**grid_search.best_params_) print('Best params:', grid_search.best_params_) # Fit the model on train data model.fit(x_train, y_train) # Predict results on test data preds = model.predict(x_test) probs = model.predict_proba(x_test) # Calculate scores and other subject information scores = calculate_scores(y_train, y_test, train_classes, test_classes, subj_data_test, preds, probs) result = { 'subject_id': subject, 'split_id': fold_idx, 'n_total': len(id_table_train_idxs) + len(id_table_test_idxs), 'n_train': len(id_table_train_idxs), 'n_test': len(id_table_test_idxs), **scores } results = results.append(result, ignore_index=True) # Save results results.to_csv(csv_filename, index=False, encoding='utf-8') # Plot results generate_plots(results, image_filename, model_type, label_name) print('**********************') return csv_filename, image_filename
def doAll(trainFileName, testFileName): trainSet = makeListEntries(trainFileName) testSet = makeListEntries(testFileName) """**************************************""" # data listTrainText = makeListText(trainSet) listTestText = makeListText(testSet) # target listTrainStars = makeListStars(trainSet) listTestStars = makeListStars(testSet) """*************************************""" # could do CountVectorizer cv = CountVectorizer(stop_words='english') trainCVMatr = cv.fit_transform(listTrainText) testCVMatr = cv.transform(listTestText) # could do TfidfVectorizer # tv = TfidfVectorizer(stop_words = 'english') # trainTVMatr = cv.fit_transform(listTrainText) # testTVMatr = cv.transform(listTestText) """*************************************""" # using CountVectorizer LR_CV_model = LogisticRegression(multi_class='multinomial', max_iter=1000, class_weight='balanced') LR_CV_model.fit(trainCVMatr, listTrainStars) # get it to predict LR_CV_prediction = LR_CV_model.predict(testCVMatr) # get accuracy score LR_CV_score = metrics.accuracy_score(listTestStars, LR_CV_prediction) LR_CV_f1 = metrics.f1_score(listTestStars, LR_CV_prediction, average='micro') LR_CV_r2 = metrics.r2_score(listTestStars, LR_CV_prediction, multioutput='variance_weighted') LR_my = betterScoring(listTestStars, LR_CV_prediction) # this is the bit with the tfidf vectorizer # LR_TV_model = LogisticRegression(multi_class = 'multinomial', max_iter=1000) # LR_TV_model.fit(trainTVMatr, listTrainStars) # get it to predict # LR_TV_prediction = LR_TV_model.predict(testTVMatr) # get accuracy score # LR_TV_score = metrics.accuracy_score(listTestStars, LR_TV_prediction) # what do the data say? #print("Multiclass, logistic regression, CountVectorizer: " + str(LR_CV_score)) #print("Multiclass, logistic regression, TfidfVectorizer: " + str(LR_TV_score)) """*************************************""" # using CountVectorizer NB_CV_model = MultinomialNB() NB_CV_model.fit(trainCVMatr, listTrainStars) # get it to predict NB_CV_prediction = NB_CV_model.predict(testCVMatr) # get accuracy score NB_CV_score = metrics.accuracy_score(listTestStars, NB_CV_prediction) NB_CV_f1 = metrics.f1_score(listTestStars, NB_CV_prediction, average='micro') NB_CV_r2 = metrics.r2_score(listTestStars, NB_CV_prediction, multioutput='variance_weighted') NB_my = betterScoring(listTestStars, NB_CV_prediction) # this is the bit with the tfidf vectorizer # NB_TV_model = MultinomialNB() # NB_TV_model.fit(trainCVMatr, listTrainStars) # get it to predict # NB_TV_prediction = NB_TV_model.predict(testTVMatr) # get accuracy score # NB_TV_score = metrics.accuracy_score(listTestStars, NB_TV_prediction) # what do the data say? #print("Naive Bayes, CountVectorizer: " + str(NB_CV_score)) # print("Naive Bayes, TfidfVectorizer: " + str(NB_TV_score)) """*************************************""" sid = SentimentIntensityAnalyzer() listOfRes = [] data2 = [json.loads(line) for line in open(testFileName, 'r')] for entry in data2: listOfRes.append(sid.polarity_scores(entry['review_body'])['compound']) scaledRes = [] size = len(listOfRes) for i in range(size): num = listOfRes[i] score = -1 if num >= q0 and num < q1: score = 1 elif num >= q1 and num < q2: score = 2 elif num >= q2 and num < q3: score = 3 elif num >= q3 and num < q4: score = 4 elif num >= q4 and num <= q5: score = 5 # add score back in scaledRes.append(score) vader_acc = metrics.accuracy_score(listTestStars, scaledRes) vader_f1 = metrics.f1_score(listTestStars, scaledRes, average='micro') vader_r2 = metrics.r2_score(listTestStars, scaledRes, multioutput='variance_weighted') vader_my = betterScoring(listTestStars, scaledRes) """*************************************""" # dealing with the ordinal regression ord_model = OrdinalClassifier(DecisionTreeClassifier()) ord_model.fit(trainCVMatr, listTrainStars) ord_model_prediction = ord_model.predict(testCVMatr) size = len(listTestStars) for i in range(size): if (ord_model_prediction[i] < 1): ord_model_prediction[i] = 1 ord_acc = metrics.accuracy_score(listTestStars, ord_model_prediction) ord_f1 = metrics.f1_score(listTestStars, ord_model_prediction, average='micro') ord_r2 = metrics.r2_score(listTestStars, ord_model_prediction, multioutput='variance_weighted') ord_my = betterScoring(listTestStars, ord_model_prediction) """*************************************""" # trying mord arr = np.asarray(listTrainStars) clf2 = mord.LogisticAT(alpha=1.) clf2.fit(trainCVMatr, arr) clf2_prediction = clf2.predict(testCVMatr) LAT_acc = metrics.accuracy_score(listTestStars, clf2_prediction) LAT_f1 = metrics.f1_score(listTestStars, clf2_prediction, average='micro') LAT_r2 = metrics.r2_score(listTestStars, clf2_prediction, multioutput='variance_weighted') LAT_my = betterScoring(listTestStars, clf2_prediction) #print('AccuracyScore of LogisticAT %s' % #metrics.accuracy_score(listTestStars, clf2.predict(testCVMatr))) clf3 = mord.LogisticIT(alpha=1.) clf3.fit(trainCVMatr, arr) clf3_prediction = clf3.predict(testCVMatr) LIT_acc = metrics.accuracy_score(listTestStars, clf3_prediction) LIT_f1 = metrics.f1_score(listTestStars, clf3_prediction, average='micro') LIT_r2 = metrics.r2_score(listTestStars, clf3_prediction, multioutput='variance_weighted') LIT_my = betterScoring(listTestStars, clf3_prediction) #print('AccuracyScore of LogisticIT %s' % #metrics.accuracy_score(listTestStars, clf3.predict(testCVMatr))) clf4 = mord.LogisticSE(alpha=1.) clf4.fit(trainCVMatr, arr) clf4_prediction = clf4.predict(testCVMatr) LSE_acc = metrics.accuracy_score(listTestStars, clf4_prediction) LSE_f1 = metrics.f1_score(listTestStars, clf4_prediction, average='micro') LSE_r2 = metrics.r2_score(listTestStars, clf4_prediction, multioutput='variance_weighted') LSE_my = betterScoring(listTestStars, clf4_prediction) #print('AccuracyScore of LogisticSE %s' % #metrics.accuracy_score(listTestStars, clf4.predict(testCVMatr))) """*************************************""" # return value categoryName = trainFileName.replace("dataset/prodAnalysis/train_", "") categoryName = categoryName.replace(".json", "") return [ categoryName, LR_CV_score, LR_CV_f1, LR_CV_r2, LR_my, NB_CV_score, NB_CV_f1, NB_CV_r2, NB_my, vader_acc, vader_f1, vader_r2, vader_my, ord_acc, ord_f1, ord_r2, ord_my, LAT_acc, LAT_f1, LAT_r2, LAT_my, LIT_acc, LIT_f1, LIT_r2, LIT_my, LSE_acc, LSE_f1, LSE_r2, LSE_my, ]