Exemplo n.º 1
0
def final_pipeline(run_modelN):
	if run_modelN == 0:
		k = 'all'
		scaler = MinMaxScaler()
		clf = SGDClassifier(early_stopping=True, max_iter=5000, penalty='l1', loss='log')

	if run_modelN == 1:
		k = 'all'
		scaler = MinMaxScaler()
		clf = SGDClassifier(early_stopping=True, max_iter=5000, penalty='elasticnet', loss='log')

	elif run_modelN == 2:
		k = 'all'
		scaler = MinMaxScaler()
		clf = SVC(kernel='linear', probability=True)

	elif run_modelN == 3:
		k = 'all'
		scaler = MinMaxScaler()
		clf = ExtraTreesClassifier(n_jobs=-1)

	elif run_modelN == 4:
		k = 'all'
		scaler = MinMaxScaler()
		clf = XGBModel(objective='binary:hinge',n_jobs=-1)

	pipeline = Pipeline([
		('normalization',scaler),
		('feature_selection', SelectKBest(k=k)),
		('clf', clf),
	])
	return pipeline
Exemplo n.º 2
0
def parameters_all_models_final(y, dim_reduction):
    if dim_reduction:
        k = 2
    else:
        k = 'all'
    n_classes = len(np.unique(y))
    parameters = [
        {
            # SGD, train: 0.8278
            'clf__estimator': SGDClassifier(
                early_stopping=True,
                max_iter=5000),  # SVM if hinge loss / logreg if log loss
            'normalization': MinMaxScaler(),
            'feature_selection__k': 'all',
            'clf__estimator__penalty': 'l1',
            'clf__estimator__loss': 'log',
        },
        {
            'clf__estimator': [SVC(kernel='rbf', probability=False)],
            'normalization': (normalization_both),
            'clf__estimator__C': (0.01, 0.1, 1, 10, 100),
            'clf__estimator__gamma': ('scale', 'auto'),
            'feature_selection__k': k,
        },
        {
            'clf__estimator': [ExtraTreesClassifier()],
            'normalization': normalization_std,
            'clf__estimator__n_estimators': (16, 32, 128),
            'clf__estimator__max_depth': (32, 64, None),
            'feature_selection__k': k,
        },
        # default params: https://stackoverflow.com/questions/34674797/xgboost-xgbclassifier-defaults-in-python
        {
            # 'clf__estimator': [XGBModel(objective='multi:softmax',num_class=n_classes, max_features='auto')],
            'clf__estimator': [XGBModel()],
            'normalization': normalization_std,
            'clf__estimator__n_estimators': (16, 32, 128),
            'clf__estimator__max_depth': (32, 64),
            'clf__estimator__learning_rate': (0.01, 0.1, 0.3),
            'feature_selection__k': k,
        },
        {
            'clf__estimator': [MLPClassifier()],
            'normalization':
            normalization_std,
            'clf__estimator__batch_size': (512),
            'clf__estimator__hidden_layer_sizes': [(50, 50, 50), (50, 100, 50),
                                                   (100, )],
            'clf__estimator__activation': ['relu'],
            'clf__estimator__alpha': [0.0001, 0.05],
            'clf__estimator__solver': ['adam'],
            'feature_selection__k':
            k,
        },
    ]
    return parameters
Exemplo n.º 3
0
def main():
    #data cleaning and features engineering section
    data = pd.read_csv('input.csv')
    data = fix_data_encoding(data)
    data = name_mapping(data)
    data = get_duration(data)
    data = get_court_city_type(data)
    data = fix_leading_zeros(data)
    data = add_judge_age(data)
    data = encode_receipt_procedure(data)
    data = add_money_amount_indicator(data)
    data = create_person_business_indicators(data)
    data = encode_case_matter(data)
    data = create_court_indicators(data)
    data = add_loadiness_of_courts(data)
    data = add_not_subject_to_duty_not_zero(data)
    data = add_lives_abroad_over_persons_and_companies_involved(data)
    data = add_date_groups(data)
    data = get_total_persons_and_companies_started(data)
    data = remove_outliers(data)
    data = add_single_person_or_company_started(data)
    data = add_single_person_or_company_answered(data)

    public_data = pd.read_csv('public_data.csv')
    print("Public data columns: ", list(public_data))
    data = add_public_data(data, public_data)
    print("After adding data: ", list(data))

    data = add_court_productivity(data)

    data.pop('start_date')
    data.pop('end_date')
    data.pop('court_name')
    data.pop('case_id')
    data.pop('court_id')
    data.pop('date_of_birth')

    # Depends if start_date will be available in final data
    data.pop('start_date_year')

    data.to_csv("out.csv")

    train, test = train_test_split(data, test_size=0.2, random_seed=1)

    #store the name of the variable we want to predict. Separately store the names of all other variables
    target = 'duration_m'
    all_columns_except_target = train.columns.difference([target])

    #    #-----------------------------------------------------------------------------------------------------
    #    #model calibration section
    #    #tree amount calibration
    #    for tree_amount in range(10,60,10):
    #        model = RandomForestRegressor(n_estimators=tree_amount)
    #        score_from_cross_validation = get_score_from_cross_validation(model, train, target, valid_split_size=3)
    #        print('number of trees=', tree_amount)
    #        print("score from cross-validation:", score_from_cross_validation)
    #
    #    #max_features calibration
    #    for max_features in ['auto','sqrt','log2']:
    #        model = RandomForestRegressor(max_features=max_features)
    #        score_from_cross_validation = get_score_from_cross_validation(model, train, target, valid_split_size=5)
    #        print('max_features_type=', max_features)
    #        print("score from cross-validation:", score_from_cross_validation)
    #
    #    #min_samples_leaf calibration
    #    for min_samples_leaf in range(1,5,1):
    #        model = RandomForestRegressor(n_estimators = 60, min_samples_leaf=min_samples_leaf)
    #        score_from_cross_validation = get_score_from_cross_validation(model, train, target, valid_split_size=3)
    #        print('min_samples_leaf=', min_samples_leaf)
    #        print("score from cross-validation:", score_from_cross_validation)

    #default settings vs manually calibrated settings
    print('RadnomForestRegressor from scikit-learn')
    model = RandomForestRegressor()
    score_from_cross_validation = get_score_from_cross_validation(
        model, train, target, valid_split_size=5)
    print('default model:')
    print("score from cross-validation on train data:",
          score_from_cross_validation)

    model = RandomForestRegressor(n_estimators=60, min_samples_leaf=2)
    score_from_cross_validation = get_score_from_cross_validation(
        model, train, target, valid_split_size=5)
    print('manually calibrated model:')
    print("score from cross-validation on train data:",
          score_from_cross_validation)

    #    model = RandomForestRegressor()
    #    model.fit(train[all_columns_except_target], train[target])
    #    score_on_test = model.score(test[all_columns_except_target], test[target])
    #    print('default model:')
    #    print("score for test data:", score_on_test)
    #
    #    model = RandomForestRegressor(n_estimators = 60, min_samples_leaf=2)
    #    model.fit(train[all_columns_except_target], train[target])
    #    score_on_test = model.score(test[all_columns_except_target], test[target])
    #    print('manually calibrated model:')
    #    print("score for test data:", score_on_test)

    #-----------------------------------------------------------------------------------------------------
    #trying different models/algorithms
    #default settings for GradientBoostingRegressor ~ a bit better than RandomForestRegressor
    print('GradientBoostingRegressor from scikit-learn')
    model = GradientBoostingRegressor()
    score_from_cross_validation = get_score_from_cross_validation(
        model, train, target, valid_split_size=5)
    print('default model:')
    print("score from cross-validation on train data:",
          score_from_cross_validation)

    #    model = GradientBoostingRegressor()
    #    model.fit(train[all_columns_except_target], train[target])
    #    score_on_test = model.score(test[all_columns_except_target], test[target])
    #    print('default model:')
    #    print("score for test data:", score_on_test)

    #default settings for ADAboost - sucks!
    print('AdaBoostRegressor from scikit-learn')
    model = AdaBoostRegressor()
    score_from_cross_validation = get_score_from_cross_validation(
        model, train, target, valid_split_size=5)
    print('default model:')
    print("score from cross-validation on train data:",
          score_from_cross_validation)

    #    model = AdaBoostRegressor()
    #    model.fit(train[all_columns_except_target], train[target])
    #    score_on_test = model.score(test[all_columns_except_target], test[target])
    #    print('default model:')
    #    print("score for test data:", score_on_test)

    #default settings for ExtraTreesRegressor - sucks
    print('ExtraTreesRegressor from scikit-learn')
    model = ExtraTreesRegressor()
    score_from_cross_validation = get_score_from_cross_validation(
        model, train, target, valid_split_size=5)
    print('default model:')
    print("score from cross-validation on train data:",
          score_from_cross_validation)

    #    model = ExtraTreesRegressor()
    #    model.fit(train[all_columns_except_target], train[target])
    #    score_on_test = model.score(test[all_columns_except_target], test[target])
    #    print('default model:')
    #    print("score for test data:", score_on_test)

    #default settings for BaggingRegressor ~ almost like RandomForestRegressor
    print('BaggingRegressor from scikit-learn')
    model = BaggingRegressor()
    score_from_cross_validation = get_score_from_cross_validation(
        model, train, target, valid_split_size=5)
    print('default model:')
    print("score from cross-validation on train data:",
          score_from_cross_validation)

    #    model = BaggingRegressor()
    #    model.fit(train[all_columns_except_target], train[target])
    #    score_on_test = model.score(test[all_columns_except_target], test[target])
    #    print('default model:')
    #    print("score for test data:", score_on_test)

    #default settings for XGBModel ~ 1% better than GradientBoostingRegressor
    print('XGBModel for scikit-learn')
    model = XGBModel()
    model.fit(train[all_columns_except_target], train[target])
    score_from_cross_validation = get_score_from_cross_validation(
        model, train, target,
        valid_split_size=5)  #calculating R^2 score manually
    print('default model:')
    print("score for test data:", score_from_cross_validation)

    #default settings for SVR - very bad!!!!!!!
    print('SVR from scikit-learn')
    model = SVR()
    score_from_cross_validation = get_score_from_cross_validation(
        model, train, target, valid_split_size=5)
    print('default model:')
    print("score from cross-validation on train data:",
          score_from_cross_validation)

    #    model = SVR()
    #    model.fit(train[all_columns_except_target], train[target])
    #    score_on_test = model.score(test[all_columns_except_target], test[target])
    #    print('default model:')
    #    print("score for test data:", score_on_test)

    #default settings for NuSVR - very bad as well!!!
    print('NuSVR from scikit-learn')
    model = NuSVR()
    score_from_cross_validation = get_score_from_cross_validation(
        model, train, target, valid_split_size=5)
    print('default model:')
    print("score from cross-validation on train data:",
          score_from_cross_validation)

    #    model = NuSVR()
    #    model.fit(train[all_columns_except_target], train[target])
    #    score_on_test = model.score(test[all_columns_except_target], test[target])
    #    print('default model:')
    #    print("score for test data:", score_on_test)

    #default settings for LinearRegression - not too bad for such model
    print('LinearRegression from scikit-learn')
    model = LinearRegression()
    score_from_cross_validation = get_score_from_cross_validation(
        model, train, target, valid_split_size=5)
    print('default model:')
    print("score from cross-validation on train data:",
          score_from_cross_validation)
Exemplo n.º 4
0
def parameters_all_models(y, dim_reduction):
	n_classes = len(np.unique(y))

	if dim_reduction:
		k = (2,8)
		n_neighbors = (16,32,64)
		parameters = [
			{
				'clf__estimator': [SGDClassifier(early_stopping=True, max_iter=5000), ],
				# SVM if hinge loss / logreg if log loss
				'normalization': (normalization_both),
				'umap__n_components': k,
				'umap__n_neighbors': n_neighbors,
				'clf__estimator__penalty': ('l2', 'elasticnet', 'l1'),
				'clf__estimator__loss': ['hinge', 'log'],
			},
			{
				'clf__estimator': [SVC(probability=False)],
				'normalization': (normalization_both),
				'clf__estimator__C': (0.01, 0.1, 1, 10, 100),
				'clf__estimator__kernel': ('rbf',),
				'umap__n_components': k,
				'umap__n_neighbors': n_neighbors,
			},
			{
				'clf__estimator': [
					XGBModel(objective='multi:softmax', num_class=n_classes, max_features='auto', n_jobs=-1)],
				'normalization': normalization_std,
				'clf__estimator__n_estimators': (32, 128),
				'clf__estimator__max_depth': (32, 64, 128),
				'clf__estimator__learning_rate': (0.01, 0.1),
				'umap__n_components': k,
				'umap__n_neighbors': n_neighbors,
			},
			{
				'clf__estimator': [ExtraTreesClassifier(max_features='auto', n_jobs=-1)],
				'normalization': normalization_std,
				'clf__estimator__n_estimators': (32, 128),
				'clf__estimator__max_depth': (32, 64, 128),
				'umap__n_components': k,
				'umap__n_neighbors': n_neighbors,
			},
			{
				'clf__estimator': [MLPClassifier(early_stopping=True, max_iter=200)],
				'normalization': normalization_std,
				'clf__estimator__batch_size': (32, 128, 512),
				'clf__estimator__hidden_layer_sizes': [(64, 16), (16, 16)],
				'clf__estimator__activation': ['relu'],
				'clf__estimator__alpha': [0.0001, 0.05],
				'clf__estimator__solver': ['adam'],
				'umap__n_components': k,
				'umap__n_neighbors': n_neighbors,
			},

		]

	else:

		k = (32, 64, 128,'all')



		parameters = [
			{
				'clf__estimator': [SGDClassifier(early_stopping=True, max_iter=5000),], # SVM if hinge loss / logreg if log loss
				'normalization': (normalization_both),
				'feature_selection__k': k,
				'clf__estimator__penalty': ('l2', 'elasticnet', 'l1'),
				'clf__estimator__loss': ['hinge','log'],
			},
			{
				'clf__estimator': [SVC(probability=False)],
				'normalization': (normalization_both),
				'clf__estimator__C': (0.01,0.1,1, 10,100),
				'clf__estimator__kernel': ('rbf',),
				'feature_selection__k': k,
			},
			{
				'clf__estimator': [XGBModel(objective='multi:softmax', num_class=n_classes, max_features='auto', n_jobs=-1)],
				'normalization': normalization_std,
				'clf__estimator__n_estimators': (32, 128),
				'clf__estimator__max_depth': (32, 64, 128),
				'clf__estimator__learning_rate': (0.01, 0.1),
				'feature_selection__k': k,
			},
			{
				'clf__estimator': [ExtraTreesClassifier(max_features='auto', n_jobs=-1)],
				'normalization': normalization_std,
				'clf__estimator__n_estimators': (32,128),
				'clf__estimator__max_depth':(32, 64, 128),
				'feature_selection__k': k,
			},
			{
			'clf__estimator': [MLPClassifier(early_stopping=True, max_iter=200)],
			'normalization': normalization_std,
			'clf__estimator__batch_size': (32,128,512),
			'clf__estimator__hidden_layer_sizes': [(256,32), (64, 32)],
			'clf__estimator__activation': ['relu'],
			'clf__estimator__alpha': [0.0001, 0.05],
			'clf__estimator__solver': ['adam'],
			'feature_selection__k': k,
			},

		]
	return parameters
                            epochs=100)
        start = time.time()
        X_train = train[train['is_anomaly'] == 0][model_features].values
        model.fit(X_train)
        end = time.time()
        fit_time = end - start
    elif parsed_args.model == 'xgboost':
        #impute features with mean
        train, test = handle_missing_values(train, test)
        from xgboost.sklearn import XGBModel
        model_features = list(train.columns)
        model_features.remove('is_anomaly')
        num_features = len(model_features)

        model = XGBModel(n_estimators=400,
                         max_depth=5,
                         n_jobs=-1,
                         objective='binary:logistic')
        start = time.time()
        model.fit(train[model_features].values, train['is_anomaly'].values)
        end = time.time()
        fit_time = end - start
    elif parsed_args.model == 'rf':
        #impute features with mean
        train, test = handle_missing_values(train, test)
        from sklearn.ensemble import RandomForestClassifier
        model_features = list(train.columns)
        model_features.remove('is_anomaly')
        num_features = len(model_features)

        model = RandomForestClassifier(n_estimators=400,
                                       max_depth=80,
Exemplo n.º 6
0
#Train-Test split
X_train, X_test, y_train, y_test = train_test_split(data_X,
                                                    data_y,
                                                    random_state=11230,
                                                    test_size=0.3)

# Modelling with Logit
model = LogisticRegression(
    penalty='l2', C=0.5)  # Hyper params found through Grid Search shown below
model.fit(X=X_train, y=y_train)
confusion_matrix(y_true=y_test, y_pred=model.predict(X_test))
roc_auc_score(y_true=y_test, y_score=model.predict_proba(X_test)[:, 1])
log_loss(y_true=y_test, y_pred=model.predict_proba(X_test)[:, 1])

# Modelling with XGBoost
model_XGB = XGBModel()
model_XGB.fit(X_train, y_train)
log_loss(y_true=y_test, y_pred=model_XGB.predict(X_test))
roc_auc_score(y_true=y_test, y_score=model_XGB.predict(X_test))

## Grid Search with Logit
#param_grid = {'penalty':['l1','l2'],'C': np.arange(0.1, 2.0, 0.025)}
#GS = GridSearchCV(model, param_grid, cv = 5, scoring = 'roc_auc')
#GS.fit(X_train, y_train)

# Submission
submission = pd.DataFrame({
    'shot_id': shot_ids,
    'shot_made_flag': model.predict(submission_data_X)
})
submission.to_csv(index=False, path_or_buf='submission.csv')
Exemplo n.º 7
0
#test
X_test = pd.concat([cmft_test], axis=1)  #tgweekday_test,tgworkday_test,
# for u in range(0,21):
# 	for v in range(u,21):
# 		X_test['count_'+str(u)+str(v)] = X_test['count_'+str(u)] * X_test['count_'+str(v)]
# X_test = cmft_test[['avg12','avg1','avg2','davg','ravg','count_13']]#,'median1','median2','median12','sigma']]
'''
multiXGB
'''
from xgboost.sklearn import XGBModel
bst = XGBModel(max_depth=8,
               learning_rate=0.04,
               n_estimators=150,
               silent=1,
               gamma=0.1,
               min_child_weight=0.3,
               subsample=0.6,
               colsample_bytree=0.3,
               seed=69)

multiXGB = MultiOutputRegressor(bst)

multiXGB.fit(X, y)
# with open('./multiXGB_models/multiXGB.pkl','w') as f:cPickle.dump(multiXGB,f)
predtrain = multiXGB.predict(X)
#cv
predcv = multiXGB.predict(X_cv)
#test
predtest = multiXGB.predict(X_test)