def final_pipeline(run_modelN): if run_modelN == 0: k = 'all' scaler = MinMaxScaler() clf = SGDClassifier(early_stopping=True, max_iter=5000, penalty='l1', loss='log') if run_modelN == 1: k = 'all' scaler = MinMaxScaler() clf = SGDClassifier(early_stopping=True, max_iter=5000, penalty='elasticnet', loss='log') elif run_modelN == 2: k = 'all' scaler = MinMaxScaler() clf = SVC(kernel='linear', probability=True) elif run_modelN == 3: k = 'all' scaler = MinMaxScaler() clf = ExtraTreesClassifier(n_jobs=-1) elif run_modelN == 4: k = 'all' scaler = MinMaxScaler() clf = XGBModel(objective='binary:hinge',n_jobs=-1) pipeline = Pipeline([ ('normalization',scaler), ('feature_selection', SelectKBest(k=k)), ('clf', clf), ]) return pipeline
def parameters_all_models_final(y, dim_reduction): if dim_reduction: k = 2 else: k = 'all' n_classes = len(np.unique(y)) parameters = [ { # SGD, train: 0.8278 'clf__estimator': SGDClassifier( early_stopping=True, max_iter=5000), # SVM if hinge loss / logreg if log loss 'normalization': MinMaxScaler(), 'feature_selection__k': 'all', 'clf__estimator__penalty': 'l1', 'clf__estimator__loss': 'log', }, { 'clf__estimator': [SVC(kernel='rbf', probability=False)], 'normalization': (normalization_both), 'clf__estimator__C': (0.01, 0.1, 1, 10, 100), 'clf__estimator__gamma': ('scale', 'auto'), 'feature_selection__k': k, }, { 'clf__estimator': [ExtraTreesClassifier()], 'normalization': normalization_std, 'clf__estimator__n_estimators': (16, 32, 128), 'clf__estimator__max_depth': (32, 64, None), 'feature_selection__k': k, }, # default params: https://stackoverflow.com/questions/34674797/xgboost-xgbclassifier-defaults-in-python { # 'clf__estimator': [XGBModel(objective='multi:softmax',num_class=n_classes, max_features='auto')], 'clf__estimator': [XGBModel()], 'normalization': normalization_std, 'clf__estimator__n_estimators': (16, 32, 128), 'clf__estimator__max_depth': (32, 64), 'clf__estimator__learning_rate': (0.01, 0.1, 0.3), 'feature_selection__k': k, }, { 'clf__estimator': [MLPClassifier()], 'normalization': normalization_std, 'clf__estimator__batch_size': (512), 'clf__estimator__hidden_layer_sizes': [(50, 50, 50), (50, 100, 50), (100, )], 'clf__estimator__activation': ['relu'], 'clf__estimator__alpha': [0.0001, 0.05], 'clf__estimator__solver': ['adam'], 'feature_selection__k': k, }, ] return parameters
def main(): #data cleaning and features engineering section data = pd.read_csv('input.csv') data = fix_data_encoding(data) data = name_mapping(data) data = get_duration(data) data = get_court_city_type(data) data = fix_leading_zeros(data) data = add_judge_age(data) data = encode_receipt_procedure(data) data = add_money_amount_indicator(data) data = create_person_business_indicators(data) data = encode_case_matter(data) data = create_court_indicators(data) data = add_loadiness_of_courts(data) data = add_not_subject_to_duty_not_zero(data) data = add_lives_abroad_over_persons_and_companies_involved(data) data = add_date_groups(data) data = get_total_persons_and_companies_started(data) data = remove_outliers(data) data = add_single_person_or_company_started(data) data = add_single_person_or_company_answered(data) public_data = pd.read_csv('public_data.csv') print("Public data columns: ", list(public_data)) data = add_public_data(data, public_data) print("After adding data: ", list(data)) data = add_court_productivity(data) data.pop('start_date') data.pop('end_date') data.pop('court_name') data.pop('case_id') data.pop('court_id') data.pop('date_of_birth') # Depends if start_date will be available in final data data.pop('start_date_year') data.to_csv("out.csv") train, test = train_test_split(data, test_size=0.2, random_seed=1) #store the name of the variable we want to predict. Separately store the names of all other variables target = 'duration_m' all_columns_except_target = train.columns.difference([target]) # #----------------------------------------------------------------------------------------------------- # #model calibration section # #tree amount calibration # for tree_amount in range(10,60,10): # model = RandomForestRegressor(n_estimators=tree_amount) # score_from_cross_validation = get_score_from_cross_validation(model, train, target, valid_split_size=3) # print('number of trees=', tree_amount) # print("score from cross-validation:", score_from_cross_validation) # # #max_features calibration # for max_features in ['auto','sqrt','log2']: # model = RandomForestRegressor(max_features=max_features) # score_from_cross_validation = get_score_from_cross_validation(model, train, target, valid_split_size=5) # print('max_features_type=', max_features) # print("score from cross-validation:", score_from_cross_validation) # # #min_samples_leaf calibration # for min_samples_leaf in range(1,5,1): # model = RandomForestRegressor(n_estimators = 60, min_samples_leaf=min_samples_leaf) # score_from_cross_validation = get_score_from_cross_validation(model, train, target, valid_split_size=3) # print('min_samples_leaf=', min_samples_leaf) # print("score from cross-validation:", score_from_cross_validation) #default settings vs manually calibrated settings print('RadnomForestRegressor from scikit-learn') model = RandomForestRegressor() score_from_cross_validation = get_score_from_cross_validation( model, train, target, valid_split_size=5) print('default model:') print("score from cross-validation on train data:", score_from_cross_validation) model = RandomForestRegressor(n_estimators=60, min_samples_leaf=2) score_from_cross_validation = get_score_from_cross_validation( model, train, target, valid_split_size=5) print('manually calibrated model:') print("score from cross-validation on train data:", score_from_cross_validation) # model = RandomForestRegressor() # model.fit(train[all_columns_except_target], train[target]) # score_on_test = model.score(test[all_columns_except_target], test[target]) # print('default model:') # print("score for test data:", score_on_test) # # model = RandomForestRegressor(n_estimators = 60, min_samples_leaf=2) # model.fit(train[all_columns_except_target], train[target]) # score_on_test = model.score(test[all_columns_except_target], test[target]) # print('manually calibrated model:') # print("score for test data:", score_on_test) #----------------------------------------------------------------------------------------------------- #trying different models/algorithms #default settings for GradientBoostingRegressor ~ a bit better than RandomForestRegressor print('GradientBoostingRegressor from scikit-learn') model = GradientBoostingRegressor() score_from_cross_validation = get_score_from_cross_validation( model, train, target, valid_split_size=5) print('default model:') print("score from cross-validation on train data:", score_from_cross_validation) # model = GradientBoostingRegressor() # model.fit(train[all_columns_except_target], train[target]) # score_on_test = model.score(test[all_columns_except_target], test[target]) # print('default model:') # print("score for test data:", score_on_test) #default settings for ADAboost - sucks! print('AdaBoostRegressor from scikit-learn') model = AdaBoostRegressor() score_from_cross_validation = get_score_from_cross_validation( model, train, target, valid_split_size=5) print('default model:') print("score from cross-validation on train data:", score_from_cross_validation) # model = AdaBoostRegressor() # model.fit(train[all_columns_except_target], train[target]) # score_on_test = model.score(test[all_columns_except_target], test[target]) # print('default model:') # print("score for test data:", score_on_test) #default settings for ExtraTreesRegressor - sucks print('ExtraTreesRegressor from scikit-learn') model = ExtraTreesRegressor() score_from_cross_validation = get_score_from_cross_validation( model, train, target, valid_split_size=5) print('default model:') print("score from cross-validation on train data:", score_from_cross_validation) # model = ExtraTreesRegressor() # model.fit(train[all_columns_except_target], train[target]) # score_on_test = model.score(test[all_columns_except_target], test[target]) # print('default model:') # print("score for test data:", score_on_test) #default settings for BaggingRegressor ~ almost like RandomForestRegressor print('BaggingRegressor from scikit-learn') model = BaggingRegressor() score_from_cross_validation = get_score_from_cross_validation( model, train, target, valid_split_size=5) print('default model:') print("score from cross-validation on train data:", score_from_cross_validation) # model = BaggingRegressor() # model.fit(train[all_columns_except_target], train[target]) # score_on_test = model.score(test[all_columns_except_target], test[target]) # print('default model:') # print("score for test data:", score_on_test) #default settings for XGBModel ~ 1% better than GradientBoostingRegressor print('XGBModel for scikit-learn') model = XGBModel() model.fit(train[all_columns_except_target], train[target]) score_from_cross_validation = get_score_from_cross_validation( model, train, target, valid_split_size=5) #calculating R^2 score manually print('default model:') print("score for test data:", score_from_cross_validation) #default settings for SVR - very bad!!!!!!! print('SVR from scikit-learn') model = SVR() score_from_cross_validation = get_score_from_cross_validation( model, train, target, valid_split_size=5) print('default model:') print("score from cross-validation on train data:", score_from_cross_validation) # model = SVR() # model.fit(train[all_columns_except_target], train[target]) # score_on_test = model.score(test[all_columns_except_target], test[target]) # print('default model:') # print("score for test data:", score_on_test) #default settings for NuSVR - very bad as well!!! print('NuSVR from scikit-learn') model = NuSVR() score_from_cross_validation = get_score_from_cross_validation( model, train, target, valid_split_size=5) print('default model:') print("score from cross-validation on train data:", score_from_cross_validation) # model = NuSVR() # model.fit(train[all_columns_except_target], train[target]) # score_on_test = model.score(test[all_columns_except_target], test[target]) # print('default model:') # print("score for test data:", score_on_test) #default settings for LinearRegression - not too bad for such model print('LinearRegression from scikit-learn') model = LinearRegression() score_from_cross_validation = get_score_from_cross_validation( model, train, target, valid_split_size=5) print('default model:') print("score from cross-validation on train data:", score_from_cross_validation)
def parameters_all_models(y, dim_reduction): n_classes = len(np.unique(y)) if dim_reduction: k = (2,8) n_neighbors = (16,32,64) parameters = [ { 'clf__estimator': [SGDClassifier(early_stopping=True, max_iter=5000), ], # SVM if hinge loss / logreg if log loss 'normalization': (normalization_both), 'umap__n_components': k, 'umap__n_neighbors': n_neighbors, 'clf__estimator__penalty': ('l2', 'elasticnet', 'l1'), 'clf__estimator__loss': ['hinge', 'log'], }, { 'clf__estimator': [SVC(probability=False)], 'normalization': (normalization_both), 'clf__estimator__C': (0.01, 0.1, 1, 10, 100), 'clf__estimator__kernel': ('rbf',), 'umap__n_components': k, 'umap__n_neighbors': n_neighbors, }, { 'clf__estimator': [ XGBModel(objective='multi:softmax', num_class=n_classes, max_features='auto', n_jobs=-1)], 'normalization': normalization_std, 'clf__estimator__n_estimators': (32, 128), 'clf__estimator__max_depth': (32, 64, 128), 'clf__estimator__learning_rate': (0.01, 0.1), 'umap__n_components': k, 'umap__n_neighbors': n_neighbors, }, { 'clf__estimator': [ExtraTreesClassifier(max_features='auto', n_jobs=-1)], 'normalization': normalization_std, 'clf__estimator__n_estimators': (32, 128), 'clf__estimator__max_depth': (32, 64, 128), 'umap__n_components': k, 'umap__n_neighbors': n_neighbors, }, { 'clf__estimator': [MLPClassifier(early_stopping=True, max_iter=200)], 'normalization': normalization_std, 'clf__estimator__batch_size': (32, 128, 512), 'clf__estimator__hidden_layer_sizes': [(64, 16), (16, 16)], 'clf__estimator__activation': ['relu'], 'clf__estimator__alpha': [0.0001, 0.05], 'clf__estimator__solver': ['adam'], 'umap__n_components': k, 'umap__n_neighbors': n_neighbors, }, ] else: k = (32, 64, 128,'all') parameters = [ { 'clf__estimator': [SGDClassifier(early_stopping=True, max_iter=5000),], # SVM if hinge loss / logreg if log loss 'normalization': (normalization_both), 'feature_selection__k': k, 'clf__estimator__penalty': ('l2', 'elasticnet', 'l1'), 'clf__estimator__loss': ['hinge','log'], }, { 'clf__estimator': [SVC(probability=False)], 'normalization': (normalization_both), 'clf__estimator__C': (0.01,0.1,1, 10,100), 'clf__estimator__kernel': ('rbf',), 'feature_selection__k': k, }, { 'clf__estimator': [XGBModel(objective='multi:softmax', num_class=n_classes, max_features='auto', n_jobs=-1)], 'normalization': normalization_std, 'clf__estimator__n_estimators': (32, 128), 'clf__estimator__max_depth': (32, 64, 128), 'clf__estimator__learning_rate': (0.01, 0.1), 'feature_selection__k': k, }, { 'clf__estimator': [ExtraTreesClassifier(max_features='auto', n_jobs=-1)], 'normalization': normalization_std, 'clf__estimator__n_estimators': (32,128), 'clf__estimator__max_depth':(32, 64, 128), 'feature_selection__k': k, }, { 'clf__estimator': [MLPClassifier(early_stopping=True, max_iter=200)], 'normalization': normalization_std, 'clf__estimator__batch_size': (32,128,512), 'clf__estimator__hidden_layer_sizes': [(256,32), (64, 32)], 'clf__estimator__activation': ['relu'], 'clf__estimator__alpha': [0.0001, 0.05], 'clf__estimator__solver': ['adam'], 'feature_selection__k': k, }, ] return parameters
epochs=100) start = time.time() X_train = train[train['is_anomaly'] == 0][model_features].values model.fit(X_train) end = time.time() fit_time = end - start elif parsed_args.model == 'xgboost': #impute features with mean train, test = handle_missing_values(train, test) from xgboost.sklearn import XGBModel model_features = list(train.columns) model_features.remove('is_anomaly') num_features = len(model_features) model = XGBModel(n_estimators=400, max_depth=5, n_jobs=-1, objective='binary:logistic') start = time.time() model.fit(train[model_features].values, train['is_anomaly'].values) end = time.time() fit_time = end - start elif parsed_args.model == 'rf': #impute features with mean train, test = handle_missing_values(train, test) from sklearn.ensemble import RandomForestClassifier model_features = list(train.columns) model_features.remove('is_anomaly') num_features = len(model_features) model = RandomForestClassifier(n_estimators=400, max_depth=80,
#Train-Test split X_train, X_test, y_train, y_test = train_test_split(data_X, data_y, random_state=11230, test_size=0.3) # Modelling with Logit model = LogisticRegression( penalty='l2', C=0.5) # Hyper params found through Grid Search shown below model.fit(X=X_train, y=y_train) confusion_matrix(y_true=y_test, y_pred=model.predict(X_test)) roc_auc_score(y_true=y_test, y_score=model.predict_proba(X_test)[:, 1]) log_loss(y_true=y_test, y_pred=model.predict_proba(X_test)[:, 1]) # Modelling with XGBoost model_XGB = XGBModel() model_XGB.fit(X_train, y_train) log_loss(y_true=y_test, y_pred=model_XGB.predict(X_test)) roc_auc_score(y_true=y_test, y_score=model_XGB.predict(X_test)) ## Grid Search with Logit #param_grid = {'penalty':['l1','l2'],'C': np.arange(0.1, 2.0, 0.025)} #GS = GridSearchCV(model, param_grid, cv = 5, scoring = 'roc_auc') #GS.fit(X_train, y_train) # Submission submission = pd.DataFrame({ 'shot_id': shot_ids, 'shot_made_flag': model.predict(submission_data_X) }) submission.to_csv(index=False, path_or_buf='submission.csv')
#test X_test = pd.concat([cmft_test], axis=1) #tgweekday_test,tgworkday_test, # for u in range(0,21): # for v in range(u,21): # X_test['count_'+str(u)+str(v)] = X_test['count_'+str(u)] * X_test['count_'+str(v)] # X_test = cmft_test[['avg12','avg1','avg2','davg','ravg','count_13']]#,'median1','median2','median12','sigma']] ''' multiXGB ''' from xgboost.sklearn import XGBModel bst = XGBModel(max_depth=8, learning_rate=0.04, n_estimators=150, silent=1, gamma=0.1, min_child_weight=0.3, subsample=0.6, colsample_bytree=0.3, seed=69) multiXGB = MultiOutputRegressor(bst) multiXGB.fit(X, y) # with open('./multiXGB_models/multiXGB.pkl','w') as f:cPickle.dump(multiXGB,f) predtrain = multiXGB.predict(X) #cv predcv = multiXGB.predict(X_cv) #test predtest = multiXGB.predict(X_test)