def get_bucketer(method, encoding_method=None, case_id_col=None, cat_cols=None, num_cols=None, n_clusters=None, random_state=None): if method == "cluster": bucket_encoder = EncoderFactory.get_encoder(method=encoding_method, case_id_col=case_id_col, dynamic_cat_cols=cat_cols, dynamic_num_cols=num_cols) clustering = KMeans(n_clusters, random_state=random_state) return ClusterBasedBucketer(encoder=bucket_encoder, clustering=clustering) elif method == "state": bucket_encoder = EncoderFactory.get_encoder(method=encoding_method, case_id_col=case_id_col, dynamic_cat_cols=cat_cols, dynamic_num_cols=num_cols) return StateBasedBucketer(encoder=bucket_encoder) elif method == "zero": return NoBucketer(case_id_col=case_id_col) elif method == "prefix": return PrefixLengthBucketer(case_id_col=case_id_col) else: print("Invalid bucketer type") return None
text_transformer_args["nr_selected"] = 500 if text_method == "nb": text_transformer_args[ "pos_label"] = dataset_manager.pos_label elif text_method in ["pv", "lda"]: text_transformer_args["random_seed"] = 22 if dataset_name in ["github"]: text_transformer_args["min_freq"] = 10 elif dataset_name in ["crm2"]: text_transformer_args["min_freq"] = 20 cls_args = args_all["cls_args"] cls_args['n_estimators'] = 500 start = time.time() text_transformer = EncoderFactory.get_encoder( text_method, text_transformer_args=text_transformer_args) dt_train_text = text_transformer.fit_transform( train[dataset_manager.static_text_cols + dataset_manager.dynamic_text_cols], train[dataset_manager.label_col]) time_train += time.time() - start time_text_model = time.time() - start static_text_cols = [] dynamic_text_cols = [] for col in dataset_manager.static_text_cols + dataset_manager.dynamic_text_cols: start = time.time() dt_train_text = text_transformer.transform( train[[col]], train[dataset_manager.label_col]) current_text_cols = [ "%s_%s" % (col, text_col)
'dynamic_cat_cols': dataset_manager.dynamic_cat_cols, 'dynamic_num_cols': dataset_manager.dynamic_num_cols, 'fillna': fillna } cls_args = { 'random_state': random_state, 'min_cases_for_training': n_min_cases_in_bucket } for i in range(len(cls_params_names)): cls_args[cls_params_names[i]] = cls_params_combo[i] # Bucketing prefixes based on control flow print("Bucketing prefixes...") # initiate the KNN model bucket_encoder = EncoderFactory.get_encoder( bucket_encoding, **knn_encoder_args) encoded_train = bucket_encoder.fit_transform( dt_train_prefixes) bucketer = NearestNeighbors( n_neighbors=bucketer_params_combo[0], algorithm='auto').fit(encoded_train) # get nearest neighbors for each test case encoded_test = bucket_encoder.fit_transform( dt_test_prefixes) _, indices = bucketer.kneighbors(encoded_test) # use appropriate classifier for each bucket of test cases # for evaluation, collect predictions from different buckets together preds = [] test_y = []
def create_and_evaluate_model(args): global trial_nr trial_nr += 1 start = time.time() score = 0 for cv_iter in range(n_splits): dt_test_prefixes = dt_prefixes[cv_iter] dt_train_prefixes = pd.DataFrame() for cv_train_iter in range(n_splits): if cv_train_iter != cv_iter: dt_train_prefixes = pd.concat( [dt_train_prefixes, dt_prefixes[cv_train_iter]], axis=0) # Bucketing prefixes based on control flow bucketer_args = { 'encoding_method': bucket_encoding, 'case_id_col': dataset_manager.case_id_col, 'cat_cols': [dataset_manager.activity_col], 'num_cols': [], 'random_state': random_state } if bucket_method == "cluster": bucketer_args["n_clusters"] = args["n_clusters"] bucketer = BucketFactory.get_bucketer(bucket_method, **bucketer_args) bucket_assignments_train = bucketer.fit_predict(dt_train_prefixes) bucket_assignments_test = bucketer.predict(dt_test_prefixes) preds_all = [] test_y_all = [] if "prefix" in method_name: scores = defaultdict(int) for bucket in set(bucket_assignments_test): relevant_train_cases_bucket = dataset_manager.get_indexes( dt_train_prefixes)[bucket_assignments_train == bucket] relevant_test_cases_bucket = dataset_manager.get_indexes( dt_test_prefixes)[bucket_assignments_test == bucket] dt_test_bucket = dataset_manager.get_relevant_data_by_indexes( dt_test_prefixes, relevant_test_cases_bucket) test_y = dataset_manager.get_label_numeric(dt_test_bucket) if len(relevant_train_cases_bucket) == 0: preds = [class_ratios[cv_iter] ] * len(relevant_test_cases_bucket) else: dt_train_bucket = dataset_manager.get_relevant_data_by_indexes( dt_train_prefixes, relevant_train_cases_bucket) # one row per event train_y = dataset_manager.get_label_numeric(dt_train_bucket) if len(set(train_y)) < 2: preds = [train_y[0]] * len(relevant_test_cases_bucket) else: feature_combiner = FeatureUnion([ (method, EncoderFactory.get_encoder(method, **cls_encoder_args)) for method in methods ]) if cls_method == "rf": cls = RandomForestClassifier( n_estimators=500, max_features=args['max_features'], random_state=random_state) elif cls_method == "xgboost": cls = xgb.XGBClassifier( objective='binary:logistic', n_estimators=500, learning_rate=args['learning_rate'], subsample=args['subsample'], max_depth=int(args['max_depth']), colsample_bytree=args['colsample_bytree'], min_child_weight=int(args['min_child_weight']), seed=random_state) elif cls_method == "logit": cls = LogisticRegression(C=2**args['C'], random_state=random_state) elif cls_method == "svm": cls = SVC(C=2**args['C'], gamma=2**args['gamma'], random_state=random_state) if cls_method == "svm" or cls_method == "logit": pipeline = Pipeline([('encoder', feature_combiner), ('scaler', StandardScaler()), ('cls', cls)]) else: pipeline = Pipeline([('encoder', feature_combiner), ('cls', cls)]) pipeline.fit(dt_train_bucket, train_y) if cls_method == "svm": preds = pipeline.decision_function(dt_test_bucket) else: preds_pos_label_idx = np.where(cls.classes_ == 1)[0][0] preds = pipeline.predict_proba( dt_test_bucket)[:, preds_pos_label_idx] if "prefix" in method_name: auc = 0.5 if len(set(test_y)) == 2: auc = roc_auc_score(test_y, preds) scores[bucket] += auc preds_all.extend(preds) test_y_all.extend(test_y) score += roc_auc_score(test_y_all, preds_all) if "prefix" in method_name: for k, v in args.items(): for bucket, bucket_score in scores.items(): fout_all.write( "%s;%s;%s;%s;%s;%s;%s;%s\n" % (trial_nr, dataset_name, cls_method, method_name, bucket, k, v, bucket_score / n_splits)) fout_all.write("%s;%s;%s;%s;%s;%s;%s;%s\n" % (trial_nr, dataset_name, cls_method, method_name, 0, "processing_time", time.time() - start, 0)) else: for k, v in args.items(): fout_all.write("%s;%s;%s;%s;%s;%s;%s\n" % (trial_nr, dataset_name, cls_method, method_name, k, v, score / n_splits)) fout_all.write("%s;%s;%s;%s;%s;%s;%s\n" % (trial_nr, dataset_name, cls_method, method_name, "processing_time", time.time() - start, 0)) fout_all.flush() return {'loss': -score / n_splits, 'status': STATUS_OK, 'model': cls}
# train and fit pipeline for each bucket for bucket in set(bucket_assignments_train): print("Fitting pipeline for bucket %s..." % bucket) relevant_cases_bucket = dataset_manager.get_indexes( dt_train_prefixes)[bucket_assignments_train == bucket] dt_train_bucket = dataset_manager.get_relevant_data_by_indexes( dt_train_prefixes, relevant_cases_bucket) # one row per event train_y = dataset_manager.get_label_numeric( dt_train_bucket) feature_combiner = FeatureUnion([ (method, EncoderFactory.get_encoder( method, **cls_encoder_args)) for method in methods ]) pipelines[bucket] = Pipeline([ ('encoder', feature_combiner), ('cls', ClassifierFactory.get_classifier( cls_method, **cls_args)) ]) pipelines[bucket].fit(dt_train_bucket, train_y) # if the bucketing is prefix-length-based, then evaluate for each prefix length separately, otherwise evaluate all prefixes together max_evaluation_prefix_length = max_prefix_length if bucket_method == "prefix" else min_prefix_length prefix_lengths_test = dt_test_prefixes.groupby( dataset_manager.case_id_col).size()
# select prefixes for the given bucket relevant_train_cases_bucket = dataset_manager.get_indexes(dt_train_prefixes)[bucket_assignments_train == bucket] relevant_test_cases_bucket = dataset_manager.get_indexes(dt_test_prefixes)[bucket_assignments_test == bucket] dt_train_bucket = dataset_manager.get_relevant_data_by_indexes(dt_train_prefixes, relevant_train_cases_bucket) dt_test_bucket = dataset_manager.get_relevant_data_by_indexes(dt_test_prefixes, relevant_test_cases_bucket) train_y = dataset_manager.get_label_numeric(dt_train_bucket) test_y = dataset_manager.get_label_numeric(dt_test_bucket) # add data about prefixes in this bucket (class labels and prefix lengths) nr_events_all.extend(list(dataset_manager.get_prefix_lengths(dt_test_bucket))) test_y_all.extend(test_y) # encode the prefixes feature_combiner = FeatureUnion([(method, EncoderFactory.get_encoder(method, **cls_encoder_args)) for method in methods]) if "svm" in cls_method or "logit" in cls_method: feature_combiner = Pipeline([('encoder', feature_combiner), ('scaler', StandardScaler())]) X_train = feature_combiner.fit_transform(dt_train_bucket) X_test = feature_combiner.transform(dt_test_bucket) # fit classifier and calibrate cls = ClassifierFactory.get_classifier(cls_method.replace("_calibrated", ""), current_args, random_state, min_cases_for_training, overall_class_ratio, binary=(False if "calibrate" in cls_method else True)) cls.fit(X_train, train_y) if "calibrate" in cls_method: relevant_val_cases_bucket = dataset_manager.get_indexes(dt_val_prefixes)[bucket_assignments_val == bucket] dt_val_bucket = dataset_manager.get_relevant_data_by_indexes(dt_val_prefixes, relevant_val_cases_bucket) X_val = feature_combiner.transform(dt_val_bucket)
def create_and_evaluate_model(args): global trial_nr, all_results trial_nr += 1 print("Trial %s out of %s" % (trial_nr, n_iter)) start = time.time() score = 0 cls_args = {k: v for k, v in args.items() if k in cls_params} text_transformer_args = {k: v for k, v in args.items() if k not in cls_params} cls_args['n_estimators'] = 500 for cv_iter in range(n_splits): # read encoded data train_chunk = dataset_manager.read_fold(os.path.join(folds_dir, "fold%s_train.csv" % cv_iter)) test_chunk = dataset_manager.read_fold(os.path.join(folds_dir, "fold%s_test.csv" % cv_iter)) # fit text models and transform for each event if text_method in ["nb", "bong"]: if dataset_ref in ["crm2", "github"] and cls_method == "xgboost" and "single" in bucket_enc: if "index" in bucket_enc: text_transformer_args["nr_selected"] = 100 cls_args['n_estimators'] = 200 else: text_transformer_args["nr_selected"] = 200 else: text_transformer_args["nr_selected"] = 500 if 'ngram_max' not in text_transformer_args: text_transformer_args['ngram_max'] = 1 if text_method == "nb": text_transformer_args["pos_label"] = dataset_manager.pos_label elif text_method in ["pv", "lda"]: text_transformer_args["random_seed"] = 22 if dataset_name in ["github"]: text_transformer_args["min_freq"] = 20 elif dataset_name in ["crm2"]: text_transformer_args["min_freq"] = 20 text_transformer = EncoderFactory.get_encoder(text_method, text_transformer_args=text_transformer_args) dt_train_text = text_transformer.fit_transform(train_chunk[dataset_manager.static_text_cols+dataset_manager.dynamic_text_cols], train_chunk[dataset_manager.label_col]) static_text_cols = [] dynamic_text_cols = [] for col in dataset_manager.static_text_cols + dataset_manager.dynamic_text_cols: dt_train_text = text_transformer.transform(train_chunk[[col]], train_chunk[dataset_manager.label_col]) current_text_cols = ["%s_%s" % (col, text_col) for text_col in dt_train_text.columns] dt_train_text.columns = current_text_cols dt_test_text = text_transformer.transform(test_chunk[[col]]) dt_test_text.columns = current_text_cols train_chunk = pd.concat([train_chunk.drop(col, axis=1), dt_train_text], axis=1, sort=False) test_chunk = pd.concat([test_chunk.drop(col, axis=1), dt_test_text], axis=1, sort=False) if col in dataset_manager.static_text_cols: static_text_cols.extend(current_text_cols) else: dynamic_text_cols.extend(current_text_cols) del dt_train_text, dt_test_text # generate prefixes if nr_events is not None: dt_train_prefixes = dataset_manager.generate_prefix_data(train_chunk, nr_events, nr_events) dt_test_prefixes = dataset_manager.generate_prefix_data(test_chunk, nr_events, nr_events) else: dt_train_prefixes = dataset_manager.generate_prefix_data(train_chunk, min_prefix_length, max_prefix_length) dt_test_prefixes = dataset_manager.generate_prefix_data(test_chunk, min_prefix_length, max_prefix_length) train_y = dataset_manager.get_label_numeric(dt_train_prefixes) test_y = dataset_manager.get_label_numeric(dt_test_prefixes) # set up sequence encoders encoders = [] for method in methods: if cls_encoding == text_enc: cls_encoder_args = {'case_id_col': dataset_manager.case_id_col, 'static_cat_cols': dataset_manager.static_cat_cols, 'static_num_cols': dataset_manager.static_num_cols + static_text_cols, 'dynamic_cat_cols': dataset_manager.dynamic_cat_cols, 'dynamic_num_cols': dataset_manager.dynamic_num_cols + dynamic_text_cols, 'fillna': True} else: cls_encoder_args = {'case_id_col': dataset_manager.case_id_col, 'static_cat_cols': dataset_manager.static_cat_cols, 'static_num_cols': dataset_manager.static_num_cols + static_text_cols, 'dynamic_cat_cols': dataset_manager.dynamic_cat_cols, 'dynamic_num_cols': dataset_manager.dynamic_num_cols, 'fillna': True} encoders.append((method, EncoderFactory.get_encoder(method, **cls_encoder_args))) if cls_encoding != text_enc and text_enc not in methods: cls_encoder_args = {'case_id_col': dataset_manager.case_id_col, 'static_cat_cols': [], 'static_num_cols': [], 'dynamic_cat_cols': [], 'dynamic_num_cols': dynamic_text_cols, 'fillna': True} encoders.append((text_enc, EncoderFactory.get_encoder(text_enc, **cls_encoder_args))) feature_combiner = FeatureUnion(encoders) # fit classifier and predict cls = ClassifierFactory.get_classifier(cls_method, cls_args, random_state, min_cases_for_training, class_ratios[cv_iter]) if cls_method == "svm" or cls_method == "logit": pipeline = Pipeline([('encoder', feature_combiner), ('scaler', StandardScaler()), ('cls', cls)]) else: pipeline = Pipeline([('encoder', feature_combiner), ('cls', cls)]) pipeline.fit(dt_train_prefixes, train_y) preds = pipeline.predict_proba(dt_test_prefixes) if len(set(test_y)) >= 2: score += roc_auc_score(test_y, preds) # save current trial results for k, v in cls_args.items(): all_results.append((trial_nr, k, v, -1, score / n_splits)) for k, v in text_transformer_args.items(): all_results.append((trial_nr, k, v, -1, score / n_splits)) return {'loss': -score / n_splits, 'status': STATUS_OK, 'model': cls}
text_transformer_args = args_all["text_transformer_args"] if text_method in ["nb", "bong"]: text_transformer_args["nr_selected"] = 500 if text_method == "nb": text_transformer_args["pos_label"] = dataset_manager.pos_label elif text_method in ["pv", "lda"]: text_transformer_args["random_seed"] = 22 if dataset_name in ["github"]: text_transformer_args["min_freq"] = 10 elif dataset_name in ["crm2"]: text_transformer_args["min_freq"] = 20 cls_args = args_all["cls_args"] cls_args['n_estimators'] = 500 text_transformer = EncoderFactory.get_encoder(text_method, text_transformer_args=text_transformer_args) dt_train_text = text_transformer.fit_transform(train[dataset_manager.static_text_cols+dataset_manager.dynamic_text_cols], train[dataset_manager.label_col]) static_text_cols = [] dynamic_text_cols = [] for col in dataset_manager.static_text_cols + dataset_manager.dynamic_text_cols: dt_train_text = text_transformer.transform(train[[col]], train[dataset_manager.label_col]) current_text_cols = ["%s_%s" % (col, text_col) for text_col in dt_train_text.columns] dt_train_text.columns = current_text_cols dt_test_text = text_transformer.transform(test[[col]]) dt_test_text.columns = current_text_cols train_current = pd.concat([train.drop(col, axis=1), dt_train_text], axis=1, sort=False) test_current = pd.concat([test.drop(col, axis=1), dt_test_text], axis=1, sort=False) if col in dataset_manager.static_text_cols: static_text_cols.extend(current_text_cols)
def create_and_evaluate_model(args): global trial_nr trial_nr += 1 start = time.time() score = 0 for cv_iter in range(n_splits): dt_test_prefixes = dt_prefixes[cv_iter] dt_train_prefixes = pd.DataFrame() for cv_train_iter in range(n_splits): if cv_train_iter != cv_iter: dt_train_prefixes = pd.concat( [dt_train_prefixes, dt_prefixes[cv_train_iter]], axis=0) # Bucketing prefixes based on control flow knn_encoder_args = { 'case_id_col': dataset_manager.case_id_col, 'static_cat_cols': [], 'static_num_cols': [], 'dynamic_cat_cols': [dataset_manager.activity_col], 'dynamic_num_cols': [], 'fillna': True } # initiate the KNN model bucket_encoder = EncoderFactory.get_encoder(bucket_encoding, **knn_encoder_args) encoded_train = bucket_encoder.fit_transform(dt_train_prefixes) bucketer = NearestNeighbors(n_neighbors=args["n_neighbors"], algorithm='auto').fit(encoded_train) # get nearest neighbors for each test case encoded_test = bucket_encoder.fit_transform(dt_test_prefixes) _, indices = bucketer.kneighbors(encoded_test) preds_all = [] test_y_all = [] for i in range(len(encoded_test)): # retrieve nearest neighbors from training set knn_idxs = indices[i] relevant_cases_bucket = encoded_train.iloc[knn_idxs].index dt_train_bucket = dataset_manager.get_relevant_data_by_indexes( dt_train_prefixes, relevant_cases_bucket) # one row per event train_y = dataset_manager.get_label_numeric(dt_train_bucket) # select current test case relevant_test_case = [encoded_test.index[i]] dt_test_bucket = dataset_manager.get_relevant_data_by_indexes( dt_test_prefixes, relevant_test_case) test_y_all.extend( dataset_manager.get_label_numeric(dt_test_bucket)) if len(set(train_y)) < 2: preds_all.append(train_y[0]) else: feature_combiner = FeatureUnion([ (method, EncoderFactory.get_encoder(method, **cls_encoder_args)) for method in methods ]) if cls_method == "rf": cls = RandomForestClassifier( n_estimators=500, max_features=args['max_features'], random_state=random_state) elif cls_method == "xgboost": cls = xgb.XGBClassifier( objective='binary:logistic', n_estimators=500, learning_rate=args['learning_rate'], subsample=args['subsample'], max_depth=int(args['max_depth']), colsample_bytree=args['colsample_bytree'], min_child_weight=int(args['min_child_weight']), seed=random_state) elif cls_method == "logit": cls = LogisticRegression(C=2**args['C'], random_state=random_state) elif cls_method == "svm": cls = SVC(C=2**args['C'], gamma=2**args['gamma'], random_state=random_state) if cls_method == "svm" or cls_method == "logit": pipeline = Pipeline([('encoder', feature_combiner), ('scaler', StandardScaler()), ('cls', cls)]) else: pipeline = Pipeline([('encoder', feature_combiner), ('cls', cls)]) pipeline.fit(dt_train_bucket, train_y) if cls_method == "svm": preds = pipeline.decision_function(dt_test_bucket) else: preds_pos_label_idx = np.where(cls.classes_ == 1)[0][0] preds = pipeline.predict_proba( dt_test_bucket)[:, preds_pos_label_idx] preds_all.extend(preds) score += roc_auc_score(test_y_all, preds_all) for k, v in args.items(): fout_all.write("%s;%s;%s;%s;%s;%s;%s\n" % (trial_nr, dataset_name, cls_method, method_name, k, v, score / n_splits)) fout_all.write("%s;%s;%s;%s;%s;%s;%s\n" % (trial_nr, dataset_name, cls_method, method_name, "processing_time", time.time() - start, 0)) fout_all.flush() return {'loss': -score / n_splits, 'status': STATUS_OK, 'model': cls}
list(dataset_manager.get_prefix_lengths(dt_test_bucket))) test_y_all.extend(test_y) if cls_encoding == 'waveletLast' or cls_encoding == 'waveletAgg' or cls_encoding == 'waveletIndex': if cls_encoding == "waveletLast" or cls_encoding == "waveletAgg" or cls_encoding == "waveletIndex": if cls_encoding == "waveletLast": encoding = "laststate" elif cls_encoding == "waveletAgg": encoding = "agg" else: encoding = "index" # initialize pipeline for sequence encoder and classifier feature_combiner_last = FeatureUnion( [(method, EncoderFactory.get_encoder(bucket_method, method, cls_method, **cls_encoder_args)) for method in encoding_dict[encoding]], n_jobs=-1) feature_combiner_wavelet = FeatureUnion( [(method, EncoderFactory.get_encoder(bucket_method, method, cls_method, **cls_encoder_args)) for method in encoding_dict['wavelet']], n_jobs=-1) cls = ClassifierFactory.get_classifier(cls_method, current_args, random_state, min_cases_for_training, overall_class_ratio) # fit pipeline
'max_depth': 8, 'colsample_bytree': 0.48, 'min_child_weight': 2 } # select relevant cases relevant_cases_bucket = dataset_manager.get_indexes(dt_train_prefixes)[ bucket_assignments_train == bucket] dt_train_bucket = dataset_manager.get_relevant_data_by_indexes( dt_train_prefixes, relevant_cases_bucket) y_train = dataset_manager.get_label_numeric(dt_train_bucket) if len(set(y_train)) < 2: break encoders = [(method, EncoderFactory.get_encoder(method, **cls_encoder_args)) for method in methods] if "act_freqs" in dataset_name2: cls_encoder_args2 = { 'case_id_col': dataset_manager.case_id_col, 'static_cat_cols': [], 'static_num_cols': [], 'dynamic_cat_cols': [], 'dynamic_num_cols': act_freq_cols, 'fillna': fillna } encoders.append( ("act_freqs_last", EncoderFactory.get_encoder("last", **cls_encoder_args2))) feature_combiners[bucket] = FeatureUnion(encoders)
dt_test_prefixes = dataset_manager.generate_prefix_data( test, min_prefix_length, max_prefix_length) dt_train_prefixes = dataset_manager.generate_prefix_data( train, min_prefix_length, max_prefix_length) # Bucketing prefixes based on control flow knn_encoder_args = { 'case_id_col': dataset_manager.case_id_col, 'static_cat_cols': [], 'static_num_cols': [], 'dynamic_cat_cols': [dataset_manager.activity_col], 'dynamic_num_cols': [], 'fillna': True } # initiate the KNN model bucket_encoder = EncoderFactory.get_encoder(bucket_encoding, **knn_encoder_args) encoded_train = bucket_encoder.fit_transform(dt_train_prefixes) if "n_neighbors" in args: n_neighbors = int(args["n_neighbors"]) else: n_neighbors = 50 bucketer = NearestNeighbors(n_neighbors=n_neighbors, algorithm='auto').fit(encoded_train) for _, dt_test_bucket in dt_test_prefixes.groupby( dataset_manager.case_id_col): encoded_case = bucket_encoder.fit_transform(dt_test_bucket) _, knn_idxs = bucketer.kneighbors(encoded_case) knn_idxs = knn_idxs[0] relevant_cases_bucket = encoded_train.iloc[knn_idxs].index
cls_encoder_args = {'case_id_col': dataset_manager.case_id_col, 'static_cat_cols': dataset_manager.static_cat_cols, 'static_num_cols': dataset_manager.static_num_cols, 'dynamic_cat_cols': dataset_manager.dynamic_cat_cols, 'dynamic_num_cols': dataset_manager.dynamic_num_cols, 'fillna': True} # split into training and test train, _ = dataset_manager.split_data_strict(data, train_ratio, split="temporal") # generate data where each prefix is a separate instance dt_prefixes = dataset_manager.generate_prefix_data(train, min_prefix_length, max_prefix_length) # encode all prefixes feature_combiner = FeatureUnion([(method, EncoderFactory.get_encoder(method, **cls_encoder_args)) for method in ["static", "agg"]]) X_all = feature_combiner.fit_transform(dt_prefixes) y_all = np.array(dataset_manager.get_label_numeric(dt_prefixes)) # generate dataset that will enable easy splitting for CV - to guarantee that prefixes of the same case will remain in the same chunk case_ids = dt_prefixes.groupby(dataset_manager.case_id_col).first()["orig_case_id"] dt_for_splitting = pd.DataFrame({dataset_manager.case_id_col: case_ids, dataset_manager.label_col: y_all}).drop_duplicates() print('Optimizing parameters...') space = {#'n_estimators': hp.choice('n_estimators', np.arange(150, 1000, dtype=int)), 'max_depth': scope.int(hp.quniform('max_depth', 4, 30, 1)), 'max_features': hp.uniform('max_features', 0, 1)} trials = Trials() best = fmin(create_and_evaluate_model, space, algo=tpe.suggest, max_evals=10, trials=trials)
if len(relevant_train_cases_bucket) == 0: preds = [dataset_manager.get_class_ratio(train)] * len(relevant_test_cases_bucket) else: dt_train_bucket = dataset_manager.get_relevant_data_by_indexes(dt_train_prefixes, relevant_train_cases_bucket) # one row per event train_y = dataset_manager.get_label_numeric(dt_train_bucket) if len(set(train_y)) < 2: preds = [train_y[0]] * len(relevant_test_cases_bucket) test_y_all.extend(dataset_manager.get_label_numeric(dt_test_bucket)) else: feature_combiner = FeatureUnion( [(method, EncoderFactory.get_encoder(method, **cls_encoder_args)) for method in methods]) cls = xgb.XGBClassifier(objective='binary:logistic', n_estimators=500, learning_rate=current_args['learning_rate'], subsample=current_args['subsample'], max_depth=int(current_args['max_depth']), colsample_bytree=current_args['colsample_bytree'], min_child_weight=int(current_args['min_child_weight']), seed=random_state) pipeline = Pipeline([('encoder', feature_combiner), ('cls', cls)]) pipeline.fit(dt_train_bucket, train_y)