示例#1
0
def get_bucketer(method, encoding_method=None, case_id_col=None, cat_cols=None, num_cols=None, n_clusters=None, random_state=None):

    if method == "cluster":
        bucket_encoder = EncoderFactory.get_encoder(method=encoding_method, case_id_col=case_id_col, dynamic_cat_cols=cat_cols, dynamic_num_cols=num_cols)
        clustering = KMeans(n_clusters, random_state=random_state)
        return ClusterBasedBucketer(encoder=bucket_encoder, clustering=clustering)
        
    elif method == "state":
        bucket_encoder = EncoderFactory.get_encoder(method=encoding_method, case_id_col=case_id_col, dynamic_cat_cols=cat_cols, dynamic_num_cols=num_cols)
        return StateBasedBucketer(encoder=bucket_encoder)
            
    elif method == "zero":
        return NoBucketer(case_id_col=case_id_col)

    elif method == "prefix":
        return PrefixLengthBucketer(case_id_col=case_id_col)

    else:
        print("Invalid bucketer type")
        return None
                text_transformer_args["nr_selected"] = 500
                if text_method == "nb":
                    text_transformer_args[
                        "pos_label"] = dataset_manager.pos_label
            elif text_method in ["pv", "lda"]:
                text_transformer_args["random_seed"] = 22
            if dataset_name in ["github"]:
                text_transformer_args["min_freq"] = 10
            elif dataset_name in ["crm2"]:
                text_transformer_args["min_freq"] = 20

            cls_args = args_all["cls_args"]
            cls_args['n_estimators'] = 500

            start = time.time()
            text_transformer = EncoderFactory.get_encoder(
                text_method, text_transformer_args=text_transformer_args)
            dt_train_text = text_transformer.fit_transform(
                train[dataset_manager.static_text_cols +
                      dataset_manager.dynamic_text_cols],
                train[dataset_manager.label_col])
            time_train += time.time() - start
            time_text_model = time.time() - start

            static_text_cols = []
            dynamic_text_cols = []
            for col in dataset_manager.static_text_cols + dataset_manager.dynamic_text_cols:
                start = time.time()
                dt_train_text = text_transformer.transform(
                    train[[col]], train[dataset_manager.label_col])
                current_text_cols = [
                    "%s_%s" % (col, text_col)
示例#3
0
                        'dynamic_cat_cols': dataset_manager.dynamic_cat_cols,
                        'dynamic_num_cols': dataset_manager.dynamic_num_cols,
                        'fillna': fillna
                    }

                    cls_args = {
                        'random_state': random_state,
                        'min_cases_for_training': n_min_cases_in_bucket
                    }
                    for i in range(len(cls_params_names)):
                        cls_args[cls_params_names[i]] = cls_params_combo[i]

                    # Bucketing prefixes based on control flow
                    print("Bucketing prefixes...")
                    # initiate the KNN model
                    bucket_encoder = EncoderFactory.get_encoder(
                        bucket_encoding, **knn_encoder_args)
                    encoded_train = bucket_encoder.fit_transform(
                        dt_train_prefixes)
                    bucketer = NearestNeighbors(
                        n_neighbors=bucketer_params_combo[0],
                        algorithm='auto').fit(encoded_train)

                    # get nearest neighbors for each test case
                    encoded_test = bucket_encoder.fit_transform(
                        dt_test_prefixes)
                    _, indices = bucketer.kneighbors(encoded_test)

                    # use appropriate classifier for each bucket of test cases
                    # for evaluation, collect predictions from different buckets together
                    preds = []
                    test_y = []
def create_and_evaluate_model(args):
    global trial_nr
    trial_nr += 1

    start = time.time()
    score = 0
    for cv_iter in range(n_splits):

        dt_test_prefixes = dt_prefixes[cv_iter]
        dt_train_prefixes = pd.DataFrame()
        for cv_train_iter in range(n_splits):
            if cv_train_iter != cv_iter:
                dt_train_prefixes = pd.concat(
                    [dt_train_prefixes, dt_prefixes[cv_train_iter]], axis=0)

        # Bucketing prefixes based on control flow
        bucketer_args = {
            'encoding_method': bucket_encoding,
            'case_id_col': dataset_manager.case_id_col,
            'cat_cols': [dataset_manager.activity_col],
            'num_cols': [],
            'random_state': random_state
        }
        if bucket_method == "cluster":
            bucketer_args["n_clusters"] = args["n_clusters"]
        bucketer = BucketFactory.get_bucketer(bucket_method, **bucketer_args)
        bucket_assignments_train = bucketer.fit_predict(dt_train_prefixes)
        bucket_assignments_test = bucketer.predict(dt_test_prefixes)

        preds_all = []
        test_y_all = []
        if "prefix" in method_name:
            scores = defaultdict(int)
        for bucket in set(bucket_assignments_test):
            relevant_train_cases_bucket = dataset_manager.get_indexes(
                dt_train_prefixes)[bucket_assignments_train == bucket]
            relevant_test_cases_bucket = dataset_manager.get_indexes(
                dt_test_prefixes)[bucket_assignments_test == bucket]
            dt_test_bucket = dataset_manager.get_relevant_data_by_indexes(
                dt_test_prefixes, relevant_test_cases_bucket)
            test_y = dataset_manager.get_label_numeric(dt_test_bucket)
            if len(relevant_train_cases_bucket) == 0:
                preds = [class_ratios[cv_iter]
                         ] * len(relevant_test_cases_bucket)
            else:
                dt_train_bucket = dataset_manager.get_relevant_data_by_indexes(
                    dt_train_prefixes,
                    relevant_train_cases_bucket)  # one row per event
                train_y = dataset_manager.get_label_numeric(dt_train_bucket)

                if len(set(train_y)) < 2:
                    preds = [train_y[0]] * len(relevant_test_cases_bucket)
                else:
                    feature_combiner = FeatureUnion([
                        (method,
                         EncoderFactory.get_encoder(method,
                                                    **cls_encoder_args))
                        for method in methods
                    ])

                    if cls_method == "rf":
                        cls = RandomForestClassifier(
                            n_estimators=500,
                            max_features=args['max_features'],
                            random_state=random_state)

                    elif cls_method == "xgboost":
                        cls = xgb.XGBClassifier(
                            objective='binary:logistic',
                            n_estimators=500,
                            learning_rate=args['learning_rate'],
                            subsample=args['subsample'],
                            max_depth=int(args['max_depth']),
                            colsample_bytree=args['colsample_bytree'],
                            min_child_weight=int(args['min_child_weight']),
                            seed=random_state)

                    elif cls_method == "logit":
                        cls = LogisticRegression(C=2**args['C'],
                                                 random_state=random_state)

                    elif cls_method == "svm":
                        cls = SVC(C=2**args['C'],
                                  gamma=2**args['gamma'],
                                  random_state=random_state)

                    if cls_method == "svm" or cls_method == "logit":
                        pipeline = Pipeline([('encoder', feature_combiner),
                                             ('scaler', StandardScaler()),
                                             ('cls', cls)])
                    else:
                        pipeline = Pipeline([('encoder', feature_combiner),
                                             ('cls', cls)])
                    pipeline.fit(dt_train_bucket, train_y)

                    if cls_method == "svm":
                        preds = pipeline.decision_function(dt_test_bucket)
                    else:
                        preds_pos_label_idx = np.where(cls.classes_ == 1)[0][0]
                        preds = pipeline.predict_proba(
                            dt_test_bucket)[:, preds_pos_label_idx]

            if "prefix" in method_name:
                auc = 0.5
                if len(set(test_y)) == 2:
                    auc = roc_auc_score(test_y, preds)
                scores[bucket] += auc
            preds_all.extend(preds)
            test_y_all.extend(test_y)

        score += roc_auc_score(test_y_all, preds_all)

    if "prefix" in method_name:
        for k, v in args.items():
            for bucket, bucket_score in scores.items():
                fout_all.write(
                    "%s;%s;%s;%s;%s;%s;%s;%s\n" %
                    (trial_nr, dataset_name, cls_method, method_name, bucket,
                     k, v, bucket_score / n_splits))
        fout_all.write("%s;%s;%s;%s;%s;%s;%s;%s\n" %
                       (trial_nr, dataset_name, cls_method, method_name, 0,
                        "processing_time", time.time() - start, 0))
    else:
        for k, v in args.items():
            fout_all.write("%s;%s;%s;%s;%s;%s;%s\n" %
                           (trial_nr, dataset_name, cls_method, method_name, k,
                            v, score / n_splits))
        fout_all.write("%s;%s;%s;%s;%s;%s;%s\n" %
                       (trial_nr, dataset_name, cls_method, method_name,
                        "processing_time", time.time() - start, 0))
    fout_all.flush()
    return {'loss': -score / n_splits, 'status': STATUS_OK, 'model': cls}
示例#5
0
                    # train and fit pipeline for each bucket
                    for bucket in set(bucket_assignments_train):
                        print("Fitting pipeline for bucket %s..." % bucket)
                        relevant_cases_bucket = dataset_manager.get_indexes(
                            dt_train_prefixes)[bucket_assignments_train ==
                                               bucket]
                        dt_train_bucket = dataset_manager.get_relevant_data_by_indexes(
                            dt_train_prefixes,
                            relevant_cases_bucket)  # one row per event
                        train_y = dataset_manager.get_label_numeric(
                            dt_train_bucket)

                        feature_combiner = FeatureUnion([
                            (method,
                             EncoderFactory.get_encoder(
                                 method, **cls_encoder_args))
                            for method in methods
                        ])
                        pipelines[bucket] = Pipeline([
                            ('encoder', feature_combiner),
                            ('cls',
                             ClassifierFactory.get_classifier(
                                 cls_method, **cls_args))
                        ])
                        pipelines[bucket].fit(dt_train_bucket, train_y)

                    # if the bucketing is prefix-length-based, then evaluate for each prefix length separately, otherwise evaluate all prefixes together
                    max_evaluation_prefix_length = max_prefix_length if bucket_method == "prefix" else min_prefix_length

                    prefix_lengths_test = dt_test_prefixes.groupby(
                        dataset_manager.case_id_col).size()
示例#6
0
        # select prefixes for the given bucket
        relevant_train_cases_bucket = dataset_manager.get_indexes(dt_train_prefixes)[bucket_assignments_train == bucket]
        relevant_test_cases_bucket = dataset_manager.get_indexes(dt_test_prefixes)[bucket_assignments_test == bucket]
        
        dt_train_bucket = dataset_manager.get_relevant_data_by_indexes(dt_train_prefixes, relevant_train_cases_bucket)
        dt_test_bucket = dataset_manager.get_relevant_data_by_indexes(dt_test_prefixes, relevant_test_cases_bucket)

        train_y = dataset_manager.get_label_numeric(dt_train_bucket)
        test_y = dataset_manager.get_label_numeric(dt_test_bucket)
            
        # add data about prefixes in this bucket (class labels and prefix lengths)
        nr_events_all.extend(list(dataset_manager.get_prefix_lengths(dt_test_bucket)))
        test_y_all.extend(test_y)

        # encode the prefixes
        feature_combiner = FeatureUnion([(method, EncoderFactory.get_encoder(method, **cls_encoder_args)) for method in methods])
        if "svm" in cls_method or "logit" in cls_method:
            feature_combiner = Pipeline([('encoder', feature_combiner), ('scaler', StandardScaler())])
            
        X_train = feature_combiner.fit_transform(dt_train_bucket)
        X_test = feature_combiner.transform(dt_test_bucket)

        # fit classifier and calibrate
        cls = ClassifierFactory.get_classifier(cls_method.replace("_calibrated", ""), current_args, random_state, min_cases_for_training, overall_class_ratio, binary=(False if "calibrate" in cls_method else True))
        cls.fit(X_train, train_y)

        if "calibrate" in cls_method:
            relevant_val_cases_bucket = dataset_manager.get_indexes(dt_val_prefixes)[bucket_assignments_val == bucket]
            dt_val_bucket = dataset_manager.get_relevant_data_by_indexes(dt_val_prefixes, relevant_val_cases_bucket)

            X_val = feature_combiner.transform(dt_val_bucket)
def create_and_evaluate_model(args):
    global trial_nr, all_results
    trial_nr += 1
    
    print("Trial %s out of %s" % (trial_nr, n_iter))
    
    start = time.time()
    score = 0
    
    cls_args = {k: v for k, v in args.items() if k in cls_params}
    text_transformer_args = {k: v for k, v in args.items() if k not in cls_params}
    cls_args['n_estimators'] = 500
    
    for cv_iter in range(n_splits):
        
        # read encoded data
        train_chunk = dataset_manager.read_fold(os.path.join(folds_dir, "fold%s_train.csv" % cv_iter))
        test_chunk = dataset_manager.read_fold(os.path.join(folds_dir, "fold%s_test.csv" % cv_iter))
        
        # fit text models and transform for each event
        if text_method in ["nb", "bong"]:
            if dataset_ref in ["crm2", "github"] and cls_method == "xgboost" and "single" in bucket_enc:
                if "index" in bucket_enc:
                    text_transformer_args["nr_selected"] = 100
                    cls_args['n_estimators'] = 200
                else:
                    text_transformer_args["nr_selected"] = 200
            else:
                text_transformer_args["nr_selected"] = 500
            if 'ngram_max' not in text_transformer_args:
                text_transformer_args['ngram_max'] = 1
            if text_method == "nb":
                text_transformer_args["pos_label"] = dataset_manager.pos_label
        elif text_method in ["pv", "lda"]:
            text_transformer_args["random_seed"] = 22
        if dataset_name in ["github"]:
            text_transformer_args["min_freq"] = 20
        elif dataset_name in ["crm2"]:
            text_transformer_args["min_freq"] = 20
        
        text_transformer = EncoderFactory.get_encoder(text_method, text_transformer_args=text_transformer_args)
        dt_train_text = text_transformer.fit_transform(train_chunk[dataset_manager.static_text_cols+dataset_manager.dynamic_text_cols], 
                                                       train_chunk[dataset_manager.label_col])
        
        static_text_cols = []
        dynamic_text_cols = []
        for col in dataset_manager.static_text_cols + dataset_manager.dynamic_text_cols:
            dt_train_text = text_transformer.transform(train_chunk[[col]], train_chunk[dataset_manager.label_col])
            current_text_cols = ["%s_%s" % (col, text_col) for text_col in dt_train_text.columns]
            dt_train_text.columns = current_text_cols
            dt_test_text = text_transformer.transform(test_chunk[[col]])
            dt_test_text.columns = current_text_cols
            train_chunk = pd.concat([train_chunk.drop(col, axis=1), dt_train_text], axis=1, sort=False)
            test_chunk = pd.concat([test_chunk.drop(col, axis=1), dt_test_text], axis=1, sort=False)
            if col in dataset_manager.static_text_cols:
                static_text_cols.extend(current_text_cols)
            else:
                dynamic_text_cols.extend(current_text_cols)
            del dt_train_text, dt_test_text
        
        # generate prefixes
        if nr_events is not None:
            dt_train_prefixes = dataset_manager.generate_prefix_data(train_chunk, nr_events, nr_events)
            dt_test_prefixes = dataset_manager.generate_prefix_data(test_chunk, nr_events, nr_events)
        else:
            dt_train_prefixes = dataset_manager.generate_prefix_data(train_chunk, min_prefix_length, max_prefix_length)
            dt_test_prefixes = dataset_manager.generate_prefix_data(test_chunk, min_prefix_length, max_prefix_length)
                
        train_y = dataset_manager.get_label_numeric(dt_train_prefixes)
        test_y = dataset_manager.get_label_numeric(dt_test_prefixes)
            
        # set up sequence encoders
        encoders = []
        for method in methods:
            if cls_encoding == text_enc:
                cls_encoder_args = {'case_id_col': dataset_manager.case_id_col, 
                    'static_cat_cols': dataset_manager.static_cat_cols,
                    'static_num_cols': dataset_manager.static_num_cols + static_text_cols, 
                    'dynamic_cat_cols': dataset_manager.dynamic_cat_cols,
                    'dynamic_num_cols': dataset_manager.dynamic_num_cols + dynamic_text_cols, 
                    'fillna': True}
            else:
                cls_encoder_args = {'case_id_col': dataset_manager.case_id_col, 
                    'static_cat_cols': dataset_manager.static_cat_cols,
                    'static_num_cols': dataset_manager.static_num_cols + static_text_cols, 
                    'dynamic_cat_cols': dataset_manager.dynamic_cat_cols,
                    'dynamic_num_cols': dataset_manager.dynamic_num_cols, 
                    'fillna': True}
            encoders.append((method, EncoderFactory.get_encoder(method, **cls_encoder_args)))
        if cls_encoding != text_enc and text_enc not in methods:
            cls_encoder_args = {'case_id_col': dataset_manager.case_id_col, 
                    'static_cat_cols': [],
                    'static_num_cols': [], 
                    'dynamic_cat_cols': [],
                    'dynamic_num_cols': dynamic_text_cols, 
                    'fillna': True}
            encoders.append((text_enc, EncoderFactory.get_encoder(text_enc, **cls_encoder_args)))
                
        feature_combiner = FeatureUnion(encoders)
        
        # fit classifier and predict
        cls = ClassifierFactory.get_classifier(cls_method, cls_args, random_state, min_cases_for_training, class_ratios[cv_iter])

        if cls_method == "svm" or cls_method == "logit":
            pipeline = Pipeline([('encoder', feature_combiner), ('scaler', StandardScaler()), ('cls', cls)])
        else:
            pipeline = Pipeline([('encoder', feature_combiner), ('cls', cls)])

        pipeline.fit(dt_train_prefixes, train_y)
        preds = pipeline.predict_proba(dt_test_prefixes)

        if len(set(test_y)) >= 2:
            score += roc_auc_score(test_y, preds)
    
    # save current trial results
    for k, v in cls_args.items():
        all_results.append((trial_nr, k, v, -1, score / n_splits))
    for k, v in text_transformer_args.items():
        all_results.append((trial_nr, k, v, -1, score / n_splits))

    return {'loss': -score / n_splits, 'status': STATUS_OK, 'model': cls}
示例#8
0
        text_transformer_args = args_all["text_transformer_args"]
        if text_method in ["nb", "bong"]:
            text_transformer_args["nr_selected"] = 500
            if text_method == "nb":
                text_transformer_args["pos_label"] = dataset_manager.pos_label
        elif text_method in ["pv", "lda"]:
            text_transformer_args["random_seed"] = 22
        if dataset_name in ["github"]:
            text_transformer_args["min_freq"] = 10
        elif dataset_name in ["crm2"]:
            text_transformer_args["min_freq"] = 20
            
        cls_args = args_all["cls_args"]
        cls_args['n_estimators'] = 500

        text_transformer = EncoderFactory.get_encoder(text_method, text_transformer_args=text_transformer_args)
        dt_train_text = text_transformer.fit_transform(train[dataset_manager.static_text_cols+dataset_manager.dynamic_text_cols], 
                                                       train[dataset_manager.label_col])
        
        static_text_cols = []
        dynamic_text_cols = []
        for col in dataset_manager.static_text_cols + dataset_manager.dynamic_text_cols:
            dt_train_text = text_transformer.transform(train[[col]], train[dataset_manager.label_col])
            current_text_cols = ["%s_%s" % (col, text_col) for text_col in dt_train_text.columns]
            dt_train_text.columns = current_text_cols
            dt_test_text = text_transformer.transform(test[[col]])
            dt_test_text.columns = current_text_cols
            train_current = pd.concat([train.drop(col, axis=1), dt_train_text], axis=1, sort=False)
            test_current = pd.concat([test.drop(col, axis=1), dt_test_text], axis=1, sort=False)
            if col in dataset_manager.static_text_cols:
                static_text_cols.extend(current_text_cols)
def create_and_evaluate_model(args):
    global trial_nr
    trial_nr += 1

    start = time.time()
    score = 0
    for cv_iter in range(n_splits):

        dt_test_prefixes = dt_prefixes[cv_iter]
        dt_train_prefixes = pd.DataFrame()
        for cv_train_iter in range(n_splits):
            if cv_train_iter != cv_iter:
                dt_train_prefixes = pd.concat(
                    [dt_train_prefixes, dt_prefixes[cv_train_iter]], axis=0)

        # Bucketing prefixes based on control flow
        knn_encoder_args = {
            'case_id_col': dataset_manager.case_id_col,
            'static_cat_cols': [],
            'static_num_cols': [],
            'dynamic_cat_cols': [dataset_manager.activity_col],
            'dynamic_num_cols': [],
            'fillna': True
        }
        # initiate the KNN model
        bucket_encoder = EncoderFactory.get_encoder(bucket_encoding,
                                                    **knn_encoder_args)
        encoded_train = bucket_encoder.fit_transform(dt_train_prefixes)
        bucketer = NearestNeighbors(n_neighbors=args["n_neighbors"],
                                    algorithm='auto').fit(encoded_train)

        # get nearest neighbors for each test case
        encoded_test = bucket_encoder.fit_transform(dt_test_prefixes)
        _, indices = bucketer.kneighbors(encoded_test)

        preds_all = []
        test_y_all = []
        for i in range(len(encoded_test)):

            # retrieve nearest neighbors from training set
            knn_idxs = indices[i]
            relevant_cases_bucket = encoded_train.iloc[knn_idxs].index
            dt_train_bucket = dataset_manager.get_relevant_data_by_indexes(
                dt_train_prefixes, relevant_cases_bucket)  # one row per event
            train_y = dataset_manager.get_label_numeric(dt_train_bucket)

            # select current test case
            relevant_test_case = [encoded_test.index[i]]
            dt_test_bucket = dataset_manager.get_relevant_data_by_indexes(
                dt_test_prefixes, relevant_test_case)
            test_y_all.extend(
                dataset_manager.get_label_numeric(dt_test_bucket))

            if len(set(train_y)) < 2:
                preds_all.append(train_y[0])
            else:
                feature_combiner = FeatureUnion([
                    (method,
                     EncoderFactory.get_encoder(method, **cls_encoder_args))
                    for method in methods
                ])

                if cls_method == "rf":
                    cls = RandomForestClassifier(
                        n_estimators=500,
                        max_features=args['max_features'],
                        random_state=random_state)

                elif cls_method == "xgboost":
                    cls = xgb.XGBClassifier(
                        objective='binary:logistic',
                        n_estimators=500,
                        learning_rate=args['learning_rate'],
                        subsample=args['subsample'],
                        max_depth=int(args['max_depth']),
                        colsample_bytree=args['colsample_bytree'],
                        min_child_weight=int(args['min_child_weight']),
                        seed=random_state)

                elif cls_method == "logit":
                    cls = LogisticRegression(C=2**args['C'],
                                             random_state=random_state)

                elif cls_method == "svm":
                    cls = SVC(C=2**args['C'],
                              gamma=2**args['gamma'],
                              random_state=random_state)

                if cls_method == "svm" or cls_method == "logit":
                    pipeline = Pipeline([('encoder', feature_combiner),
                                         ('scaler', StandardScaler()),
                                         ('cls', cls)])
                else:
                    pipeline = Pipeline([('encoder', feature_combiner),
                                         ('cls', cls)])
                pipeline.fit(dt_train_bucket, train_y)

                if cls_method == "svm":
                    preds = pipeline.decision_function(dt_test_bucket)
                else:
                    preds_pos_label_idx = np.where(cls.classes_ == 1)[0][0]
                    preds = pipeline.predict_proba(
                        dt_test_bucket)[:, preds_pos_label_idx]

                preds_all.extend(preds)

        score += roc_auc_score(test_y_all, preds_all)

    for k, v in args.items():
        fout_all.write("%s;%s;%s;%s;%s;%s;%s\n" %
                       (trial_nr, dataset_name, cls_method, method_name, k, v,
                        score / n_splits))
    fout_all.write("%s;%s;%s;%s;%s;%s;%s\n" %
                   (trial_nr, dataset_name, cls_method, method_name,
                    "processing_time", time.time() - start, 0))
    fout_all.flush()
    return {'loss': -score / n_splits, 'status': STATUS_OK, 'model': cls}
示例#10
0
                list(dataset_manager.get_prefix_lengths(dt_test_bucket)))
            test_y_all.extend(test_y)

            if cls_encoding == 'waveletLast' or cls_encoding == 'waveletAgg' or cls_encoding == 'waveletIndex':
                if cls_encoding == "waveletLast" or cls_encoding == "waveletAgg" or cls_encoding == "waveletIndex":
                    if cls_encoding == "waveletLast":
                        encoding = "laststate"
                    elif cls_encoding == "waveletAgg":
                        encoding = "agg"
                    else:
                        encoding = "index"

            # initialize pipeline for sequence encoder and classifier
            feature_combiner_last = FeatureUnion(
                [(method,
                  EncoderFactory.get_encoder(bucket_method, method, cls_method,
                                             **cls_encoder_args))
                 for method in encoding_dict[encoding]],
                n_jobs=-1)
            feature_combiner_wavelet = FeatureUnion(
                [(method,
                  EncoderFactory.get_encoder(bucket_method, method, cls_method,
                                             **cls_encoder_args))
                 for method in encoding_dict['wavelet']],
                n_jobs=-1)

            cls = ClassifierFactory.get_classifier(cls_method, current_args,
                                                   random_state,
                                                   min_cases_for_training,
                                                   overall_class_ratio)

            # fit pipeline
                'max_depth': 8,
                'colsample_bytree': 0.48,
                'min_child_weight': 2
            }

        # select relevant cases
        relevant_cases_bucket = dataset_manager.get_indexes(dt_train_prefixes)[
            bucket_assignments_train == bucket]
        dt_train_bucket = dataset_manager.get_relevant_data_by_indexes(
            dt_train_prefixes, relevant_cases_bucket)
        y_train = dataset_manager.get_label_numeric(dt_train_bucket)
        if len(set(y_train)) < 2:
            break

        encoders = [(method,
                     EncoderFactory.get_encoder(method, **cls_encoder_args))
                    for method in methods]
        if "act_freqs" in dataset_name2:
            cls_encoder_args2 = {
                'case_id_col': dataset_manager.case_id_col,
                'static_cat_cols': [],
                'static_num_cols': [],
                'dynamic_cat_cols': [],
                'dynamic_num_cols': act_freq_cols,
                'fillna': fillna
            }
            encoders.append(
                ("act_freqs_last",
                 EncoderFactory.get_encoder("last", **cls_encoder_args2)))
        feature_combiners[bucket] = FeatureUnion(encoders)
    dt_test_prefixes = dataset_manager.generate_prefix_data(
        test, min_prefix_length, max_prefix_length)
    dt_train_prefixes = dataset_manager.generate_prefix_data(
        train, min_prefix_length, max_prefix_length)

    # Bucketing prefixes based on control flow
    knn_encoder_args = {
        'case_id_col': dataset_manager.case_id_col,
        'static_cat_cols': [],
        'static_num_cols': [],
        'dynamic_cat_cols': [dataset_manager.activity_col],
        'dynamic_num_cols': [],
        'fillna': True
    }
    # initiate the KNN model
    bucket_encoder = EncoderFactory.get_encoder(bucket_encoding,
                                                **knn_encoder_args)
    encoded_train = bucket_encoder.fit_transform(dt_train_prefixes)
    if "n_neighbors" in args:
        n_neighbors = int(args["n_neighbors"])
    else:
        n_neighbors = 50
    bucketer = NearestNeighbors(n_neighbors=n_neighbors,
                                algorithm='auto').fit(encoded_train)

    for _, dt_test_bucket in dt_test_prefixes.groupby(
            dataset_manager.case_id_col):
        encoded_case = bucket_encoder.fit_transform(dt_test_bucket)
        _, knn_idxs = bucketer.kneighbors(encoded_case)
        knn_idxs = knn_idxs[0]

        relevant_cases_bucket = encoded_train.iloc[knn_idxs].index
示例#13
0
cls_encoder_args = {'case_id_col': dataset_manager.case_id_col, 
                    'static_cat_cols': dataset_manager.static_cat_cols,
                    'static_num_cols': dataset_manager.static_num_cols, 
                    'dynamic_cat_cols': dataset_manager.dynamic_cat_cols,
                    'dynamic_num_cols': dataset_manager.dynamic_num_cols, 
                    'fillna': True}
    
# split into training and test
train, _ = dataset_manager.split_data_strict(data, train_ratio, split="temporal")
    
# generate data where each prefix is a separate instance
dt_prefixes = dataset_manager.generate_prefix_data(train, min_prefix_length, max_prefix_length)

# encode all prefixes
feature_combiner = FeatureUnion([(method, EncoderFactory.get_encoder(method, **cls_encoder_args)) for method in ["static", "agg"]])
X_all = feature_combiner.fit_transform(dt_prefixes)
y_all = np.array(dataset_manager.get_label_numeric(dt_prefixes))

# generate dataset that will enable easy splitting for CV - to guarantee that prefixes of the same case will remain in the same chunk
case_ids = dt_prefixes.groupby(dataset_manager.case_id_col).first()["orig_case_id"]
dt_for_splitting = pd.DataFrame({dataset_manager.case_id_col: case_ids, dataset_manager.label_col: y_all}).drop_duplicates()

print('Optimizing parameters...')

space = {#'n_estimators': hp.choice('n_estimators', np.arange(150, 1000, dtype=int)),
         'max_depth': scope.int(hp.quniform('max_depth', 4, 30, 1)),
         'max_features': hp.uniform('max_features', 0, 1)}
trials = Trials()
best = fmin(create_and_evaluate_model, space, algo=tpe.suggest, max_evals=10, trials=trials)
    if len(relevant_train_cases_bucket) == 0:
        preds = [dataset_manager.get_class_ratio(train)] * len(relevant_test_cases_bucket)

    else:
        dt_train_bucket = dataset_manager.get_relevant_data_by_indexes(dt_train_prefixes,
                                                                       relevant_train_cases_bucket)  # one row per event
        train_y = dataset_manager.get_label_numeric(dt_train_bucket)

        if len(set(train_y)) < 2:
            preds = [train_y[0]] * len(relevant_test_cases_bucket)

            test_y_all.extend(dataset_manager.get_label_numeric(dt_test_bucket))
        else:

            feature_combiner = FeatureUnion(
                [(method, EncoderFactory.get_encoder(method, **cls_encoder_args)) for method in methods])


            cls = xgb.XGBClassifier(objective='binary:logistic',
                                    n_estimators=500,
                                    learning_rate=current_args['learning_rate'],
                                    subsample=current_args['subsample'],
                                    max_depth=int(current_args['max_depth']),
                                    colsample_bytree=current_args['colsample_bytree'],
                                    min_child_weight=int(current_args['min_child_weight']),
                                    seed=random_state)

            pipeline = Pipeline([('encoder', feature_combiner), ('cls', cls)])

            pipeline.fit(dt_train_bucket, train_y)