示例#1
0
def load_model_config(model_path, log_name=None):
    import json
    from .utils.config_utils import load_json
    config = load_json(model_path)
    if log_name is not None:
        logger = get_logger(log_name)
        logger.info(log_name)
        logger.info("\n" + json.dumps(
            config, sort_keys=True, indent=4, separators=(',', ':')))
    return config
示例#2
0
# -*- coding:utf-8 -*-
import os, os.path as osp
import numpy as np

from cfxgb.lib.utils.log_utils import get_logger
from cfxgb.lib.utils.cache_utils import name2path

LOGGER = get_logger("cfxgb")


def check_dir(path):
    d = osp.abspath(osp.join(path, osp.pardir))
    if not osp.exists(d):
        os.makedirs(d)


class BaseClassifierWrapper(object):
    def __init__(self, name, est_class, est_args):
        """
        name: str)
            Used for debug and as the filename this model may be saved in the disk
        """
        self.name = name
        self.est_class = est_class
        self.est_args = est_args
        self.cache_suffix = ".pkl"
        self.est = None

    def _init_estimator(self):
        """
        You can re-implement this function when inherient this class
示例#3
0
def main(args):
    #Logging
    logger = get_logger("cfxgb")

    ################################################################################################################
    #ARGUMENT CHECK
    ################################################################################################################

    if args.Dataset is None:
        logger.error("Dataset required")
        exit(0)

    if args.ParentCols < 0:
        logger.error("Enter valid levels")
        exit(0)

    if args.parameters is None:
        logger.error("Model Parameters required")
        exit(0)
    else:
        config = load_json(args.parameters)
    logger.info("Loaded JSON")

    logger.info(
        "JSON ----------------------------------------------------------------------------------"
    )
    json1 = json.dumps(config, indent=4, separators=(". ", " = "))
    logger.info(json1)
    logger.info(
        "END OF JSON----------------------------------------------------------------------------"
    )

    ################################################################################################################
    #DATASET
    ################################################################################################################

    if not osp.exists(args.Dataset):
        full_path = osp.join('Datasets', args.Dataset + '.csv')
        if not osp.exists(full_path):
            logger.error("Enter valid Dataset")
            exit(0)
    else:
        full_path = args.Dataset

    logger.info(args.Dataset + " used")
    data = pd.read_csv(full_path)
    if (args.ignore):
        logger.info("First column ignored")
        data = data.iloc[:, 1:]

    logger.info("Data Read Complete")
    ################################################################################################################

    ################################################################################################################
    #Extra Columns
    ################################################################################################################

    if (args.ParentCols):
        logger.info("{} level(s) of parent nodes will be added. ".format(
            args.ParentCols))

    else:
        logger.info("Parent nodes not considered")
################################################################################################################

################################################################################################################
#Sample
################################################################################################################

    if (args.sample):
        weights = data.groupby(
            data.columns[-1])[data.columns[-1]].transform('count')
        if (len(np.unique(weights)) == 1):
            logging.info("Equal weights already.")
            data = data.sample(n=args.sample, random_state=0)
        else:
            sum = np.sum(np.unique(weights))
            weights = sum - weights
            data = data.sample(n=args.sample, weights=weights, random_state=0)
        logger.info("Distribution after sampling : \n{}".format(
            data.iloc[:, -1].value_counts()))

################################################################################################################

################################################################################################################
# X,y
################################################################################################################

    X = data.iloc[:, :-1]
    y = data.iloc[:, -1]

    ################################################################################################################

    ################################################################################################################
    #Feature Selection (Initial)
    ################################################################################################################

    if (args.featureSelect):
        logger.info("Feature Selection - Initial")
        clf = XGBClassifier(n_estimators=100,
                            learning_rate=0.3,
                            max_depth=4,
                            verbosity=0,
                            random_state=0,
                            n_jobs=-1)
        rfe = RFECV(clf, step=1, cv=5, verbose=0)
        X = rfe.fit_transform(X, y)

################################################################################################################

################################################################################################################
#TRAIN TEST SPLIT
################################################################################################################

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=0)  #stratify = y
    logger.info("Train Test Split complete")

    ################################################################################################################

    #$#$#$#$#$#$#$$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$$#$$$#$#$#$#$$#$#$#$$#$#$#$#$#$#$#$#$#$#$#
    #TRAINING
    #$#$#$#$#$#$#$$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$$#$$$#$#$#$#$$#$#$#$$#$#$#$#$#$#$#$#$#$#$#

    #$#$#$#$#$#$#$$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$$#$$$#$#$#$#$$#$#$#$$#$#$#$#$#$#$#$#$#$#$#
    #SAMPLING
    #$#$#$#$#$#$#$$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$$#$$$#$#$#$#$$#$#$#$$#$#$#$#$#$#$#$#$#$#$#

    if (args.RandomSamp):
        rus = RandomUnderSampler(random_state=0)
        X_train, y_train = rus.fit_resample(X_train, y_train)
        logger.info("Applied Random Under-Sampling")

    else:
        logger.info("No Random Under-Sampling")

    X_train = np.array(X_train)
    y_train = np.array(y_train)
    y_test = np.array(y_test)
    X_test = np.array(X_test)

    #$#$#$#$#$#$#$$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$$#$$$#$#$#$#$$#$#$#$$#$#$#$#$#$#$#$#$#$#$#
    #MODEL
    #$#$#$#$#$#$#$$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$$#$$$#$#$#$#$$#$#$#$$#$#$#$#$#$#$#$#$#$#$#

    #CFXGB
    cfxgb = CFXGB(config, args)

    #$#$#$#$#$#$#$$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$$#$$$#$#$#$#$$#$#$#$$#$#$#$#$#$#$#$#$#$#$#

    #$#$#$#$#$#$#$$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$$#$$$#$#$#$#$$#$#$#$$#$#$#$#$#$#$#$#$#$#$#
    #CASCADED FOREST AS TRANSFORMER
    #$#$#$#$#$#$#$$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$$#$$$#$#$#$#$$#$#$#$$#$#$#$#$#$#$#$#$#$#$#

    X_train_enc = cfxgb.get_encoded(X_train, y_train)
    X_test_enc = cfxgb.transform(X_test)

    #Final Transformation
    X_train_enc, X_test_enc = cfxgb.finalTransform(X_train, X_train_enc,
                                                   X_test, X_test_enc)
    #    X_train_enc = pd.DataFrame(X_train_enc)
    #    X_train_enc.to_csv("X_train_enc.csv")
    #    X_test_enc = pd.DataFrame(X_train_enc)
    #    X_test_enc.to_csv("X_test_enc.csv")
    logger.info("X_train_enc.shape={}, X_test_enc.shape={}".format(
        X_train_enc.shape, X_test_enc.shape))

    #$#$#$#$#$#$#$$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$$#$$$#$#$#$#$$#$#$#$$#$#$#$#$#$#$#$#$#$#$#
    #XGBOOST
    #$#$#$#$#$#$#$$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$$#$$$#$#$#$#$$#$#$#$$#$#$#$#$#$#$#$#$#$#$#

    y_pred = cfxgb.classify(X_train_enc, y_train, X_test_enc, y_test)

    logger.info("Confusion Matrix - \n{}".format(
        confusion_matrix(y_test, y_pred)))
    logger.info("\nClassification Report - \n{}".format(
        classification_report(y_test, y_pred)))
    logger.info("Accuracy - {}\n".format(accuracy_score(y_test, y_pred)))
    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
    logger.info("AUC ")
    auc = metrics.auc(fpr, tpr)
    logger.info(auc)
    logger.info("Time - {}".format(time.time() - t))
    logger.info("Arguments used in this run : {}".format(str(sys.argv)))

    logging.shutdown()
示例#4
0
import numpy as np
from scipy.sparse import issparse

from cfxgb.lib.utils.log_utils import get_logger

LOGGER = get_logger('cfxgb')


def load_model_config(model_path, log_name=None):
    import json
    from .utils.config_utils import load_json
    config = load_json(model_path)
    if log_name is not None:
        logger = get_logger(log_name)
        logger.info(log_name)
        logger.info("\n" + json.dumps(
            config, sort_keys=True, indent=4, separators=(',', ':')))
    return config


def concat_datas(datas):
    if type(datas) != list:
        return datas
    for i, data in enumerate(datas):
        datas[i] = data.reshape((data.shape[0], -1))
    return np.concatenate(datas, axis=1)


def data_norm(X_train, X_test):
    X_mean = np.mean(X_train, axis=0)
    X_std = np.std(X_train, axis=0)
示例#5
0
import os, os.path as osp
import numpy as np
from sklearn.model_selection import KFold, StratifiedKFold
from xgboost import XGBClassifier

from cfxgb.lib.utils.log_utils import get_logger
from cfxgb.lib.utils.cache_utils import name2path

LOGGER = get_logger("gcforest.estimators.kfold_wrapper")


class KFoldWrapper(object):
    """
    K-Fold Wrapper
    """
    def __init__(self,
                 name,
                 n_folds,
                 est_class,
                 est_args,
                 args,
                 random_state=None):
        """
        Parameters
        ----------
        n_folds (int):
            Number of folds.
            If n_folds=1, means no K-Fold
        est_class (class):
            Class of estimator
        args: