Exemplo n.º 1
0
def get_validated_dmatrices(train_path,
                            validate_path,
                            content_type,
                            csv_weights=0):
    """Get training and validation Data Matrices for XGBoost training.

    Check size and format of both training and validation data channels, and return parsed
    Data Matrices.

    :param train_path:
    :param validate_path:
    :param content_type: Content type of data. Supports 'libsvm' or 'csv'
    :param csv_weights: 1 if instance weights are in the second column of csv data files; otherwise, 0
    :return: Parsed xgb.DMatrix
    """
    train_files_size = get_size(train_path) if train_path else 0
    val_files_size = get_size(validate_path) if validate_path else 0

    logging.debug("File size need to be processed in the node: {}mb.".format(
        round((train_files_size + val_files_size) / (1024 * 1024), 2)))

    if train_files_size > 0:
        validate_data_file_path(train_path, content_type)
    if val_files_size > 0:
        validate_data_file_path(validate_path, content_type)

    train_dmatrix = get_dmatrix(
        train_path, content_type,
        csv_weights=csv_weights) if train_files_size > 0 else None
    val_dmatrix = get_dmatrix(validate_path,
                              content_type) if val_files_size > 0 else None

    return train_dmatrix, val_dmatrix
Exemplo n.º 2
0
def get_validated_dmatrices(train_path,
                            validate_path,
                            content_type,
                            csv_weights=0,
                            is_pipe=False,
                            combine_train_val=False):
    """Get training and validation Data Matrices for XGBoost training.

    Check size and format of both training and validation data channels, and return parsed
    Data Matrices.

    :param train_path:
    :param validate_path:
    :param content_type: Content type of data. Supports 'libsvm' or 'csv'
    :param csv_weights: 1 if instance weights are in the second column of csv data files; otherwise, 0
    :param is_pipe: Boolean to indicate if data is being read in pipe mode
    :combine_train_val: Boolean to indicate if returns a DMatrix combining train and validation data
    :return: Parsed xgb.DMatrix
    """
    train_files_size = get_size(train_path, is_pipe) if train_path else 0
    val_files_size = get_size(validate_path, is_pipe) if validate_path else 0

    if not is_pipe:
        logging.debug(
            "File size need to be processed in the node: {}mb.".format(
                round((train_files_size + val_files_size) / (1024 * 1024), 2)))

        if train_files_size > 0:
            validate_data_file_path(train_path, content_type)
        if val_files_size > 0:
            validate_data_file_path(validate_path, content_type)

    train_dmatrix = get_dmatrix(train_path, content_type, csv_weights=csv_weights, is_pipe=is_pipe) \
        if train_files_size > 0 else None
    val_dmatrix = get_dmatrix(validate_path, content_type, csv_weights=csv_weights, is_pipe=is_pipe) \
        if val_files_size > 0 else None

    train_val_dmatrix = train_dmatrix
    if combine_train_val and train_dmatrix is not None and val_dmatrix is not None:
        logging.info("Read both train and validation data into one DMatrix")
        train_val_dmatrix = get_dmatrix([train_path, validate_path],
                                        content_type,
                                        csv_weights=csv_weights,
                                        is_pipe=is_pipe)

    return train_dmatrix, val_dmatrix, train_val_dmatrix
Exemplo n.º 3
0
def get_validated_dmatrices(train_path,
                            validate_path,
                            content_type,
                            csv_weights=0,
                            is_pipe=False,
                            subsample_ratio_on_read=None):
    """Get training and validation Data Matrices for XGBoost training.

    Check size and format of both training and validation data channels, and return parsed
    Data Matrices.

    :param train_path:
    :param validate_path:
    :param content_type: Content type of data. Supports 'libsvm', 'csv', 'parquet', and 'recordio-protobuf'.
    :param csv_weights: 1 if instance weights are in the second column of csv data files; otherwise, 0
    :param is_pipe: Boolean to indicate if data is being read in pipe mode
    :param subsample_ratio_on_read: None or a value in (0, 1) to indicate how much of the dataset should
            be read into memory.
    :return: Parsed xgb.DMatrix
    """
    train_files_size = get_size(train_path, is_pipe) if train_path else 0
    val_files_size = get_size(validate_path, is_pipe) if validate_path else 0

    if not is_pipe:
        logging.debug("File size need to be processed in the node: {}mb.".format(
            round((train_files_size + val_files_size) / (1024 * 1024), 2)))

        if train_files_size > 0:
            validate_data_file_path(train_path, content_type)
        if val_files_size > 0:
            validate_data_file_path(validate_path, content_type)

    train_dmatrix = get_dmatrix(train_path,
                                content_type,
                                csv_weights=csv_weights,
                                is_pipe=is_pipe,
                                subsample_ratio_on_read=subsample_ratio_on_read) \
        if train_files_size > 0 else None
    val_dmatrix = get_dmatrix(validate_path, content_type, is_pipe=is_pipe) \
        if val_files_size > 0 else None

    return train_dmatrix, val_dmatrix
Exemplo n.º 4
0
    def test_get_dmatrix(self):
        current_path = Path(os.path.abspath(__file__))
        data_path = os.path.join(str(current_path.parent.parent), 'resources',
                                 'abalone', 'data')
        file_path = [
            os.path.join(data_path, path) for path in ['train', 'validation']
        ]

        dmatrix = data_utils.get_dmatrix(file_path, 'libsvm', 0, False)

        self.assertEqual(9, dmatrix.num_col())
        self.assertEqual(3548, dmatrix.num_row())
Exemplo n.º 5
0
                        default=os.environ.get('SM_HOSTS'))
    parser.add_argument('--sm_current_host',
                        type=str,
                        default=os.environ.get('SM_CURRENT_HOST'))

    parser.add_argument('--GetFIFlg', type=str, default='N')
    parser.add_argument('--GetTestScoreFlg', type=str, default='N')
    parser.add_argument('--GetTestPredFlg', type=str, default='N')

    args, _ = parser.parse_known_args()

    # Get SageMaker host information from runtime environment variables
    sm_hosts = json.loads(args.sm_hosts)
    sm_current_host = args.sm_current_host

    dtrain = get_dmatrix(args.train, 'csv')

    dtest = get_dmatrix(args.test, 'csv')

    if not (dtest):
        if ((args.GetTestScoreFlg == 'Y') | (args.GetTestPredFlg == 'Y')):
            raise Exception(
                'Please provide test data in a test channel for prediction and scores or set GetTestScoreFlg and GetTestPredFlg to N'
            )

    train_hp = {
        'max_depth': args.max_depth,
        'eta': args.eta,
        'objective': args.objective,
        'booster': args.booster,
        'seed': args.seed,
    # Sagemaker specific arguments. Defaults are set in the environment variables.
    parser.add_argument('--output_data_dir', type=str, default=os.environ.get('SM_OUTPUT_DATA_DIR'))
    parser.add_argument('--model_dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
    parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN'))
    parser.add_argument('--validation', type=str, default=os.environ.get('SM_CHANNEL_VALIDATION'))
    parser.add_argument('--sm_hosts', type=str, default=os.environ.get('SM_HOSTS'))
    parser.add_argument('--sm_current_host', type=str, default=os.environ.get('SM_CURRENT_HOST'))

    args, _ = parser.parse_known_args()

    # Get SageMaker host information from runtime environment variables
    sm_hosts = json.loads(args.sm_hosts)
    sm_current_host = args.sm_current_host

    dtrain = get_dmatrix(args.train, 'libsvm')
    dval = get_dmatrix(args.validation, 'libsvm')
    watchlist = [(dtrain, 'train'), (dval, 'validation')] if dval is not None else [(dtrain, 'train')]

    train_hp = {
        'max_depth': args.max_depth,
        'eta': args.eta,
        'gamma': args.gamma,
        'min_child_weight': args.min_child_weight,
        'subsample': args.subsample,
        'objective': args.objective
        }

    xgb_train_args = dict(
        params=train_hp,
        dtrain=dtrain,
Exemplo n.º 7
0
    # Sagemaker specific arguments. Defaults are set in the environment variables.
    parser.add_argument("--model_dir",
                        type=str,
                        default=os.environ.get("SM_MODEL_DIR",
                                               "/opt/ml/model"))
    parser.add_argument(
        "--train",
        type=str,
        default=os.environ.get("SM_CHANNEL_TRAIN",
                               "/opt/ml/input/data/abalone"),
    )

    args, _ = parser.parse_known_args()

    dtrain = get_dmatrix(args.train, "libsvm")

    params = {
        "max_depth": 5,
        "eta": 0.2,
        "gamma": 4,
        "min_child_weight": 6,
        "subsample": 0.7,
        "verbosity": 2,
        "objective": "reg:squarederror",
        "tree_method": "auto",
        "predictor": "auto",
    }

    booster = xgb.train(params=params, dtrain=dtrain, num_boost_round=50)
    booster.save_model(args.model_dir + "/" + model_filename)
Exemplo n.º 8
0
    
    parser.add_argument('--output_data_dir', type=str, default=os.environ.get('SM_OUTPUT_DATA_DIR'))
    parser.add_argument('--model_dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
    parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN'))
    parser.add_argument('--validation', type=str, default=os.environ.get('SM_CHANNEL_VALIDATION'))
    parser.add_argument('--test', type=str, default=os.environ.get('SM_CHANNEL_TEST'))
    parser.add_argument('--sm_hosts', type=str, default=os.environ.get('SM_HOSTS'))
    parser.add_argument('--sm_current_host', type=str, default=os.environ.get('SM_CURRENT_HOST'))

    args, _ = parser.parse_known_args()

    # Get SageMaker host information from runtime environment variables
    sm_hosts = json.loads(args.sm_hosts)
    sm_current_host = args.sm_current_host

    dtrain = get_dmatrix(args.train, 'csv')
    dval = get_dmatrix(args.validation, 'csv')
    watchlist = [(dtrain, 'train'), (dval, 'validation')] if dval is not None else [(dtrain, 'train')]

      
    dtest = get_dmatrix(args.test, 'csv')
    if not(dtest):
        if ((args.GetTestScoreFlg=='Y') | (args.GetTestPredFlg=='Y')):
            raise Exception('Please provide test data in a test channel for prediction and scores or set GetTestScoreFlg and GetTestPredFlg to N')
            
    train_hp = {
        'max_depth': args.max_depth,
        'eta': args.eta,
        'objective': args.objective,
        'booster': args.booster,
        'seed': args.seed,
Exemplo n.º 9
0
                        type=str,
                        default=os.environ.get("SM_CHANNEL_VALIDATION"))
    parser.add_argument("--sm_hosts",
                        type=str,
                        default=os.environ.get("SM_HOSTS"))
    parser.add_argument("--sm_current_host",
                        type=str,
                        default=os.environ.get("SM_CURRENT_HOST"))

    args, _ = parser.parse_known_args()

    # Get SageMaker host information from runtime environment variables
    sm_hosts = json.loads(args.sm_hosts)
    sm_current_host = args.sm_current_host

    dtrain = get_dmatrix(args.train, "libsvm")
    dval = get_dmatrix(args.validation, "libsvm")
    watchlist = ([(dtrain, "train"),
                  (dval, "validation")] if dval is not None else [(dtrain,
                                                                   "train")])

    train_hp = {
        "max_depth": args.max_depth,
        "eta": args.eta,
        "gamma": args.gamma,
        "min_child_weight": args.min_child_weight,
        "subsample": args.subsample,
        "verbosity": args.verbosity,
        "objective": args.objective,
        "tree_method": args.tree_method,
        "predictor": args.predictor,
Exemplo n.º 10
0
    #     parser.add_argument('--validation', type=str,
    #                         default=os.environ['SM_CHANNEL_VALIDATION'])
    parser.add_argument('--sm_hosts', type=str, default=os.environ['SM_HOSTS'])
    parser.add_argument('--sm_current_host',
                        type=str,
                        default=os.environ['SM_CURRENT_HOST'])

    args, _ = parser.parse_known_args()

    # Get SageMaker host information from runtime environment variables
    sm_hosts = json.loads(os.environ['SM_HOSTS'])
    sm_current_host = args.sm_current_host

    print("hello, i get data")

    dtrain = get_dmatrix(args.train, 'csv')
    #     dval = get_dmatrix(args.validation, 'csv')
    #     watchlist = [(dtrain, 'train'), (dval, 'validation')
    #                  ] if dval is not None else [(dtrain, 'train')]
    watchlist = [(dtrain, 'train')]

    train_hp = {
        'max_depth': args.max_depth,
        'eta': args.eta,
        'gamma': args.gamma,
        'min_child_weight': args.min_child_weight,
        'subsample': args.subsample,
        'verbose': args.verbose,
        'objective': args.objective,
        'eval_metric': args.eval_metric
    }