示例#1
0
def train():
    try:
        print("starting training...")
        hyperparameters = load_json_object(hyperparameters_file_path)
        print("\nHyperparameters configuration:")
        print_json_object(hyperparameters)

        input_data_config = load_json_object(inputdataconfig_file_path)
        print("\nInput data configuration:")
        print_json_object(input_data_config)

        for key in input_data_config:
            print("\nList of files in {0} channel: ".format(key))
            channel_path = data_files_path + key + "/"
            print_files_in_path(channel_path)

        if os.path.exists(resource_file_path):
            resource_config = load_json_object(resource_file_path)
            print("\nResource configuration:")
            print_json_object(resource_config)

        # Take the set of files and read them all into a single pandas dataframe
        input_files = [
            os.path.join(data_files_path + "train/", file)
            for file in os.listdir(data_files_path + "train/")
        ]
        if len(input_files) == 0:
            raise ValueError((
                "There are no files in {}.\n" +
                "This usually indicates that the channel ({}) was incorrectly specified,\n"
                +
                "the data specification in S3 was incorrectly specified or the role specified\n"
                + "does not have permission to access the data.").format(
                    data_files_path + "train/", "train"))

        concat_data = load_raw(input_files, [label_column, feature_column])

        print(concat_data.info())

        preprocessor = CountVectorizer(analyzer=set)
        print("fitting...")
        preprocessor.fit(concat_data[feature_column])
        print("finished fitting...")

        feature_column_names = preprocessor.get_feature_names()
        print(feature_column_names)

        le = LabelEncoder()
        le.fit(concat_data[label_column])
        print("le classes: ", le.classes_)

        dump(preprocessor, os.path.join(model_artifacts_path, "model.joblib"))
        dump(le, os.path.join(model_artifacts_path, "label.joblib"))

        print("saved model!")
    except Exception as e:
        write_failure_file(failure_file_path, str(e))
        print(e, file=sys.stderr)
        sys.exit(1)
示例#2
0
def main(args):
    """
    SM_CHANNEL does not contain backward slash:
        SM_CHANNEL_TRAIN=/opt/ml/input/data/train
        SM_CHANNEL_VALIDATION=/opt/ml/input/data/validation

    Training job name:
        script-mode-container-xgb-2020-08-10-13-29-15-756

    """
    train_channel, validation_channel, model_dir = args.train, args.validation, args.model_dir

    print("\nList of files in train channel: ")
    print_files_in_path(train_channel)

    print("\nList of files in validation channel: ")
    print_files_in_path(validation_channel)
    use_cuda = torch.cuda.is_available()
    torch.manual_seed(args.seed)

    device = torch.device("cuda" if use_cuda else "cpu")
    print("Device:", device)
    kwargs = {"num_workers": 8, "pin_memory": True} if use_cuda else {}

    input_features = 5
    n_samples = 5000
    dataset = MyDataset(n_samples, input_features, 3)
    train_len = int(n_samples * 0.7)
    test_len = n_samples - train_len
    train_set, val_set = torch.utils.data.random_split(dataset,
                                                       [train_len, test_len])
    train_loader = DataLoader(train_set,
                              batch_size=args.batch_size,
                              shuffle=True,
                              **kwargs)
    test_loader = DataLoader(val_set,
                             batch_size=args.batch_size,
                             shuffle=True,
                             **kwargs)

    model = Net(input_features).to(device)
    # optimizer = optim.Adadelta(model.parameters(), lr=args.lr)
    optimizer = optim.Adam(model.parameters(), lr=args.lr)
    # scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)
    for epoch in range(1, args.epochs + 1):
        train(args, model, device, train_loader, optimizer, epoch)
        # scheduler.step()
        test(model, device, test_loader)

    if args.save_model:
        save_model(model, model_dir)
def train(hp1, hp2, hp3, train_channel, validation_channel):

    print("\nList of files in train channel: ")
    print_files_in_path(os.environ["SM_CHANNEL_TRAIN"])

    print("\nList of files in validation channel: ")
    print_files_in_path(os.environ["SM_CHANNEL_VALIDATION"])

    # Dummy net.
    net = None

    # Run training loop.
    epochs = 5
    for x in range(epochs):
        print("\nRunning epoch {0}...".format(x))

        time.sleep(30)

        print("Completed epoch {0}.".format(x))

    # At the end of the training loop, we have to save model artifacts.
    model_dir = os.environ["SM_MODEL_DIR"]
    save_model_artifacts(model_dir + "/", net)
示例#4
0
def train():
    try:
        print("\nRunning training...")

        if os.path.exists(hyperparameters_file_path):
            hyperparameters = load_json_object(hyperparameters_file_path)
            print('\nHyperparameters configuration:')
            print_json_object(hyperparameters)

        if os.path.exists(inputdataconfig_file_path):
            input_data_config = load_json_object(inputdataconfig_file_path)
            print('\nInput data configuration:')
            print_json_object(input_data_config)

            for key in input_data_config:
                print('\nList of files in {0} channel: '.format(key))
                channel_path = data_files_path + key + '/'
                print_files_in_path(channel_path)

        if os.path.exists(resource_file_path):
            resource_config = load_json_object(resource_file_path)
            print('\nResource configuration:')
            print_json_object(resource_config)

        if (training_job_name_env in os.environ):
            print("\nTraining job name: ")
            print(os.environ[training_job_name_env])

        if (training_job_arn_env in os.environ):
            print("\nTraining job ARN: ")
            print(os.environ[training_job_arn_env])

        # This object is used to handle SIGTERM and SIGKILL signals.
        signal_handler = ExitSignalHandler()

        # Dummy net.
        net = None

        # Run training loop.
        epochs = 1
        for x in range(epochs):
            print("\nRunning epoch {0}...".format(x))

            time.sleep(10)

            if (signal_handler.exit_now):
                print(
                    "Received SIGTERM/SIGINT. Saving training state and exiting."
                )
                # Save state here.
                save_model_artifacts(model_artifacts_path, net)
                sys.exit(0)

            print("Completed epoch {0}.".format(x))

        # At the end of the training loop, we have to save model artifacts.
        save_model_artifacts(model_artifacts_path, net)

        print("\nTraining completed!")
    except Exception as e:
        write_failure_file(failure_file_path, str(e))
        print(e, file=sys.stderr)
        sys.exit(1)
def train(train_channel, validation_channel, model_dir, epochs):
    """
    SM_CHANNEL does not contain backward slash:
        SM_CHANNEL_TRAIN=/opt/ml/input/data/train
        SM_CHANNEL_VALIDATION=/opt/ml/input/data/validation

    Training job name:
        script-mode-container-xgb-2020-08-10-13-29-15-756

    """
    print("\nList of files in train channel: ")
    print_files_in_path(train_channel)

    print("\nList of files in validation channel: ")
    print_files_in_path(validation_channel)

    X_train, X_test, y_train, y_test = get_data(train_channel,
                                                validation_channel)

    n_jobs = cpu_count() - 1

    parameters = {
        "min_child_weight": 5,
        "max_depth": 5,
        "learning_rate": 0.0001,
        "objective": "multi:softprob",
        "n_estimators": epochs,
    }

    model = XGBClassifier(
        base_score=0.5,
        booster="gbtree",
        colsample_bylevel=1,
        colsample_bynode=1,
        colsample_bytree=1,
        gamma=0,
        max_delta_step=0,
        missing=None,
        n_jobs=n_jobs,  # From version 1.1.1, cant use -1 for all cores
        nthread=None,
        random_state=0,
        reg_alpha=0,
        reg_lambda=1,
        # scale_pos_weight=1,
        subsample=1,
        verbosity=1,
        **parameters,
    )
    print(model)
    fit_params = {
        # "sample_weight": df_train_w["sample_weight"],
        "early_stopping_rounds": 10,
        "eval_metric": "mlogloss",
        "eval_set": [(X_train, y_train), (X_test, y_test)],
    }
    model.fit(X_train, y_train, **fit_params)
    # model.fit(X_train, y_train)

    # Evaluation
    preds = model.predict(X_test)
    print(classification_report(y_test, preds))
    print(confusion_matrix(y_test, preds, labels=[0, 1, 2]))
    print(precision_score(y_test, preds, average="weighted"))

    save_model(model, model_dir)