Пример #1
0
 def on_epoch_begin(self, epoch, logs=None):
     log_metric("MO_learning_rate", eval(self.model.optimizer.lr))
Пример #2
0
    enc = OneHotEncoder()
    Y = enc.fit_transform(y[:, np.newaxis]).toarray()

    # Scale data to have mean 0 and variance 1
    # which is importance for convergence of the neural network
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    x_train, x_test, y_train, y_test = train_test_split(X_scaled,
                                                        Y,
                                                        test_size=0.2,
                                                        random_state=42)
    with mlflow.start_run():
        model = tf.keras.Sequential([
            tf.keras.layers.Dense(10, activation=tf.nn.relu,
                                  input_shape=(4, )),
            tf.keras.layers.Dense(10, activation=tf.nn.relu),
            tf.keras.layers.Dense(3, activation="softmax")
        ])
        model.compile(optimizer='adam',
                      loss=tf.keras.losses.CategoricalCrossentropy(),
                      metrics=['accuracy'])
        model.fit(x_train, y_train, epochs=1000)
        test_loss, test_acc = model.evaluate(x_test, y_test)

        log_metric('acc', test_acc)
        log_model(keras_model=model,
                  registered_model_name='Keras-Iris-Model',
                  artifact_path='model_artifact',
                  conda_env=conda_env)

if __name__ == '__main__':
    with open('test.file', 'wb') as f:
        test = range(10)
        joblib.dump(test, f)

    local_registry = "sqlite:///mlruns.db"
    print(f"Running local model registry={local_registry}")
    mlflow.set_tracking_uri(local_registry)

    clt = MlflowClient()
    exp_a_uri = "file:///tmp/exp_A"
    exp_a_id = get_exp_id("experiment_A", local_registry, local_registry,
                          exp_a_uri)
    exp_b_uri = "file:///tmp/exp_B"
    exp_b_id = get_exp_id("experiment_B", local_registry, local_registry,
                          exp_b_uri)

    for i in range(3):
        with mlflow.start_run(experiment_id=exp_a_id):
            mlflow.log_metric("MEAN SQUARE ERROR", 0.25 * random())
            mlflow.log_artifact('test.file')
            print(f"artifact_uri={mlflow.get_artifact_uri()}")
    print("-" * 75)
    for i in range(3):
        with mlflow.start_run(experiment_id=exp_b_id):
            mlflow.log_metric("MEAN SQUARE ERROR", 0.25 * random())
            mlflow.log_artifact('test.file')
            print(f"artifact_uri={mlflow.get_artifact_uri()}")
Пример #4
0
                'parameters': str(rand()),
                'in': str(rand()),
                'this': str(rand()),
                'experiement': str(rand()),
                'run': str(rand()),
                'because': str(rand()),
                'we': str(rand()),
                'need': str(rand()),
                'to': str(rand()),
                'check': str(rand()),
                'how': str(rand()),
                'it': str(rand()),
                'handles': str(rand()),
            }
            log_params(parameters)
            mlflow.log_metric('test_metric', 1)

        with mlflow.start_run(run_name='child_metrics.py', nested=True):
            metrics = {
                'lot': [rand()],
                'of': [rand()],
                'parameters': [rand()],
                'in': [rand()],
                'this': [rand()],
                'experiement': [rand()],
                'run': [rand()],
                'because': [rand()],
                'we': [rand()],
                'need': [rand()],
                'to': [rand()],
                'check': [rand()],
Пример #5
0
def train(in_alpha, in_l1_ratio, masterid):
    import os
    import warnings
    import sys

    import pandas as pd
    import numpy as np
    from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import ElasticNet

    import mlflow
    import mlflow.sklearn

    import logging
    ""
    logging.basicConfig(level=logging.WARN)
    logger = logging.getLogger(__name__)

    def eval_metrics(actual, pred):
        rmse = np.sqrt(mean_squared_error(actual, pred))
        mae = mean_absolute_error(actual, pred)
        r2 = r2_score(actual, pred)
        return rmse, mae, r2

    warnings.filterwarnings("ignore")
    np.random.seed(40)
    #print("xxxxxxxxxxxxxxxxxxxxx")
    # Read data from hive table prepared before
    data = {}
    try:
        print(masterid)
        data = query_hive_data(masterid)[[
            "sequence", "x_basic_hour", "x_basic_horizon", "i_set", "ec_ws",
            "ec_wd", "ec_tmp", "ec_press", "ec_rho", "ec_dist", "gfs_ws",
            "gfs_wd", "gfs_tmp", "gfs_press", "gfs_rho", "gfs_dist", "speed",
            "power"
        ]]
        data = data.fillna('1')
        #print(data)
    except Exception as e:
        logger.exception("Get Hive Data Error: %s", e)

    print(data)
    # Split the data into training and test sets. (0.75, 0.25) split.
    train, test = train_test_split(data)

    # The predicted column is "power"
    train_x = train.drop(["power", "speed"], axis=1)
    test_x = test.drop(["power", "speed"], axis=1)
    train_y = train[["power"]]
    test_y = test[["power"]]

    # Set default values if no alpha is provided
    if float(in_alpha) is None:
        alpha = 0.5
    else:
        alpha = float(in_alpha)

    # Set default values if no l1_ratio is provided
    if float(in_l1_ratio) is None:
        l1_ratio = 0.5
    else:
        l1_ratio = float(in_l1_ratio)

    # Useful for multiple runs (only doing one run in this sample notebook)
    with mlflow.start_run():
        # Execute ElasticNet
        lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)
        lr.fit(train_x, train_y)

        #print(test_x)
        # Evaluate Metrics
        predicted_powers = lr.predict(test_x)
        (rmse, mae, r2) = eval_metrics(test_y, predicted_powers)

        # Print out metrics
        print("Elasticnet model (alpha=%f, l1_ratio=%f):" % (alpha, l1_ratio))
        print("  RMSE: %s" % rmse)
        print("  MAE: %s" % mae)
        print("  R2: %s" % r2)

        # Log parameter, metrics, and model to MLflow
        mlflow.log_param("alpha", alpha)
        mlflow.log_param("l1_ratio", l1_ratio)
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2", r2)
        mlflow.log_metric("mae", mae)

        mlflow.sklearn.log_model(lr, "model")
Пример #6
0
def eval_Slim(params, cfg, train_mat, eval_mat, experiment):
    # This function is what Hyperopt is going to optimize (minimize 'loss' value)
    print(experiment)
    with mlflow.start_run(experiment_id=experiment):

        # Log the config
        utils.config_helpers.log_config(dict(cfg.model))

        n_users, n_items = train_mat.shape
        np.random.seed(seed=cfg.model.seed)

        # Log relevant parameters for this run.
        mlflow.log_param("alpha", params['alpha'])
        mlflow.log_param("l1_ratio", params['l1_ratio'])
        mlflow.log_param("max_iter", params['max_iter'])
        mlflow.log_param("tol", params['tol'])

        # Log this run
        log.info(
            f"Testing  alpha: {params['alpha']},  l1_ratio: {params['l1_ratio']}, max_iter: {params['max_iter']} and tol: {params['tol']}"
        )

        start = time.time()
        # Create model
        slim = RecModel.Slim(num_items=n_items, num_users=n_users)

        # Train Model
        slim.train(X=train_mat.copy(),
                   alpha=params['alpha'],
                   l1_ratio=params['l1_ratio'],
                   max_iter=params['max_iter'],
                   tolerance=params['tol'],
                   cores=1,
                   verbose=int(cfg.model.verbose))

        # Log run-time
        mlflow.log_metric("Runtime", int(round(time.time() - start, 0)))

        # Evaluate model
        perf_all = slim.eval_topn(eval_mat.copy(),
                                  rand_sampled=int(cfg.model.rand_sampled),
                                  topn=np.array(cfg.model.top_n_performances,
                                                dtype=np.int32),
                                  random_state=int(cfg.model.seed),
                                  cores=int(cfg.model.cores))

        # Log the performance of the model
        for pos in range(len(cfg.model.top_n_performances)):
            mlflow.log_metric(
                f"recallAT{cfg.model.top_n_performances[pos]}_of_{cfg.model.rand_sampled}",
                perf_all[f"Recall@{cfg.model.top_n_performances[pos]}"])
        mlflow.log_metric('MAE_train', slim.eval_prec(train_mat.copy()))
        mlflow.log_metric('MAE_eval', slim.eval_prec(eval_mat.copy()))

        #We will always choose the first topn performance. Hopefully, that is also the smallest is most relevant for us.g
        rel_topn_perf = perf_all[f"Recall@{cfg.model.top_n_performances[0]}"]

        log.info(
            f"Current recallAT{cfg.model.top_n_performances[0]}_of_{cfg.model.rand_sampled} performance was {rel_topn_perf}"
        )
        loss = -rel_topn_perf
        return {'loss': loss, 'status': hp.STATUS_OK, 'eval_time': time.time()}
Пример #7
0
def log_metric(metric_name, value, step=None):
    print(f"[INFO] Logging metric: {metric_name}")
    mlflow.log_metric(metric_name, value, step=None)
Пример #8
0
 def log_metric(self, metric: str, value: Union[int, float]):
     mlflow.log_metric(metric, value)
Пример #9
0
    if batch % log_step == 0:
        mlflow.log_metric('training_loss', logs.get('loss'))
        mlflow.log_metric('training_acc', logs.get('acc'))


batch_callback = LambdaCallback(
    on_batch_end=lambda batch, logs: track_loss_acc(logs))
epoch_callback = LambdaCallback(
    on_epoch_end=lambda epoch, logs: print("Epoch Complete"))
complete_callback = LambdaCallback(on_train_end=lambda logs: print(logs))

# def main():
inputs = Input(shape=(image_pixels, ))
# hidden = Dense(int(image_pixels * 0.37), activation='relu')(inputs)
outputs = Dense(num_labels, activation='softmax')(inputs)
model = Model(inputs=inputs, outputs=outputs)

sgd = SGD(lr=learning_rate)
model.compile(optimizer=sgd, loss=loss_func, metrics=metrics)

fit_resp = model.fit(x=x_train, y=y_train, callbacks=[batch_callback])
mlflow.log_metric('training_loss', fit_resp.history.get('loss')[0])
mlflow.log_metric('training_acc', fit_resp.history.get('acc')[0])

loss, acc = model.evaluate(x_test, y_test)
mlflow.log_metric('loss', loss)
mlflow.log_metric('acc', acc)

# if __name__ == '__main__':
#     main()
Пример #10
0
def track_loss_acc(logs, log_step=100):
    batch = logs.get('batch')
    if batch % log_step == 0:
        mlflow.log_metric('training_loss', logs.get('loss'))
        mlflow.log_metric('training_acc', logs.get('acc'))
Пример #11
0
def valid_lr(race_results_df_processed_valid, model_lr, parameters):
    # mlflow
    print('FILE_DIR: ' + FILE_DIR)
    mlflow.set_tracking_uri(FILE_DIR + '/../../../logs/mlruns/')
    mlflow.set_experiment('forecast_keiba_valid')
    run_info = mlflow.start_run()
    mlflow.set_tag('model', 'lr')

    # 検証のデータ準備
    race_results_df_processed_valid = race_results_df_processed_valid
    # 説明変数の取得
    X_valid = race_results_df_processed_valid.drop(['rank'],axis=1)
    # 目的変数の取得
    y_valid = race_results_df_processed_valid['rank']

    # 推論実行
    y_valid_pred = model_lr.predict(X_valid)

    # 集計用に処理
    valid_results_df = pd.DataFrame({'pred':y_valid_pred,'actual':y_valid})
    race_id_list = list(set(list(valid_results_df.index)))
    valid_results_list = valid_results_df.reset_index().values.tolist()
    # シャッフル
    random.shuffle(valid_results_list)

    # 集計(馬単)
    correct_count = 0
    for race_id in race_id_list:
        pred_cnt_by_race = 0
        cnt_by_race = 0
        for rank in [1]:
            for i in range(len(valid_results_list)):
                # 対象レースidのうち、{rank}位と予測された馬
                if valid_results_list[i][0] == race_id and valid_results_list[i][1] == rank:
                    pred_cnt_by_race += 1
                    if pred_cnt_by_race <= 1 and (valid_results_list[i][2] == 1):
                        cnt_by_race += 1
        if cnt_by_race == 1:
            correct_count += 1
    acc_exacta_1 = correct_count/100
    print('acc_exacta_1: ' + str(acc_exacta_1))

    # 集計(馬連)
    correct_count = 0
    for race_id in race_id_list:
        pred_cnt_by_race = 0
        cnt_by_race = 0
        for rank in [1, 2]:
            for i in range(len(valid_results_list)):
                # 対象レースidのうち、{rank}位と予測された馬
                if valid_results_list[i][0] == race_id and valid_results_list[i][1] == rank:
                    pred_cnt_by_race += 1
                    if pred_cnt_by_race <= 2 and (valid_results_list[i][2] == 1 or valid_results_list[i][2] == 2):
                        cnt_by_race += 1
        if cnt_by_race == 2:
            correct_count += 1
    acc_quinella_2 = correct_count/100
    print('acc_quinella_2: ' + str(acc_quinella_2))

    # 集計(三連複)
    correct_count = 0
    for race_id in race_id_list:
        pred_cnt_by_race = 0
        cnt_by_race = 0
        for rank in [1, 2, 3]:
            for i in range(len(valid_results_list)):
                # 対象レースidのうち、{rank}位と予測された馬
                if valid_results_list[i][0] == race_id and valid_results_list[i][1] == rank:
                    pred_cnt_by_race += 1
                    if pred_cnt_by_race <= 3 and (valid_results_list[i][2] == 1 or valid_results_list[i][2] == 2 or valid_results_list[i][2] == 3):
                        cnt_by_race += 1
        if cnt_by_race == 3:
            correct_count += 1
    acc_trio_3 = correct_count/100
    print('acc_trio_3: ' + str(acc_trio_3))

    mlflow.log_metric("acc_exacta_1", acc_exacta_1)
    mlflow.log_metric("acc_quinella_2", acc_quinella_2)
    mlflow.log_metric("acc_trio_3", acc_trio_3)

    # 通知
    if parameters['is_notify']:
        run_result_dict = mlflow.get_run(run_info.info.run_id).to_dictionary()
        run_result_str = json.dumps(run_result_dict, indent=4)

        conf_paths = [FILE_DIR + "/../../../conf/base", FILE_DIR + "/../../../conf/local"]
        conf_loader = ConfigLoader(conf_paths)
        credentials = conf_loader.get("credentials*", "credentials*/**")
        token = credentials['dev_line']['access_token']

        url = "https://notify-api.line.me/api/notify"
        headers = {"Authorization": "Bearer " + token}
        payload = {"message": "model_lr" + run_result_str}
        requests.post(url, headers=headers, data=payload)

    mlflow.end_run()
    max_iterations = 150
    # log parameters
    mlflow.log_param("random_state", random_state)
    mlflow.log_param("solver", solver)
    mlflow.log_param("multi_class", multi_class)
    mlflow.log_param("max_iterations", max_iterations)
    mlflow.set_tag("framework", "sklearn")
    # create a model
    model = LogisticRegression(random_state=random_state,
                               solver=solver,
                               multi_class=multi_class,
                               max_iter=max_iterations)
    model = model.fit(X, y)
    # log metris and persiste model via mlflow.<flavour>.log_model
    score = model.score(X, y)
    mlflow.log_metric("score", model.score(X, y))
    mlflow.sklearn.log_model(model, artifact_path)
    # obtain run_id for next steps
    run_id = mlflow.active_run().info.run_id
    print("run_id: {}".format(run_id))
    print("Score: {}".format(model.score(X, y)))

# COMMAND ----------

# MAGIC %md ### Load model

# COMMAND ----------

# MAGIC %md
# MAGIC In this step we read the model from a persistent artifact storage we created in the previous step. This is to demonstrate the retrieval of created models using `run_id` as we will need in the next step when deploying to AzureML.
Пример #13
0
        X = pd.DataFrame(boston.data, columns=boston.feature_names)
        y = pd.Series(boston.target)

        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.25)
        mlflow.set_tag(EVALUATION_SET_TAG, "25 percent Split")

        print("building regressor")
        regressor = xgb.XGBRegressor(n_estimators=100,
                                     reg_lambda=1,
                                     gamma=0,
                                     max_depth=3)

        print("fitting model")
        regressor.fit(X_train, y_train)

        print("predicting on eval set")
        y_pred = regressor.predict(X_test)

        # Log the mean squared error
        print("calculating MSE")
        mse = mean_squared_error(y_test, y_pred)
        mlflow.log_metric("mse", mse)

        # Save the model artifact
        print("Saving model")
        regressor.save_model(f"{BASE_DIR}/outputs-xgb/model.json")
        mlflow.log_artifacts(f"{BASE_DIR}/outputs-xgb")
    print("done.")
Пример #14
0
def log_learning_curve(model_name: str, model: Any, fold=0):
    '''
    Function to log learning curve.
    For GBDT models, the schema of evals_result is uniform like below:
    evals_result = {
        'validation_0': {'logloss': ['0.604835', '0.531479']},
        'validation_1': {'logloss': ['0.41965', '0.17686']}
        }
    example key for output: fold0_valid0-logloss
    '''
    if model_name == 'XGBClassifier':
        evals_result = model.evals_result()
        for eval_idx in range(len(evals_result)):
            validation_X_raw = f'validation_{eval_idx}'  # this is the raw expression from model
            metricdict = evals_result[validation_X_raw]
            for metricname, scorelist in metricdict.items(
            ):  # this loops only once
                metricname = get_normalized_metricname(metricname)
                for i, score in enumerate(scorelist):
                    mlflow.log_metric(
                        f'fold{fold}_valid{eval_idx}-{metricname}_vsstep',
                        score, i)

    elif model_name == 'LGBMClassifier':
        evals_result = model.evals_result_
        for eval_idx in range(len(evals_result)):
            validation_X_raw = 'training' if eval_idx == 0 else f'valid_{eval_idx}'  # this is the raw expression from model
            metricdict = evals_result[validation_X_raw]
            for metricname, scorelist in metricdict.items(
            ):  # this loops only once
                metricname = get_normalized_metricname(metricname)
                for i, score in enumerate(scorelist):
                    mlflow.log_metric(
                        f'fold{fold}_valid{eval_idx}-{metricname}_vsstep',
                        score, i)

    elif model_name == 'CatBoostClassifier':
        evals_result = model.get_evals_result()
        for eval_idx in range(
                len(evals_result) - 1
        ):  # skip key 'learn', which contains same value as validation_0
            validation_X_raw = f'validation_{eval_idx}'
            metricdict = evals_result[validation_X_raw]
            for metricname, scorelist in metricdict.items(
            ):  # this loops only once
                metricname = get_normalized_metricname(metricname)
                for i, score in enumerate(scorelist):
                    mlflow.log_metric(
                        f'fold{fold}_valid{eval_idx}-{metricname}_vsstep',
                        score, i)

    elif model_name == 'RandomForestClassifier2':
        evals_result = model.get_evals_result()
        for eval_idx in range(len(evals_result)):
            validation_X_raw = f'valid{eval_idx}'
            metricdict = evals_result[validation_X_raw]
            for metricname, scorelist in metricdict.items(
            ):  # this loops only once
                metricname = get_normalized_metricname(metricname)
                for i, score in enumerate(scorelist):
                    mlflow.log_metric(
                        f'fold{fold}_valid{eval_idx}-{metricname}_vsstep',
                        score, i)
    else:
        raise ValueError(f'Invalid model_name: {model_name}')
Пример #15
0
def main(args):

    data_dir = args.data_dir
    figure_path = args.figure_dir
    model_path = args.model_dir

    file_name = "data.hdf5"

    # Set skip_training to False if the model has to be trained, to True if the model has to be loaded.
    skip_training = False

    # Set the torch device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Device = {}".format(device))

    # Initialize parameters
    parameters = Params_cross(
        subject_n=args.sub,
        hand=args.hand,
        batch_size=args.batch_size,
        valid_batch_size=args.batch_size_valid,
        test_batch_size=args.batch_size_test,
        epochs=args.epochs,
        lr=args.learning_rate,
        wd=args.weight_decay,
        patience=args.patience,
        device=device,
        y_measure=args.y_measure,
        desc=args.desc,
    )
    # Import data and generate train-, valid- and test-set
    # Set if generate with RPS values or not (check network architecture used later)

    print("Testing: {} ".format(parameters.desc))

    dataset = MEG_Cross_Dataset_no_bp(
        data_dir,
        file_name,
        parameters.subject_n,
        mode="train",
        y_measure=parameters.y_measure,
    )
    test_dataset = MEG_Cross_Dataset_no_bp(
        data_dir,
        file_name,
        parameters.subject_n,
        mode="test",
        y_measure=parameters.y_measure,
    )

    # split the dataset in train, test and valid sets.
    train_len, valid_len = len_split_cross(len(dataset))

    # train_dataset, valid_test, test_dataset = random_split(dataset, [train_len, valid_len, test_len],
    #                                                        generator=torch.Generator().manual_seed(42))
    train_dataset, valid_dataset = random_split(
        dataset, [train_len, valid_len]
    )

    print(
        "Train dataset len {}, valid dataset len {}, test dataset len {}".format(
            len(train_dataset), len(valid_dataset), len(test_dataset)
        )
    )

    # Initialize the dataloaders
    trainloader = DataLoader(
        train_dataset,
        batch_size=parameters.batch_size,
        shuffle=True,
        num_workers=4,
    )
    validloader = DataLoader(
        valid_dataset,
        batch_size=parameters.valid_batch_size,
        shuffle=True,
        num_workers=4,
    )
    testloader = DataLoader(
        test_dataset,
        batch_size=parameters.test_batch_size,
        shuffle=False,
        num_workers=4,
    )

    # Initialize network

    with torch.no_grad():
        sample, y = iter(trainloader).next()

    n_times = sample.shape[-1]
    net = MNet(n_times)

    print(net)
    # Training loop or model loading
    if not skip_training:
        print("Begin training....")

        # Check the optimizer before running (different from model to model)
        # optimizer = Adam(net.parameters(), lr=parameters.lr, weight_decay=5e-4)
        optimizer = SGD(net.parameters(), lr=parameters.lr, weight_decay=5e-4)

        scheduler = ReduceLROnPlateau(optimizer, mode="min", factor=0.5,
                                      patience=15)

        print("scheduler : ", scheduler)

        loss_function = torch.nn.MSELoss()
        start_time = timer.time()

        net, train_loss, valid_loss = train(
            net,
            trainloader,
            validloader,
            optimizer,
            scheduler,
            loss_function,
            parameters.device,
            parameters.epochs,
            parameters.patience,
            parameters.hand,
            model_path,
        )

        train_time = timer.time() - start_time
        print("Training done in {:.4f}".format(train_time))

        # visualize the loss as the network trained
        fig = plt.figure(figsize=(10, 4))
        plt.plot(
            range(1, len(train_loss) + 1), train_loss, label="Training Loss"
        )
        plt.plot(
            range(1, len(valid_loss) + 1), valid_loss, label="Validation Loss"
        )

        # find position of lowest validation loss
        minposs = valid_loss.index(min(valid_loss)) + 1
        plt.axvline(
            minposs,
            linestyle="--",
            color="r",
            label="Early Stopping Checkpoint",
        )

        plt.xlabel("epochs")
        plt.ylabel("loss")
        # plt.ylim(0, 0.5) # consistent scale
        # plt.xlim(0, len(train_loss)+1) # consistent scale
        plt.grid(True)
        plt.legend()
        plt.tight_layout()
        plt.show()
        image1 = fig
        plt.savefig(os.path.join(figure_path, "loss_plot.pdf"))

    if not skip_training:
        # Save the trained model
        save_pytorch_model(net, model_path, "model.pth")
    else:
        # Load the model (properly select the model architecture)
        net = MNet()
        net = load_pytorch_model(
            net, os.path.join(model_path, "model.pth"), parameters.device
        )

    # Evaluation
    print("Evaluation...")
    net.eval()
    y_pred = []
    y = []
    y_pred_valid = []
    y_valid = []

    with torch.no_grad():
        for data, labels in testloader:
            data, labels = (
                data.to(parameters.device),
                labels.to(parameters.device),
            )
            y.extend(list(labels[:, parameters.hand]))
            y_pred.extend((list(net(data))))

        for data, labels in validloader:
            data, labels = (
                data.to(parameters.device),
                labels.to(parameters.device),
            )
            y_valid.extend(list(labels[:, parameters.hand]))
            y_pred_valid.extend((list(net(data))))

    # Calculate Evaluation measures
    print("Evaluation measures")
    mse = mean_squared_error(y, y_pred)
    rmse = mean_squared_error(y, y_pred, squared=False)
    mae = mean_absolute_error(y, y_pred)
    r2 = r2_score(y, y_pred)

    rmse_valid = mean_squared_error(y_valid, y_pred_valid, squared=False)
    r2_valid = r2_score(y_valid, y_pred_valid)
    valid_loss_last = min(valid_loss)

    print("Test set ")
    print("mean squared error {}".format(mse))
    print("root mean squared error {}".format(rmse))
    print("mean absolute error {}".format(mae))
    print("r2 score {}".format(r2))

    print("Validation set")
    print("root mean squared error valid {}".format(rmse_valid))
    print("r2 score valid {}".format(r2_valid))
    print("last value of the validation loss:".format(valid_loss_last))

    # plot y_new against the true value focus on 100 timepoints
    fig, ax = plt.subplots(1, 1, figsize=[10, 4])
    times = np.arange(200)
    ax.plot(times, y_pred[0:200], color="b", label="Predicted")
    ax.plot(times, y[0:200], color="r", label="True")
    ax.set_xlabel("Times")
    ax.set_ylabel("{}".format(parameters.y_measure))
    ax.set_title(
        "Sub {}, hand {}, {} prediction".format(
            str(parameters.subject_n),
            "sx" if parameters.hand == 0 else "dx",
            parameters.y_measure,
        )
    )
    plt.legend()
    plt.savefig(os.path.join(figure_path, "Times_prediction_focus.pdf"))
    plt.show()

    # plot y_new against the true value
    fig, ax = plt.subplots(1, 1, figsize=[10, 4])
    times = np.arange(len(y_pred))
    ax.plot(times, y_pred, color="b", label="Predicted")
    ax.plot(times, y, color="r", label="True")
    ax.set_xlabel("Times")
    ax.set_ylabel("{}".format(parameters.y_measure))
    ax.set_title(
        "Sub {}, hand {}, {} prediction".format(
            str(parameters.subject_n),
            "sx" if parameters.hand == 0 else "dx",
            parameters.y_measure,
        )
    )
    plt.legend()
    plt.savefig(os.path.join(figure_path, "Times_prediction.pdf"))
    plt.show()

    # scatterplot y predicted against the true value
    fig, ax = plt.subplots(1, 1, figsize=[10, 4])
    ax.scatter(np.array(y), np.array(y_pred), color="b", label="Predicted")
    ax.set_xlabel("True")
    ax.set_ylabel("Predicted")
    # plt.legend()
    plt.savefig(os.path.join(figure_path, "Scatter.pdf"))
    plt.show()

    # scatterplot y predicted against the true value
    fig, ax = plt.subplots(1, 1, figsize=[10, 4])
    ax.scatter(
        np.array(y_valid), np.array(y_pred_valid), color="b", label="Predicted"
    )
    ax.set_xlabel("True")
    ax.set_ylabel("Predicted")
    # plt.legend()
    plt.savefig(os.path.join(figure_path, "Scatter_valid.pdf"))
    plt.show()

    # log the model and parameters using mlflow tracker
    with mlflow.start_run(experiment_id=args.experiment) as run:
        for key, value in vars(parameters).items():
            mlflow.log_param(key, value)

        mlflow.log_param("Time", train_time)

        mlflow.log_metric("MSE", mse)
        mlflow.log_metric("RMSE", rmse)
        mlflow.log_metric("MAE", mae)
        mlflow.log_metric("R2", r2)

        mlflow.log_metric("RMSE_Valid", rmse_valid)
        mlflow.log_metric("R2_Valid", r2_valid)
        mlflow.log_metric("Valid_loss", valid_loss_last)

        mlflow.log_artifact(os.path.join(figure_path, "Times_prediction.pdf"))
        mlflow.log_artifact(
            os.path.join(figure_path, "Times_prediction_focus.pdf")
        )
        mlflow.log_artifact(os.path.join(figure_path, "loss_plot.pdf"))
        mlflow.log_artifact(os.path.join(figure_path, "Scatter.pdf"))
        mlflow.log_artifact(os.path.join(figure_path, "Scatter_valid.pdf"))
        mlflow.pytorch.log_model(net, "models")
Пример #16
0
                           average_method='arithmetic'))
    clf.fit(X_train, y_train)

    i = 0
    for p in clf.cv_results_['params']:
        # Log best params
        run = client.create_run(experiment_id=experiment.experiment_id,
                                source_version=ver_name,
                                run_name='run_{0}'.format(str(i)))

        with mlflow.start_run(run_uuid=run.info.run_uuid, nested=True):

            for k, v in p.items():
                mlflow.log_param(k, v)

            for k in [
                    'rank_test_score', 'mean_test_score', 'std_test_score',
                    'mean_fit_time', 'std_fit_time'
            ]:
                mlflow.log_metric(k, clf.cv_results_[k][i])

            for cvix in range(clf.n_splits_):
                keyname = 'split{0}_train_score'.format(cvix)
                mlflow.log_metric('train_score_cv',
                                  clf.cv_results_[keyname][i])

                keyname = 'split{0}_test_score'.format(cvix)
                mlflow.log_metric('test_score_cv', clf.cv_results_[keyname][i])

        i = i + 1
Пример #17
0
def run(data_url, target, tracking_uri, model_name, exp_id, output_path, msg):

    print("msg", msg)

    class XGBWrapper(mlflow.pyfunc.PythonModel):
        def load_context(self, context):
            import xgboost as xgb
            self.xgb_model = xgb.Booster()
            self.xgb_model.load_model(context.artifacts["xgb_model"])

        def predict(self, context, model_input):
            input_matrix = xgb.DMatrix(model_input.values)
            return {
                'prediction': self.xgb_model.predict(input_matrix),
                'msg': msg
            }

    print("xgboost version", xgb.__version__)
    data = pd.read_csv(data_url, sep=";")
    label_encoder = preprocessing.LabelEncoder()
    label_encoder.fit(data[target].values)
    data[target] = label_encoder.transform(data[target])
    train, test = train_test_split(data)
    train_x = train.drop([target], axis=1)
    test_x = test.drop([target], axis=1)
    train_y = train[target]
    test_y = test[target]

    learning_rate = np.random.choice([0.01, 0.1, 0.2])
    colsample_bytree = 0.2 + np.random.rand() * 0.4
    subsample = 0.2 + np.random.rand() * 0.4

    mlflow.set_tracking_uri(tracking_uri)
    mlflow.xgboost.autolog()
    dtrain = xgb.DMatrix(train_x, label=train_y)
    dtest = xgb.DMatrix(test_x, label=test_y)
    with mlflow.start_run(run_name=model_name, experiment_id=exp_id):
        params = {
            'max_depth': 5,
            "objective": "multi:softprob",
            "num_class": len(np.unique(train_y)),
            "learning_rate": learning_rate,
            "eval_metric": "mlogloss",
            "colsample_bytree": colsample_bytree,
            "subsample": subsample,
        }
        model = xgb.train(params,
                          dtrain,
                          num_boost_round=int(5 / learning_rate),
                          evals=[(dtrain, "train")],
                          verbose_eval=False)
        model.save_model(xgb_model_path)
        mlflow.pyfunc.log_model(artifact_path='model',
                                conda_env=conda_env,
                                python_model=XGBWrapper(),
                                artifacts=artifacts,
                                registered_model_name=model_name)
        # evaluate model
        train_y_proba = model.predict(dtrain)
        train_y_pred = train_y_proba.argmax(axis=1)
        test_y_proba = model.predict(dtest)
        test_y_pred = test_y_proba.argmax(axis=1)

        mlflow.log_metric("train_accuracy",
                          accuracy_score(train_y_pred, train_y))
        mlflow.log_metric("test_accuracy", accuracy_score(test_y_pred, test_y))
        # mlflow.xgboost.log_model(model, 'model', registered_model_name=model_name)

    client = MlflowClient(tracking_uri)
    model_uri = client.search_model_versions(f"name='{model_name}'")[-1].source
    info = {"model_uri": model_uri}
    print(f"Artifacts saved at {model_uri}.")
    pathlib.Path(os.path.dirname(output_path)).mkdir(parents=True,
                                                     exist_ok=True)
    with open(output_path, 'w') as f:
        json.dump(info, f)
Пример #18
0
def main(params: dict, output_dir: str):
    import mlflow
    print("start params={}".format(params))
    df = pd.read_feather(
        "../../riiid_takoi/notebook/data/train_sort.feather").head(len_train)
    # df = pd.read_pickle("../input/riiid-test-answer-prediction/split10/train_0.pickle").sort_values(["user_id", "timestamp"]).reset_index(drop=True)
    if is_debug:
        df = df.head(30000)
    for d in load_feature_dir:
        df_ = pd.read_feather(d).head(len_train)
        if is_debug:
            df_ = df_.head(30000)
        df = pd.concat([df, df_], axis=1)

    print(df.isnull().sum())
    # ====================
    # preprocess
    # ====================
    df["content_id"] = df["content_id"] + 2
    df["prior_question_had_explanation"] = df[
        "prior_question_had_explanation"].fillna(-1) + 2
    df["content_id_with_lecture"] = df["content_id"]
    df.loc[df["content_type_id"] == 1,
           "content_id_with_lecture"] = df["content_id"] + 14000
    df["answered_correctly"] += 3
    df["task_container_id"] += 1
    df["part"] += 1
    df["prior_question_elapsed_time"] = df[
        "prior_question_elapsed_time"].fillna(0)
    df["timestamp_delta"] = df["timestamp_delta"].fillna(0)
    df["uid_win_rate"] = df["uid_win_rate"].fillna(
        0.65)  # target_encoding(user_id)

    # 以下は特徴作成時に処理済
    # df["content_id_delta"] = df["content_id_delta"].fillna(-1) + 2
    # df["last_content_id_acc"] = df["last_content_id_acc"].fillna(-1) + 2

    # ====================
    # data prepare
    # ====================
    agg_dict = {
        "content_id_with_lecture": list,
        "prior_question_had_explanation": list,
        "prior_question_elapsed_time": list,
        "answered_correctly": list,
        "task_container_id": list,
        "part": list,
        "content_id_delta": list,
        "last_content_id_acc": list,
        "uid_win_rate": list,
        "is_val": list,
        "timestamp_delta": list,
    }
    df_val_row = pd.read_feather(
        "../../riiid_takoi/notebook/fe/validation_row_id.feather").head(
            len_train // 10)
    if is_debug:
        df_val_row = df_val_row.head(3000)
    df_val_row["is_val"] = 1

    df = pd.merge(df, df_val_row, how="left", on="row_id")
    df["is_val"] = df["is_val"].fillna(0)

    print(df["is_val"].value_counts())

    if not load_pickle or is_debug:
        # 100件ずつgroupを作る. 例えば950件あったら、 1~50, 51~150, 151~250 のように、先頭が端数になるように
        w_df = df[df["is_val"] == 0]
        w_df["group"] = (
            w_df.groupby("user_id")["user_id"].transform("count") -
            w_df.groupby("user_id").cumcount()) // params["max_seq"]

        group = w_df.groupby(["user_id", "group"]).agg(agg_dict).T.to_dict()

        dataset_train = SAKTDataset(group,
                                    n_skill=60000,
                                    max_seq=params["max_seq"])

        del w_df
        gc.collect()

        group = df[df["content_type_id"] == 0].groupby("user_id").agg(
            agg_dict).T.to_dict()
        dataset_val = SAKTDataset(group,
                                  is_test=True,
                                  n_skill=60000,
                                  max_seq=params["max_seq"])

    os.makedirs("../input/feature_engineering/model200", exist_ok=True)
    if not is_debug and not load_pickle:
        with open(f"../input/feature_engineering/model200/train.pickle",
                  "wb") as f:
            pickle.dump(dataset_train, f)
        with open(f"../input/feature_engineering/model200/val.pickle",
                  "wb") as f:
            pickle.dump(dataset_val, f)

    if not is_debug and load_pickle:
        with open(f"../input/feature_engineering/model200/train.pickle",
                  "rb") as f:
            dataset_train = pickle.load(f)
        with open(f"../input/feature_engineering/model200/val.pickle",
                  "rb") as f:
            dataset_val = pickle.load(f)
        print("loaded!")
    dataloader_train = DataLoader(dataset_train,
                                  batch_size=params["batch_size"],
                                  shuffle=True,
                                  num_workers=1)
    dataloader_val = DataLoader(dataset_val,
                                batch_size=params["batch_size"],
                                shuffle=False,
                                num_workers=1)

    model = SAKTModel(n_skill=60000,
                      embed_dim=params["embed_dim"],
                      max_seq=params["max_seq"],
                      dropout=dropout,
                      cont_emb=params["cont_emb"])

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    optimizer = AdaBelief(
        optimizer_grouped_parameters,
        lr=params["lr"],
    )
    num_train_optimization_steps = int(len(dataloader_train) * 20)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=params["num_warmup_steps"],
        num_training_steps=num_train_optimization_steps)
    criterion = nn.BCEWithLogitsLoss()

    model.to(device)
    criterion.to(device)

    for epoch in range(epochs):
        loss, acc, auc, auc_val = train_epoch(model, dataloader_train,
                                              dataloader_val, optimizer,
                                              criterion, scheduler, epoch,
                                              device)
        print("epoch - {} train_loss - {:.3f} auc - {:.4f} auc-val: {:.4f}".
              format(epoch, loss, auc, auc_val))
        torch.save(
            model.state_dict(),
            f"{output_dir}/transformers_epoch{epoch}_auc{round(auc_val, 4)}.pth"
        )

    preds = []
    labels = []
    with torch.no_grad():
        for item in tqdm(dataloader_val):
            label = item["label"].to(device).float()

            output = model(item, device)

            preds.extend(torch.nn.Sigmoid()(
                output[:, -1]).view(-1).data.cpu().numpy().tolist())
            labels.extend(label[:, -1].view(-1).data.cpu().numpy().tolist())

    auc_transformer = roc_auc_score(labels, preds)
    print("single transformer: {:.4f}".format(auc_transformer))
    df_oof = pd.DataFrame()
    # df_oof["row_id"] = df.loc[val_idx].index
    print(len(dataloader_val))
    print(len(preds))
    df_oof["predict"] = preds
    df_oof["target"] = labels

    df_oof.to_csv(f"{output_dir}/transformers1.csv", index=False)
    """
    df_oof2 = pd.read_csv("../output/ex_237/20201213110353/oof_train_0_lgbm.csv")
    df_oof2.columns = ["row_id", "predict_lgbm", "target"]
    df_oof2 = pd.merge(df_oof, df_oof2, how="inner")

    auc_lgbm = roc_auc_score(df_oof2["target"].values, df_oof2["predict_lgbm"].values)
    print("lgbm: {:.4f}".format(auc_lgbm))

    print("ensemble")
    max_auc = 0
    max_nn_ratio = 0
    for r in np.arange(0, 1.05, 0.05):
        auc = roc_auc_score(df_oof2["target"].values, df_oof2["predict_lgbm"].values*(1-r) + df_oof2["predict"].values*r)
        print("[nn_ratio: {:.2f}] AUC: {:.4f}".format(r, auc))

        if max_auc < auc:
            max_auc = auc
            max_nn_ratio = r
    print(len(df_oof2))
    """
    if not is_debug:
        mlflow.start_run(experiment_id=10, run_name=os.path.basename(__file__))

        for key, value in params.items():
            mlflow.log_param(key, value)
        mlflow.log_metric("auc_val", auc_transformer)
        mlflow.end_run()
    torch.save(model.state_dict(), f"{output_dir}/transformers.pth")
    del model
    torch.cuda.empty_cache()
    with open(f"{output_dir}/transformer_param.json", "w") as f:
        json.dump(params, f)
Пример #19
0
                               weight_decay=args.weight_decay)

        logging.info("Training classifier")
        for epoch in trange(args.epochs):
            model.train()
            running_loss = []
            for idx, batch in enumerate(tqdm(train_loader)):
                optimizer.zero_grad()
                data = batch["data"].to(device)
                target = batch["target"].to(device)
                output = model(data)
                loss_value = loss(output, target)
                loss_value.backward()
                optimizer.step()
                running_loss.append(loss_value.item())
            mlflow.log_metric("train_loss",
                              sum(running_loss) / len(running_loss), epoch)

            if validation_dataset:
                logging.info("Evaluating model on validation")
                model.eval()
                running_loss = []
                targets = []
                predictions = []
                with torch.no_grad():
                    for batch in tqdm(validation_loader):
                        data = batch["data"].to(device)
                        target = batch["target"].to(device)
                        output = model(data)
                        running_loss.append(loss(output, target).item())
                        targets.extend(batch["target"].numpy())
                        predictions.extend(
def train(ngram, nb_hash, word2vec, maxiter, oversampling):
    maxIter = maxiter
    warnings.filterwarnings("ignore")
    np.random.seed(40)


    spark = SparkSession.builder \
        .master("local[*]") \
        .appName("ML") \
        .getOrCreate()
    #wine_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),'data/cdiscount_train.csv')

    RowDF = spark.read.format('com.databricks.spark.csv').options(
        header='true', inferschema='true',
        sep=",").load('data/operation bancaire.csv')
    train = RowDF.dropna(subset='lib4')
    print("nbr de classe", train.select(train.columns[4]).distinct().count())
    """
        df = train.fillna(0,subset='credit')
        def trasf(ligne):
            if ligne > 0:
                return 1
            else :
                  return 0
            
          
    udftrasform =  udf(lambda x: trasf(x),)
    dataClean = df.withColumn("credit_oui_non", udftrasform(df['credit']))
    dataClean.show(n=20,truncate=True)
            
    """
    df = train.select('lib1',
                      train.credit.isNull().cast('float').alias('credit_o_n'),
                      'lib4')

    #train.show(22,truncate=True)

    # Taux de sous-échantillonnage des données pour tester le programme de préparation
    # sur un petit jeu de données
    taux_donnees = [0.7, 0.3]

    dataTrain, DataTest = df.randomSplit(taux_donnees, seed=42)
    n_train = dataTrain.count()
    n_test = DataTest.count()
    print("DataTrain : size = %d, DataTest : size = %d" % (n_train, n_test))
    if oversampling:

        def sur_echant(df, p=0.7):
            counts = df.groupBy('lib4').count().collect()
            categories = [i[0] for i in counts]
            values = [i[1] for i in counts]
            max_v = max(values)
            indx = values.index(max_v)
            dic = {j: 1 for i, j in enumerate(categories) if i != indx}
            dic[categories[indx]] = p
            df = df.sampleBy("lib4", fractions=dic)
            p = int(max_v * p)
            for i, cat in enumerate(categories):
                if i != indx:
                    a = df
                    data = a.sampleBy("lib4", fractions={
                        cat: 1
                    }).toPandas().sample(p - values[i], replace=True)
                    spark_df = spark.createDataFrame(data)
                    df = df.union(spark_df)
            return df

        dataTrain = sur_echant(dataTrain, p=0.8)

        print("DataTrain apres le sur_echantillonage: size = %d" %
              (dataTrain.count()))

    opm = "mp"
    with mlflow.start_run():

        pipeline = Pipeline_model(ngram,
                                  nb_hash,
                                  data=df,
                                  opm=opm,
                                  vec=word2vec,
                                  maxIter=maxIter)

        time_start = time.time()
        # On applique toutes les étapes sur la DataFrame d'apprentissage.
        model = pipeline.fit(dataTrain)
        time_end = time.time()
        time_lrm = (time_end - time_start)
        print(
            "LR prend %d s pour un echantillon d'apprentissage de taille : n = %d"
            % (time_lrm, n_train))

        predictionsDF = model.transform(DataTest)

        #labelsAndPredictions = predictionsDF.select("categoryIndex","prediction").collect()
        #nb_good_prediction = sum([r[0]==r[1] for r in labelsAndPredictions])
        #testScore =nb_good_prediction/n_test
        #print('Test score = , pour un echantillon test de taille n = %d' + str(testScore))

        # Select (prediction, true label) and compute test error
        #evaluator1 = MulticlassClassificationEvaluator(labelCol="categoryIndex", predictionCol="prediction", metricName="accuracy")
        #accuracy = evaluator1.evaluate(predictionsDF)

        evaluator = MulticlassClassificationEvaluator(
            labelCol="categoryIndex", predictionCol="prediction")
        f1 = evaluator.evaluate(predictionsDF, {evaluator.metricName: "f1"})
        accuracy = evaluator.evaluate(predictionsDF,
                                      {evaluator.metricName: "accuracy"})

        print("Test scor accuracy= %g" % (accuracy))
        print("Test scor f1score = %g" % (f1))
        mlflow.log_param("word2vec", word2vec)
        mlflow.log_param("ngram", ngram)
        mlflow.log_param("nb_hash", nb_hash)
        mlflow.log_param("maxIter", maxIter)
        mlflow.log_param("oversampling", oversampling)
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("f1Score", f1)
        mlflow.spark.log_model(model, "spark-model")
Пример #21
0
def main(params: dict):
    print("start params={}".format(params))
    df = pd.read_pickle(
        "../input/riiid-test-answer-prediction/split10/train_0.pickle"
    ).sort_values(["user_id", "timestamp"]).reset_index(drop=True)
    if is_debug:
        df = df.head(10000)
    df = df[df.content_type_id == False]

    train_idx = []
    val_idx = []
    np.random.seed(0)
    for _, w_df in df.groupby("user_id"):
        if np.random.random() < 0.1:
            # all val
            val_idx.extend(w_df.index.tolist())
        else:
            train_num = int(len(w_df) * 0.9)
            train_idx.extend(w_df[:train_num].index.tolist())
            val_idx.extend(w_df[train_num:].index.tolist())

    df["is_val"] = 0
    df["is_val"].loc[val_idx] = 1
    df["group"] = (df.groupby("user_id")["user_id"].transform("count") -
                   df.groupby("user_id").cumcount()) // max_len
    print(df["group"].value_counts())

    print(len(train_idx))
    print(len(val_idx))
    group = df[[
        'user_id', 'content_id', 'answered_correctly', "group", "part",
        'is_val'
    ]].groupby([
        'user_id', "group"
    ]).apply(lambda r: (r['content_id'].values, r['part'].values, r[
        'answered_correctly'].values, r["is_val"].values))
    dataset_train = SAKTDataset(group, 13523, max_seq=params["max_seq"])

    group = df[[
        'user_id', 'content_id', 'answered_correctly', "part", 'is_val'
    ]].groupby(['user_id']).apply(lambda r: (r['content_id'].values, r[
        'part'].values, r['answered_correctly'].values, r["is_val"].values))
    dataset_val = SAKTDataset(group,
                              13523,
                              is_test=True,
                              max_seq=params["max_seq"])

    dataloader_train = DataLoader(dataset_train,
                                  batch_size=1024,
                                  shuffle=True,
                                  num_workers=1)
    dataloader_val = DataLoader(dataset_val,
                                batch_size=1024,
                                shuffle=False,
                                num_workers=1)

    device = torch.device("cuda")

    model = SAKTModel(13523,
                      embed_dim=params["embed_dim"],
                      max_seq=params["max_seq"])
    # optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, momentum=0.99, weight_decay=0.005)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    criterion = nn.BCEWithLogitsLoss()

    model.to(device)
    criterion.to(device)

    epochs = 20
    for epoch in range(epochs):
        loss, acc, auc, auc_val = train_epoch(model, dataloader_train,
                                              dataloader_val, optimizer,
                                              criterion, device)
        print("epoch - {} train_loss - {:.3f} auc - {:.4f} auc-val: {:.4f}".
              format(epoch, loss, auc, auc_val))

    preds = []
    labels = []
    for d in tqdm(dataloader_val):
        x = d[0].to(device).long()
        target_id = d[1].to(device).long()
        part = d[2].to(device).long()
        label = d[3].to(device).long()

        output, atten_weight = model(x, target_id, part)

        preds.extend(torch.nn.Sigmoid()(
            output[:, -1]).view(-1).data.cpu().numpy().tolist())
        labels.extend(label[:, -1].view(-1).data.cpu().numpy())
    df_oof = pd.DataFrame()
    df_oof["row_id"] = df.loc[val_idx].index
    df_oof["predict"] = preds
    df_oof["target"] = df.loc[val_idx]["answered_correctly"].values
    df_oof.to_csv(f"{output_dir}/transformers1.csv", index=False)

    df_oof2 = pd.read_csv(
        "../output/ex_172/20201202080625/oof_train_0_lgbm.csv")
    df_oof2.columns = ["row_id", "predict_lgbm", "target"]
    df_oof2 = pd.merge(df_oof, df_oof2, how="inner")

    auc_transformer = roc_auc_score(df_oof2["target"].values,
                                    df_oof2["predict"].values)
    auc_lgbm = roc_auc_score(df_oof2["target"].values,
                             df_oof2["predict_lgbm"].values)
    print("single transformer: {:.4f}".format(auc_transformer))
    print("lgbm: {:.4f}".format(auc_lgbm))

    print("ensemble")
    max_auc = 0
    max_nn_ratio = 0
    for r in np.arange(0, 1.05, 0.05):
        auc = roc_auc_score(
            df_oof2["target"].values, df_oof2["predict_lgbm"].values *
            (1 - r) + df_oof2["predict"].values * r)
        print("[nn_ratio: {:.2f}] AUC: {:.4f}".format(r, auc))

        if max_auc < auc:
            max_auc = auc
            max_nn_ratio = r
    print(len(df_oof2))

    if not is_debug:
        mlflow.start_run(experiment_id=10, run_name=os.path.basename(__file__))

        mlflow.log_param("count_row", len(df))

        for key, value in params.items():
            mlflow.log_param(key, value)
        mlflow.log_metric("auc_val", auc_transformer)
        mlflow.log_metric("auc_lgbm", auc_lgbm)
        mlflow.log_metric("auc_ensemble", max_auc)
        mlflow.log_metric("ensemble_nn_ratio", max_nn_ratio)
        mlflow.end_run()
Пример #22
0
def embeddings_quality(embeddings_file, configuration):
    """Function used to test the quality of the embeddings provided in file 'embeddings_file'. The tests must already
    be provided in the directory 'test_dir' to be executed. Please refer to the readme for info about the test format.

    :param embeddings_file: path to the embeddings file to be tested
    :param test_dir: path to the directory that contains all the tests.
    """
    print('# Executing EQ tests.')
    test_dir = configuration['test_dir']
    test_dir = test_dir.strip('/') + '/'

    if configuration['training_algorithm'] == 'fasttext':
        wv = models.KeyedVectors.load(embeddings_file, mmap='r')
    else:
        wv = models.KeyedVectors.load_word2vec_format(embeddings_file, unicode_errors='ignore')
    sum_total = 0
    count_tests = 0
    result_col = {}
    result_con = {}
    result_row = {}

    nmr_tests = []
    nmcon_tests = []
    nmc_tests = []
    for fin in os.listdir(test_dir):
        if fin.startswith('nmr'):
            nmr_tests.append(test_dir + fin)
        elif fin.startswith('nmcon'):
            nmcon_tests.append(test_dir + fin)
        elif fin.startswith('nmc'):
            nmc_tests.append(test_dir + fin)
    if len(nmc_tests) == len(nmcon_tests) == len(nmr_tests) == 0:
        raise ValueError('No valid test files found. Exiting. ')

    if len(nmc_tests) > 0:
        print('# Testing columns.')
        result_col = _test_no_match_columns(wv, nmc_tests)
        sum_total += result_col['nmc_avg']
        count_tests += 1
        print('# nmc_avg: {:.2f}'.format(result_col['nmc_avg']))
    else:
        warnings.warn('No valid nmc tests found. ')

    if len(nmr_tests) > 0:
        print('# Testing rows.')
        result_row = _test_no_match_rows(wv, nmr_tests)
        sum_total += result_row['nmr_avg']
        count_tests += 1
        print('# nmr_avg: {:.2f}'.format(result_row['nmr_avg']))
    else:
        warnings.warn('No valid nmr tests found. ')
    if len(nmcon_tests) > 0:
        print('# Testing concepts.')
        result_con = _test_no_match_concept(wv, nmcon_tests)
        sum_total += result_con['nmcon_avg']
        count_tests += 1
        print('# nmcon_avg: {:.2f}'.format(result_con['nmcon_avg']))
    else:
        warnings.warn('No valid nmcon tests found. ')
    try:
        avg_results = sum_total / count_tests

    except ZeroDivisionError:
        print('No tests were executed.')
        avg_results = 0

    print('# EQ average: {:.2f}'.format(avg_results))

    if configuration['mlflow']:
        with mlflow.active_run():
            mlflow.log_metric('eq_avg', avg_results)
            mlflow.log_metric('nmc_avg', result_col['nmc_avg'])
            mlflow.log_metric('nmcon_avg', result_con['nmcon_avg'])
            mlflow.log_metric('nmr_avg', result_row['nmr_avg'])
            for k in result_col:
                mlflow.log_metric(k, result_col[k])
            for k in result_row:
                mlflow.log_metric(k, result_row[k])
            for k in result_con:
                mlflow.log_metric(k, result_con[k])

    result_dict = dict(chain.from_iterable(d.items() for d in (result_row, result_col, result_con)))
    _r = ['nmc_avg', 'nmr_avg', 'nmcon_avg', 'eq_avg']
    result_dict['eq_avg'] = avg_results
    print('\t'.join(_r))
    for k in _r:
        print(result_dict[k], end='\t')
    print()

    return result_dict
def run(epochs, lr):
    """
    Train and test CNN with given parameters
    """

    # ==============
    #  Load Data
    # ==============

    mixdata = h5py.File(
        "../train/scsn_p_2000_2017_6sec_0.5r_pick_train_mix.hdf5", "r")
    testdata = h5py.File(
        "../test/scsn_p_2000_2017_6sec_0.5r_pick_test_mix.hdf5", "r")

    batch_size = 500

    train_size = 1 * 10**5
    train_ratio = 0.5
    test_size = 1 * 10**5

    train_val_data = mixdata["X"][:train_size]
    train_val_labels = mixdata["pwave"][:train_size]

    (trainset, trainlabels), (valset, val_labels) = split_trainset(
        train_val_data, train_val_labels, train_ratio)

    trainset = list(zip(trainset, trainlabels))

    valset = list(zip(valset, val_labels))

    trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(valset, batch_size=batch_size, shuffle=True)

    # ======================
    #  Use GPU, if available
    # ======================

    if torch.cuda.is_available():
        device = "cuda"
    else:
        device = "cpu"

    model = CNN()

    model = parallelize(model)

    model.to(device)

    #lr = float(args.lr)

    # ===================================
    #  Define Optimizer and Loss Function
    # ===================================

    criterion = nn.NLLLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    mlflow.set_tracking_uri("file:.\mlruns")
    mlflow.start_run()

    #epochs = int(args.epochs)

    train_losses = []
    val_losses = []

    min_val_loss = float("inf")

    for epoch in range(epochs):
        model.train()
        train_loss = 0

        for batch, labels in trainloader:
            # ============================================
            #            TRAINING
            # ============================================
            batch, labels = batch.to(device), labels.to(device)
            # Clear gradients in optimizer
            optimizer.zero_grad()
            # Forward pass
            output = model.forward(batch)
            # Calculate loss
            loss = criterion(
                output,
                labels.type(torch.cuda.LongTensor).view(labels.shape, 1))
            train_loss += loss.item()
            # Backpropagation
            loss.backward()
            # Update weights
            optimizer.step()
        else:
            with torch.no_grad():
                model.eval()
                val_loss = 0

                for batch, labels in val_loader:
                    # ============================================
                    #            VALIDATION
                    # ============================================
                    batch, labels = batch.to(device), labels.to(device)
                    # Forward pass
                    ouput = model.forward(batch)
                    # Calculate loss
                    loss = criterion(
                        output,
                        labels.type(torch.cuda.LongTensor).view(
                            labels.shape, 1))
                    val_loss += loss.item()

        # Print epoch summary
        t_loss_avg = train_loss / len(trainloader)
        v_loss_avg = val_loss / len(val_loader)

        if v_loss_avg < min_val_loss:
            torch.save(model.state_dict(), "./artifacts/model.pth")
            mlflow.log_artifact("./artifacts/model.pth")

        mlflow.log_metric("train_loss", t_loss_avg)
        mlflow.log_metric("val_loss", v_loss_avg)

        train_losses.append(t_loss_avg)
        val_losses.append(v_loss_avg)

        print(
            'Epoch [{:5d}/{:5d}] | train loss: {:6.4f} | validation loss: {:6.4f}'
            .format(epoch + 1, epochs, t_loss_avg, v_loss_avg))

    # ==============
    #  Test model
    # ==============

    model.load_state_dict(torch.load("./artifacts/model.pth"))

    test_path = "../test/scsn_p_2000_2017_6sec_0.5r_pick_test_mix.hdf5"

    y_true, y_pred, y_probs = test_model(model,
                                         test_path,
                                         test_size,
                                         device="cuda")

    report = classification_report(y_true,
                                   y_pred,
                                   target_names=["pwave", "noise"])
    report_dict = classification_report(y_true,
                                        y_pred,
                                        target_names=["pwave", "noise"],
                                        output_dict=True)
    accuracy = accuracy_score(y_true, y_pred)
    roc_score = roc_auc_score(y_true, y_probs)

    print(report)
    print("Accuracy: {:.4}%".format(accuracy * 100))
    print("ROC Score: ", roc_score)

    mlflow.log_param("epochs", epochs)
    mlflow.log_param("learning_rate", lr)
    mlflow.log_param("device", device)

    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("auc_roc_score", roc_score)

    for category in report_dict:
        for metric, value in report_dict[category].items():
            metric_name = category + "_" + metric
            mlflow.log_metric(metric_name, value)

    mlflow.end_run()
Пример #24
0
    graph.write_png(local_image_path)


# Load Data
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data,
                                                    iris.target,
                                                    test_size=0.2,
                                                    random_state=RANDOM_STATE)

# Set MLflow tags
mlflow.set_tags({'platform': 'local-mlrun'})

# Log params
max_depth = int(sys.argv[1]) if len(sys.argv) > 1 else 1

# Model training
model = DecisionTreeClassifier(max_depth=max_depth)
model.fit(X_train, y_train)

# Log model
mlflow.sklearn.log_model(model, "model")

# Log metrics
accuracy = model.score(X_test, y_test)
mlflow.log_metric("accuracy", accuracy)

# Log artifact
export_tree(model, TREE_IMAGE_PATH)
mlflow.log_artifact(TREE_IMAGE_PATH)
Пример #25
0
import mlflow
import os
from mlflow.sklearn import save_model, log_model

mlflow.set_tracking_uri(os.environ["MLFLOW_HOST"])
mlflow.set_experiment("iris-exp")

# Injest data
iris = load_iris()

# Prepare training data
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

with mlflow.start_run():
    # Train model
    n_estimators = 150
    mlflow.log_param("n_estimators", n_estimators)
    clf = RandomForestClassifier(n_estimators=n_estimators)
    clf.fit(X_train, y_train)

    # Evaluate
    y_pred = clf.predict(X_test)

    score = metrics.accuracy_score(y_test, y_pred)
    print(f"Accuracy: {score}")
    mlflow.log_metric(key="accuracy", value=score)

    # Save model
    # save_model(clf, 'iris_model') uncomment if you want to save localy
    log_model(clf, 'iris_model')
Пример #26
0
def log_metrics(metrics):
    for k, values in metrics.items():
        for v in values:
            mlflow.log_metric(k, v)
Пример #27
0
def main():

    os.environ['MLFLOW_TRACKING_URI'] = "http://localhost:5000"
    os.environ['MLFLOW_TRACKING_USERNAME'] = "******"
    os.environ['MLFLOW_TRACKING_PASSWORD'] = "******"

    mlflow.create_experiment("diabetes2", artifact_location="../tmp/artifacts")
    mlflow.set_experiment("diabetes2")
    mlflow.start_run(run_name=None)

    # Load Diabetes datasets
    diabetes = datasets.load_diabetes()
    X = diabetes.data
    y = diabetes.target

    # Create pandas DataFrame for sklearn ElasticNet linear_model
    Y = np.array([y]).transpose()
    d = np.concatenate((X, Y), axis=1)
    cols = diabetes.feature_names + ["progression"]
    data = pd.DataFrame(d, columns=cols)

    np.random.seed(40)

    # Split the data into training and test sets. (0.75, 0.25) split.
    train, test = train_test_split(data)

    # The predicted column is "progression" which is a quantitative
    # measure of disease progression one year after baseline
    train_x = train.drop(["progression"], axis=1)
    test_x = test.drop(["progression"], axis=1)
    train_y = train[["progression"]]
    test_y = test[["progression"]]

    alpha = float(sys.argv[1]) if len(sys.argv) > 1 else 0.05
    l1_ratio = float(sys.argv[2]) if len(sys.argv) > 2 else 0.05

    # Run ElasticNet
    lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)
    lr.fit(train_x, train_y)
    predicted_qualities = lr.predict(test_x)
    (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)

    # Print out ElasticNet model metrics
    print("Elasticnet model (alpha=%f, l1_ratio=%f):" % (alpha, l1_ratio))
    print("  RMSE: %s" % rmse)
    print("  MAE: %s" % mae)
    print("  R2: %s" % r2)

    # Log mlflow attributes for mlflow UI
    mlflow.log_param("alpha", alpha)
    mlflow.log_param("l1_ratio", l1_ratio)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("mae", mae)
    mlflow.sklearn.log_model(lr, "model")

    # Compute paths
    eps = 5e-3  # the smaller it is the longer is the path

    print("Computing regularization path using the elastic net.")
    alphas_enet, coefs_enet, _ = enet_path(X,
                                           y,
                                           eps=eps,
                                           l1_ratio=l1_ratio,
                                           fit_intercept=False)

    # Display results
    fig = plt.figure(1)
    plt.gca()

    colors = cycle(["b", "r", "g", "c", "k"])
    neg_log_alphas_enet = -np.log10(alphas_enet)
    for coef_e, c in zip(coefs_enet, colors):
        l2 = plt.plot(neg_log_alphas_enet, coef_e, linestyle="--", c=c)

    plt.xlabel("-Log(alpha)")
    plt.ylabel("coefficients")
    title = "ElasticNet Path by alpha for l1_ratio = " + str(l1_ratio)
    plt.title(title)
    plt.axis("tight")

    # Save figures
    fig.savefig("ElasticNet-paths.png")

    # Close plot
    plt.close(fig)

    # Log artifacts (output files)
    mlflow.log_artifact("ElasticNet-paths.png")
Пример #28
0
import os
from random import random, randint
import mlflow

if __name__ == "__main__":
    print("Running the test script ...")

    mlflow.set_tracking_uri("http://127.0.0.1:5000")

    if not os.path.exists("artifact_folder"):
        os.makedirs("artifact_folder")

    mlflow.log_param("param1", randint(0, 100))

    mlflow.log_metric("foo", random())
    mlflow.log_metric("foo", random() + 1)
    mlflow.log_metric("foo", random() + 2)

    with open("artifact_folder/test.txt", "w") as f:
        f.write("hello world!")

    mlflow.log_artifacts("artifact_folder")
Пример #29
0
def main(params: dict, output_dir: str):
    import mlflow
    print("start params={}".format(params))
    model_id = "train_0"
    logger = get_logger()
    # df = pd.read_pickle("../input/riiid-test-answer-prediction/train_merged.pickle")
    df = pd.read_pickle(
        "../input/riiid-test-answer-prediction/split10/train_0.pickle"
    ).sort_values(["user_id", "timestamp"]).reset_index(drop=True)
    if is_debug:
        df = df.head(30000)
    df["prior_question_had_explanation"] = df[
        "prior_question_had_explanation"].fillna(-1)
    column_config = {
        ("content_id", "content_type_id"): {
            "type": "category"
        },
        "user_answer": {
            "type": "leakage_feature"
        },
        "answered_correctly": {
            "type": "leakage_feature"
        },
        "part": {
            "type": "category"
        },
        "prior_question_elapsed_time_bin300": {
            "type": "category"
        },
        "duration_previous_content_bin300": {
            "type": "category"
        },
        "prior_question_had_explanation": {
            "type": "category"
        },
        "rating_diff_content_user_id": {
            "type": "numeric"
        },
        "task_container_id_bin300": {
            "type": "category"
        }
    }

    if not load_pickle or is_debug:
        feature_factory_dict = {"user_id": {}}
        feature_factory_dict["user_id"][
            "DurationPreviousContent"] = DurationPreviousContent()
        feature_factory_dict["user_id"][
            "ElapsedTimeBinningEncoder"] = ElapsedTimeBinningEncoder()
        feature_factory_dict["user_id"][
            "UserContentRateEncoder"] = UserContentRateEncoder(
                rate_func="elo", column="user_id")
        feature_factory_manager = FeatureFactoryManager(
            feature_factory_dict=feature_factory_dict,
            logger=logger,
            split_num=1,
            model_id="train_0",
            load_feature=not is_debug,
            save_feature=not is_debug)

        print("all_predict")
        df = feature_factory_manager.all_predict(df)
        df["task_container_id_bin300"] = [
            x if x < 300 else 300 for x in df["task_container_id"]
        ]
        df = df[[
            "user_id", "content_id", "content_type_id", "part", "user_answer",
            "answered_correctly", "prior_question_elapsed_time_bin300",
            "duration_previous_content_bin300",
            "prior_question_had_explanation", "rating_diff_content_user_id",
            "task_container_id_bin300"
        ]]
        print(df.head(10))

        print("data preprocess")

        train_idx = []
        val_idx = []
        np.random.seed(0)
        for _, w_df in df[df["content_type_id"] == 0].groupby("user_id"):
            if np.random.random() < 0.01:
                # all val
                val_idx.extend(w_df.index.tolist())
            else:
                train_num = int(len(w_df) * 0.95)
                train_idx.extend(w_df[:train_num].index.tolist())
                val_idx.extend(w_df[train_num:].index.tolist())
    ff_for_transformer = FeatureFactoryForTransformer(
        column_config=column_config,
        dict_path="../feature_engineering/",
        sequence_length=params["max_seq"],
        logger=logger)
    ff_for_transformer.make_dict(df=df)
    n_skill = len(ff_for_transformer.embbed_dict[("content_id",
                                                  "content_type_id")])
    if not load_pickle or is_debug:
        df["is_val"] = 0
        df["is_val"].loc[val_idx] = 1
        w_df = df[df["is_val"] == 0]
        w_df["group"] = (
            w_df.groupby("user_id")["user_id"].transform("count") -
            w_df.groupby("user_id").cumcount()) // params["max_seq"]
        w_df["user_id"] = w_df["user_id"].astype(
            str) + "_" + w_df["group"].astype(str)

        group = ff_for_transformer.all_predict(w_df)

        dataset_train = SAKTDataset(group,
                                    n_skill=n_skill,
                                    max_seq=params["max_seq"])

        del w_df
        gc.collect()

    ff_for_transformer = FeatureFactoryForTransformer(
        column_config=column_config,
        dict_path="../feature_engineering/",
        sequence_length=params["max_seq"],
        logger=logger)
    if not load_pickle or is_debug:
        group = ff_for_transformer.all_predict(df[df["content_type_id"] == 0])
        dataset_val = SAKTDataset(group,
                                  is_test=True,
                                  n_skill=n_skill,
                                  max_seq=params["max_seq"])

    os.makedirs("../input/feature_engineering/model142", exist_ok=True)
    if not is_debug and not load_pickle:
        with open(f"../input/feature_engineering/model142/train.pickle",
                  "wb") as f:
            pickle.dump(dataset_train, f)
        with open(f"../input/feature_engineering/model142/val.pickle",
                  "wb") as f:
            pickle.dump(dataset_val, f)

    if not is_debug and load_pickle:
        with open(f"../input/feature_engineering/model142/train.pickle",
                  "rb") as f:
            dataset_train = pickle.load(f)
        with open(f"../input/feature_engineering/model142/val.pickle",
                  "rb") as f:
            dataset_val = pickle.load(f)
        print("loaded!")
    dataloader_train = DataLoader(dataset_train,
                                  batch_size=params["batch_size"],
                                  shuffle=True,
                                  num_workers=1)
    dataloader_val = DataLoader(dataset_val,
                                batch_size=params["batch_size"],
                                shuffle=False,
                                num_workers=1)

    model = SAKTModel(n_skill,
                      embed_dim=params["embed_dim"],
                      max_seq=params["max_seq"],
                      dropout=dropout)

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    optimizer = AdamW(
        optimizer_grouped_parameters,
        lr=params["lr"],
        weight_decay=0.01,
    )
    num_train_optimization_steps = int(len(dataloader_train) * epochs)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=params["num_warmup_steps"],
        num_training_steps=num_train_optimization_steps)
    criterion = nn.BCEWithLogitsLoss()

    model.to(device)
    criterion.to(device)

    for epoch in range(epochs):
        loss, acc, auc, auc_val = train_epoch(model, dataloader_train,
                                              dataloader_val, optimizer,
                                              criterion, scheduler, device)
        print("epoch - {} train_loss - {:.3f} auc - {:.4f} auc-val: {:.4f}".
              format(epoch, loss, auc, auc_val))

    preds = []
    labels = []
    with torch.no_grad():
        for item in tqdm(dataloader_val):
            label = item["label"].to(device).float()

            output = model(item, device)

            preds.extend(torch.nn.Sigmoid()(
                output[:, -1]).view(-1).data.cpu().numpy().tolist())
            labels.extend(label[:, -1].view(-1).data.cpu().numpy().tolist())

    auc_transformer = roc_auc_score(labels, preds)
    print("single transformer: {:.4f}".format(auc_transformer))
    df_oof = pd.DataFrame()
    # df_oof["row_id"] = df.loc[val_idx].index
    print(len(dataloader_val))
    print(len(preds))
    df_oof["predict"] = preds
    df_oof["target"] = labels

    df_oof.to_csv(f"{output_dir}/transformers1.csv", index=False)
    """
    df_oof2 = pd.read_csv("../output/ex_237/20201213110353/oof_train_0_lgbm.csv")
    df_oof2.columns = ["row_id", "predict_lgbm", "target"]
    df_oof2 = pd.merge(df_oof, df_oof2, how="inner")

    auc_lgbm = roc_auc_score(df_oof2["target"].values, df_oof2["predict_lgbm"].values)
    print("lgbm: {:.4f}".format(auc_lgbm))

    print("ensemble")
    max_auc = 0
    max_nn_ratio = 0
    for r in np.arange(0, 1.05, 0.05):
        auc = roc_auc_score(df_oof2["target"].values, df_oof2["predict_lgbm"].values*(1-r) + df_oof2["predict"].values*r)
        print("[nn_ratio: {:.2f}] AUC: {:.4f}".format(r, auc))

        if max_auc < auc:
            max_auc = auc
            max_nn_ratio = r
    print(len(df_oof2))
    """
    if not is_debug:
        mlflow.start_run(experiment_id=10, run_name=os.path.basename(__file__))

        for key, value in params.items():
            mlflow.log_param(key, value)
        mlflow.log_metric("auc_val", auc_transformer)
        mlflow.end_run()
    torch.save(model.state_dict(), f"{output_dir}/transformers.pth")
    del model
    torch.cuda.empty_cache()
    with open(f"{output_dir}/transformer_param.json", "w") as f:
        json.dump(params, f)
    if is_make_feature_factory:
        # feature factory
        feature_factory_dict = {"user_id": {}}
        feature_factory_dict["user_id"][
            "DurationPreviousContent"] = DurationPreviousContent(
                is_partial_fit=True)
        feature_factory_dict["user_id"][
            "ElapsedTimeBinningEncoder"] = ElapsedTimeBinningEncoder()
        feature_factory_manager = FeatureFactoryManager(
            feature_factory_dict=feature_factory_dict,
            logger=logger,
            split_num=1,
            model_id="all",
            load_feature=not is_debug,
            save_feature=not is_debug)

        ff_for_transformer = FeatureFactoryForTransformer(
            column_config=column_config,
            dict_path="../feature_engineering/",
            sequence_length=params["max_seq"],
            logger=logger)
        df = pd.read_pickle(
            "../input/riiid-test-answer-prediction/train_merged.pickle")
        if is_debug:
            df = df.head(10000)
        df = df.sort_values(["user_id", "timestamp"]).reset_index(drop=True)
        feature_factory_manager.fit(df)
        df = feature_factory_manager.all_predict(df)
        for dicts in feature_factory_manager.feature_factory_dict.values():
            for factory in dicts.values():
                factory.logger = None
        feature_factory_manager.logger = None
        with open(f"{output_dir}/feature_factory_manager.pickle", "wb") as f:
            pickle.dump(feature_factory_manager, f)

        ff_for_transformer.fit(df)
        ff_for_transformer.logger = None
        with open(
                f"{output_dir}/feature_factory_manager_for_transformer.pickle",
                "wb") as f:
            pickle.dump(ff_for_transformer, f)
Пример #30
0
Файл: main.py Проект: Daiver/jff
import mlflow
import os
import time
from mlflow import log_metric, log_param, log_artifact


if __name__ == "__main__":
    mlflow.set_experiment("First")
    with mlflow.start_run():
        # Log a parameter (key-value pair)
        log_param("param1", 5)

        # Log a metric; metrics can be updated throughout the run
        for i in range(200):
            time.sleep(0.1)
            log_metric("foo1", 1 * i)
            log_metric("foo2", 2 * i)
            log_metric("foo3", 3 * i)
            log_metric("foo4", 3 * i)
            log_metric("foo5", 3 * i)
            log_metric("foo6", 3 * i)
            log_metric("foo7", 3 * i)
            log_metric("foo8", 3 * i)
            log_metric("foo9", 3 * i)
            log_metric("foo10", 3 * i)
            log_metric("foo11", 3 * i)
            log_metric("foo12", 3 * i)
            log_metric("foo13", 3 * i)
            log_metric("foo14", 3 * i)
            log_metric("foo15", 3 * i)
            log_metric("foo16", 3 * i)
Пример #31
0
def train_torch_KFold(train_df: pd.DataFrame, feat_cols: List[str],
                      target_cols: List[str], model_name: str,
                      model_param: DictConfig, train_param: DictConfig,
                      cv_param: DictConfig, optimizer: DictConfig,
                      scheduler: DictConfig, loss_function: DictConfig,
                      OUT_DIR: str) -> None:
    '''
    1. Create model
    2. Split training data into folds
    3. Train model
    4. Calculate validation metrics
    5. Calculate average validation metrics
    '''
    # store scores in schema: {'train-acc': {'fold': [0.1, 0.8, ...], 'avg': 0.005}, ...}
    metrics = ['train-acc', 'valid-acc', 'train-auc', 'valid-auc']
    scores: dict = {}
    for metric in metrics:
        scores[metric] = {'vsfold': [], 'avg': None}

    device = get_device()
    target = target_cols[0]

    kf = KFold(**cv_param)
    for fold, (tr, te) in enumerate(
            kf.split(train_df[target].values, train_df[target].values)):
        print(
            f'Starting fold: {fold}, train size: {len(tr)}, validation size: {len(te)}'
        )

        # split data
        train = train_df.loc[tr, :]
        valid = train_df.loc[te, :]
        y_tr = train_df.loc[tr, target]
        y_val = train_df.loc[te, target]

        # create dataset
        train_set = TitanicDataset(train, feat_cols, target_cols)
        train_loader = DataLoader(train_set,
                                  batch_size=train_param.batch_size,
                                  shuffle=True,
                                  num_workers=4)
        valid_set = TitanicDataset(valid, feat_cols, target_cols)
        valid_loader = DataLoader(valid_set,
                                  batch_size=train_param.batch_size,
                                  shuffle=False,
                                  num_workers=4)

        torch.cuda.empty_cache()
        model = get_model(model_name,
                          model_param,
                          optimizer,
                          scheduler,
                          loss_function,
                          feat_cols=feat_cols,
                          target_cols=target_cols,
                          fold=fold,
                          device=device)

        early_stop_callback = EarlyStopping(
            'val-loss_vsepoch',
            patience=train_param.early_stopping_rounds,
            mode='max')

        trainer = pl.Trainer(
            max_epochs=train_param.epochs,
            fast_dev_run=False,  # TODO: pass from param!
            callbacks=[early_stop_callback],
        )
        trainer.fit(model, train_loader, valid_loader)

        # log model
        torch.save(model.state_dict(), f'{OUT_DIR}/model_{fold}.pth')

        # log metrics per fold
        pred_tr = inference_fn(model, train_loader, device, target_cols)
        pred_val = inference_fn(model, valid_loader, device, target_cols)
        pred_tr = np.where(pred_tr >= 0.5, 1, 0).astype(int)
        pred_val = np.where(pred_val >= 0.5, 1, 0).astype(int)
        score = {
            metrics[0]: accuracy_score(y_tr, pred_tr),
            metrics[1]: accuracy_score(y_val, pred_val),
            metrics[2]: roc_auc_score(y_tr, pred_tr),
            metrics[3]: roc_auc_score(y_val, pred_val)
        }
        for metric in metrics:
            scores[metric]['vsfold'].append(score[metric])
            mlflow.log_metric(f'{metric}_vsfold', score[metric], step=fold)

    # log metrics averaged over folds
    for metric in metrics:
        scores[metric]['avg'] = np.array(scores[metric]['vsfold']).mean()
        mlflow.log_metric(f'{metric}_foldavg', scores[metric]['avg'])

    print('End training')
    return None