Пример #1
0
	def test_split_data(self):

		input_file = 'data/test_questions.txt'
		x_train, y_train, x_test, y_test = split_data(0.8)
		self.assertNotEqual(x_train, {})
		self.assertNotEqual(y_train, {})
		self.assertNotEqual(x_test, {})
		self.assertNotEqual(y_test, {})
Пример #2
0
def train_loop(exe, train_progm, init, num_iters, train_data, dev_count,
               sum_cost, avg_cost, lr_scheduler, token_num, predict):

    data_input_names = encoder_data_input_fields + decoder_data_input_fields[:
                                                                             -1] + label_data_input_fields
    util_input_names = encoder_util_input_fields + decoder_util_input_fields

    start_time = time.time()
    exec_time = 0.0
    for batch_id, data in enumerate(train_data()):
        if batch_id >= num_iters:
            break
        feed_list = []
        total_num_token = 0
        for place_id, data_buffer in enumerate(
                split_data(data, num_part=dev_count)):
            data_input_dict, util_input_dict, num_token = prepare_batch_input(
                data_buffer, data_input_names, util_input_names,
                ModelHyperParams.eos_idx, ModelHyperParams.eos_idx,
                ModelHyperParams.n_head, ModelHyperParams.d_model)
            total_num_token += num_token
            feed_kv_pairs = data_input_dict.items() + util_input_dict.items()
            lr_rate = lr_scheduler.update_learning_rate()
            feed_kv_pairs += {lr_scheduler.learning_rate.name: lr_rate}.items()
            feed_list.append(dict(feed_kv_pairs))

            if not init:
                for pos_enc_param_name in pos_enc_param_names:
                    pos_enc = position_encoding_init(
                        ModelHyperParams.max_length + 1,
                        ModelHyperParams.d_model)
                    feed_list[place_id][pos_enc_param_name] = pos_enc
        for feed_dict in feed_list:
            feed_dict[sum_cost.name + "@GRAD"] = 1. / total_num_token

        exe_start_time = time.time()
        if dev_count > 1:
            # prallel executor
            outs = exe.run(fetch_list=[sum_cost.name, token_num.name],
                           feed=feed_list)
        else:
            # executor
            outs = exe.run(fetch_list=[sum_cost, token_num], feed=feed_list[0])
        exec_time += time.time() - exe_start_time

        sum_cost_val, token_num_val = np.array(outs[0]), np.array(outs[1])
        total_sum_cost = sum_cost_val.sum()  # sum the cost from multi-devices
        total_token_num = token_num_val.sum()
        total_avg_cost = total_sum_cost / total_token_num
        print("batch: %d, sum loss: %f, avg loss: %f, ppl: %f" %
              (batch_id, total_sum_cost, total_avg_cost,
               np.exp([min(total_avg_cost, 100)])))
        init = True
    return time.time() - start_time, exec_time
Пример #3
0
    def test_split(self):
        '''Testa a separacao dos dados em conjuntos de validacao e treinamento'''
        raw_data = load_data()
        pp_data = pre_processing(raw_data)
        (x_tr, y_tr, x_vl, y_vl) = split_data(pp_data)

        self.assertIsNotNone(x_tr)
        self.assertIsNotNone(x_vl)
        self.assertIsNotNone(y_tr)
        self.assertIsNotNone(y_vl)

        ratio_x = len(x_vl) / (len(x_vl) + len(x_tr))
        ratio_y = len(y_vl) / (len(y_vl) + len(y_tr))
        self.assertAlmostEqual(ratio_x, 0.2, places=1)
        self.assertAlmostEqual(ratio_y, 0.2, places=1)
Пример #4
0
def test_split_data():
    test_data = {
        'id': [0, 1, 2, 3, 4],
        'target': [0, 0, 1, 0, 1],
        'col1': [1, 2, 3, 4, 5],
        'col2': [2, 1, 1, 2, 1]
    }

    data_df = pd.DataFrame(data=test_data)
    data = split_data(data_df)

    # verify that columns were removed correctly
    assert "target" not in data[0].data.columns
    assert "id" not in data[0].data.columns
    assert "col1" in data[0].data.columns

    # verify that data was split as desired
    assert data[0].data.shape == (4, 2)
    assert data[1].data.shape == (1, 2)
Пример #5
0
def test_split_data():
    test_data = {
        'id': [0, 1, 2, 3, 4],
        'target': [0, 0, 1, 0, 1],
        'col1': [1, 2, 3, 4, 5],
        'col2': [2, 1, 1, 2, 1]
    }

    data_df = pd.DataFrame(data=test_data)
    data = split_data(data_df)

    # verify that columns were removed correctly
    assert "target" not in data[0].data.columns
    assert "id" not in data[0].data.columns
    assert "col1" in data[0].data.columns

    # verify that data was split as desired
    assert data[0].data.shape == (4, 2)
    assert data[1].data.shape == (1, 2)

    # the valid_data set's raw data is used for metric calculation, so
    # free_raw_data should be False
    assert not data[1].free_raw_data
Пример #6
0
def main():
    print("Running train_aml.py")

    parser = argparse.ArgumentParser("train")
    parser.add_argument(
        "--model_name",
        type=str,
        help="Name of the Model",
        default="insure_model_model.pkl",
    )

    parser.add_argument("--step_output",
                        type=str,
                        help=("output for passing data to next step"))

    parser.add_argument("--dataset_version",
                        type=str,
                        help=("dataset version"))

    parser.add_argument("--data_file_path",
                        type=str,
                        help=("data file path, if specified,\
               a new version of the dataset will be registered"))

    parser.add_argument(
        "--caller_run_id",
        type=str,
        help=("caller run id, for example ADF pipeline run id"))

    parser.add_argument("--dataset_name",
                        type=str,
                        help=("Dataset name. Dataset must be passed by name\
              to always get the desired dataset version\
              rather than the one used while the pipeline creation"))

    args = parser.parse_args()

    print("Argument [model_name]: %s" % args.model_name)
    print("Argument [step_output]: %s" % args.step_output)
    print("Argument [dataset_version]: %s" % args.dataset_version)
    print("Argument [data_file_path]: %s" % args.data_file_path)
    print("Argument [caller_run_id]: %s" % args.caller_run_id)
    print("Argument [dataset_name]: %s" % args.dataset_name)

    model_name = args.model_name
    step_output_path = args.step_output
    dataset_version = args.dataset_version
    data_file_path = args.data_file_path
    dataset_name = args.dataset_name

    run = Run.get_context()

    print("Getting training parameters")

    # Load the training parameters from the parameters file
    with open("parameters.json") as f:
        pars = json.load(f)
    try:
        train_args = pars["training"]
    except KeyError:
        print("Could not load training values from file")
        train_args = {}

    # Log the training parameters
    print(f"Parameters: {train_args}")
    for (k, v) in train_args.items():
        run.log(k, v)
        run.parent.log(k, v)

    # Get the dataset
    if (dataset_name):
        if (data_file_path == 'none'):
            dataset = Dataset.get_by_name(run.experiment.workspace,
                                          dataset_name,
                                          dataset_version)  # NOQA: E402, E501
        else:
            dataset = register_dataset(run.experiment.workspace, dataset_name,
                                       os.environ.get("DATASTORE_NAME"),
                                       data_file_path)
    else:
        e = ("No dataset provided")
        print(e)
        raise Exception(e)

    # Link dataset to the step run so it is trackable in the UI
    run.input_datasets['training_data'] = dataset
    run.parent.tag("dataset_id", value=dataset.id)

    # Split the data into test/train
    df = dataset.to_pandas_dataframe()
    data = split_data(df)

    # Train the model
    model = train_model(data, train_args)

    # Evaluate and log the metrics returned from the train function
    metrics = get_model_metrics(model, data[1])
    for (k, v) in metrics.items():
        run.log(k, v)
        run.parent.log(k, v)

    # Pass model file to next step
    os.makedirs(step_output_path, exist_ok=True)
    model_output_path = os.path.join(step_output_path, model_name)
    joblib.dump(value=model, filename=model_output_path)

    # Also upload model file to run outputs for history
    os.makedirs('outputs', exist_ok=True)
    output_path = os.path.join('outputs', model_name)
    joblib.dump(value=model, filename=output_path)

    run.tag("run_type", value="train")
    print(f"tags now present for run: {run.tags}")

    run.complete()
Пример #7
0
def main():
    print("Running train_aml.py")

    parser = argparse.ArgumentParser("train")
    parser.add_argument("--model_name", type=str, help="Name of the Model")

    parser.add_argument("--step_output",
                        type=str,
                        help=("output for passing data to next step"))

    parser.add_argument("--data_file_path",
                        type=str,
                        help=("data file path, if specified,\
               a new version of the dataset will be registered"))

    parser.add_argument("--dataset_name",
                        type=str,
                        help=("Dataset name. Dataset must be passed by name\
              to always get the desired dataset version\
              rather than the one used while the pipeline creation"))

    parser.add_argument("--datastore_name", type=str, help=("Datastore name."))

    parser.add_argument(
        "--ml_params",
        type=str,
        help=
        "Parameters for ML pipelne in json format with defaults defined in parameters.json",  # NOQA: E501
    )
    args = parser.parse_args()

    print("Argument [model_name]: %s" % args.model_name)
    print("Argument [step_output]: %s" % args.step_output)
    print("Argument [data_file_path]: %s" % args.data_file_path)
    print("Argument [dataset_name]: %s" % args.dataset_name)
    print("Argument [datastore_name]: %s" % args.datastore_name)
    print("Argument [ml_params]: %s" % args.ml_params)

    model_name = args.model_name
    step_output_path = args.step_output
    data_file_path = args.data_file_path
    dataset_name = args.dataset_name
    datastore_name = args.datastore_name

    run = Run.get_context()
    training_args, preprocessing_args = parse_ml_params(run, args.ml_params)

    # Get the dataset
    dataset = get_or_register_dataset(dataset_name=dataset_name,
                                      datastore_name=datastore_name,
                                      data_file_path=data_file_path,
                                      aml_workspace=run.experiment.workspace)

    # Link dataset to the step run so it is trackable in the UI
    run.input_datasets['training_data'] = dataset
    run.parent.tag("dataset_id", value=dataset.id)

    # Train the model
    # mount the dynamic version of the dataset, which can't be determined at pipeline publish time  # NOQA: E501
    mount_context = dataset.mount()
    mount_context.start()
    print(f"mount_point is: {mount_context.mount_point}")
    data = split_data(mount_context.mount_point, preprocessing_args)
    model, history = train_model(data, training_args, preprocessing_args)
    mount_context.stop()

    # Evaluate and log the metrics returned from the train function
    metrics = get_model_metrics(history)
    for (k, v) in metrics.items():
        run.log(k, v)
        run.parent.log(k, v)

    # Pass model file to next step
    os.makedirs(step_output_path, exist_ok=True)
    model_output_path = os.path.join(step_output_path, model_name)
    model.save(model_output_path)
    with open(os.path.join(step_output_path, "run_id.txt"), "w") as text_file:
        print(f"{run.id}", file=text_file)

    # Also upload model file to run outputs for history
    os.makedirs('outputs', exist_ok=True)
    output_path = os.path.join('outputs', model_name)
    model.save(output_path)

    run.tag("run_type", value="train")
    print(f"tags now present for run: {run.tags}")

    run.complete()
Пример #8
0
]

if __name__ == "__main__":

    print('\nCARREGANDO DADOS...')
    data = load_data()
    start = time.time()
    min_max(data)
    for exam_type, name in [(0, 'IGG'), (1, 'IGM'), (2, 'PCR')]:

        print('\n------------------------\n')
        print('[REDE ' + name + '] SEPARANDO DADOS PARA TREINAMENTO...')
        processed_data = process_data(data, exam_type)

        print('[REDE ' + name + '] ADEQUANDO DADOS PARA TREINAMENTO...')
        splitted_data = split_data(processed_data)

        print('[REDE ' + name + '] MONTANDO MODELO...')
        x_train = splitted_data[0]
        y_train = splitted_data[2]

        print('[REDE ' + name + '] ' + str(len(x_train)) + \
                ' ELEMENTOS NO CONJUNTO DE TREINAMENTO...')
        print('[REDE ' + name + '] ' + str(len(y_train)) + \
                ' ELEMENTOS NO CONJUNTO DE VALIDAÇÃO...')

        print('[REDE ' + name + '] INSTANCIANDO REDES...')
        network = NN2(x_train.shape[1])
        optimizer = torch.optim.Adam(network.parameters(), lr=LEARNING_RATE)
        criterion = torch.nn.BCELoss()
Пример #9
0
def main():

    # test = os.environ.get("TEST_DATA")
    # train_data = os.environ.get("TRAINING_DATA")

    TRAINING_DATA_DIR = os.environ.get("TRAINING_DATA")
    TEST_DATA = os.environ.get("TEST_DATA")

    train_data = pd.read_csv(TRAINING_DATA_DIR)
    test = pd.read_csv(TEST_DATA)

    add_columns = train.addingColumns(train_data, test)
    data, country_dict, all_data = train.addingWolrd(add_columns)

    # le = preprocessing.LabelEncoder()

    # Select train (real) data from March 1 to March 22nd

    dates_list = [
        '2020-03-01', '2020-03-02', '2020-03-03', '2020-03-04', '2020-03-05',
        '2020-03-06', '2020-03-07', '2020-03-08', '2020-03-09', '2020-03-10',
        '2020-03-11', '2020-03-12', '2020-03-13', '2020-03-14', '2020-03-15',
        '2020-03-16', '2020-03-17', '2020-03-18', '2020-03-19', '2020-03-20',
        '2020-03-21', '2020-03-22', '2020-03-23', '2020-03-24'
    ]

    # Filter Spain, run the Linear Regression workflow
    # country_name = "Spain"	country_name = "Spain"
    #
    country_name = os.environ.get("COUNTRY")

    day_start = 39
    data_country = data[data['Country/Region'] == country_dict[country_name]]
    data_country = data_country.loc[data_country['Day_num'] >= day_start]
    X_train, Y_train_1, Y_train_2, X_test = train.split_data(data_country)
    model, pred = train.lin_reg(X_train, Y_train_1, X_test)

    # Create a df with both real cases and predictions (predictions starting on March 12th)
    X_train_check = X_train.copy()
    X_train_check['Target'] = Y_train_1

    X_test_check = X_test.copy()
    X_test_check['Target'] = pred

    X_final_check = pd.concat([X_train_check, X_test_check])

    # Select predictions from March 1st to March 24th
    predicted_data = X_final_check.loc[(X_final_check['Day_num'].isin(
        list(range(day_start, day_start + len(dates_list)))))].Target
    real_data = train_data.loc[
        (train_data['Country/Region'] == country_name)
        & (train_data['Date'].isin(dates_list))]['ConfirmedCases']
    dates_list_num = list(range(0, len(dates_list)))

    # Plot results
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

    ax1.plot(dates_list_num, np.exp(predicted_data))
    ax1.plot(dates_list_num, real_data)
    ax1.axvline(10, linewidth=2, ls=':', color='grey', alpha=0.5)
    ax1.legend(['Predicted cases', 'Actual cases', 'Train-test split'],
               loc='upper left')
    ax1.set_xlabel("Day count (from March 1st to March 22nd)")
    ax1.set_ylabel("Confirmed Cases")

    ax2.plot(dates_list_num, predicted_data)
    ax2.plot(dates_list_num, np.log(real_data))
    ax2.axvline(10, linewidth=2, ls=':', color='grey', alpha=0.5)
    ax2.legend(['Predicted cases', 'Actual cases', 'Train-test split'],
               loc='upper left')
    ax2.set_xlabel("Day count (from March 1st to March 22nd)")
    ax2.set_ylabel("Log Confirmed Cases")

    plt.suptitle(
        ("ConfirmedCases predictions based on Linear Regression for " +
         country_name))

    plt.show()
Пример #10
0
def main():
    print("Running train_aml.py")

    parser = argparse.ArgumentParser("train")
    parser.add_argument(
        "--model_name",
        type=str,
        help="Name of the Model",
        default="insurance_model.pkl",
    )

    parser.add_argument(
        "--data_file_path",
        type=str,
        help=
        ("data file path, if specified,a new version of the dataset will be registered"
         ),
        default="insurance",
    )

    parser.add_argument(
        "--dataset_name",
        type=str,
        help="Dataset name",
        default="insurance_dataset",
    )

    args = parser.parse_args()

    print("Argument [model_name]: %s" % args.model_name)
    print("Argument [data_file_path]: %s" % args.data_file_path)
    print("Argument [dataset_name]: %s" % args.dataset_name)

    model_name = args.model_name
    data_file_path = args.data_file_path
    dataset_name = args.dataset_name

    run = Run.get_context()

    print("Getting training parameters")

    # Load the training parameters from the parameters file
    with open("parameters.json") as f:
        pars = json.load(f)
    try:
        train_args = pars["training"]
    except KeyError:
        print("Could not load training values from file")
        train_args = {}

    # Log the training parameters
    print(f"Parameters: {train_args}")
    for (k, v) in train_args.items():
        run.log(k, v)
        #run.parent.log(k, v)

    # Get the dataset
    if (dataset_name):
        if (data_file_path == 'none'):
            dataset = Dataset.get_by_name(run.experiment.workspace,
                                          dataset_name)  # NOQA: E402, E501
        else:
            dataset = register_dataset(run.experiment.workspace, dataset_name,
                                       "workspaceblobstore", data_file_path)
    else:
        e = ("No dataset provided")
        print(e)
        raise Exception(e)

    # Link dataset to the step run so it is trackable in the UI
    run.input_datasets['training_data'] = dataset
    #run.parent.tag("dataset_id", value=dataset.id)

    # Split the data into test/train
    df = dataset.to_pandas_dataframe()
    data = split_data(df)

    # Train the model
    model = train_model(data, train_args)

    # Evaluate and log the metrics returned from the train function
    metrics = get_model_metrics(model, data)
    for (k, v) in metrics.items():
        run.log(k, v)
        #run.parent.log(k, v)

    # Also upload model file to run outputs for history
    os.makedirs('outputs', exist_ok=True)
    output_path = os.path.join('outputs', model_name)
    joblib.dump(value=model, filename=output_path)

    run.tag("run_type", value="train")
    print(f"tags now present for run: {run.tags}")

    # upload the model file explicitly into artifacts
    print("Uploading the model into run artifacts...")
    run.upload_file(name="./outputs/models/" + model_name,
                    path_or_stream=output_path)
    print("Uploaded the model {} to experiment {}".format(
        model_name, run.experiment.name))
    dirpath = os.getcwd()
    print(dirpath)
    print("Following files are uploaded ")
    print(run.get_file_names())

    run.complete()