예제 #1
0
def split_data(
        x_path: InputPath(),
        y_path: InputPath(),
        train_x_path: OutputPath(str),
        test_x_path: OutputPath(str),
        train_y_path: OutputPath(str),
        test_y_path: OutputPath(str),
):
    import numpy
    from sklearn.model_selection import train_test_split
    iris_x = numpy.load(x_path)
    iris_y = numpy.load(y_path)
    train_x, test_x, train_y, test_y = train_test_split(iris_x,
                                                        iris_y,
                                                        test_size=0.2)
    print(f"Training data size - X: {train_x.size}, y: {train_y.size}")
    print(f"Testing data size - X: {test_x.size}, y: {test_y.size}")

    with open(train_x_path, 'wb') as f:
        numpy.save(f, train_x)
    with open(test_x_path, 'wb') as f:
        numpy.save(f, test_x)
    with open(train_y_path, 'wb') as f:
        numpy.save(f, train_y)
    with open(test_y_path, 'wb') as f:
        numpy.save(f, test_y)
예제 #2
0
def fetch_data_op(anomalous_data_path: OutputPath(str),
                  non_anomalous_data_path: OutputPath(str)):
    import sys, subprocess
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'pandas'])

    import os
    import pandas as pd

    master_url_root = "https://raw.githubusercontent.com/numenta/NAB/master/data/"

    df_small_noise_url_suffix = "artificialNoAnomaly/art_daily_small_noise.csv"
    df_small_noise_url = master_url_root + df_small_noise_url_suffix
    df_small_noise = pd.read_csv(df_small_noise_url,
                                 parse_dates=True,
                                 index_col="timestamp")

    df_daily_jumpsup_url_suffix = "artificialWithAnomaly/art_daily_jumpsup.csv"
    df_daily_jumpsup_url = master_url_root + df_daily_jumpsup_url_suffix
    df_daily_jumpsup = pd.read_csv(df_daily_jumpsup_url,
                                   parse_dates=True,
                                   index_col="timestamp")

    print("Non-anomalous Data")
    print(df_small_noise.describe())

    print("Anomalous Data")
    print(df_daily_jumpsup.describe())

    df_daily_jumpsup.to_csv(anomalous_data_path)
    df_small_noise.to_csv(non_anomalous_data_path)

    print(f'Anomalous data saved to {anomalous_data_path}')
    print(f'Non-anomalous data saved to {non_anomalous_data_path}')
예제 #3
0
 def consume_file_path(
         number: int,
         number_1a_path: str,
         number_1b_file: str,
         number_1c_file_path: str,
         number_1d_path_file: str,
         number_2a_path: InputPath(str),
         number_2b_file: InputPath(str),
         number_2c_file_path: InputPath(str),
         number_2d_path_file: InputPath(str),
         number_3a_path: InputTextFile(str),
         number_3b_file: InputTextFile(str),
         number_3c_file_path: InputTextFile(str),
         number_3d_path_file: InputTextFile(str),
         number_4a_path: InputBinaryFile(str),
         number_4b_file: InputBinaryFile(str),
         number_4c_file_path: InputBinaryFile(str),
         number_4d_path_file: InputBinaryFile(str),
         output_number_2a_path: OutputPath(str),
         output_number_2b_file: OutputPath(str),
         output_number_2c_file_path: OutputPath(str),
         output_number_2d_path_file: OutputPath(str),
         output_number_3a_path: OutputTextFile(str),
         output_number_3b_file: OutputTextFile(str),
         output_number_3c_file_path: OutputTextFile(str),
         output_number_3d_path_file: OutputTextFile(str),
         output_number_4a_path: OutputBinaryFile(str),
         output_number_4b_file: OutputBinaryFile(str),
         output_number_4c_file_path: OutputBinaryFile(str),
         output_number_4d_path_file: OutputBinaryFile(str),
 ):
     pass
예제 #4
0
def preprocess(uri: str, some_int: int, output_parameter_one: OutputPath(int),
               output_dataset_one: OutputPath('Dataset')):
    '''Dummy Preprocess Step.'''
    with open(output_dataset_one, 'w') as f:
        f.write('Output dataset')
    with open(output_parameter_one, 'w') as f:
        f.write("{}".format(1234))
def fetch_data(x_path: OutputPath(str), y_path: OutputPath(str)):
    # download data locally
    from sklearn import datasets

    x, y = datasets.load_breast_cancer(return_X_y=True, as_frame=True)
    x.to_parquet(x_path)
    y.to_frame().to_parquet(y_path)
예제 #6
0
def split_dataset_huggingface(
        dataset_dict_path: InputPath('HuggingFaceDatasetDict'),
        dataset_split_path: OutputPath('HuggingFaceDataset'),
        dataset_path: OutputPath('HuggingFaceArrowDataset'),
        # dataset_indices_path: OutputPath('HuggingFaceArrowDataset'),
        dataset_info_path: OutputPath(dict),
        dataset_state_path: OutputPath(dict),
        split_name: str = None,
):
    import os
    import shutil
    from datasets import config as datasets_config

    print(f'DatasetDict contents: {os.listdir(dataset_dict_path)}')
    shutil.copytree(os.path.join(dataset_dict_path, split_name),
                    dataset_split_path)
    print(
        f'Dataset contents: {os.listdir(os.path.join(dataset_dict_path, split_name))}'
    )
    shutil.copy(
        os.path.join(dataset_dict_path, split_name,
                     datasets_config.DATASET_ARROW_FILENAME), dataset_path)
    # shutil.copy(os.path.join(dataset_dict_path, split_name, datasets_config.DATASET_INDICES_FILENAME), dataset_indices_path)
    shutil.copy(
        os.path.join(dataset_dict_path, split_name,
                     datasets_config.DATASET_INFO_FILENAME), dataset_info_path)
    shutil.copy(
        os.path.join(dataset_dict_path, split_name,
                     datasets_config.DATASET_STATE_JSON_FILENAME),
        dataset_state_path)
def preprocess(
    # An input parameter of type string.
    message: str,
    # Use Output[T] to get a metadata-rich handle to the output artifact
    # of type `Dataset`.
    output_dataset_one: Output[Dataset],
    # A locally accessible filepath for another output artifact of type
    # `Dataset`.
    output_dataset_two_path: OutputPath('Dataset'),
    # A locally accessible filepath for an output parameter of type string.
    output_parameter_path: OutputPath(str)):
    '''Dummy preprocessing step'''

    # Use Dataset.path to access a local file path for writing.
    # One can also use Dataset.uri to access the actual URI file path.
    with open(output_dataset_one.path, 'w') as f:
        f.write(message)

    # OutputPath is used to just pass the local file path of the output artifact
    # to the function.
    with open(output_dataset_two_path, 'w') as f:
        f.write(message)

    with open(output_parameter_path, 'w') as f:
        f.write(message)
예제 #8
0
def load_data(x_path: OutputPath(str), y_path: OutputPath(str)):
    import numpy
    from sklearn import datasets
    iris_x, iris_y = datasets.load_iris(return_X_y=True)
    with open(x_path, 'wb') as f:
        numpy.save(f, iris_x)
    with open(y_path, 'wb') as f:
        numpy.save(f, iris_y)
def sum_multiply_numbers(numbers_path: InputPath(str),
                         sum_path: OutputPath(str),
                         product_path: OutputPath(str)):
    sum = 0
    product = 1
    with open(numbers_path, 'r') as reader:
        for line in reader:
            sum = sum + int(line)
            product = product * int(line)
    with open(sum_path, 'w') as writer:
        writer.write(str(sum))
    with open(product_path, 'w') as writer:
        writer.write(str(product))
예제 #10
0
def split_text_lines(source_path: InputPath(str),
                     odd_lines_path: OutputPath(str),
                     even_lines_path: OutputPath(str)):
    with open(source_path, 'r') as reader:
        with open(odd_lines_path, 'w') as odd_writer:
            with open(even_lines_path, 'w') as even_writer:
                while True:
                    line = reader.readline()
                    if line == "":
                        break
                    odd_writer.write(line)
                    line = reader.readline()
                    if line == "":
                        break
                    even_writer.write(line)
예제 #11
0
def load_data(output_data_path: OutputPath('ApacheParquet')):
    '''
        Get GCP raw data
    '''

    import pandas as pd
    from sklearn import preprocessing

    from google.cloud import bigquery, storage

    def build_storage_client(project_id):
        '''
        Build Storage client to perform requests to GCP buckets
        Params:
            project_id: the respective project of GCP
        '''

        # step.apply(gcp.use_gcp_secret('user-gcp-sa')) in the dsl.ContainerOP()
        storage_client = storage.Client(project_id)
        return storage_client

    def get_bucket_data(storage_client, bucket_name, file_name):
        '''
            Get file from bucket and save locally
        '''

        bucket = storage_client.get_bucket(bucket_name)
        blob = bucket.blob(file_name)
        blob.download_to_filename(file_name)

    # Get raw data
    storage_client = build_storage_client('beto-cloud')
    raw_data_bucket_name = 'stroke-parquet'
    get_bucket_data(storage_client, raw_data_bucket_name, output_data_path)
예제 #12
0
def Pandas_Transform_DataFrame_in_ApacheParquet_format(
    table_path: InputPath('ApacheParquet'),
    transformed_table_path: OutputPath('ApacheParquet'),
    transform_code: 'PythonCode',
):
    '''Transform DataFrame loaded from an ApacheParquet file.

    Inputs:
        table: DataFrame to transform.
        transform_code: Transformation code. Code is written in Python and can consist of multiple lines.
            The DataFrame variable is called "df".
            Examples:
            - `df['prod'] = df['X'] * df['Y']`
            - `df = df[['X', 'prod']]`
            - `df.insert(0, "is_positive", df["X"] > 0)`

    Outputs:
        transformed_table: Transformed DataFrame.

    Annotations:
        author: Alexey Volkov <*****@*****.**>
    '''
    import pandas

    df = pandas.read_parquet(table_path)
    # The namespace is needed so that the code can replace `df`. For example df = df[['X']]
    namespace = locals()
    exec(transform_code, namespace)
    namespace['df'].to_parquet(transformed_table_path)
예제 #13
0
def Pandas_Transform_DataFrame_in_CSV_format(
        table_path: InputPath('CSV'),
        transformed_table_path: OutputPath('CSV'),
        transform_code: 'PythonCode',
):
    '''Transform DataFrame loaded from a CSV file.

    Inputs:
        table: Table to transform.
        transform_code: Transformation code. Code is written in Python and can consist of multiple lines.
            The DataFrame variable is called "df".
            Examples:
            - `df['prod'] = df['X'] * df['Y']`
            - `df = df[['X', 'prod']]`
            - `df.insert(0, "is_positive", df["X"] > 0)`

    Outputs:
        transformed_table: Transformed table.

    Annotations:
        author: Alexey Volkov <*****@*****.**>
    '''
    import pandas

    df = pandas.read_csv(table_path, )
    exec(transform_code)
    df.to_csv(
        transformed_table_path,
        index=False,
    )
예제 #14
0
def convert_CatBoostModel_to_AppleCoreMLModel(
        model_path: InputPath('CatBoostModel'),
        converted_model_path: OutputPath('AppleCoreMLModel'),
):
    '''Convert CatBoost model to Apple CoreML format.

    Args:
        model_path: Path of a trained model in binary CatBoost model format.
        converted_model_path: Output path for the converted model.

    Outputs:
        converted_model: Model in Apple CoreML format.

    Annotations:
        author: Alexey Volkov <*****@*****.**>
    '''
    from catboost import CatBoost

    model = CatBoost()
    model.load_model(model_path)
    model.save_model(
        converted_model_path,
        format="coreml",
        # export_parameters={'prediction_type': 'probability'},
        # export_parameters={'prediction_type': 'raw'},
    )
def prepro_class(dataset_path: InputPath(str), wikiqa_path: OutputPath(str)):
    import nltk

    from wikiqa.prepro_class import prepro

    nltk.download("punkt")

    def get_args():
        from types import SimpleNamespace

        source_dir = dataset_path + "/WikiQACorpus"
        target_dir = wikiqa_path + "/wikiqa-class"
        glove_dir = dataset_path + "/glove"
        args = SimpleNamespace(
            source_dir=source_dir,
            target_dir=target_dir,
            debug=False,
            glove_corpus="6B",
            glove_dir=glove_dir,
            glove_vec_size="100",
            tokenizer="PTB",
        )
        return args

    args = get_args()
    prepro(args)
예제 #16
0
def train_mnist(data_path: InputPath(), model_output: OutputPath()):
    import tensorflow as tf
    import numpy as np
    with np.load(data_path, allow_pickle=True) as f:
        x_train, y_train = f['x_train'], f['y_train']
        x_test, y_test = f['x_test'], f['y_test']
    print(x_train.shape)
    print(y_train.shape)

    model = tf.keras.models.Sequential([
        tf.keras.layers.Flatten(input_shape=(28, 28)),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dense(10)
    ])
    model.compile(
        optimizer=tf.keras.optimizers.Adam(0.001),
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        metrics=[tf.keras.metrics.SparseCategoricalAccuracy()],
    )

    model.fit(
        x_train,
        y_train,
    )
    model.evaluate(x_test, y_test)

    model.save(model_output)
예제 #17
0
def create_fully_connected_pytorch_network(
    layer_sizes: list,
    network_path: OutputPath('PyTorchScriptModule'),
    activation_name: str = 'relu',
    random_seed: int = 0,
):
    '''Creates fully-connected network in PyTorch ScriptModule format'''
    import torch
    torch.manual_seed(random_seed)

    activation = getattr(torch, activation_name, None) or getattr(
        torch.nn.functional, activation_name, None)
    if not activation:
        raise ValueError(f'Activation "{activation_name}" was not found.')

    class ActivationLayer(torch.nn.Module):
        def forward(self, input):
            return activation(input)

    layers = []
    for layer_idx in range(len(layer_sizes) - 1):
        layer = torch.nn.Linear(layer_sizes[layer_idx],
                                layer_sizes[layer_idx + 1])
        layers.append(layer)
        if layer_idx < len(layer_sizes) - 2:
            layers.append(ActivationLayer())

    network = torch.nn.Sequential(*layers)
    script_module = torch.jit.script(network)
    print(script_module)
    script_module.save(network_path)
예제 #18
0
def wikiqa_train(
    dataset_path: InputPath(str),
    wikiqa_path: InputPath(str),
    load_path,
    shared_path,
    run_id,
    sent_size_th,
    ques_size_th,
    num_epochs,
    num_steps,
    eval_period,
    save_period,
    device,
    device_type,
    num_gpus,
    model_path: OutputPath(str),
):
    import tensorflow as tf

    from basic.cli import main

    input_dir = wikiqa_path + "/wikiqa-class"
    output_dir = model_path + "/out/wikiqa"
    full_load_path = dataset_path + load_path
    full_shared_path = dataset_path + shared_path
    tf.app.run(
        main,
        argv=[
            "./basic/cli.py",
            "--data_dir",
            input_dir,
            "--out_base_dir",
            output_dir,
            "--load_path",
            full_load_path,
            "--shared_path",
            full_shared_path,
            "--load_trained_model",
            "--run_id",
            run_id,
            "--sent_size_th",
            sent_size_th,
            "--ques_size_th",
            ques_size_th,
            "--num_epochs",
            num_epochs,
            "--num_steps",
            num_steps,
            "--eval_period",
            eval_period,
            "--save_period",
            save_period,
            "--device",
            device,
            "--device_type",
            device_type,
            "--num_gpus",
            num_gpus,
        ],
    )
예제 #19
0
def xgboost_predict(
        data_path: InputPath('CSV'),  # Also supports LibSVM
        model_path: InputPath('XGBoostModel'),
        predictions_path: OutputPath('Text'),
        label_column: int = None,
):
    '''Make predictions using a trained XGBoost model.

    Args:
        data_path: Path for the feature data in CSV format.
        model_path: Path for the trained model in binary XGBoost format.
        predictions_path: Output path for the predictions.
        label_column: Column containing the label data.

    Annotations:
        author: Alexey Volkov <*****@*****.**>
    '''
    from pathlib import Path

    import numpy
    import xgboost

    csv_data_spec = data_path + '?format=csv'
    # Only specifying the column if it's passed.
    if label_column is not None:
        csv_data_spec += '&label_column=' + str(label_column)
    testing_data = xgboost.DMatrix(csv_data_spec)

    model = xgboost.Booster(model_file=model_path)

    predictions = model.predict(testing_data)

    Path(predictions_path).parent.mkdir(parents=True, exist_ok=True)
    numpy.savetxt(predictions_path, predictions)
예제 #20
0
def download_data(url: str, output_text_path: OutputPath(str)):
    import requests

    req = requests.get(url)
    url_content = req.content

    with open(output_text_path, 'wb') as writer:
        writer.write(url_content)
예제 #21
0
def convert_values_to_int(text_path: InputPath('Text'),
                          output_path: OutputPath('Text')):
    """Returns the number of values in a CSV column."""
    import numpy as np

    result = np.loadtxt(text_path)

    np.savetxt(output_path, result, fmt='%d')
def produce_dir_with_files_python_op(
        output_dir_path: OutputPath(), num_files: int = 10):
    import os
    os.makedirs(output_dir_path, exist_ok=True)
    for i in range(num_files):
        file_path = os.path.join(output_dir_path, str(i) + '.txt')
        with open(file_path, 'w') as f:
            f.write(str(i))
 def test_build_function_with_input_output_artifacts(self):
     nb_file = get_tmp_notebook(notebook_source)
     builder = NbComponentBuilder('op1', inject_notebook_path=nb_file.name)
     builder.add_input_artifact('a_in')
     builder.add_output_artifact('a_out')
     func = builder.build_component_function()
     self.assertEqual(type(func.__annotations__['a_in']), type(InputPath()))
     self.assertEqual(type(func.__annotations__['a_out']), type(OutputPath()))
예제 #24
0
def preprocess(
    # An input parameter of type string.
    message: str,
    # Use Output[T] to get a metadata-rich handle to the output artifact
    # of type `Dataset`.
    output_dataset_one: Output[Dataset],
    # A locally accessible filepath for another output artifact of type
    # `Dataset`.
    output_dataset_two_path: OutputPath('Dataset'),
    # A locally accessible filepath for an output parameter of type string.
    output_parameter_path: OutputPath(str),
    # A locally accessible filepath for an output parameter of type bool.
    output_bool_parameter_path: OutputPath(bool),
    # A locally accessible filepath for an output parameter of type dict.
    output_dict_parameter_path: OutputPath(Dict[str, int]),
    # A locally accessible filepath for an output parameter of type list.
    output_list_parameter_path: OutputPath(List[str]),
    # An input message that defaults to empty.
    empty_message: str = "",
):
    """Dummy preprocessing step"""

    # Use Dataset.path to access a local file path for writing.
    # One can also use Dataset.uri to access the actual URI file path.
    with open(output_dataset_one.path, 'w') as f:
        f.write(message)

    # OutputPath is used to just pass the local file path of the output artifact
    # to the function.
    with open(output_dataset_two_path, 'w') as f:
        f.write(message)

    with open(output_parameter_path, 'w') as f:
        f.write(message)

    with open(output_bool_parameter_path, 'w') as f:
        f.write(
            str(True))  # use either `str()` or `json.dumps()` for bool values.

    import json
    with open(output_dict_parameter_path, 'w') as f:
        f.write(json.dumps({'A': 1, 'B': 2}))

    with open(output_list_parameter_path, 'w') as f:
        f.write(json.dumps(['a', 'b', 'c']))
예제 #25
0
    def test_func(output_parameter_path: OutputPath(str)):
      # Test that output parameters just use the passed in filename.
      self.assertEqual(output_parameter_path,
                       'gs://some-bucket/some_task/nested/output_parameter')

      # Test writing to the path succeeds. This fails if parent directories
      # don't exist.
      with open(output_parameter_path, 'w') as f:
        f.write('Hello, World!')
예제 #26
0
def download(squad_url, dataset_path: OutputPath()):
    import os
    import tempfile
    import zipfile

    import requests

    from tqdm import tqdm

    # Download WikiQA
    r = requests.get(
        "https://download.microsoft.com/download/E/5/F/E5FCFCEE-7005-4814-853D-DAA7C66507E0/WikiQACorpus.zip",
        stream=True,
    )
    total_size_in_bytes = int(r.headers.get("content-length", 0))
    progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
    with tempfile.TemporaryFile() as tf:
        for chunk in r.iter_content(chunk_size=128):
            progress_bar.update(len(chunk))
            tf.write(chunk)
        with zipfile.ZipFile(tf, "r") as f:
            f.extractall(dataset_path)
    progress_bar.close()
    if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
        print("ERROR, something went wrong")

    # Download GloVe
    GLOVE_DIR = dataset_path + "/glove"
    os.makedirs(GLOVE_DIR, exist_ok=True)
    r = requests.get("http://nlp.stanford.edu/data/glove.6B.zip", stream=True)
    total_size_in_bytes = int(r.headers.get("content-length", 0))
    progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
    with tempfile.TemporaryFile() as tf:
        for chunk in r.iter_content(chunk_size=1024):
            progress_bar.update(len(chunk))
            tf.write(chunk)
        with zipfile.ZipFile(tf, "r") as f:
            f.extractall(GLOVE_DIR)
    progress_bar.close()
    if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
        print("ERROR, something went wrong")

    # Download Squad
    r_squad = requests.get(squad_url)
    total_size_in_bytes = int(r_squad.headers.get("content-length", 0))
    progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
    with tempfile.TemporaryFile() as tf:
        for chunk in r_squad.iter_content(chunk_size=1024):
            progress_bar.update(len(chunk))
            tf.write(chunk)
        with zipfile.ZipFile(tf, "r") as f:
            f.extractall(dataset_path)
    progress_bar.close()
    if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
        print("ERROR, something went wrong")

    print(os.listdir(dataset_path))
예제 #27
0
def convert_to_tensorflow_saved_model_from_onnx_model(
    model_path: InputPath('OnnxModel'),
    converted_model_path: OutputPath('TensorflowSavedModel'),
):
    import onnx
    import onnx_tf

    onnx_model = onnx.load(model_path)
    tf_rep = onnx_tf.backend.prepare(onnx_model)
    tf_rep.export_graph(converted_model_path)
예제 #28
0
def train(dataset: InputPath('Dataset'),
          model: OutputPath('Model'),
          num_steps: int = 100):
  '''Dummy Training Step.'''

  with open(dataset, 'r') as input_file:
    input_string = input_file.read()
    with open(model, 'w') as output_file:
      for i in range(num_steps):
        output_file.write("Step {}\n{}\n=====\n".format(i, input_string))
def _get_gpu(is_used: str, gpu_path: OutputPath(str)):
    import os
    import json

    print("is_used: {}".format(is_used))
    with open(gpu_path, 'w') as f:
        if is_used == 'yes':
            f.write('yes')
        else:
            f.write('no')
예제 #30
0
def train_knn(train_x_path: InputPath(), train_y_path: InputPath(),
              model_path: OutputPath()):
    import numpy
    import pickle
    from sklearn.neighbors import KNeighborsClassifier
    train_x = numpy.load(train_x_path)
    train_y = numpy.load(train_y_path)
    print(f"Training data size - X: {train_x.size}, y: {train_y.size}")
    knn = KNeighborsClassifier()
    knn.fit(train_x, train_y)
    with open(model_path, 'wb') as f:
        pickle.dump(knn, f)