def split_data( x_path: InputPath(), y_path: InputPath(), train_x_path: OutputPath(str), test_x_path: OutputPath(str), train_y_path: OutputPath(str), test_y_path: OutputPath(str), ): import numpy from sklearn.model_selection import train_test_split iris_x = numpy.load(x_path) iris_y = numpy.load(y_path) train_x, test_x, train_y, test_y = train_test_split(iris_x, iris_y, test_size=0.2) print(f"Training data size - X: {train_x.size}, y: {train_y.size}") print(f"Testing data size - X: {test_x.size}, y: {test_y.size}") with open(train_x_path, 'wb') as f: numpy.save(f, train_x) with open(test_x_path, 'wb') as f: numpy.save(f, test_x) with open(train_y_path, 'wb') as f: numpy.save(f, train_y) with open(test_y_path, 'wb') as f: numpy.save(f, test_y)
def fetch_data_op(anomalous_data_path: OutputPath(str), non_anomalous_data_path: OutputPath(str)): import sys, subprocess subprocess.run([sys.executable, '-m', 'pip', 'install', 'pandas']) import os import pandas as pd master_url_root = "https://raw.githubusercontent.com/numenta/NAB/master/data/" df_small_noise_url_suffix = "artificialNoAnomaly/art_daily_small_noise.csv" df_small_noise_url = master_url_root + df_small_noise_url_suffix df_small_noise = pd.read_csv(df_small_noise_url, parse_dates=True, index_col="timestamp") df_daily_jumpsup_url_suffix = "artificialWithAnomaly/art_daily_jumpsup.csv" df_daily_jumpsup_url = master_url_root + df_daily_jumpsup_url_suffix df_daily_jumpsup = pd.read_csv(df_daily_jumpsup_url, parse_dates=True, index_col="timestamp") print("Non-anomalous Data") print(df_small_noise.describe()) print("Anomalous Data") print(df_daily_jumpsup.describe()) df_daily_jumpsup.to_csv(anomalous_data_path) df_small_noise.to_csv(non_anomalous_data_path) print(f'Anomalous data saved to {anomalous_data_path}') print(f'Non-anomalous data saved to {non_anomalous_data_path}')
def consume_file_path( number: int, number_1a_path: str, number_1b_file: str, number_1c_file_path: str, number_1d_path_file: str, number_2a_path: InputPath(str), number_2b_file: InputPath(str), number_2c_file_path: InputPath(str), number_2d_path_file: InputPath(str), number_3a_path: InputTextFile(str), number_3b_file: InputTextFile(str), number_3c_file_path: InputTextFile(str), number_3d_path_file: InputTextFile(str), number_4a_path: InputBinaryFile(str), number_4b_file: InputBinaryFile(str), number_4c_file_path: InputBinaryFile(str), number_4d_path_file: InputBinaryFile(str), output_number_2a_path: OutputPath(str), output_number_2b_file: OutputPath(str), output_number_2c_file_path: OutputPath(str), output_number_2d_path_file: OutputPath(str), output_number_3a_path: OutputTextFile(str), output_number_3b_file: OutputTextFile(str), output_number_3c_file_path: OutputTextFile(str), output_number_3d_path_file: OutputTextFile(str), output_number_4a_path: OutputBinaryFile(str), output_number_4b_file: OutputBinaryFile(str), output_number_4c_file_path: OutputBinaryFile(str), output_number_4d_path_file: OutputBinaryFile(str), ): pass
def preprocess(uri: str, some_int: int, output_parameter_one: OutputPath(int), output_dataset_one: OutputPath('Dataset')): '''Dummy Preprocess Step.''' with open(output_dataset_one, 'w') as f: f.write('Output dataset') with open(output_parameter_one, 'w') as f: f.write("{}".format(1234))
def fetch_data(x_path: OutputPath(str), y_path: OutputPath(str)): # download data locally from sklearn import datasets x, y = datasets.load_breast_cancer(return_X_y=True, as_frame=True) x.to_parquet(x_path) y.to_frame().to_parquet(y_path)
def split_dataset_huggingface( dataset_dict_path: InputPath('HuggingFaceDatasetDict'), dataset_split_path: OutputPath('HuggingFaceDataset'), dataset_path: OutputPath('HuggingFaceArrowDataset'), # dataset_indices_path: OutputPath('HuggingFaceArrowDataset'), dataset_info_path: OutputPath(dict), dataset_state_path: OutputPath(dict), split_name: str = None, ): import os import shutil from datasets import config as datasets_config print(f'DatasetDict contents: {os.listdir(dataset_dict_path)}') shutil.copytree(os.path.join(dataset_dict_path, split_name), dataset_split_path) print( f'Dataset contents: {os.listdir(os.path.join(dataset_dict_path, split_name))}' ) shutil.copy( os.path.join(dataset_dict_path, split_name, datasets_config.DATASET_ARROW_FILENAME), dataset_path) # shutil.copy(os.path.join(dataset_dict_path, split_name, datasets_config.DATASET_INDICES_FILENAME), dataset_indices_path) shutil.copy( os.path.join(dataset_dict_path, split_name, datasets_config.DATASET_INFO_FILENAME), dataset_info_path) shutil.copy( os.path.join(dataset_dict_path, split_name, datasets_config.DATASET_STATE_JSON_FILENAME), dataset_state_path)
def preprocess( # An input parameter of type string. message: str, # Use Output[T] to get a metadata-rich handle to the output artifact # of type `Dataset`. output_dataset_one: Output[Dataset], # A locally accessible filepath for another output artifact of type # `Dataset`. output_dataset_two_path: OutputPath('Dataset'), # A locally accessible filepath for an output parameter of type string. output_parameter_path: OutputPath(str)): '''Dummy preprocessing step''' # Use Dataset.path to access a local file path for writing. # One can also use Dataset.uri to access the actual URI file path. with open(output_dataset_one.path, 'w') as f: f.write(message) # OutputPath is used to just pass the local file path of the output artifact # to the function. with open(output_dataset_two_path, 'w') as f: f.write(message) with open(output_parameter_path, 'w') as f: f.write(message)
def load_data(x_path: OutputPath(str), y_path: OutputPath(str)): import numpy from sklearn import datasets iris_x, iris_y = datasets.load_iris(return_X_y=True) with open(x_path, 'wb') as f: numpy.save(f, iris_x) with open(y_path, 'wb') as f: numpy.save(f, iris_y)
def sum_multiply_numbers(numbers_path: InputPath(str), sum_path: OutputPath(str), product_path: OutputPath(str)): sum = 0 product = 1 with open(numbers_path, 'r') as reader: for line in reader: sum = sum + int(line) product = product * int(line) with open(sum_path, 'w') as writer: writer.write(str(sum)) with open(product_path, 'w') as writer: writer.write(str(product))
def split_text_lines(source_path: InputPath(str), odd_lines_path: OutputPath(str), even_lines_path: OutputPath(str)): with open(source_path, 'r') as reader: with open(odd_lines_path, 'w') as odd_writer: with open(even_lines_path, 'w') as even_writer: while True: line = reader.readline() if line == "": break odd_writer.write(line) line = reader.readline() if line == "": break even_writer.write(line)
def load_data(output_data_path: OutputPath('ApacheParquet')): ''' Get GCP raw data ''' import pandas as pd from sklearn import preprocessing from google.cloud import bigquery, storage def build_storage_client(project_id): ''' Build Storage client to perform requests to GCP buckets Params: project_id: the respective project of GCP ''' # step.apply(gcp.use_gcp_secret('user-gcp-sa')) in the dsl.ContainerOP() storage_client = storage.Client(project_id) return storage_client def get_bucket_data(storage_client, bucket_name, file_name): ''' Get file from bucket and save locally ''' bucket = storage_client.get_bucket(bucket_name) blob = bucket.blob(file_name) blob.download_to_filename(file_name) # Get raw data storage_client = build_storage_client('beto-cloud') raw_data_bucket_name = 'stroke-parquet' get_bucket_data(storage_client, raw_data_bucket_name, output_data_path)
def Pandas_Transform_DataFrame_in_ApacheParquet_format( table_path: InputPath('ApacheParquet'), transformed_table_path: OutputPath('ApacheParquet'), transform_code: 'PythonCode', ): '''Transform DataFrame loaded from an ApacheParquet file. Inputs: table: DataFrame to transform. transform_code: Transformation code. Code is written in Python and can consist of multiple lines. The DataFrame variable is called "df". Examples: - `df['prod'] = df['X'] * df['Y']` - `df = df[['X', 'prod']]` - `df.insert(0, "is_positive", df["X"] > 0)` Outputs: transformed_table: Transformed DataFrame. Annotations: author: Alexey Volkov <*****@*****.**> ''' import pandas df = pandas.read_parquet(table_path) # The namespace is needed so that the code can replace `df`. For example df = df[['X']] namespace = locals() exec(transform_code, namespace) namespace['df'].to_parquet(transformed_table_path)
def Pandas_Transform_DataFrame_in_CSV_format( table_path: InputPath('CSV'), transformed_table_path: OutputPath('CSV'), transform_code: 'PythonCode', ): '''Transform DataFrame loaded from a CSV file. Inputs: table: Table to transform. transform_code: Transformation code. Code is written in Python and can consist of multiple lines. The DataFrame variable is called "df". Examples: - `df['prod'] = df['X'] * df['Y']` - `df = df[['X', 'prod']]` - `df.insert(0, "is_positive", df["X"] > 0)` Outputs: transformed_table: Transformed table. Annotations: author: Alexey Volkov <*****@*****.**> ''' import pandas df = pandas.read_csv(table_path, ) exec(transform_code) df.to_csv( transformed_table_path, index=False, )
def convert_CatBoostModel_to_AppleCoreMLModel( model_path: InputPath('CatBoostModel'), converted_model_path: OutputPath('AppleCoreMLModel'), ): '''Convert CatBoost model to Apple CoreML format. Args: model_path: Path of a trained model in binary CatBoost model format. converted_model_path: Output path for the converted model. Outputs: converted_model: Model in Apple CoreML format. Annotations: author: Alexey Volkov <*****@*****.**> ''' from catboost import CatBoost model = CatBoost() model.load_model(model_path) model.save_model( converted_model_path, format="coreml", # export_parameters={'prediction_type': 'probability'}, # export_parameters={'prediction_type': 'raw'}, )
def prepro_class(dataset_path: InputPath(str), wikiqa_path: OutputPath(str)): import nltk from wikiqa.prepro_class import prepro nltk.download("punkt") def get_args(): from types import SimpleNamespace source_dir = dataset_path + "/WikiQACorpus" target_dir = wikiqa_path + "/wikiqa-class" glove_dir = dataset_path + "/glove" args = SimpleNamespace( source_dir=source_dir, target_dir=target_dir, debug=False, glove_corpus="6B", glove_dir=glove_dir, glove_vec_size="100", tokenizer="PTB", ) return args args = get_args() prepro(args)
def train_mnist(data_path: InputPath(), model_output: OutputPath()): import tensorflow as tf import numpy as np with np.load(data_path, allow_pickle=True) as f: x_train, y_train = f['x_train'], f['y_train'] x_test, y_test = f['x_test'], f['y_test'] print(x_train.shape) print(y_train.shape) model = tf.keras.models.Sequential([ tf.keras.layers.Flatten(input_shape=(28, 28)), tf.keras.layers.Dense(128, activation='relu'), tf.keras.layers.Dense(10) ]) model.compile( optimizer=tf.keras.optimizers.Adam(0.001), loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=[tf.keras.metrics.SparseCategoricalAccuracy()], ) model.fit( x_train, y_train, ) model.evaluate(x_test, y_test) model.save(model_output)
def create_fully_connected_pytorch_network( layer_sizes: list, network_path: OutputPath('PyTorchScriptModule'), activation_name: str = 'relu', random_seed: int = 0, ): '''Creates fully-connected network in PyTorch ScriptModule format''' import torch torch.manual_seed(random_seed) activation = getattr(torch, activation_name, None) or getattr( torch.nn.functional, activation_name, None) if not activation: raise ValueError(f'Activation "{activation_name}" was not found.') class ActivationLayer(torch.nn.Module): def forward(self, input): return activation(input) layers = [] for layer_idx in range(len(layer_sizes) - 1): layer = torch.nn.Linear(layer_sizes[layer_idx], layer_sizes[layer_idx + 1]) layers.append(layer) if layer_idx < len(layer_sizes) - 2: layers.append(ActivationLayer()) network = torch.nn.Sequential(*layers) script_module = torch.jit.script(network) print(script_module) script_module.save(network_path)
def wikiqa_train( dataset_path: InputPath(str), wikiqa_path: InputPath(str), load_path, shared_path, run_id, sent_size_th, ques_size_th, num_epochs, num_steps, eval_period, save_period, device, device_type, num_gpus, model_path: OutputPath(str), ): import tensorflow as tf from basic.cli import main input_dir = wikiqa_path + "/wikiqa-class" output_dir = model_path + "/out/wikiqa" full_load_path = dataset_path + load_path full_shared_path = dataset_path + shared_path tf.app.run( main, argv=[ "./basic/cli.py", "--data_dir", input_dir, "--out_base_dir", output_dir, "--load_path", full_load_path, "--shared_path", full_shared_path, "--load_trained_model", "--run_id", run_id, "--sent_size_th", sent_size_th, "--ques_size_th", ques_size_th, "--num_epochs", num_epochs, "--num_steps", num_steps, "--eval_period", eval_period, "--save_period", save_period, "--device", device, "--device_type", device_type, "--num_gpus", num_gpus, ], )
def xgboost_predict( data_path: InputPath('CSV'), # Also supports LibSVM model_path: InputPath('XGBoostModel'), predictions_path: OutputPath('Text'), label_column: int = None, ): '''Make predictions using a trained XGBoost model. Args: data_path: Path for the feature data in CSV format. model_path: Path for the trained model in binary XGBoost format. predictions_path: Output path for the predictions. label_column: Column containing the label data. Annotations: author: Alexey Volkov <*****@*****.**> ''' from pathlib import Path import numpy import xgboost csv_data_spec = data_path + '?format=csv' # Only specifying the column if it's passed. if label_column is not None: csv_data_spec += '&label_column=' + str(label_column) testing_data = xgboost.DMatrix(csv_data_spec) model = xgboost.Booster(model_file=model_path) predictions = model.predict(testing_data) Path(predictions_path).parent.mkdir(parents=True, exist_ok=True) numpy.savetxt(predictions_path, predictions)
def download_data(url: str, output_text_path: OutputPath(str)): import requests req = requests.get(url) url_content = req.content with open(output_text_path, 'wb') as writer: writer.write(url_content)
def convert_values_to_int(text_path: InputPath('Text'), output_path: OutputPath('Text')): """Returns the number of values in a CSV column.""" import numpy as np result = np.loadtxt(text_path) np.savetxt(output_path, result, fmt='%d')
def produce_dir_with_files_python_op( output_dir_path: OutputPath(), num_files: int = 10): import os os.makedirs(output_dir_path, exist_ok=True) for i in range(num_files): file_path = os.path.join(output_dir_path, str(i) + '.txt') with open(file_path, 'w') as f: f.write(str(i))
def test_build_function_with_input_output_artifacts(self): nb_file = get_tmp_notebook(notebook_source) builder = NbComponentBuilder('op1', inject_notebook_path=nb_file.name) builder.add_input_artifact('a_in') builder.add_output_artifact('a_out') func = builder.build_component_function() self.assertEqual(type(func.__annotations__['a_in']), type(InputPath())) self.assertEqual(type(func.__annotations__['a_out']), type(OutputPath()))
def preprocess( # An input parameter of type string. message: str, # Use Output[T] to get a metadata-rich handle to the output artifact # of type `Dataset`. output_dataset_one: Output[Dataset], # A locally accessible filepath for another output artifact of type # `Dataset`. output_dataset_two_path: OutputPath('Dataset'), # A locally accessible filepath for an output parameter of type string. output_parameter_path: OutputPath(str), # A locally accessible filepath for an output parameter of type bool. output_bool_parameter_path: OutputPath(bool), # A locally accessible filepath for an output parameter of type dict. output_dict_parameter_path: OutputPath(Dict[str, int]), # A locally accessible filepath for an output parameter of type list. output_list_parameter_path: OutputPath(List[str]), # An input message that defaults to empty. empty_message: str = "", ): """Dummy preprocessing step""" # Use Dataset.path to access a local file path for writing. # One can also use Dataset.uri to access the actual URI file path. with open(output_dataset_one.path, 'w') as f: f.write(message) # OutputPath is used to just pass the local file path of the output artifact # to the function. with open(output_dataset_two_path, 'w') as f: f.write(message) with open(output_parameter_path, 'w') as f: f.write(message) with open(output_bool_parameter_path, 'w') as f: f.write( str(True)) # use either `str()` or `json.dumps()` for bool values. import json with open(output_dict_parameter_path, 'w') as f: f.write(json.dumps({'A': 1, 'B': 2})) with open(output_list_parameter_path, 'w') as f: f.write(json.dumps(['a', 'b', 'c']))
def test_func(output_parameter_path: OutputPath(str)): # Test that output parameters just use the passed in filename. self.assertEqual(output_parameter_path, 'gs://some-bucket/some_task/nested/output_parameter') # Test writing to the path succeeds. This fails if parent directories # don't exist. with open(output_parameter_path, 'w') as f: f.write('Hello, World!')
def download(squad_url, dataset_path: OutputPath()): import os import tempfile import zipfile import requests from tqdm import tqdm # Download WikiQA r = requests.get( "https://download.microsoft.com/download/E/5/F/E5FCFCEE-7005-4814-853D-DAA7C66507E0/WikiQACorpus.zip", stream=True, ) total_size_in_bytes = int(r.headers.get("content-length", 0)) progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True) with tempfile.TemporaryFile() as tf: for chunk in r.iter_content(chunk_size=128): progress_bar.update(len(chunk)) tf.write(chunk) with zipfile.ZipFile(tf, "r") as f: f.extractall(dataset_path) progress_bar.close() if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes: print("ERROR, something went wrong") # Download GloVe GLOVE_DIR = dataset_path + "/glove" os.makedirs(GLOVE_DIR, exist_ok=True) r = requests.get("http://nlp.stanford.edu/data/glove.6B.zip", stream=True) total_size_in_bytes = int(r.headers.get("content-length", 0)) progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True) with tempfile.TemporaryFile() as tf: for chunk in r.iter_content(chunk_size=1024): progress_bar.update(len(chunk)) tf.write(chunk) with zipfile.ZipFile(tf, "r") as f: f.extractall(GLOVE_DIR) progress_bar.close() if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes: print("ERROR, something went wrong") # Download Squad r_squad = requests.get(squad_url) total_size_in_bytes = int(r_squad.headers.get("content-length", 0)) progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True) with tempfile.TemporaryFile() as tf: for chunk in r_squad.iter_content(chunk_size=1024): progress_bar.update(len(chunk)) tf.write(chunk) with zipfile.ZipFile(tf, "r") as f: f.extractall(dataset_path) progress_bar.close() if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes: print("ERROR, something went wrong") print(os.listdir(dataset_path))
def convert_to_tensorflow_saved_model_from_onnx_model( model_path: InputPath('OnnxModel'), converted_model_path: OutputPath('TensorflowSavedModel'), ): import onnx import onnx_tf onnx_model = onnx.load(model_path) tf_rep = onnx_tf.backend.prepare(onnx_model) tf_rep.export_graph(converted_model_path)
def train(dataset: InputPath('Dataset'), model: OutputPath('Model'), num_steps: int = 100): '''Dummy Training Step.''' with open(dataset, 'r') as input_file: input_string = input_file.read() with open(model, 'w') as output_file: for i in range(num_steps): output_file.write("Step {}\n{}\n=====\n".format(i, input_string))
def _get_gpu(is_used: str, gpu_path: OutputPath(str)): import os import json print("is_used: {}".format(is_used)) with open(gpu_path, 'w') as f: if is_used == 'yes': f.write('yes') else: f.write('no')
def train_knn(train_x_path: InputPath(), train_y_path: InputPath(), model_path: OutputPath()): import numpy import pickle from sklearn.neighbors import KNeighborsClassifier train_x = numpy.load(train_x_path) train_y = numpy.load(train_y_path) print(f"Training data size - X: {train_x.size}, y: {train_y.size}") knn = KNeighborsClassifier() knn.fit(train_x, train_y) with open(model_path, 'wb') as f: pickle.dump(knn, f)