def xgboost_predict( data_path: InputPath('CSV'), # Also supports LibSVM model_path: InputPath('XGBoostModel'), predictions_path: OutputPath('Text'), label_column: int = None, ): '''Make predictions using a trained XGBoost model. Args: data_path: Path for the feature data in CSV format. model_path: Path for the trained model in binary XGBoost format. predictions_path: Output path for the predictions. label_column: Column containing the label data. Annotations: author: Alexey Volkov <*****@*****.**> ''' from pathlib import Path import numpy import xgboost csv_data_spec = data_path + '?format=csv' # Only specifying the column if it's passed. if label_column is not None: csv_data_spec += '&label_column=' + str(label_column) testing_data = xgboost.DMatrix(csv_data_spec) model = xgboost.Booster(model_file=model_path) predictions = model.predict(testing_data) Path(predictions_path).parent.mkdir(parents=True, exist_ok=True) numpy.savetxt(predictions_path, predictions)
def split_data( x_path: InputPath(), y_path: InputPath(), train_x_path: OutputPath(str), test_x_path: OutputPath(str), train_y_path: OutputPath(str), test_y_path: OutputPath(str), ): import numpy from sklearn.model_selection import train_test_split iris_x = numpy.load(x_path) iris_y = numpy.load(y_path) train_x, test_x, train_y, test_y = train_test_split(iris_x, iris_y, test_size=0.2) print(f"Training data size - X: {train_x.size}, y: {train_y.size}") print(f"Testing data size - X: {test_x.size}, y: {test_y.size}") with open(train_x_path, 'wb') as f: numpy.save(f, train_x) with open(test_x_path, 'wb') as f: numpy.save(f, test_x) with open(train_y_path, 'wb') as f: numpy.save(f, train_y) with open(test_y_path, 'wb') as f: numpy.save(f, test_y)
def wikiqa_train( dataset_path: InputPath(str), wikiqa_path: InputPath(str), load_path, shared_path, run_id, sent_size_th, ques_size_th, num_epochs, num_steps, eval_period, save_period, device, device_type, num_gpus, model_path: OutputPath(str), ): import tensorflow as tf from basic.cli import main input_dir = wikiqa_path + "/wikiqa-class" output_dir = model_path + "/out/wikiqa" full_load_path = dataset_path + load_path full_shared_path = dataset_path + shared_path tf.app.run( main, argv=[ "./basic/cli.py", "--data_dir", input_dir, "--out_base_dir", output_dir, "--load_path", full_load_path, "--shared_path", full_shared_path, "--load_trained_model", "--run_id", run_id, "--sent_size_th", sent_size_th, "--ques_size_th", ques_size_th, "--num_epochs", num_epochs, "--num_steps", num_steps, "--eval_period", eval_period, "--save_period", save_period, "--device", device, "--device_type", device_type, "--num_gpus", num_gpus, ], )
def consume_file_path( number: int, number_1a_path: str, number_1b_file: str, number_1c_file_path: str, number_1d_path_file: str, number_2a_path: InputPath(str), number_2b_file: InputPath(str), number_2c_file_path: InputPath(str), number_2d_path_file: InputPath(str), number_3a_path: InputTextFile(str), number_3b_file: InputTextFile(str), number_3c_file_path: InputTextFile(str), number_3d_path_file: InputTextFile(str), number_4a_path: InputBinaryFile(str), number_4b_file: InputBinaryFile(str), number_4c_file_path: InputBinaryFile(str), number_4d_path_file: InputBinaryFile(str), output_number_2a_path: OutputPath(str), output_number_2b_file: OutputPath(str), output_number_2c_file_path: OutputPath(str), output_number_2d_path_file: OutputPath(str), output_number_3a_path: OutputTextFile(str), output_number_3b_file: OutputTextFile(str), output_number_3c_file_path: OutputTextFile(str), output_number_3d_path_file: OutputTextFile(str), output_number_4a_path: OutputBinaryFile(str), output_number_4b_file: OutputBinaryFile(str), output_number_4c_file_path: OutputBinaryFile(str), output_number_4d_path_file: OutputBinaryFile(str), ): pass
def calculate_regression_metrics_from_csv( true_values_path: InputPath(), predicted_values_path: InputPath(), ) -> NamedTuple('Outputs', [ ('number_of_items', int), ('max_absolute_error', float), ('mean_absolute_error', float), ('mean_squared_error', float), ('root_mean_squared_error', float), ('metrics', dict), ]): '''Calculates regression metrics. Annotations: author: Alexey Volkov <*****@*****.**> ''' import math import numpy true_values = numpy.loadtxt(true_values_path, dtype=numpy.float64) predicted_values = numpy.loadtxt(predicted_values_path, dtype=numpy.float64) if len(predicted_values.shape) != 1: raise NotImplemented('Only single prediction values are supported.') if len(true_values.shape) != 1: raise NotImplemented('Only single true values are supported.') if predicted_values.shape != true_values.shape: raise ValueError('Input shapes are different: {} != {}'.format( predicted_values.shape, true_values.shape)) number_of_items = true_values.size errors = (true_values - predicted_values) abs_errors = numpy.abs(errors) squared_errors = errors**2 max_absolute_error = numpy.max(abs_errors) mean_absolute_error = numpy.average(abs_errors) mean_squared_error = numpy.average(squared_errors) root_mean_squared_error = math.sqrt(mean_squared_error) metrics = dict( number_of_items=number_of_items, max_absolute_error=max_absolute_error, mean_absolute_error=mean_absolute_error, mean_squared_error=mean_squared_error, root_mean_squared_error=root_mean_squared_error, ) return ( number_of_items, max_absolute_error, mean_absolute_error, mean_squared_error, root_mean_squared_error, metrics, )
def train_knn(train_x_path: InputPath(), train_y_path: InputPath(), model_path: OutputPath()): import numpy import pickle from sklearn.neighbors import KNeighborsClassifier train_x = numpy.load(train_x_path) train_y = numpy.load(train_y_path) print(f"Training data size - X: {train_x.size}, y: {train_y.size}") knn = KNeighborsClassifier() knn.fit(train_x, train_y) with open(model_path, 'wb') as f: pickle.dump(knn, f)
def prep_data_op(anomalous_data_path: InputPath(str), non_anomalous_data_path: InputPath(str)): import sys, subprocess subprocess.run([sys.executable, '-m', 'pip', 'install', 'pandas']) import os import pandas as pd df_small_noise = pd.read_csv(non_anomalous_data_path) df_daily_jumpsup = pd.read_csv(anomalous_data_path) print(df_small_noise.head()) print(df_daily_jumpsup.head())
def train_mnist(data_path: InputPath(), model_output: OutputPath()): import tensorflow as tf import numpy as np with np.load(data_path, allow_pickle=True) as f: x_train, y_train = f['x_train'], f['y_train'] x_test, y_test = f['x_test'], f['y_test'] print(x_train.shape) print(y_train.shape) model = tf.keras.models.Sequential([ tf.keras.layers.Flatten(input_shape=(28, 28)), tf.keras.layers.Dense(128, activation='relu'), tf.keras.layers.Dense(10) ]) model.compile( optimizer=tf.keras.optimizers.Adam(0.001), loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=[tf.keras.metrics.SparseCategoricalAccuracy()], ) model.fit( x_train, y_train, ) model.evaluate(x_test, y_test) model.save(model_output)
def export_pickle(data_path: InputPath('Pickle'), ): ''' Export pickle to bucket ''' def build_storage_client(project_id): ''' Build Storage client to perform requests to GCP buckets Params: project_id: the respective project of GCP ''' # step.apply(gcp.use_gcp_secret('user-gcp-sa')) in the dsl.ContainerOP() storage_client = storage.Client(project_id) return storage_client def load_to_bucket(storage_client, bucket_name, file_name): ''' Load file to GCP bucket ''' bucket = storage_client.get_bucket(bucket_name) blob = bucket.blob(file_name) with open(file_name, "rb") as f: blob.upload_from_file(f) storage_client = build_storage_client('beto-cloud') bucket_name = 'stroke' load_to_bucket(storage_client, bucket_name, data_path)
def prepro_class(dataset_path: InputPath(str), wikiqa_path: OutputPath(str)): import nltk from wikiqa.prepro_class import prepro nltk.download("punkt") def get_args(): from types import SimpleNamespace source_dir = dataset_path + "/WikiQACorpus" target_dir = wikiqa_path + "/wikiqa-class" glove_dir = dataset_path + "/glove" args = SimpleNamespace( source_dir=source_dir, target_dir=target_dir, debug=False, glove_corpus="6B", glove_dir=glove_dir, glove_vec_size="100", tokenizer="PTB", ) return args args = get_args() prepro(args)
def convert_CatBoostModel_to_AppleCoreMLModel( model_path: InputPath('CatBoostModel'), converted_model_path: OutputPath('AppleCoreMLModel'), ): '''Convert CatBoost model to Apple CoreML format. Args: model_path: Path of a trained model in binary CatBoost model format. converted_model_path: Output path for the converted model. Outputs: converted_model: Model in Apple CoreML format. Annotations: author: Alexey Volkov <*****@*****.**> ''' from catboost import CatBoost model = CatBoost() model.load_model(model_path) model.save_model( converted_model_path, format="coreml", # export_parameters={'prediction_type': 'probability'}, # export_parameters={'prediction_type': 'raw'}, )
def print_text( text_path: InputPath() ): # The "text" input is untyped so that any data can be printed '''Print text''' with open(text_path, 'r') as reader: for line in reader: print(line, end='')
def Pandas_Transform_DataFrame_in_ApacheParquet_format( table_path: InputPath('ApacheParquet'), transformed_table_path: OutputPath('ApacheParquet'), transform_code: 'PythonCode', ): '''Transform DataFrame loaded from an ApacheParquet file. Inputs: table: DataFrame to transform. transform_code: Transformation code. Code is written in Python and can consist of multiple lines. The DataFrame variable is called "df". Examples: - `df['prod'] = df['X'] * df['Y']` - `df = df[['X', 'prod']]` - `df.insert(0, "is_positive", df["X"] > 0)` Outputs: transformed_table: Transformed DataFrame. Annotations: author: Alexey Volkov <*****@*****.**> ''' import pandas df = pandas.read_parquet(table_path) # The namespace is needed so that the code can replace `df`. For example df = df[['X']] namespace = locals() exec(transform_code, namespace) namespace['df'].to_parquet(transformed_table_path)
def split_dataset_huggingface( dataset_dict_path: InputPath('HuggingFaceDatasetDict'), dataset_split_path: OutputPath('HuggingFaceDataset'), dataset_path: OutputPath('HuggingFaceArrowDataset'), # dataset_indices_path: OutputPath('HuggingFaceArrowDataset'), dataset_info_path: OutputPath(dict), dataset_state_path: OutputPath(dict), split_name: str = None, ): import os import shutil from datasets import config as datasets_config print(f'DatasetDict contents: {os.listdir(dataset_dict_path)}') shutil.copytree(os.path.join(dataset_dict_path, split_name), dataset_split_path) print( f'Dataset contents: {os.listdir(os.path.join(dataset_dict_path, split_name))}' ) shutil.copy( os.path.join(dataset_dict_path, split_name, datasets_config.DATASET_ARROW_FILENAME), dataset_path) # shutil.copy(os.path.join(dataset_dict_path, split_name, datasets_config.DATASET_INDICES_FILENAME), dataset_indices_path) shutil.copy( os.path.join(dataset_dict_path, split_name, datasets_config.DATASET_INFO_FILENAME), dataset_info_path) shutil.copy( os.path.join(dataset_dict_path, split_name, datasets_config.DATASET_STATE_JSON_FILENAME), dataset_state_path)
def train( # Use InputPath to get a locally accessible path for the input artifact # of type `Dataset`. dataset_one_path: InputPath('Dataset'), # Use Input[T] to get a metadata-rich handle to the input artifact # of type `Dataset`. dataset_two: Input[Dataset], # An input parameter of type string. message: str, # Use Output[T] to get a metadata-rich handle to the output artifact # of type `Dataset`. model: Output[Model], # An input parameter of type int with a default value. num_steps: int = 100): '''Dummy Training step''' with open(dataset_one_path, 'r') as input_file: dataset_one_contents = input_file.read() with open(dataset_two.path, 'r') as input_file: dataset_two_contents = input_file.read() line = "dataset_one_contents: {} || dataset_two_contents: {} || message: {}\n".format( dataset_one_contents, dataset_two_contents, message) with open(model.path, 'w') as output_file: for i in range(num_steps): output_file.write("Step {}\n{}\n=====\n".format(i, line)) # Use `model` to get a Model artifact, which has a .metadata dictionary # to store arbitrary metadata for the output artifact. model.metadata['accuracy'] = 0.9
def Pandas_Transform_DataFrame_in_CSV_format( table_path: InputPath('CSV'), transformed_table_path: OutputPath('CSV'), transform_code: 'PythonCode', ): '''Transform DataFrame loaded from a CSV file. Inputs: table: Table to transform. transform_code: Transformation code. Code is written in Python and can consist of multiple lines. The DataFrame variable is called "df". Examples: - `df['prod'] = df['X'] * df['Y']` - `df = df[['X', 'prod']]` - `df.insert(0, "is_positive", df["X"] > 0)` Outputs: transformed_table: Transformed table. Annotations: author: Alexey Volkov <*****@*****.**> ''' import pandas df = pandas.read_csv(table_path, ) exec(transform_code) df.to_csv( transformed_table_path, index=False, )
def consume_file_path(number_file_path: InputPath(int) = None) -> int: result = -1 if number_file_path: with open(number_file_path) as f: string_data = f.read() result = int(string_data) return result
def catboost_predict_class_probabilities( data_path: InputPath('CSV'), model_path: InputPath('CatBoostModel'), predictions_path: OutputPath(), label_column: int = None, ): '''Predict class probabilities with a CatBoost model. Args: data_path: Path for the data in CSV format. model_path: Path for the trained model in binary CatBoostModel format. label_column: Column containing the label data. predictions_path: Output path for the predictions. Outputs: predictions: Predictions in text format. Annotations: author: Alexey Volkov <*****@*****.**> ''' import tempfile from catboost import CatBoost, Pool import numpy if label_column: column_descriptions = {label_column: 'Label'} column_description_path = tempfile.NamedTemporaryFile( delete=False).name with open(column_description_path, 'w') as column_description_file: for idx, kind in column_descriptions.items(): column_description_file.write('{}\t{}\n'.format(idx, kind)) else: column_description_path = None eval_data = Pool( data_path, column_description=column_description_path, has_header=True, delimiter=',', ) model = CatBoost() model.load_model(model_path) predictions = model.predict(eval_data, prediction_type='Probability') numpy.savetxt(predictions_path, predictions)
def test_build_function_with_input_output_artifacts(self): nb_file = get_tmp_notebook(notebook_source) builder = NbComponentBuilder('op1', inject_notebook_path=nb_file.name) builder.add_input_artifact('a_in') builder.add_output_artifact('a_out') func = builder.build_component_function() self.assertEqual(type(func.__annotations__['a_in']), type(InputPath())) self.assertEqual(type(func.__annotations__['a_out']), type(OutputPath()))
def prepro_class( dataset_path: InputPath(str), class_dir: InputPath(str), train_ratio, glove_vec_size, mode, tokenizer, url, port, prepro_squad_dir: OutputPath(str), ): import nltk from squad.prepro_class import prepro nltk.download("punkt") train_ratio = float(train_ratio) glove_vec_size = int(glove_vec_size) port = int(port) def get_args(): source_dir = class_dir + "/data/squad-class" target_dir = prepro_squad_dir + "/squad-class" glove_dir = dataset_path + "/data/glove" from types import SimpleNamespace args = SimpleNamespace( source_dir=source_dir, target_dir=target_dir, debug=False, train_ratio=train_ratio, glove_corpus="6B", glove_dir=glove_dir, glove_vec_size=glove_vec_size, mode=mode, single_path="", tokenizer=tokenizer, url=url, port=port, split=False, ) return args args = get_args() prepro(args)
def test_model(test_x_path: InputPath(), test_y_path: InputPath(), model_path: InputPath()): import numpy import pickle import random from sklearn.metrics import classification_report p = random.random() print(p) if p > 0.5: raise Exception() test_x = numpy.load(test_x_path) test_y = numpy.load(test_y_path) print(f"Testing data size - X: {test_x.size}, y: {test_y.size}") with open(model_path, 'rb') as f: model = pickle.load(f) pred_y = model.predict(test_x) print(classification_report(test_y, pred_y))
def train_logistics(train_x_path: InputPath(), train_y_path: InputPath(), model_path: OutputPath()): import numpy import pickle import random from sklearn.linear_model import LogisticRegression p = random.random() print(p) if p > 0.5: raise Exception() train_x = numpy.load(train_x_path) train_y = numpy.load(train_y_path) print(f"Training data size - X: {train_x.size}, y: {train_y.size}") knn = LogisticRegression() knn.fit(train_x, train_y) with open(model_path, 'wb') as f: pickle.dump(knn, f)
def convert_values_to_int(text_path: InputPath('Text'), output_path: OutputPath('Text')): """Returns the number of values in a CSV column.""" import numpy as np result = np.loadtxt(text_path) np.savetxt(output_path, result, fmt='%d')
def read_csv(input_csv_path: InputPath("CSV")): import subprocess subprocess.run(["pip", "install", "pandas"]) import pandas as pd df = pd.read_csv(input_csv_path, index_col=0) print(f"input_csv_path: {input_csv_path}") print(f"type: {type(input_csv_path)}") print(df.head())
def convert_to_tensorflow_saved_model_from_onnx_model( model_path: InputPath('OnnxModel'), converted_model_path: OutputPath('TensorflowSavedModel'), ): import onnx import onnx_tf onnx_model = onnx.load(model_path) tf_rep = onnx_tf.backend.prepare(onnx_model) tf_rep.export_graph(converted_model_path)
def train(dataset: InputPath('Dataset'), model: OutputPath('Model'), num_steps: int = 100): '''Dummy Training Step.''' with open(dataset, 'r') as input_file: input_string = input_file.read() with open(model, 'w') as output_file: for i in range(num_steps): output_file.write("Step {}\n{}\n=====\n".format(i, input_string))
def train(x_path: InputPath(str), y_path: InputPath(str), model_path: OutputPath(str)): import pandas as pd from sklearn.linear_model import LogisticRegression import joblib x = pd.read_parquet(x_path) y = pd.read_parquet(y_path) model = LogisticRegression() model.fit(x, y) joblib.dump(model, model_path) # TODO: output artifact of model stats coefs = { feature: round(value, 2) for feature, value in zip(x.columns, model.coef_.flatten()) } coefs['intercept'] = round(model.intercept_[0], 2) return coefs
def prepare_embeddings( gcp_bucket: str, num_words: int, w2v_model_path: str, embedding_dim: int, json_tokenizer_path: InputPath(str), num_classes: int, output_emb_matrix_path: OutputPath(str) ) -> NamedTuple('PrepareEmbOutput', [('vocabulary_size', int)]): from gensim.models import Word2Vec from google.cloud import storage from tensorflow.keras.preprocessing.text import tokenizer_from_json import os import json import numpy as np from collections import namedtuple # Storage client for loading the w2v model storage_client = storage.Client() bucket = storage_client.bucket(gcp_bucket) # Load w2v model model = Word2Vec() blob_w2v = bucket.get_blob(w2v_model_path) destination_uri = '{}/{}'.format(".", blob_w2v.name) if not os.path.exists(destination_uri): os.mkdir("/model") blob_w2v.download_to_filename(destination_uri) w2v_model = model.wv.load(destination_uri) word_vectors = w2v_model.wv # Load Json tokenizer # blob_tok = bucket.get_blob(json_tokenizer_path) with open(json_tokenizer_path) as f: json_token = json.load(f) tokenizer = tokenizer_from_json(json_token) word_index = tokenizer.word_index vocabulary_size = min(len(word_index) + 1, num_words) embedding_matrix = np.zeros((vocabulary_size, embedding_dim), dtype=np.int32) for word, i in word_index.items(): if i >= num_words: continue try: embedding_vector = word_vectors[word] embedding_matrix[i] = embedding_vector except KeyError: embedding_matrix[i] = np.random.normal(0, np.sqrt(0.25), embedding_dim) del (word_vectors) # Save the matrix @ output_embd_matrix_path embedding_matrix.tofile(output_emb_matrix_path) PrepareEmbOutput = namedtuple('PrepareEmbOuput', ['vocabulary_size']) return (PrepareEmbOutput(vocabulary_size))
def split_text_lines(source_path: InputPath(str), odd_lines_path: OutputPath(str), even_lines_path: OutputPath(str)): with open(source_path, 'r') as reader: with open(odd_lines_path, 'w') as odd_writer: with open(even_lines_path, 'w') as even_writer: while True: line = reader.readline() if line == "": break odd_writer.write(line) line = reader.readline() if line == "": break even_writer.write(line)
def split_data( x_path: InputPath(str), y_path: InputPath(str), x_train_path: OutputPath(str), y_train_path: OutputPath(str), x_test_path: OutputPath(str), y_test_path: OutputPath(str), test_size: float = 0.2 ): # split into training and test sets import pandas as pd from sklearn.model_selection import train_test_split x = pd.read_parquet(x_path) y = pd.read_parquet(y_path) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=1) x_train.to_parquet(x_train_path) x_test.to_parquet(x_test_path) y_train.to_parquet(y_train_path) y_test.to_parquet(y_test_path)