def get_converted_data(dataset_task, conf_file): if dataset_task: dataset_upload_task = Dataset.get(dataset_id=dataset_task) else: dataset_upload_task = Dataset.get(dataset_project="Nvidia TLT examples with ClearML", dataset_name="Example data") image_directory_path = ( get_field_from_config(conf_file, "image_directory_path") .strip() .strip('"') .rpartition("/")[0] ) # noinspection PyBroadException try: os.makedirs(image_directory_path) except Exception: pass # download the artifact and open it saved_dataset = dataset_upload_task.get_local_copy() dataset_name = os.listdir(saved_dataset)[0] dataset_path = Path(os.path.join(saved_dataset, dataset_name)) if not dataset_path.is_dir() and dataset_path.suffix in (".zip", ".tgz", ".tar.gz"): dataset_suffix = dataset_path.suffix if dataset_suffix == ".zip": from zipfile import ZipFile ZipFile(dataset_path.as_posix()).extractall(path=image_directory_path) elif dataset_suffix == ".tar.gz": import tarfile with tarfile.open(dataset_path.as_posix()) as file: file.extractall(image_directory_path) elif dataset_suffix == ".tgz": import tarfile with tarfile.open(dataset_path.as_posix(), mode="r:gz") as file: file.extractall(image_directory_path) saved_dataset = str(dataset_path) else: os.system("cp -R {}/* {}".format(saved_dataset, image_directory_path)) print(saved_dataset)
) valid_aug = albumentations.Compose( [ albumentations.Normalize( mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], max_pixel_value=255.0, p=1.0, ), ], p=1.0, ) # download dataset (cached!) dataset_folder = Dataset.get(dataset_id=cfg.dataset_id).get_local_copy() train_image_paths = glob.glob( os.path.join(dataset_folder, f"jpeg-{cfg.image_size}x{cfg.image_size}", "train", "**", "*.jpeg"), recursive=True, ) valid_image_paths = glob.glob( os.path.join(dataset_folder, f"jpeg-{cfg.image_size}x{cfg.image_size}", "val", "**", "*.jpeg"), recursive=True, ) train_targets = [x.split("/")[-2] for x in train_image_paths] valid_targets = [x.split("/")[-2] for x in valid_image_paths]
task = Task.init(project_name="Image Example", task_name="Image classification CIFAR10") params = { "number_of_epochs": 20, "batch_size": 64, "dropout": 0.25, "base_lr": 0.001, "momentum": 0.9, "loss_report": 100, } params = task.connect(params) # enabling configuration override by clearml/ print(params) # printing actual configuration (after override in remote mode) # The below gets the dataset and stores in the cache. If you want to download the dataset regardless if it's in the # cache, use the Dataset.get(dataset_name, dataset_project).get_mutable_local_copy(path to download) dataset_path = Dataset.get(dataset_name=dataset_name, dataset_project=dataset_project).get_local_copy() # Dataset and Dataloader initializations transform = transforms.Compose([transforms.ToTensor()]) trainset = datasets.CIFAR10(root=dataset_path, train=True, download=False, transform=transform) trainloader = torch.utils.data.DataLoader(trainset, batch_size=params.get( "batch_size", 4), shuffle=True, num_workers=10) testset = datasets.CIFAR10(root=dataset_path,
def trigger_dataset_func(dataset_id): dataset = Dataset.get(dataset_id=dataset_id) print('dataset id {} created'.format(dataset.id))
def main(): task = Task.init(project_name="Nvidia Clara examples with ClearML", task_name="Training with Clara") task.set_base_docker( "nvcr.io/nvidia/clara-train-sdk:v3.1.01 --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864" ) parser = argparse.ArgumentParser() parser.add_argument("--mmar", "-m", type=str, help="MMAR_ROOT folder") parser.add_argument("--train_config", "-c", type=str, help="train config file", required=True) parser.add_argument("--env", "-e", type=str, help="environment file") parser.add_argument("--log_config", "-l", type=str, help="log config file") parser.add_argument("--write_train_stats", action="store_true") parser.add_argument("--set", metavar="KEY=VALUE", nargs="*") parser.add_argument("--parse_data", action="store_true", help="copy the artifact data") parser.add_argument( "--images_dir", type=str, help="Name of the images folder, will be store as a folder in DATA_ROOT." "Should be the same to the artifact name in the dataset task") parser.add_argument( "--labels_dir", type=str, help="Name of the labels folder, will be store as a folder in DATA_ROOT." "Should be the same to the artifact name in the dataset task") parser.add_argument( "--dataset_task", type=str, help= "The dataset task id, if not provided, a task named `Example data` will be chosen" ) set_env_vars() args = parser.parse_args() mmar = args.mmar or os.environ["MMAR_ROOT"] train_config = args.train_config env = args.env log_config = args.log_config kv = args.set images_dir = args.images_dir or "" labels_dir = args.labels_dir or "" dataset_task = args.dataset_task if dataset_task: dataset_task = Dataset.get(dataset_id=dataset_task) else: dataset_task = Dataset.get( dataset_project="Nvidia Clara examples with ClearML", dataset_name="Example data") updated_kv = [] if dataset_task: local_data = dataset_task.get_local_copy() for elem in kv: if elem.startswith("DATASET_JSON"): dataset_name = elem.rpartition("/")[2] updated_kv.append("DATASET_JSON={}".format( os.path.join(local_data, dataset_name))) else: updated_kv.append(elem) train_conf = task.connect_configuration(train_config, name="train", description="train config file") if env: env_conf = task.connect_configuration(env, name="env", description="environment file") with open(env_conf, "r") as env_file: import json env_dict = json.load(env_file) data_root = env_dict.get("DATA_ROOT", "/") # noinspection PyBroadException try: os.makedirs(os.path.join(mmar, data_root)) except Exception: pass dataset_json = env_dict.get("DATASET_JSON", "/") try: dataset_json_file = task.connect_configuration( os.path.join(mmar, dataset_json), name="dataset_json", description="dataset file") # noinspection PyBroadException try: os.makedirs(dataset_json.rpartition("/")[0]) except Exception: pass os.system("cp -R {} {}".format( dataset_json_file, os.path.join(mmar, dataset_json))) except Exception as ex: print("Can not connect dataset config file {},\n{}".format( dataset_json, ex)) for artifact in os.listdir(local_data): os.system("cp -R {} {}".format(os.path.join(local_data, artifact), str(os.path.join(mmar, data_root)))) if (artifact == images_dir and images_dir) or (artifact == labels_dir and labels_dir): os.system("mv {} {}".format( os.path.join(local_data, artifact), os.path.join(mmar, data_root, artifact))) else: env_conf = env log_conf = task.connect_configuration( log_config, name="log config", description="log config file") if log_config else log_config # noinspection PyBroadException try: os.makedirs(os.path.join(mmar, train_config.rpartition("/")[0])) except Exception: pass os.system("cp -R {} {}".format(train_conf, os.path.join(mmar, train_config))) # noinspection PyBroadException try: os.makedirs(os.path.join(mmar, env.rpartition("/")[0])) except Exception: pass os.system("cp -R {} {}".format(env_conf, os.path.join(mmar, env))) # noinspection PyBroadException try: os.makedirs(os.path.join(mmar, log_config.rpartition("/")[0])) except Exception: pass os.system("cp -R {} {}".format(log_conf, os.path.join(mmar, log_config))) train_mmar()
import pandas as pd import pickle import joblib import matplotlib.pyplot as plt import numpy as np from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestRegressor from clearml import Task, Dataset # Connecting ClearML task = Task.init(project_name="assignment1", task_name="training_dirty") # get dataset with split/test dataset = Dataset.get(dataset_project='assignment1', dataset_name='dirty_data_split') # get a read only version of the data dataset_folder = dataset.get_local_copy() # open the dataset pickle file with open(dataset_folder + '/dirty_train.pkl', 'rb') as f: X_train, X_test, y_train, y_test = pickle.load(f) # train the model rf = RandomForestRegressor(max_depth=2, random_state=0) rf.fit(X_train,y_train) # store the trained model joblib.dump(rf, 'rf_dirty.pkl', compress=True) # print model predication results result = rf.score(X_test, y_test)
import os import pickle import pandas as pd from clearml import Task, Dataset from sklearn.model_selection import train_test_split # Connecting ClearML task = Task.init(project_name="assignment1", task_name="split_transformed") # get the original dataset dataset = Dataset.get(dataset_project='assignment1', dataset_name='transformed_dataset') # create a copy that we can change, dataset_folder = dataset.get_mutable_local_copy( target_folder='/Users/guardi/MSCA/MLOps/ClearML/working_dataset', overwrite=True) print(f"dataset_folder: {dataset_folder}") df = pd.read_csv(dataset_folder + '/transformed_dataset.csv') X = df[[ 'GDP per capita', 'Social support', 'Freedom to make life choices', 'Generosity', 'Perceptions of corruption', 'Social Generosity' ]] # target y = df['Healthy life expectancy'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
def main(): task = Task.init(project_name="Nvidia Clara examples with ClearML", task_name="Validate Clara") task.set_base_docker( "nvcr.io/nvidia/clara-train-sdk:v3.1.01 --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864" ) parser = argparse.ArgumentParser() parser.add_argument('--mmar', '-m', type=str, help='MMAR_ROOT folder', required=True) parser.add_argument('--config', '-c', type=str, help='evaluate config file', required=True) parser.add_argument('--env', '-e', type=str, help='environment file') parser.add_argument('--log_config', '-l', type=str, help='log config file') parser.add_argument('--set', metavar='KEY=VALUE', nargs='*') parser.add_argument('--models_task', type=str, help='The training task id') parser.add_argument("--dataset_task", type=str, help="The dataset task id, if not provided, a task named `Example data` will be chosen") set_env_vars() args = parser.parse_args() mmar = args.mmar or os.environ["MMAR_ROOT"] evaluate_config = args.config env = args.env log_config = args.log_config kv = args.set dataset_task = args.dataset_task evaluate_conf = task.connect_configuration(evaluate_config, name="evaluate", description="evaluate config file") if env: env_conf = task.connect_configuration(env, name="env", description="environment file") if dataset_task: dataset_task = Dataset.get(dataset_id=dataset_task) else: dataset_task = Dataset.get(dataset_project="Nvidia Clara examples with ClearML", dataset_name="Example data") with open(env_conf, "r") as env_file: import json env_dict = json.load(env_file) data_root = env_dict.get("DATA_ROOT", "/") # noinspection PyBroadException try: os.makedirs(os.path.join(mmar, data_root)) except Exception: pass dataset_json = env_dict.get("DATASET_JSON", "/") try: dataset_json_file = task.connect_configuration(os.path.join(mmar, dataset_json), name="dataset_json", description="dataset file") # noinspection PyBroadException try: os.makedirs(os.path.join(mmar, dataset_json.rpartition("/")[0])) except Exception: pass os.system("cp -R {} {}".format(dataset_json_file, os.path.join(mmar, dataset_json))) except Exception as ex: print("Can not connect dataset config file {},\n{}".format(dataset_json, ex)) local_data = dataset_task.get_local_copy() for artifact in os.listdir(local_data): os.system("cp -R {} {}".format(os.path.join(local_data, artifact), str(os.path.join(mmar, data_root)))) os.system("mv {} {}".format(os.path.join(local_data, artifact), os.path.join(mmar, data_root, artifact))) else: env_conf = env log_conf = task.connect_configuration(log_config, name="log config", description="log config file") if log_config \ else log_config # noinspection PyBroadException try: os.makedirs(os.path.join(mmar, evaluate_config.rpartition("/")[0])) except Exception: pass os.system("cp -R {} {}".format(evaluate_conf, os.path.join(mmar, evaluate_config))) # noinspection PyBroadException try: os.makedirs(os.path.join(mmar, env.rpartition("/")[0])) except Exception: pass os.system("cp -R {} {}".format(env_conf, os.path.join(mmar, env))) # noinspection PyBroadException try: os.makedirs(os.path.join(mmar, log_config.rpartition("/")[0])) except Exception: pass os.system("cp -R {} {}".format(log_conf, os.path.join(mmar, log_config))) if args.models_task: m_task = Task.get_task(task_id=args.models_task) output_models = m_task.get_models().get("output") script_path = Path(__file__).parent.absolute() dest = [elem.partition("=")[2] for elem in kv if elem.startswith("MMAR_CKPT_DIR")][0] # noinspection PyBroadException try: os.makedirs(dest) except Exception: pass for mdl in output_models: m_output = mdl.get_weights_package() for model in m_output: os.system("mv {} {}".format(os.path.join(script_path, model), dest)) evaluate_mmar() # noinspection PyBroadException try: for f in Path(os.path.join(mmar, env_dict.get("MMAR_EVAL_OUTPUT_PATH", "/"))).rglob('*'): task.upload_artifact(f.name, artifact_object=f) except Exception: pass
import os import pickle import pandas as pd from clearml import Task, Dataset from sklearn.model_selection import train_test_split # Connecting ClearML task = Task.init(project_name="assignment1", task_name="split_clean") # get the original dataset dataset = Dataset.get(dataset_project='assignment1', dataset_name='clean_dataset') # create a copy that we can change, dataset_folder = dataset.get_mutable_local_copy( target_folder='/Users/guardi/MSCA/MLOps/ClearML/working_dataset', overwrite=True) print(f"dataset_folder: {dataset_folder}") df = pd.read_csv(dataset_folder + '/clean_data.csv') X = df[[ 'GDP per capita', 'Social support', 'Freedom to make life choices', 'Generosity', 'Perceptions of corruption' ]] # target y = df['Healthy life expectancy'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
) valid_aug = albumentations.Compose( [ albumentations.Normalize( mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], max_pixel_value=255.0, p=1.0, ), ], p=1.0, ) # download dataset (cached!) <--- dataset_folder = Dataset.get(dataset_id=DATASET_ID).get_local_copy() train_image_paths = glob.glob( os.path.join(dataset_folder, f"jpeg-{IMAGE_SIZE}x{IMAGE_SIZE}", "train", "**", "*.jpeg"), recursive=True, ) valid_image_paths = glob.glob( os.path.join(dataset_folder, f"jpeg-{IMAGE_SIZE}x{IMAGE_SIZE}", "val", "**", "*.jpeg"), recursive=True, ) train_targets = [x.split("/")[-2] for x in train_image_paths] valid_targets = [x.split("/")[-2] for x in valid_image_paths]
import os import pickle from clearml import Task, Dataset from sklearn.model_selection import train_test_split # Connecting ClearML task = Task.init(project_name="uchicago", task_name="process dataset") # get the original dataset dataset = Dataset.get(dataset_project='uchicago', dataset_name='dataset1') # create a copy that we can change, dataset_folder = dataset.get_mutable_local_copy( target_folder='working_dataset', overwrite=True) print(f"dataset_folder: {dataset_folder}") # open the dataset pickle file with open(dataset_folder + '/iris_dataset.pkl', 'rb') as f: iris = pickle.load(f) # "process" data (i.e. we split it into train/test) X = iris.data y = iris.target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # store the dataset split into a pickle file with open(dataset_folder + '/iris_dataset.pkl', 'wb') as f: pickle.dump([X_train, X_test, y_train, y_test], f)
# override numpy version for colab Task.add_requirements('numpy', '1.19.5') task = Task.init( project_name=project_name, task_name='Orig dataset split to sizes', task_type=Task.TaskTypes.data_processing, output_uri = True, # auto save everything to ClearML Free ) cfg = DataSplitConf() task.connect(cfg, 'dataset split config') if cfg.cloud_queue is not None and len(cfg.cloud_queue): task.execute_remotely(cfg.cloud_queue) input_dataset = Dataset.get(dataset_id=cfg.input_dataset_id) input_dataset_folder = input_dataset.get_local_copy() # going to do some pruning relative to this folder all_subfolders = [d for d in Path(input_dataset_folder).iterdir() if d.is_dir()] all_subfolders_rel = [d.relative_to(input_dataset_folder) for d in all_subfolders] # prepare an artifact for upload results = {image_size: {'train': '', 'val': '', 'norm_info': {}} for image_size in cfg.image_size_values} for image_size in cfg.image_size_values: dataset_name = f"{cfg.dataset_name}_{image_size}x{image_size}_" train_files, validation_files = \ extract_relevant_filenames(input_dataset_folder, image_size)
cfg.dataset_metadata_artifact_name] metadata = artifact.get() for image_size, meta in metadata.items(): print(f'processing {image_size}...') # get augmentations - including mean pixel value norm_info = meta['norm_info'] # get dataset id's train_dataset_id = meta.get('train', "") valid_dataset_id = meta.get('val', "") if not len(train_dataset_id) or not len(valid_dataset_id): raise ValueError('Preprocess error: could not find' f' datasets for image size {image_size}') # download dataset (cached!) try: train_dataset_folder = Dataset.get( dataset_id=train_dataset_id).get_local_copy() valid_dataset_folder = Dataset.get( dataset_id=valid_dataset_id).get_local_copy() except ValueError as ex: raise ValueError( f'Preprocess error for datasets for image size {image_size}\n{ex}' ) train_image_paths = [ f for f in Path(train_dataset_folder).glob('**/*.jp*g') ] valid_image_paths = [ f for f in Path(valid_dataset_folder).glob('**/*.jp*g') ] # show some images