target_folder='/Users/guardi/MSCA/MLOps/ClearML/working_dataset', overwrite=True) print(f"dataset_folder: {dataset_folder}") df = pd.read_csv(dataset_folder + '/transformed_dataset.csv') X = df[[ 'GDP per capita', 'Social support', 'Freedom to make life choices', 'Generosity', 'Perceptions of corruption', 'Social Generosity' ]] # target y = df['Healthy life expectancy'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # store the dataset split into a pickle file with open(dataset_folder + '/transformed_train.pkl', 'wb') as f: pickle.dump([X_train, X_test, y_train, y_test], f) # create a new version of the dataset with the pickle file new_dataset = Dataset.create(dataset_project='assignment1', dataset_name='transformed_data_split', parent_datasets=[dataset]) new_dataset.sync_folder(local_path=dataset_folder) new_dataset.upload() new_dataset.finalize() print('we are done')
target_folder='/Users/guardi/MSCA/MLOps/ClearML/working_dataset', overwrite=True) print(f"dataset_folder: {dataset_folder}") df = pd.read_csv(dataset_folder + '/clean_data.csv') X = df[[ 'GDP per capita', 'Social support', 'Freedom to make life choices', 'Generosity', 'Perceptions of corruption' ]] # target y = df['Healthy life expectancy'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # store the dataset split into a pickle file with open(dataset_folder + '/clean_train.pkl', 'wb') as f: pickle.dump([X_train, X_test, y_train, y_test], f) # create a new version of the dataset with the pickle file new_dataset = Dataset.create(dataset_project='assignment1', dataset_name='clean_data_split', parent_datasets=[dataset]) new_dataset.sync_folder(local_path=dataset_folder) new_dataset.upload() new_dataset.finalize() print('we are done')
# create a copy that we can change, dataset_folder = dataset.get_mutable_local_copy( target_folder='working_dataset', overwrite=True) print(f"dataset_folder: {dataset_folder}") # open the dataset pickle file with open(dataset_folder + '/iris_dataset.pkl', 'rb') as f: iris = pickle.load(f) # "process" data (i.e. we split it into train/test) X = iris.data y = iris.target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # store the dataset split into a pickle file with open(dataset_folder + '/iris_dataset.pkl', 'wb') as f: pickle.dump([X_train, X_test, y_train, y_test], f) # create a new version of the dataset with the pickle file new_dataset = Dataset.create(dataset_project='uchicago', dataset_name='dataset2', parent_datasets=[dataset]) new_dataset.sync_folder(local_path=dataset_folder) new_dataset.upload() new_dataset.finalize() print('we are done')
parser.add_argument( '--clearml-project', dest='clearml_project', type=str, help= 'The name of the clearml project that the dataset will be stored and published to.', default='Caltech Birds/Datasets') parser.add_argument( '--clearml-dataset-url', dest='clearml_dataset_url', type=str, help= 'Location of where the dataset files should be stored. Default is Azure Blob Storage. Format is azure://storage_account/container', default='azure://clearmllibrary/datasets') args = parser.parse_args() for task_type in ['train', 'test']: print('[INFO] Versioning and uploading {0} dataset for CUB200 2011'.format( task_type)) dataset = Dataset.create('cub200_2011_{0}_dataset'.format(task_type), dataset_project=args.clearml_project) dataset.add_files(path=os.path.join(args.dataset_basedir, task_type), verbose=False) dataset.upload(output_url=args.clearml_dataset_url) print('[INFO] {0} Dataset finalized....'.format(task_type), end='') dataset.finalize() print('done.') print('[INFO] {0} Dataset published....'.format(task_type), end='') dataset.publish() print('done.')
# Download CIFAR dataset and create a dataset with ClearML's Dataset class from clearml import StorageManager, Dataset manager = StorageManager() dataset_path = manager.get_local_copy( remote_url="https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz") dataset = Dataset.create(dataset_name="cifar_dataset", dataset_project="dataset_examples") # Prepare and clean data here before it is added to the dataset dataset.add_files(path=dataset_path) # Dataset is uploaded to the ClearML Server by default dataset.upload() dataset.finalize()
if cfg.delete_earlier_versions: for t in test_if_exists: try: Dataset.delete(t['id']) print(f'Deleted {t}') except ValueError: print(f'Could not delete dataset - has children?') except ValueError: pass print(f'Now with {dataset_name}, creating!') if cfg.use_lineage: new_dataset = Dataset.create( dataset_name=dataset_name+stage, dataset_project=project_name, parent_datasets=[cfg.input_dataset_id] ) print('...Done') # remove other sizes for other_folder_rel in all_subfolders_rel: if other_folder_rel != rel_folder: new_dataset.remove_files(str(other_folder_rel)+"/*", verbose=False) # remove other stages for not_stage in ['train', 'val', 'test']: if not_stage != stage: new_dataset.remove_files(str(rel_folder/not_stage)+"/*", verbose=False) # upload should be no-op in this case rmed = new_dataset.list_removed_files(cfg.input_dataset_id)