def upload(args): """ Given a folder or file, upload all the folders and files contained within it, skipping ones that already exist on the remote. """ base_remote_path, path_dict = Object.validate_full_path( args.full_path, vault=args.vault, path=args.path) # Assert the vault exists and is accessible vault = Vault.get_by_full_path(path_dict['vault_full_path']) # If not the vault root, validate remote path exists and is a folder if path_dict['path'] != '/': Object.get_by_full_path(base_remote_path, assert_type='folder') for local_path in args.local_path: local_path = local_path.rstrip('/') local_start = os.path.basename(local_path) if os.path.isdir(local_path): _upload_folder(path_dict['domain'], vault, base_remote_path, local_path, local_start) else: Object.upload_file(local_path, path_dict['path'], vault.full_path)
def _create_folder(vault, full_path, tags=None): """Create a folder if not exists""" full_path, path_dict = \ Object.validate_full_path(full_path) folder_name = path_dict['filename'] try: new_obj = Object.get_by_full_path(full_path) if not new_obj.is_folder: raise SolveError('Object type {} already exists at location: {}' .format(new_obj.object_type, full_path)) except NotFoundError: # Create the folder if path_dict['parent_path'] == '/': parent_object_id = None else: parent = Object.get_by_full_path(path_dict['parent_full_path'], assert_type='folder') parent_object_id = parent.id # Make the API call new_obj = Object.create( vault_id=vault.id, parent_object_id=parent_object_id, object_type='folder', filename=folder_name, tags=tags or [] ) print('Notice: Folder created for {0} at {1}' .format(folder_name, new_obj.path)) return new_obj
def upload(args): """ Given a folder or file, upload all the folders and files contained within it, skipping ones that already exist on the remote. """ base_remote_path, path_dict = Object.validate_full_path(args.full_path, vault=args.vault, path=args.path) # Assert the vault exists and is accessible vault = Vault.get_by_full_path(path_dict['vault_full_path']) # If not the vault root, validate remote path exists and is a folder if path_dict['path'] != '/': Object.get_by_full_path(base_remote_path, assert_type='folder') for local_path in args.local_path: local_path = local_path.rstrip('/') local_start = os.path.basename(local_path) if os.path.isdir(local_path): _upload_folder(path_dict['domain'], vault, base_remote_path, local_path, local_start) else: Object.upload_file(local_path, path_dict['path'], vault.full_path)
def import_file(args): """ Given a dataset and a local path, upload and import the file(s). Command arguments (args): * create_dataset * template_id * full_path * vault (optional, overrides the vault in full_path) * path (optional, overrides the path in full_path) * commit_mode * capacity * file (list) * follow (default: False) """ full_path, path_dict = Object.validate_full_path( args.full_path, vault=args.vault, path=args.path) # Ensure the dataset exists. Create if necessary. if args.create_dataset: dataset = create_dataset(args) else: try: dataset = solvebio.Dataset.get_by_full_path(full_path) except solvebio.SolveError as e: if e.status_code != 404: raise e print("Dataset not found: {0}".format(full_path)) print("Tip: use the --create-dataset flag " "to create one from a template") sys.exit(1) # Generate a manifest from the local files manifest = solvebio.Manifest() manifest.add(*args.file) # Create the manifest-based import imp = solvebio.DatasetImport.create( dataset_id=dataset.id, manifest=manifest.manifest, commit_mode=args.commit_mode ) if args.follow: imp.follow() else: mesh_url = 'https://my.solvebio.com/activity/' print("Your import has been submitted, view details at: {0}" .format(mesh_url))
def import_file(args): """ Given a dataset and a local path, upload and import the file(s). Command arguments (args): * create_dataset * template_id * full_path * vault (optional, overrides the vault in full_path) * path (optional, overrides the path in full_path) * commit_mode * capacity * file (list) * follow (default: False) """ full_path, path_dict = Object.validate_full_path(args.full_path, vault=args.vault, path=args.path) # Ensure the dataset exists. Create if necessary. if args.create_dataset: dataset = create_dataset(args) else: try: dataset = solvebio.Dataset.get_by_full_path(full_path) except solvebio.SolveError as e: if e.status_code != 404: raise e print("Dataset not found: {0}".format(full_path)) print("Tip: use the --create-dataset flag " "to create one from a template") sys.exit(1) # Generate a manifest from the local files manifest = solvebio.Manifest() manifest.add(*args.file) # Create the manifest-based import imp = solvebio.DatasetImport.create(dataset_id=dataset.id, manifest=manifest.manifest, commit_mode=args.commit_mode) if args.follow: imp.follow() else: mesh_url = 'https://my.solvebio.com/activity/' print("Your import has been submitted, view details at: {0}".format( mesh_url))
def get_or_create_by_full_path(cls, full_path, **kwargs): from solvebio import Vault from solvebio import Object _client = kwargs.pop('client', None) or cls._client or client create_vault = kwargs.pop('create_vault', False) create_folders = kwargs.pop('create_folders', True) # Check for object type assertion, if not explicitly added, see # if user has passed object_type, as their intent was to get/create # an object of that type. assert_type = kwargs.pop('assert_type', kwargs.get('object_type', None)) try: return cls.get_by_full_path(full_path, assert_type=assert_type, client=_client) except NotFoundError: pass # Object type required when creating Object object_type = kwargs.get('object_type') if not object_type: raise Exception("'object_type' is required when creating a new " "Object. Pass one of: file, folder, dataset") # TODO should we require file contents? # Technically a user could then use this object to the call # upload_file() # if object_type == 'file' and not kwargs.get('content'): # raise Exception('') # Object not found, create it step-by-step full_path, parts = Object.validate_full_path(full_path, client=_client) if create_vault: vault = Vault.get_or_create_by_full_path('{0}:{1}'.format( parts['domain'], parts['vault']), client=_client) else: vaults = Vault.all(account_domain=parts['domain'], name=parts['vault'], client=_client) if len(vaults.solve_objects()) == 0: raise Exception('Vault with name {0}:{1} does not exist. Pass ' 'create_vault=True to auto-create'.format( parts['domain'], parts['vault'])) vault = vaults.solve_objects()[0] # Create the folders to hold the object if they do not already exist. object_path = parts['path'] curr_path = os.path.dirname(object_path) folders_to_create = [] new_folders = [] id_map = {'/': None} while curr_path != '/': try: obj = Object.get_by_path(curr_path, vault_id=vault.id, assert_type='folder', client=_client) id_map[curr_path] = obj.id break except NotFoundError: if not create_folders: raise Exception('Folder {} does not exist. Pass ' 'create_folders=True to auto-create ' 'missing folders') folders_to_create.append(curr_path) curr_path = '/'.join(curr_path.split('/')[:-1]) if curr_path == '': break for folder in reversed(folders_to_create): new_folder = Object.create( object_type='folder', vault_id=vault.id, filename=os.path.basename(folder), parent_object_id=id_map[os.path.dirname(folder)], client=_client) new_folders.append(new_folder) id_map[folder] = new_folder.id if os.path.dirname(object_path) == '/': parent_folder_id = None elif new_folders: parent_folder_id = new_folders[-1].id else: parent_folder_id = id_map[os.path.dirname(object_path)] return Object.create(filename=os.path.basename(object_path), vault_id=vault.id, parent_object_id=parent_folder_id, client=_client, **kwargs)
def _upload_folder(domain, vault, base_remote_path, base_local_path, local_start, exclude_paths=None, dry_run=False): # Create the upload root folder if it does not exist on the remote try: upload_root_path, _ = Object.validate_full_path( os.path.join(base_remote_path, local_start) ) Object.get_by_full_path(upload_root_path, assert_type='folder') except NotFoundError: base_remote_path, path_dict = \ Object.validate_full_path(base_remote_path) base_folder_path = os.path.join(base_remote_path, local_start) if dry_run: print('[Dry Run] Creating folder {}'.format(base_folder_path)) else: _create_folder(vault, base_folder_path) # Create folders and upload files for abs_local_parent_path, folders, files in os.walk(base_local_path): # Strips off the local path and adds the parent directory at # each phase of the loop local_parent_path = re.sub( '^' + os.path.dirname(base_local_path), '', abs_local_parent_path ).lstrip('/') if should_exclude(abs_local_parent_path, exclude_paths, dry_run=dry_run): continue remote_folder_full_path = \ os.path.join(base_remote_path, local_parent_path) # Create folders for folder in folders: new_folder_path = os.path.join(abs_local_parent_path, folder) if should_exclude(new_folder_path, exclude_paths, dry_run=dry_run): continue remote_path = os.path.join(remote_folder_full_path, folder) if dry_run: print('[Dry Run] Creating folder {}'.format(remote_path)) else: _create_folder(vault, remote_path) # Upload the files that do not yet exist on the remote for f in files: local_file_path = os.path.join(abs_local_parent_path, f) if should_exclude(local_file_path, exclude_paths, dry_run=dry_run): continue if dry_run: print('[Dry Run] Uploading {} to {}' .format(local_file_path, remote_folder_full_path)) else: remote_parent = Object.get_by_full_path( remote_folder_full_path, assert_type='folder') Object.upload_file(local_file_path, remote_parent.path, vault.full_path)
def import_file(args): """ Given a dataset and a local path, upload and import the file(s). Command arguments (args): * create_dataset and it's args * capacity * template_id * template_file * capacity * tag * metadata * metadata_json_file * create_vault * full_path * commit_mode * remote_source * dry_run * follow * file (list) """ if args.dry_run: print("NOTE: Running import command in dry run mode") full_path, path_dict = Object.validate_full_path(args.full_path) files_list = [] if args.remote_source: # Validate files for file_fp in args.file: files_ = list(Object.all(glob=file_fp, limit=1000)) if not files_: print("Did not find any {}files at path {}".format( 'remote ' if args.remote_source else '', file_fp)) else: for file_ in files_: print("Found file: {}".format(file_.full_path)) files_list.append(file_) else: # Local files # Note: if these are globs or folders, then this will # create a multi-file manifest which is deprecated # and should be updated to one file per import. files_list = [fp for fp in args.file] if not files_list: print("Exiting. No files were found at the following {}paths: {}" .format('remote ' if args.remote_source else '', ', '.join(args.file))) sys.exit(1) if args.template_id: try: template = DatasetTemplate.retrieve(args.template_id) except SolveError as e: if e.status_code != 404: raise e print("No template with ID {0} found!".format(args.template_id)) sys.exit(1) elif args.template_file: template = _create_template_from_file(args.template_file, args.dry_run) else: template = None # Ensure the dataset exists. Create if necessary. if args.create_dataset: dataset = create_dataset(args, template=template) else: try: dataset = Object.get_by_full_path(full_path, assert_type='dataset') except solvebio.errors.NotFoundError: print("Dataset not found: {0}".format(full_path)) print("Tip: use the --create-dataset flag " "to create one from a template") sys.exit(1) if args.dry_run: print("Importing the following files/paths into dataset: {}" .format(full_path)) for file_ in files_list: if args.remote_source: print(file_.full_path) else: print(file_) return # Generate a manifest from the local files imports = [] for file_ in files_list: if args.remote_source: kwargs = dict(object_id=file_.id) else: manifest = solvebio.Manifest() manifest.add(file_) kwargs = dict(manifest=manifest.manifest) # Add template params if template: kwargs.update(template.import_params) # Create the import import_ = DatasetImport.create( dataset_id=dataset.id, commit_mode=args.commit_mode, **kwargs ) imports.append(import_) if args.follow: dataset.activity(follow=True) else: mesh_url = 'https://my.solvebio.com/activity/' print("Your import has been submitted, view details at: {0}" .format(mesh_url)) return imports, dataset
def upload(args): """ Given a folder or file, upload all the folders and files contained within it, skipping ones that already exist on the remote. """ base_remote_path, path_dict = Object.validate_full_path(args.full_path) # Assert the vault exists and is accessible vault = Vault.get_by_full_path(path_dict['vault_full_path']) # If not the vault root, validate remote path exists and is a folder if path_dict['path'] != '/': try: Object.get_by_full_path(base_remote_path, assert_type='folder') except: if not args.create_full_path: raise if args.dry_run: print('[Dry Run] Creating {}'.format(base_remote_path)) else: # Create the destination path (including subfolders) # if not found parent_folder_path = vault.full_path + ':' folders = path_dict['path'].lstrip('/').split('/') for folder in folders: folder_full_path = os.path.join(parent_folder_path, folder) parent_folder = _create_folder(vault, folder_full_path) parent_folder_path = parent_folder.full_path # Exit if there are multiple local paths and the # exclude paths are not absolute base_exclude_paths = args.exclude or [] if base_exclude_paths and len(args.local_path) > 1: rel_exclude_paths = [p for p in base_exclude_paths if not os.path.isabs(p)] local_path_parents = set([os.path.dirname(os.path.abspath(p)) for p in args.local_path]) if rel_exclude_paths and len(local_path_parents) > 1: sys.exit('Exiting. Cannot apply the --exclude relative paths when ' 'multiple upload paths with different parent directories ' 'are specified. Make --exclude paths absolute or run ' 'upload paths one at a time.') for local_path in args.local_path: # Expand local path and strip trailing slash local_path = os.path.abspath(local_path).rstrip('/') local_name = os.path.basename(local_path) # add basepath to excludes exclude_paths = [ os.path.join(local_path, os.path.normpath(exclude_path)) for exclude_path in base_exclude_paths ] if os.path.isdir(local_path): _upload_folder(path_dict['domain'], vault, base_remote_path, local_path, local_name, exclude_paths=exclude_paths, dry_run=args.dry_run) else: if args.dry_run: print('[Dry Run] Uploading {} to {}' .format(local_path, path_dict['path'])) else: Object.upload_file(local_path, path_dict['path'], vault.full_path)
def create_dataset(args, template=None): """ Attempt to create a new dataset given the following params: * template_id * template_file * capacity * tag * metadata * metadata_json_file * create_vault * full_path * dry_run """ if args.dry_run: print("NOTE: Running create-dataset command in dry run mode") full_path, path_dict = Object.validate_full_path(args.full_path) try: # Fail if a dataset already exists. Object.get_by_full_path(full_path, assert_type='dataset') print('A dataset already exists at path: {0}'.format(full_path)) sys.exit(1) except NotFoundError: pass # Accept a template_id or a template_file if template: # Template has already been validated/created # in the import command that called this pass elif args.template_id: try: template = DatasetTemplate.retrieve(args.template_id) except SolveError as e: if e.status_code != 404: raise e print("No template with ID {0} found!".format(args.template_id)) sys.exit(1) elif args.template_file: template = _create_template_from_file(args.template_file, args.dry_run) else: template = None if template: print("Creating new dataset {0} using the template '{1}'." .format(full_path, template.name)) fields = template.fields description = 'Created with dataset template: {0}' \ .format(str(template.id)) else: fields = [] description = None # Create dataset metadata # Looks at --metadata_json_file first and will update # that with any other key/value pairs passed in to --metadata metadata = {} if args.metadata and args.metadata_json_file: print('WARNING: Received --metadata and --metadata-json-file. ' 'Will update the JSON file values with the --metadata values') if args.metadata_json_file: with open(args.metadata_json_file, 'r') as fp: try: metadata = json.load(fp) except: print('Metadata JSON file {0} could not be loaded. Please ' 'pass valid JSON'.format(args.metadata_json_file)) sys.exit(1) if args.metadata: metadata.update(args.metadata) if args.dry_run: print("Creating new '{}' capacity dataset at {}" .format(args.capacity, full_path)) if description: print("Description: {}".format(description)) if fields: print("Fields: {}".format(fields)) if args.tag: print("Tags: {}".format(args.tag)) if metadata: print("Metadata: {}".format(metadata)) return return Dataset.get_or_create_by_full_path( full_path, capacity=args.capacity, fields=fields, description=description, tags=args.tag or [], metadata=metadata, create_vault=args.create_vault, )
def _upload_folder(domain, vault, base_remote_path, base_local_path, local_start): # Create the upload root folder if it does not exist on the remote try: upload_root_path, _ = Object.validate_full_path( os.path.join(base_remote_path, local_start)) obj = Object.get_by_full_path(upload_root_path, assert_type='folder') except NotFoundError: base_remote_path, path_dict = \ Object.validate_full_path(base_remote_path) if path_dict['path'] == '/': parent_object_id = None else: obj = Object.get_by_full_path(base_remote_path, assert_type='folder') parent_object_id = obj.id # Create base folder new_folder = Object.create(vault_id=vault.id, parent_object_id=parent_object_id, object_type='folder', filename=local_start) print('Notice: Folder created for {0} at {1}'.format( base_local_path, new_folder.path, )) for root, dirs, files in os.walk(base_local_path): # Create the sub-folders that do not exist on the remote for d in dirs: dirpath = os.path.join( base_remote_path, re.sub('^' + os.path.dirname(base_local_path), '', root).lstrip('/'), # noqa d) try: Object.get_by_full_path(dirpath, object_type='folder') except NotFoundError: # Create the folder if os.path.dirname(dirpath.split(':')[-1]) == '/': parent_object_id = None else: parent_full_path = os.path.dirname(dirpath) parent = Object.get_by_full_path(parent_full_path, assert_type='folder') parent_object_id = parent.id # Make the API call new_obj = Object.create( vault_id=vault.id, parent_object_id=parent_object_id, object_type='folder', filename=d, ) print('Notice: Folder created for {0} at {1}'.format( os.path.join(root, d), new_obj.path)) # Upload the files that do not yet exist on the remote for f in files: file_full_path = os.path.join( base_remote_path, re.sub('^' + os.path.dirname(base_local_path), '', root).lstrip('/'), f, ) try: Object.get_by_full_path(file_full_path) except NotFoundError: parent_full_path = os.path.dirname( os.path.join( base_remote_path, re.sub('^' + os.path.dirname(base_local_path), '', root).lstrip('/'), f, )) parent = Object.get_by_full_path(parent_full_path, assert_type='folder') Object.upload_file(os.path.join(root, f), parent.path, vault.full_path)
def _upload_folder(domain, vault, base_remote_path, base_local_path, local_start): # Create the upload root folder if it does not exist on the remote try: upload_root_path, _ = Object.validate_full_path( os.path.join(base_remote_path, local_start) ) obj = Object.get_by_full_path(upload_root_path, assert_type='folder') except NotFoundError: base_remote_path, path_dict = \ Object.validate_full_path(base_remote_path) if path_dict['path'] == '/': parent_object_id = None else: obj = Object.get_by_full_path(base_remote_path, assert_type='folder') parent_object_id = obj.id # Create base folder new_folder = Object.create( vault_id=vault.id, parent_object_id=parent_object_id, object_type='folder', filename=local_start ) print('Notice: Folder created for {0} at {1}'.format( base_local_path, new_folder.path, )) for root, dirs, files in os.walk(base_local_path): # Create the sub-folders that do not exist on the remote for d in dirs: dirpath = os.path.join( base_remote_path, re.sub('^' + os.path.dirname(base_local_path), '', root).lstrip('/'), # noqa d ) try: Object.get_by_full_path(dirpath, object_type='folder') except NotFoundError: # Create the folder if os.path.dirname(dirpath.split(':')[-1]) == '/': parent_object_id = None else: parent_full_path = os.path.dirname(dirpath) parent = Object.get_by_full_path( parent_full_path, assert_type='folder') parent_object_id = parent.id # Make the API call new_obj = Object.create( vault_id=vault.id, parent_object_id=parent_object_id, object_type='folder', filename=d, ) print('Notice: Folder created for {0} at {1}' .format(os.path.join(root, d), new_obj.path)) # Upload the files that do not yet exist on the remote for f in files: file_full_path = os.path.join( base_remote_path, re.sub('^' + os.path.dirname(base_local_path), '', root).lstrip('/'), f, ) try: Object.get_by_full_path(file_full_path) except NotFoundError: parent_full_path = os.path.dirname( os.path.join( base_remote_path, re.sub('^' + os.path.dirname(base_local_path), '', root).lstrip('/'), f, ) ) parent = Object.get_by_full_path( parent_full_path, assert_type='folder') Object.upload_file(os.path.join(root, f), parent.path, vault.full_path)
def create_dataset(args): """ Attempt to create a new dataset given the following params: * template_id * template_file * capacity * create_vault * [argument] dataset name or full path NOTE: genome_build has been deprecated and is no longer used. """ # For backwards compatibility, the "full_path" argument # can be a dataset filename, but only if vault and path # are set. If vault/path are both provided and there # are no forward-slashes in the "full_path", assume # the user has provided a dataset filename. if '/' not in args.full_path and args.vault and args.path: full_path, path_dict = Object.validate_full_path( '{0}:/{1}/{2}'.format(args.vault, args.path, args.full_path)) else: full_path, path_dict = Object.validate_full_path( args.full_path, vault=args.vault, path=args.path) # Accept a template_id or a template_file if args.template_id: # Validate the template ID try: tpl = solvebio.DatasetTemplate.retrieve(args.template_id) except solvebio.SolveError as e: if e.status_code != 404: raise e print("No template with ID {0} found!" .format(args.template_id)) sys.exit(1) elif args.template_file: mode = 'r' fopen = open if check_gzip_path(args.template_file): mode = 'rb' fopen = gzip.open # Validate the template file with fopen(args.template_file, mode) as fp: try: tpl_json = json.load(fp) except: print('Template file {0} could not be loaded. Please ' 'pass valid JSON'.format(args.template_file)) sys.exit(1) tpl = solvebio.DatasetTemplate.create(**tpl_json) print("A new dataset template was created with id: {0}".format(tpl.id)) else: print("Creating a new dataset {0} without a template." .format(full_path)) tpl = None fields = [] entity_type = None description = None if tpl: print("Creating new dataset {0} using the template '{1}'." .format(full_path, tpl.name)) fields = tpl.fields entity_type = tpl.entity_type # include template used to create description = 'Created with dataset template: {0}'.format(str(tpl.id)) return solvebio.Dataset.get_or_create_by_full_path( full_path, capacity=args.capacity, entity_type=entity_type, fields=fields, description=description, create_vault=args.create_vault, )
def get_or_create_by_full_path(cls, full_path, **kwargs): from solvebio import Vault from solvebio import Object _client = kwargs.pop('client', None) or cls._client or client create_vault = kwargs.pop('create_vault', False) create_folders = kwargs.pop('create_folders', True) try: return Dataset.get_by_full_path(full_path, assert_type='dataset', client=_client) except NotFoundError: pass # Dataset not found, create it step-by-step full_path, parts = Object.validate_full_path(full_path, client=_client) if create_vault: vault = Vault.get_or_create_by_full_path('{0}:{1}'.format( parts['domain'], parts['vault']), client=_client) else: vaults = Vault.all(account_domain=parts['domain'], name=parts['vault'], client=_client) if len(vaults.solve_objects()) == 0: raise Exception( 'Vault does not exist with name {0}:{1}'.format( parts['domain'], parts['vault'])) vault = vaults.solve_objects()[0] # Create the folders to hold the dataset if they do not already exist. object_path = parts['path'] curr_path = os.path.dirname(object_path) folders_to_create = [] new_folders = [] id_map = {'/': None} while curr_path != '/': try: obj = Object.get_by_path(curr_path, vault_id=vault.id, assert_type='folder', client=_client) id_map[curr_path] = obj.id break except NotFoundError: if not create_folders: raise Exception('Folder {} does not exist. Pass ' 'create_folders=True to auto-create ' 'missing folders') folders_to_create.append(curr_path) curr_path = '/'.join(curr_path.split('/')[:-1]) if curr_path == '': break for folder in reversed(folders_to_create): new_folder = Object.create( object_type='folder', vault_id=vault.id, filename=os.path.basename(folder), parent_object_id=id_map[os.path.dirname(folder)], client=_client) new_folders.append(new_folder) id_map[folder] = new_folder.id if os.path.dirname(object_path) == '/': parent_folder_id = None elif new_folders: parent_folder_id = new_folders[-1].id else: parent_folder_id = id_map[os.path.dirname(object_path)] return Dataset.create(name=os.path.basename(object_path), vault_id=vault.id, vault_parent_object_id=parent_folder_id, client=_client, **kwargs)
def upload_file(cls, local_path, remote_path, vault_full_path, **kwargs): from solvebio import Vault from solvebio import Object _client = kwargs.pop('client', None) or cls._client or client local_path = os.path.expanduser(local_path) if os.stat(local_path).st_size == 0: print('Notice: Cannot upload empty file {0}'.format(local_path)) return # Get vault vault = Vault.get_by_full_path(vault_full_path, client=_client) # Get MD5, mimetype, and file size for the object local_md5, _ = md5sum(local_path, multipart_threshold=None) _, mimetype = mimetypes.guess_type(local_path) size = os.path.getsize(local_path) # Check if object exists already and compare md5sums full_path, path_dict = Object.validate_full_path(os.path.join( '{}:{}'.format(vault.full_path, remote_path), os.path.basename(local_path)), client=_client) try: obj = cls.get_by_full_path(full_path, client=_client) if not obj.is_file: print('WARNING: A {} currently exists at {}'.format( obj.object_type, full_path)) else: # Check against md5sum of remote file if obj.md5 == local_md5: print('WARNING: File {} (md5sum {}) already exists, ' 'not uploading'.format(full_path, local_md5)) return obj else: print('WARNING: File {} exists on SolveBio with different ' 'md5sum (local: {} vs remote: {}) Uploading anyway, ' 'but not overwriting.'.format( full_path, local_md5, obj.md5)) except NotFoundError: pass # Lookup parent object if path_dict['parent_path'] == '/': parent_object_id = None else: parent_obj = Object.get_by_full_path(path_dict['parent_full_path'], assert_type='folder', client=_client) parent_object_id = parent_obj.id description = kwargs.get('description') # Create the file, and upload it to the Upload URL obj = Object.create(vault_id=vault.id, parent_object_id=parent_object_id, object_type='file', filename=os.path.basename(local_path), md5=local_md5, mimetype=mimetype, size=size, description=description, tags=kwargs.get('tags', []) or [], client=_client) print('Notice: File created for {0} at {1}'.format( local_path, obj.path)) print('Notice: Upload initialized') upload_url = obj.upload_url headers = { 'Content-MD5': base64.b64encode(binascii.unhexlify(local_md5)), 'Content-Type': mimetype, 'Content-Length': str(size), } # Use a session with a retry policy to handle connection errors. session = requests.Session() max_retries = 5 retry = Retry( total=max_retries, read=max_retries, connect=max_retries, backoff_factor=0.3, status_forcelist=(500, 502, 504, 400), ) session.mount('https://', requests.adapters.HTTPAdapter(max_retries=retry)) upload_resp = session.put(upload_url, data=open(local_path, 'rb'), headers=headers) if upload_resp.status_code != 200: print('WARNING: Upload status code for {0} was {1}'.format( local_path, upload_resp.status_code)) # Clean up the failed upload obj.delete(force=True) raise FileUploadError(upload_resp.content) else: print('Notice: Successfully uploaded {0} to {1}'.format( local_path, obj.path)) return obj
def get_or_create_by_full_path(cls, full_path, **kwargs): from solvebio import Vault from solvebio import Object _client = kwargs.pop('client', None) or cls._client or client create_vault = kwargs.pop('create_vault', False) create_folders = kwargs.pop('create_folders', True) try: return Dataset.get_by_full_path(full_path, assert_type='dataset', client=_client) except NotFoundError: pass # Dataset not found, create it step-by-step full_path, parts = Object.validate_full_path(full_path, client=_client) if create_vault: vault = Vault.get_or_create_by_full_path( '{0}:{1}'.format(parts['domain'], parts['vault']), client=_client) else: vaults = Vault.all(account_domain=parts['domain'], name=parts['vault'], client=_client) if len(vaults.solve_objects()) == 0: raise Exception( 'Vault does not exist with name {0}:{1}'.format( parts['domain'], parts['vault']) ) vault = vaults.solve_objects()[0] # Create the folders to hold the dataset if they do not already exist. object_path = parts['path'] curr_path = os.path.dirname(object_path) folders_to_create = [] new_folders = [] id_map = {'/': None} while curr_path != '/': try: obj = Object.get_by_path(curr_path, vault_id=vault.id, assert_type='folder', client=_client) id_map[curr_path] = obj.id break except NotFoundError: if not create_folders: raise Exception('Folder {} does not exist. Pass ' 'create_folders=True to auto-create ' 'missing folders') folders_to_create.append(curr_path) curr_path = '/'.join(curr_path.split('/')[:-1]) if curr_path == '': break for folder in reversed(folders_to_create): new_folder = Object.create( object_type='folder', vault_id=vault.id, filename=os.path.basename(folder), parent_object_id=id_map[os.path.dirname(folder)], client=_client ) new_folders.append(new_folder) id_map[folder] = new_folder.id if os.path.dirname(object_path) == '/': parent_folder_id = None elif new_folders: parent_folder_id = new_folders[-1].id else: parent_folder_id = id_map[os.path.dirname(object_path)] return Dataset.create(name=os.path.basename(object_path), vault_id=vault.id, vault_parent_object_id=parent_folder_id, client=_client, **kwargs)
def create_dataset(args): """ Attempt to create a new dataset given the following params: * template_id * template_file * capacity * create_vault * [argument] dataset name or full path NOTE: genome_build has been deprecated and is no longer used. """ # For backwards compatibility, the "full_path" argument # can be a dataset filename, but only if vault and path # are set. If vault/path are both provided and there # are no forward-slashes in the "full_path", assume # the user has provided a dataset filename. if '/' not in args.full_path and args.vault and args.path: full_path, path_dict = Object.validate_full_path('{0}:/{1}/{2}'.format( args.vault, args.path, args.full_path)) else: full_path, path_dict = Object.validate_full_path(args.full_path, vault=args.vault, path=args.path) # Accept a template_id or a template_file if args.template_id: # Validate the template ID try: tpl = solvebio.DatasetTemplate.retrieve(args.template_id) except solvebio.SolveError as e: if e.status_code != 404: raise e print("No template with ID {0} found!".format(args.template_id)) sys.exit(1) elif args.template_file: mode = 'r' fopen = open if check_gzip_path(args.template_file): mode = 'rb' fopen = gzip.open # Validate the template file with fopen(args.template_file, mode) as fp: try: tpl_json = json.load(fp) except: print('Template file {0} could not be loaded. Please ' 'pass valid JSON'.format(args.template_file)) sys.exit(1) tpl = solvebio.DatasetTemplate.create(**tpl_json) print("A new dataset template was created with id: {0}".format(tpl.id)) else: print( "Creating a new dataset {0} without a template.".format(full_path)) tpl = None fields = [] entity_type = None description = None if tpl: print("Creating new dataset {0} using the template '{1}'.".format( full_path, tpl.name)) fields = tpl.fields entity_type = tpl.entity_type # include template used to create description = 'Created with dataset template: {0}'.format(str(tpl.id)) return solvebio.Dataset.get_or_create_by_full_path( full_path, capacity=args.capacity, entity_type=entity_type, fields=fields, description=description, create_vault=args.create_vault, )