def to_path(path_or_url): is_url = path_or_url.find('://') != -1 # todo fixme if is_url: try: temp_name = sniff.stream_to_file(urlopen(path_or_url), prefix='url_paste') except Exception as e: raise UploadProblemException('Unable to fetch %s\n%s' % (path_or_url, unicodify(e))) return temp_name, is_url return path_or_url, is_url
def add_composite_file(dataset, output_path, files_path): if dataset.composite_files: os.mkdir(files_path) for name, value in dataset.composite_files.items(): value = util.bunch.Bunch(**value) if dataset.composite_file_paths[ value.name] is None and not value.optional: raise UploadProblemException( 'A required composite data file was not provided (%s)' % name) elif dataset.composite_file_paths[value.name] is not None: dp = dataset.composite_file_paths[value.name]['path'] isurl = dp.find('://') != -1 # todo fixme if isurl: try: temp_name = sniff.stream_to_file(urlopen(dp), prefix='url_paste') except Exception as e: raise UploadProblemException('Unable to fetch %s\n%s' % (dp, str(e))) dataset.path = temp_name dp = temp_name if not value.is_binary: tmpdir = output_adjacent_tmpdir(output_path) tmp_prefix = 'data_id_%s_convert_' % dataset.dataset_id if dataset.composite_file_paths[value.name].get( 'space_to_tab', value.space_to_tab): sniff.convert_newlines_sep2tabs(dp, tmp_dir=tmpdir, tmp_prefix=tmp_prefix) else: sniff.convert_newlines(dp, tmp_dir=tmpdir, tmp_prefix=tmp_prefix) shutil.move(dp, os.path.join(files_path, name)) # Move the dataset to its "real" path shutil.move(dataset.primary_file, output_path) # Write the job info return dict(type='dataset', dataset_id=dataset.dataset_id, stdout='uploaded %s file' % dataset.file_type)
def to_path(path_or_url): is_url = path_or_url.find('://') != -1 # todo fixme if is_url: try: temp_name = sniff.stream_url_to_file( path_or_url, file_sources=get_file_sources()) except Exception as e: raise UploadProblemException('Unable to fetch %s\n%s' % (path_or_url, unicodify(e))) return temp_name, is_url return path_or_url, is_url
def to_path(path_or_url): isa_url = is_url(path_or_url) file_sources = get_file_sources() if isa_url or file_sources and file_sources.looks_like_uri( path_or_url): try: temp_name = sniff.stream_url_to_file(path_or_url, file_sources=file_sources) except Exception as e: raise UploadProblemException('Unable to fetch %s\n%s' % (path_or_url, unicodify(e))) return temp_name, isa_url return path_or_url, isa_url
def add_file(dataset, registry, output_path): ext = None compression_type = None line_count = None link_data_only_str = dataset.get('link_data_only', 'copy_files') if link_data_only_str not in ['link_to_files', 'copy_files']: raise UploadProblemException( "Invalid setting '%s' for option link_data_only - upload request misconfigured" % link_data_only_str) link_data_only = link_data_only_str == 'link_to_files' # run_as_real_user is estimated from galaxy config (external chmod indicated of inputs executed) # If this is True we always purge supplied upload inputs so they are cleaned up and we reuse their # paths during data conversions since this user already owns that path. # Older in_place check for upload jobs created before 18.01, TODO remove in 19.XX. xref #5206 run_as_real_user = dataset.get('run_as_real_user', False) or dataset.get( "in_place", False) # purge_source defaults to True unless this is an FTP import and # ftp_upload_purge has been overridden to False in Galaxy's config. # We set purge_source to False if: # - the job does not have write access to the file, e.g. when running as the # real user # - the files are uploaded from external paths. purge_source = dataset.get( 'purge_source', True) and not run_as_real_user and dataset.type not in ('server_dir', 'path_paste') # in_place is True unless we are running as a real user or importing external paths (i.e. # this is a real upload and not a path paste or ftp import). # in_place should always be False if running as real user because the uploaded file will # be owned by Galaxy and not the user and it should be False for external paths so Galaxy doesn't # modify files not controlled by Galaxy. in_place = not run_as_real_user and dataset.type not in ('server_dir', 'path_paste', 'ftp_import') # Base on the check_upload_content Galaxy config option and on by default, this enables some # security related checks on the uploaded content, but can prevent uploads from working in some cases. check_content = dataset.get('check_content', True) # auto_decompress is a request flag that can be swapped off to prevent Galaxy from automatically # decompressing archive files before sniffing. auto_decompress = dataset.get('auto_decompress', True) try: dataset.file_type except AttributeError: raise UploadProblemException( 'Unable to process uploaded file, missing file_type parameter.') if dataset.type == 'url': try: dataset.path = sniff.stream_url_to_file( dataset.path, file_sources=get_file_sources()) except Exception as e: raise UploadProblemException('Unable to fetch %s\n%s' % (dataset.path, unicodify(e))) # See if we have an empty file if not os.path.exists(dataset.path): raise UploadProblemException( 'Uploaded temporary file (%s) does not exist.' % dataset.path) stdout, ext, datatype, is_binary, converted_path = handle_upload( registry=registry, path=dataset.path, requested_ext=dataset.file_type, name=dataset.name, tmp_prefix='data_id_%s_upload_' % dataset.dataset_id, tmp_dir=output_adjacent_tmpdir(output_path), check_content=check_content, link_data_only=link_data_only, in_place=in_place, auto_decompress=auto_decompress, convert_to_posix_lines=dataset.to_posix_lines, convert_spaces_to_tabs=dataset.space_to_tab, ) # Strip compression extension from name if compression_type and not getattr( datatype, 'compressed', False) and dataset.name.endswith('.' + compression_type): dataset.name = dataset.name[:-len('.' + compression_type)] # Move dataset if link_data_only: # Never alter a file that will not be copied to Galaxy's local file store. if datatype.dataset_content_needs_grooming(dataset.path): err_msg = 'The uploaded files need grooming, so change your <b>Copy data into Galaxy?</b> selection to be ' + \ '<b>Copy files into Galaxy</b> instead of <b>Link to files without copying into Galaxy</b> so grooming can be performed.' raise UploadProblemException(err_msg) if not link_data_only: # Move the dataset to its "real" path. converted_path is a tempfile so we move it even if purge_source is False. if purge_source or converted_path: try: # If user has indicated that the original file to be purged and have converted_path tempfile if purge_source and converted_path: shutil.move(converted_path, output_path) os.remove(dataset.path) else: shutil.move(converted_path or dataset.path, output_path) except OSError as e: # We may not have permission to remove the input if e.errno != errno.EACCES: raise else: shutil.copy(dataset.path, output_path) # Write the job info stdout = stdout or 'uploaded %s file' % ext info = dict(type='dataset', dataset_id=dataset.dataset_id, ext=ext, stdout=stdout, name=dataset.name, line_count=line_count) if dataset.get('uuid', None) is not None: info['uuid'] = dataset.get('uuid') # FIXME: does this belong here? also not output-adjacent-tmpdir aware =/ if not link_data_only and datatype and datatype.dataset_content_needs_grooming( output_path): # Groom the dataset content if necessary datatype.groom_dataset_content(output_path) return info
def add_composite_file(dataset, registry, output_path, files_path): datatype = None # Find data type if dataset.file_type is not None: datatype = registry.get_datatype_by_extension(dataset.file_type) def to_path(path_or_url): is_url = path_or_url.find('://') != -1 # todo fixme if is_url: try: temp_name = sniff.stream_url_to_file( path_or_url, file_sources=get_file_sources()) except Exception as e: raise UploadProblemException('Unable to fetch %s\n%s' % (path_or_url, unicodify(e))) return temp_name, is_url return path_or_url, is_url def make_files_path(): safe_makedirs(files_path) def stage_file(name, composite_file_path, is_binary=False): dp = composite_file_path['path'] path, is_url = to_path(dp) if is_url: dataset.path = path dp = path auto_decompress = composite_file_path.get('auto_decompress', True) if auto_decompress and not datatype.composite_type and CompressedFile.can_decompress( dp): # It isn't an explicitly composite datatype, so these are just extra files to attach # as composite data. It'd be better if Galaxy was communicating this to the tool # a little more explicitly so we didn't need to dispatch on the datatype and so we # could attach arbitrary extra composite data to an existing composite datatype if # if need be? Perhaps that would be a mistake though. CompressedFile(dp).extract(files_path) else: tmpdir = output_adjacent_tmpdir(output_path) tmp_prefix = 'data_id_%s_convert_' % dataset.dataset_id sniff.handle_composite_file( datatype, dp, files_path, name, is_binary, tmpdir, tmp_prefix, composite_file_path, ) # Do we have pre-defined composite files from the datatype definition. if dataset.composite_files: make_files_path() for name, value in dataset.composite_files.items(): value = bunch.Bunch(**value) if value.name not in dataset.composite_file_paths: raise UploadProblemException( "Failed to find file_path %s in %s" % (value.name, dataset.composite_file_paths)) if dataset.composite_file_paths[ value.name] is None and not value.optional: raise UploadProblemException( 'A required composite data file was not provided (%s)' % name) elif dataset.composite_file_paths[value.name] is not None: composite_file_path = dataset.composite_file_paths[value.name] stage_file(name, composite_file_path, value.is_binary) # Do we have ad-hoc user supplied composite files. elif dataset.composite_file_paths: make_files_path() for key, composite_file in dataset.composite_file_paths.items(): stage_file(key, composite_file) # TODO: replace these defaults # Move the dataset to its "real" path primary_file_path, _ = to_path(dataset.primary_file) shutil.move(primary_file_path, output_path) # Write the job info return dict(type='dataset', dataset_id=dataset.dataset_id, stdout='uploaded %s file' % dataset.file_type)
def add_file(dataset, registry, output_path): ext = None compression_type = None line_count = None converted_path = None stdout = None link_data_only_str = dataset.get('link_data_only', 'copy_files') if link_data_only_str not in ['link_to_files', 'copy_files']: raise UploadProblemException( "Invalid setting '%s' for option link_data_only - upload request misconfigured" % link_data_only_str) link_data_only = link_data_only_str == 'link_to_files' # run_as_real_user is estimated from galaxy config (external chmod indicated of inputs executed) # If this is True we always purge supplied upload inputs so they are cleaned up and we reuse their # paths during data conversions since this user already owns that path. # Older in_place check for upload jobs created before 18.01, TODO remove in 19.XX. xref #5206 run_as_real_user = dataset.get('run_as_real_user', False) or dataset.get( "in_place", False) # purge_source defaults to True unless this is an FTP import and # ftp_upload_purge has been overridden to False in Galaxy's config. # We set purge_source to False if: # - the job does not have write access to the file, e.g. when running as the # real user # - the files are uploaded from external paths. purge_source = dataset.get( 'purge_source', True) and not run_as_real_user and dataset.type not in ('server_dir', 'path_paste') # in_place is True unless we are running as a real user or importing external paths (i.e. # this is a real upload and not a path paste or ftp import). # in_place should always be False if running as real user because the uploaded file will # be owned by Galaxy and not the user and it should be False for external paths so Galaxy doesn't # modify files not controlled by Galaxy. in_place = not run_as_real_user and dataset.type not in ('server_dir', 'path_paste', 'ftp_import') # Base on the check_upload_content Galaxy config option and on by default, this enables some # security related checks on the uploaded content, but can prevent uploads from working in some cases. check_content = dataset.get('check_content', True) # auto_decompress is a request flag that can be swapped off to prevent Galaxy from automatically # decompressing archive files before sniffing. auto_decompress = dataset.get('auto_decompress', True) try: dataset.file_type except AttributeError: raise UploadProblemException( 'Unable to process uploaded file, missing file_type parameter.') if dataset.type == 'url': try: dataset.path = sniff.stream_url_to_file(dataset.path) except Exception as e: raise UploadProblemException('Unable to fetch %s\n%s' % (dataset.path, str(e))) # See if we have an empty file if not os.path.exists(dataset.path): raise UploadProblemException( 'Uploaded temporary file (%s) does not exist.' % dataset.path) if not os.path.getsize(dataset.path) > 0: raise UploadProblemException('The uploaded file is empty') # Does the first 1K contain a null? is_binary = check_binary(dataset.path) # Decompress if needed/desired and determine/validate filetype. If a keep-compressed datatype is explicitly selected # or if autodetection is selected and the file sniffs as a keep-compressed datatype, it will not be decompressed. if not link_data_only: if is_zip(dataset.path) and not is_single_file_zip(dataset.path): stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.' try: ext, converted_path, compression_type = sniff.handle_uploaded_dataset_file( dataset.path, registry, ext=dataset.file_type, tmp_prefix='data_id_%s_upload_' % dataset.dataset_id, tmp_dir=output_adjacent_tmpdir(output_path), in_place=in_place, check_content=check_content, is_binary=is_binary, auto_decompress=auto_decompress, uploaded_file_ext=os.path.splitext( dataset.name)[1].lower().lstrip('.'), convert_to_posix_lines=dataset.to_posix_lines, convert_spaces_to_tabs=dataset.space_to_tab, ) except sniff.InappropriateDatasetContentError as exc: raise UploadProblemException(str(exc)) elif dataset.file_type == 'auto': # Link mode can't decompress anyway, so enable sniffing for keep-compressed datatypes even when auto_decompress # is enabled os.environ['GALAXY_SNIFFER_VALIDATE_MODE'] = '1' ext = sniff.guess_ext(dataset.path, registry.sniff_order, is_binary=is_binary) os.environ.pop('GALAXY_SNIFFER_VALIDATE_MODE') # The converted path will be the same as the input path if no conversion was done (or in-place conversion is used) converted_path = None if converted_path == dataset.path else converted_path # Validate datasets where the filetype was explicitly set using the filetype's sniffer (if any) if dataset.file_type != 'auto': datatype = registry.get_datatype_by_extension(dataset.file_type) # Enable sniffer "validate mode" (prevents certain sniffers from disabling themselves) os.environ['GALAXY_SNIFFER_VALIDATE_MODE'] = '1' if hasattr(datatype, 'sniff') and not datatype.sniff(dataset.path): stdout = ( "Warning: The file 'Type' was set to '{ext}' but the file does not appear to be of that" " type".format(ext=dataset.file_type)) os.environ.pop('GALAXY_SNIFFER_VALIDATE_MODE') # Handle unsniffable binaries if is_binary and ext == 'binary': upload_ext = os.path.splitext(dataset.name)[1].lower().lstrip('.') if registry.is_extension_unsniffable_binary(upload_ext): stdout = ( "Warning: The file's datatype cannot be determined from its contents and was guessed based on" " its extension, to avoid this warning, manually set the file 'Type' to '{ext}' when uploading" " this type of file".format(ext=upload_ext)) ext = upload_ext else: stdout = ( "The uploaded binary file format cannot be determined automatically, please set the file 'Type'" " manually") datatype = registry.get_datatype_by_extension(ext) # Strip compression extension from name if compression_type and not getattr( datatype, 'compressed', False) and dataset.name.endswith('.' + compression_type): dataset.name = dataset.name[:-len('.' + compression_type)] # Move dataset if link_data_only: # Never alter a file that will not be copied to Galaxy's local file store. if datatype.dataset_content_needs_grooming(dataset.path): err_msg = 'The uploaded files need grooming, so change your <b>Copy data into Galaxy?</b> selection to be ' + \ '<b>Copy files into Galaxy</b> instead of <b>Link to files without copying into Galaxy</b> so grooming can be performed.' raise UploadProblemException(err_msg) if not link_data_only: # Move the dataset to its "real" path. converted_path is a tempfile so we move it even if purge_source is False. if purge_source or converted_path: try: shutil.move(converted_path or dataset.path, output_path) except OSError as e: # We may not have permission to remove the input if e.errno != errno.EACCES: raise else: shutil.copy(dataset.path, output_path) # Write the job info stdout = stdout or 'uploaded %s file' % ext info = dict(type='dataset', dataset_id=dataset.dataset_id, ext=ext, stdout=stdout, name=dataset.name, line_count=line_count) if dataset.get('uuid', None) is not None: info['uuid'] = dataset.get('uuid') # FIXME: does this belong here? also not output-adjacent-tmpdir aware =/ if not link_data_only and datatype and datatype.dataset_content_needs_grooming( output_path): # Groom the dataset content if necessary datatype.groom_dataset_content(output_path) return info
def _resolve_src(item): converted_path = None name, path = _has_src_to_path(item) dbkey = item.get("dbkey", "?") requested_ext = item.get("ext", "auto") info = item.get("info", None) object_id = item.get("object_id", None) link_data_only = upload_config.link_data_only if "link_data_only" in item: # Allow overriding this on a per file basis. link_data_only = _link_data_only(item) to_posix_lines = upload_config.get_option(item, "to_posix_lines") space_to_tab = upload_config.get_option(item, "space_to_tab") in_place = item.get("in_place", False) purge_source = item.get("purge_source", True) # Follow upload.py logic but without the auto-decompress logic. registry = upload_config.registry check_content = upload_config.check_content data_type, ext = None, requested_ext is_binary = check_binary(path) if is_binary: data_type, ext = handle_sniffable_binary_check( data_type, ext, path, registry) if data_type is None: root_datatype = registry.get_datatype_by_extension(ext) if getattr(root_datatype, 'compressed', False): data_type = 'compressed archive' ext = ext elif is_binary: data_type, ext = handle_unsniffable_binary_check( data_type, ext, path, name, is_binary, requested_ext, check_content, registry) if not data_type and check_content and check_html(path): raise UploadProblemException( 'The uploaded file contains inappropriate HTML content') if data_type != 'binary': if not link_data_only: if to_posix_lines: if space_to_tab: line_count, converted_path = sniff.convert_newlines_sep2tabs( path, in_place=in_place, tmp_dir=".") else: line_count, converted_path = sniff.convert_newlines( path, in_place=in_place, tmp_dir=".") else: if space_to_tab: line_count, converted_path = sniff.sep2tabs( path, in_place=in_place, tmp_dir=".") if requested_ext == 'auto': ext = sniff.guess_ext(converted_path or path, registry.sniff_order) else: ext = requested_ext data_type = ext if ext == 'auto' and data_type == 'binary': ext = 'data' if ext == 'auto' and requested_ext: ext = requested_ext if ext == 'auto': ext = 'data' datatype = registry.get_datatype_by_extension(ext) if link_data_only: # Never alter a file that will not be copied to Galaxy's local file store. if datatype.dataset_content_needs_grooming(path): err_msg = 'The uploaded files need grooming, so change your <b>Copy data into Galaxy?</b> selection to be ' + \ '<b>Copy files into Galaxy</b> instead of <b>Link to files without copying into Galaxy</b> so grooming can be performed.' raise UploadProblemException(err_msg) # If this file is not in the workdir make sure it gets there. if not link_data_only and converted_path: path = upload_config.ensure_in_working_directory( converted_path, purge_source, in_place) elif not link_data_only: path = upload_config.ensure_in_working_directory( path, purge_source, in_place) if not link_data_only and datatype and datatype.dataset_content_needs_grooming( path): # Groom the dataset content if necessary datatype.groom_dataset_content(path) rval = { "name": name, "filename": path, "dbkey": dbkey, "ext": ext, "link_data_only": link_data_only } if info is not None: rval["info"] = info if object_id is not None: rval["object_id"] = object_id return rval
def _resolve_item_with_primary(item): error_message = None converted_path = None name, path = _has_src_to_path(upload_config, item, is_dataset=True) sources = [] url = item.get("url") if url: sources.append({"source_uri": url}) hashes = item.get("hashes", []) for hash_dict in hashes: hash_function = hash_dict.get("hash_function") hash_value = hash_dict.get("hash_value") try: _handle_hash_validation(upload_config, hash_function, hash_value, path) except Exception as e: error_message = str(e) item["error_message"] = error_message dbkey = item.get("dbkey", "?") link_data_only = upload_config.link_data_only if "link_data_only" in item: # Allow overriding this on a per file basis. link_data_only = _link_data_only(item) ext = "data" staged_extra_files = None if not error_message: requested_ext = item.get("ext", "auto") to_posix_lines = upload_config.get_option(item, "to_posix_lines") space_to_tab = upload_config.get_option(item, "space_to_tab") auto_decompress = upload_config.get_option(item, "auto_decompress") in_place = item.get("in_place", False) purge_source = item.get("purge_source", True) registry = upload_config.registry check_content = upload_config.check_content stdout, ext, datatype, is_binary, converted_path = handle_upload( registry=registry, path=path, requested_ext=requested_ext, name=name, tmp_prefix='data_fetch_upload_', tmp_dir=".", check_content=check_content, link_data_only=link_data_only, in_place=in_place, auto_decompress=auto_decompress, convert_to_posix_lines=to_posix_lines, convert_spaces_to_tabs=space_to_tab, ) if link_data_only: # Never alter a file that will not be copied to Galaxy's local file store. if datatype.dataset_content_needs_grooming(path): err_msg = 'The uploaded files need grooming, so change your <b>Copy data into Galaxy?</b> selection to be ' + \ '<b>Copy files into Galaxy</b> instead of <b>Link to files without copying into Galaxy</b> so grooming can be performed.' raise UploadProblemException(err_msg) # If this file is not in the workdir make sure it gets there. if not link_data_only and converted_path: path = upload_config.ensure_in_working_directory( converted_path, purge_source, in_place) elif not link_data_only: path = upload_config.ensure_in_working_directory( path, purge_source, in_place) extra_files = item.get("extra_files") if extra_files: # TODO: optimize to just copy the whole directory to extra files instead. assert not upload_config.link_data_only, "linking composite dataset files not yet implemented" extra_files_path = path + "_extra" staged_extra_files = extra_files_path os.mkdir(extra_files_path) def walk_extra_files(items, prefix=""): for item in items: if "elements" in item: name = item.get("name") if not prefix: item_prefix = name else: item_prefix = os.path.join(prefix, name) walk_extra_files(item.get("elements"), prefix=item_prefix) else: name, src_path = _has_src_to_path( upload_config, item) if prefix: rel_path = os.path.join(prefix, name) else: rel_path = name file_output_path = os.path.join( staged_extra_files, rel_path) parent_dir = os.path.dirname(file_output_path) if not os.path.exists(parent_dir): safe_makedirs(parent_dir) shutil.move(src_path, file_output_path) walk_extra_files(extra_files.get("elements", [])) # TODO: # in galaxy json add 'extra_files' and point at target derived from extra_files: if not link_data_only and datatype and datatype.dataset_content_needs_grooming( path): # Groom the dataset content if necessary datatype.groom_dataset_content(path) rval = { "name": name, "filename": path, "dbkey": dbkey, "ext": ext, "link_data_only": link_data_only, "sources": sources, "hashes": hashes } if staged_extra_files: rval["extra_files"] = os.path.abspath(staged_extra_files) return _copy_and_validate_simple_attributes(item, rval)
def _resolve_src(item): converted_path = None name, path = _has_src_to_path(item) dbkey = item.get("dbkey", "?") requested_ext = item.get("ext", "auto") info = item.get("info", None) object_id = item.get("object_id", None) link_data_only = upload_config.link_data_only if "link_data_only" in item: # Allow overriding this on a per file basis. link_data_only = _link_data_only(item) to_posix_lines = upload_config.get_option(item, "to_posix_lines") space_to_tab = upload_config.get_option(item, "space_to_tab") auto_decompress = upload_config.get_option(item, "auto_decompress") in_place = item.get("in_place", False) purge_source = item.get("purge_source", True) registry = upload_config.registry check_content = upload_config.check_content stdout, ext, datatype, is_binary, converted_path = handle_upload( registry=registry, path=path, requested_ext=requested_ext, name=name, tmp_prefix='data_fetch_upload_', tmp_dir=".", check_content=check_content, link_data_only=link_data_only, in_place=in_place, auto_decompress=auto_decompress, convert_to_posix_lines=to_posix_lines, convert_spaces_to_tabs=space_to_tab, ) if link_data_only: # Never alter a file that will not be copied to Galaxy's local file store. if datatype.dataset_content_needs_grooming(path): err_msg = 'The uploaded files need grooming, so change your <b>Copy data into Galaxy?</b> selection to be ' + \ '<b>Copy files into Galaxy</b> instead of <b>Link to files without copying into Galaxy</b> so grooming can be performed.' raise UploadProblemException(err_msg) # If this file is not in the workdir make sure it gets there. if not link_data_only and converted_path: path = upload_config.ensure_in_working_directory( converted_path, purge_source, in_place) elif not link_data_only: path = upload_config.ensure_in_working_directory( path, purge_source, in_place) if not link_data_only and datatype and datatype.dataset_content_needs_grooming( path): # Groom the dataset content if necessary datatype.groom_dataset_content(path) rval = { "name": name, "filename": path, "dbkey": dbkey, "ext": ext, "link_data_only": link_data_only } if info is not None: rval["info"] = info if object_id is not None: rval["object_id"] = object_id return rval