def set_meta_with_tool_provided(dataset_instance, file_dict, set_meta_kwds, datatypes_registry, max_metadata_value_size): # This method is somewhat odd, in that we set the metadata attributes from tool, # then call set_meta, then set metadata attributes from tool again. # This is intentional due to interplay of overwrite kwd, the fact that some metadata # parameters may rely on the values of others, and that we are accepting the # values provided by the tool as Truth. extension = dataset_instance.extension if extension == "_sniff_": try: from galaxy.datatypes import sniff extension = sniff.handle_uploaded_dataset_file( dataset_instance.dataset.external_filename, datatypes_registry) # We need to both set the extension so it is available to set_meta # and record it in the metadata so it can be reloaded on the server # side and the model updated (see MetadataCollection.{from,to}_JSON_dict) dataset_instance.extension = extension # Set special metadata property that will reload this on server side. setattr(dataset_instance.metadata, "__extension__", extension) except Exception: log.exception("Problem sniffing datatype.") for metadata_name, metadata_value in file_dict.get('metadata', {}).items(): setattr(dataset_instance.metadata, metadata_name, metadata_value) dataset_instance.datatype.set_meta(dataset_instance, **set_meta_kwds) for metadata_name, metadata_value in file_dict.get('metadata', {}).items(): setattr(dataset_instance.metadata, metadata_name, metadata_value) if max_metadata_value_size: for k, v in list(dataset_instance.metadata.items()): if total_size(v) > max_metadata_value_size: log.info("Key %s too large for metadata, discarding" % k) dataset_instance.metadata.remove_key(k)
def set_meta_with_tool_provided( dataset_instance, file_dict, set_meta_kwds, datatypes_registry ): # This method is somewhat odd, in that we set the metadata attributes from tool, # then call set_meta, then set metadata attributes from tool again. # This is intentional due to interplay of overwrite kwd, the fact that some metadata # parameters may rely on the values of others, and that we are accepting the # values provided by the tool as Truth. extension = dataset_instance.extension if extension == "_sniff_": try: from galaxy.datatypes import sniff extension = sniff.handle_uploaded_dataset_file( dataset_instance.dataset.external_filename, datatypes_registry ) # We need to both set the extension so it is available to set_meta # and record it in the metadata so it can be reloaded on the server # side and the model updated (see MetadataCollection.{from,to}_JSON_dict) dataset_instance.extension = extension # Set special metadata property that will reload this on server side. setattr( dataset_instance.metadata, "__extension__", extension ) except Exception: # TODO: log this when metadata can log stuff... # https://trello.com/c/Nrwodu9d pass for metadata_name, metadata_value in file_dict.get( 'metadata', {} ).iteritems(): setattr( dataset_instance.metadata, metadata_name, metadata_value ) dataset_instance.datatype.set_meta( dataset_instance, **set_meta_kwds ) for metadata_name, metadata_value in file_dict.get( 'metadata', {} ).iteritems(): setattr( dataset_instance.metadata, metadata_name, metadata_value )
def set_meta_with_tool_provided(dataset_instance, file_dict, set_meta_kwds, datatypes_registry): # This method is somewhat odd, in that we set the metadata attributes from tool, # then call set_meta, then set metadata attributes from tool again. # This is intentional due to interplay of overwrite kwd, the fact that some metadata # parameters may rely on the values of others, and that we are accepting the # values provided by the tool as Truth. extension = dataset_instance.extension if extension == "_sniff_": try: from galaxy.datatypes import sniff extension = sniff.handle_uploaded_dataset_file( dataset_instance.dataset.external_filename, datatypes_registry) # We need to both set the extension so it is available to set_meta # and record it in the metadata so it can be reloaded on the server # side and the model updated (see MetadataCollection.{from,to}_JSON_dict) dataset_instance.extension = extension # Set special metadata property that will reload this on server side. setattr(dataset_instance.metadata, "__extension__", extension) except Exception: # TODO: log this when metadata can log stuff... # https://trello.com/c/Nrwodu9d pass for metadata_name, metadata_value in file_dict.get('metadata', {}).iteritems(): setattr(dataset_instance.metadata, metadata_name, metadata_value) dataset_instance.datatype.set_meta(dataset_instance, **set_meta_kwds) for metadata_name, metadata_value in file_dict.get('metadata', {}).iteritems(): setattr(dataset_instance.metadata, metadata_name, metadata_value)
def sniff_and_handle_data_type(file_path, datatypes_registry): """ The sniff.handle_uploaded_dataset_file() method in Galaxy performs dual functions: it sniffs the filetype and if it's a compressed archive for a non compressed datatype such as fasta, it will be unpacked. """ ext = sniff.handle_uploaded_dataset_file(file_path, datatypes_registry) if not ext or ext == "data": is_binary = check_binary(file_path) ext = sniff.guess_ext(file_path, datatypes_registry.sniff_order, is_binary=is_binary) return ext
def sniff_and_handle_data_type(json_params, output_file): """ The sniff.handle_uploaded_dataset_file() method in Galaxy performs dual functions: it sniffs the filetype and if it's a compressed archive for a non compressed datatype such as fasta, it will be unpacked. """ try: datatypes_registry = Registry() datatypes_registry.load_datatypes( root_dir=json_params['job_config']['GALAXY_ROOT_DIR'], config=json_params['job_config']['GALAXY_DATATYPES_CONF_FILE']) file_type = sniff.handle_uploaded_dataset_file(output_file, datatypes_registry) return file_type except Exception: return None
def sniff_and_handle_data_type(json_params, output_file): """ The sniff.handle_uploaded_dataset_file() method in Galaxy performs dual functions: it sniffs the filetype and if it's a compressed archive for a non compressed datatype such as fasta, it will be unpacked. """ try: datatypes_registry = Registry() datatypes_registry.load_datatypes( root_dir=json_params['job_config']['GALAXY_ROOT_DIR'], config=json_params['job_config']['GALAXY_DATATYPES_CONF_FILE']) file_type = sniff.handle_uploaded_dataset_file( output_file, datatypes_registry) return file_type except: return None
def __main__(): filename = sys.argv[1] try: max_file_size = int( sys.argv[2] ) except: max_file_size = 0 job_params, params = load_input_parameters( filename ) if job_params is None: # using an older tabular file enhanced_handling = False job_params = dict( param_dict=params ) job_params[ 'output_data' ] = [ dict( out_data_name='output', ext='data', file_name=filename, extra_files_path=None ) ] job_params[ 'job_config' ] = dict( GALAXY_ROOT_DIR=GALAXY_ROOT_DIR, GALAXY_DATATYPES_CONF_FILE=GALAXY_DATATYPES_CONF_FILE, TOOL_PROVIDED_JOB_METADATA_FILE=TOOL_PROVIDED_JOB_METADATA_FILE ) else: enhanced_handling = True json_file = open( job_params[ 'job_config' ][ 'TOOL_PROVIDED_JOB_METADATA_FILE' ], 'w' ) # specially named file for output junk to pass onto set metadata datatypes_registry = Registry() datatypes_registry.load_datatypes( root_dir=job_params[ 'job_config' ][ 'GALAXY_ROOT_DIR' ], config=job_params[ 'job_config' ][ 'GALAXY_DATATYPES_CONF_FILE' ] ) URL = params.get( 'URL', None ) # using exactly URL indicates that only one dataset is being downloaded URL_method = params.get( 'URL_method', None ) # The Python support for fetching resources from the web is layered. urllib uses the httplib # library, which in turn uses the socket library. As of Python 2.3 you can specify how long # a socket should wait for a response before timing out. By default the socket module has no # timeout and can hang. Currently, the socket timeout is not exposed at the httplib or urllib2 # levels. However, you can set the default timeout ( in seconds ) globally for all sockets by # doing the following. socket.setdefaulttimeout( 600 ) for data_dict in job_params[ 'output_data' ]: cur_filename = data_dict.get( 'file_name', filename ) cur_URL = params.get( '%s|%s|URL' % ( GALAXY_PARAM_PREFIX, data_dict[ 'out_data_name' ] ), URL ) if not cur_URL: open( cur_filename, 'w' ).write( "" ) stop_err( 'The remote data source application has not sent back a URL parameter in the request.' ) # The following calls to urlopen() will use the above default timeout try: if not URL_method or URL_method == 'get': page = urlopen( cur_URL ) elif URL_method == 'post': page = urlopen( cur_URL, urlencode( params ) ) except Exception as e: stop_err( 'The remote data source application may be off line, please try again later. Error: %s' % str( e ) ) if max_file_size: file_size = int( page.info().get( 'Content-Length', 0 ) ) if file_size > max_file_size: stop_err( 'The size of the data (%d bytes) you have requested exceeds the maximum allowed (%d bytes) on this server.' % ( file_size, max_file_size ) ) # do sniff stream for multi_byte try: cur_filename, is_multi_byte = sniff.stream_to_open_named_file( page, os.open( cur_filename, os.O_WRONLY | os.O_CREAT ), cur_filename, source_encoding=get_charset_from_http_headers( page.headers ) ) except Exception as e: stop_err( 'Unable to fetch %s:\n%s' % ( cur_URL, e ) ) # here import checks that upload tool performs if enhanced_handling: try: ext = sniff.handle_uploaded_dataset_file( filename, datatypes_registry, ext=data_dict[ 'ext' ], is_multi_byte=is_multi_byte ) except Exception as e: stop_err( str( e ) ) info = dict( type='dataset', dataset_id=data_dict[ 'dataset_id' ], ext=ext) json_file.write( "%s\n" % dumps( info ) )
def download_from_genomespace_file_browser( json_parameter_file, genomespace_site ): json_params = json.loads( open( json_parameter_file, 'r' ).read() ) datasource_params = json_params.get( 'param_dict' ) username = datasource_params.get( "gs-username", None ) token = datasource_params.get( "gs-token", None ) assert None not in [ username, token ], "Missing GenomeSpace username or token." output_filename = datasource_params.get( "output", None ) dataset_id = json_params['output_data'][0]['dataset_id'] hda_id = json_params['output_data'][0]['hda_id'] url_opener = get_cookie_opener( username, token ) #load and set genomespace format ids to galaxy exts genomespace_site_dict = get_genomespace_site_urls()[ genomespace_site ] set_genomespace_format_identifiers( url_opener, genomespace_site_dict['dmServer'] ) file_url_prefix = "fileUrl" file_type_prefix = "fileFormat" metadata_parameter_file = open( json_params['job_config']['TOOL_PROVIDED_JOB_METADATA_FILE'], 'wb' ) #setup datatypes registry for sniffing datatypes_registry = Registry() datatypes_registry.load_datatypes( root_dir = json_params[ 'job_config' ][ 'GALAXY_ROOT_DIR' ], config = json_params[ 'job_config' ][ 'GALAXY_DATATYPES_CONF_FILE' ] ) file_numbers = [] for name in datasource_params.keys(): if name.startswith( file_url_prefix ): name = name[len( file_url_prefix ):] file_numbers.append( int( name ) ) if not file_numbers: if output_filename: open( output_filename, 'wb' ) #erase contents of file raise Exception( "You must select at least one file to import into Galaxy." ) file_numbers.sort() used_filenames = [] for file_num in file_numbers: url_key = "%s%i" % ( file_url_prefix, file_num ) download_url = datasource_params.get( url_key, None ) if download_url is None: break filetype_key = "%s%i" % ( file_type_prefix, file_num ) filetype_url = datasource_params.get( filetype_key, None ) galaxy_ext = get_galaxy_ext_from_genomespace_format_url( url_opener, filetype_url ) formated_download_url = "%s?%s" % ( download_url, urllib.urlencode( [ ( 'dataformat', filetype_url ) ] ) ) new_file_request = urllib2.Request( formated_download_url ) new_file_request.get_method = lambda: 'GET' target_download_url = url_opener.open( new_file_request ) filename = None if 'Content-Disposition' in target_download_url.info(): # If the response has Content-Disposition, try to get filename from it content_disposition = dict( map( lambda x: x.strip().split('=') if '=' in x else ( x.strip(),'' ), target_download_url.info()['Content-Disposition'].split( ';' ) ) ) if 'filename' in content_disposition: filename = content_disposition[ 'filename' ].strip( "\"'" ) if not filename: parsed_url = urlparse.urlparse( download_url ) query_params = urlparse.parse_qs( parsed_url[4] ) filename = urllib.unquote_plus( parsed_url[2].split( '/' )[-1] ) if not filename: filename = download_url metadata_dict = None original_filename = filename if output_filename is None: filename = ''.join( c in VALID_CHARS and c or '-' for c in filename ) while filename in used_filenames: filename = "-%s" % filename used_filenames.append( filename ) output_filename = os.path.join( os.getcwd(), 'primary_%i_%s_visible_%s' % ( hda_id, filename, galaxy_ext ) ) metadata_dict = dict( type = 'new_primary_dataset', base_dataset_id = dataset_id, ext = galaxy_ext, filename = output_filename, name = "GenomeSpace import on %s" % ( original_filename ) ) else: if dataset_id is not None: metadata_dict = dict( type = 'dataset', dataset_id = dataset_id, ext = galaxy_ext, name = "GenomeSpace import on %s" % ( filename ) ) output_file = open( output_filename, 'wb' ) chunk_write( target_download_url, output_file ) output_file.close() if ( galaxy_ext == AUTO_GALAXY_EXT or filetype_url == GENOMESPACE_FORMAT_IDENTIFIER_UNKNOWN ) and metadata_dict: #try to sniff datatype try: galaxy_ext = sniff.handle_uploaded_dataset_file( output_filename, datatypes_registry ) except: #sniff failed galaxy_ext = original_filename.rsplit( '.', 1 )[-1] if galaxy_ext not in datatypes_registry.datatypes_by_extension: galaxy_ext = DEFAULT_GALAXY_EXT metadata_dict[ 'ext' ] = galaxy_ext output_filename = None #only have one filename available #write out metadata info if metadata_dict: metadata_parameter_file.write( "%s\n" % json.dumps( metadata_dict ) ) metadata_parameter_file.close() return True
def download_from_genomespace_importer(username, token, json_parameter_file, genomespace_site, gs_toolname): json_params = json.loads(open(json_parameter_file, 'r').read()) datasource_params = json_params.get('param_dict') assert None not in [username, token], "Missing GenomeSpace username or token." output_filename = datasource_params.get("output_file1", None) dataset_id = base_dataset_id = json_params['output_data'][0]['dataset_id'] hda_id = json_params['output_data'][0]['hda_id'] url_opener = get_cookie_opener(username, token, gs_toolname=gs_toolname) #load and set genomespace format ids to galaxy exts genomespace_site_dict = get_genomespace_site_urls()[genomespace_site] set_genomespace_format_identifiers(url_opener, genomespace_site_dict['dmServer']) file_url_name = "URL" metadata_parameter_file = open( json_params['job_config']['TOOL_PROVIDED_JOB_METADATA_FILE'], 'wb') #setup datatypes registry for sniffing datatypes_registry = Registry() datatypes_registry.load_datatypes( root_dir=json_params['job_config']['GALAXY_ROOT_DIR'], config=json_params['job_config']['GALAXY_DATATYPES_CONF_FILE']) url_param = datasource_params.get(file_url_name, None) used_filenames = [] for download_url in url_param.split(','): using_temp_file = False parsed_url = urlparse.urlparse(download_url) query_params = urlparse.parse_qs(parsed_url[4]) #write file to disk new_file_request = urllib2.Request(download_url) new_file_request.get_method = lambda: 'GET' target_download_url = url_opener.open(new_file_request) filename = None if 'Content-Disposition' in target_download_url.info(): content_disposition = dict( map( lambda x: x.strip().split('=') if '=' in x else (x.strip(), ''), target_download_url.info()['Content-Disposition'].split( ';'))) if 'filename' in content_disposition: filename = content_disposition['filename'].strip("\"'") if not filename: parsed_url = urlparse.urlparse(download_url) query_params = urlparse.parse_qs(parsed_url[4]) filename = urllib.unquote_plus(parsed_url[2].split('/')[-1]) if not filename: filename = download_url if output_filename is None: #need to use a temp file here, because we do not know the ext yet using_temp_file = True output_filename = tempfile.NamedTemporaryFile( prefix='tmp-genomespace-importer-').name output_file = open(output_filename, 'wb') chunk_write(target_download_url, output_file) output_file.close() #determine file format file_type = None if 'dataformat' in query_params: #this is a converted dataset file_type = query_params['dataformat'][0] file_type = get_galaxy_ext_from_genomespace_format_url( url_opener, file_type) else: try: #get and use GSMetadata object download_file_path = download_url.split( "%s/file/" % (genomespace_site_dict['dmServer']), 1 )[-1] #FIXME: This is a very bad way to get the path for determining metadata. There needs to be a way to query API using download URLto get to the metadata object metadata_request = urllib2.Request( "%s/%s/filemetadata/%s" % (genomespace_site_dict['dmServer'], GENOMESPACE_API_VERSION_STRING, download_file_path)) metadata_request.get_method = lambda: 'GET' metadata_url = url_opener.open(metadata_request) file_metadata_dict = json.loads(metadata_url.read()) metadata_url.close() file_type = file_metadata_dict.get('dataFormat', None) if file_type and file_type.get('url'): file_type = file_type.get('url') file_type = get_galaxy_ext_from_genomespace_format_url( url_opener, file_type, default=None) except: pass if file_type is None: #try to sniff datatype try: file_type = sniff.handle_uploaded_dataset_file( output_filename, datatypes_registry) except: pass #sniff failed if file_type is None and '.' in parsed_url[2]: #still no known datatype, fall back to using extension file_type = parsed_url[2].rsplit('.', 1)[-1] file_type = GENOMESPACE_EXT_TO_GALAXY_EXT.get(file_type, file_type) if file_type is None: #use default extension (e.g. 'data') file_type = DEFAULT_GALAXY_EXT #save json info for single primary dataset if dataset_id is not None: metadata_parameter_file.write("%s\n" % json.dumps( dict(type='dataset', dataset_id=dataset_id, ext=file_type, name="GenomeSpace importer on %s" % (filename)))) #if using tmp file, move the file to the new file path dir to get scooped up later if using_temp_file: original_filename = filename filename = ''.join(c in VALID_CHARS and c or '-' for c in filename) while filename in used_filenames: filename = "-%s" % filename used_filenames.append(filename) target_output_filename = os.path.join( os.getcwd(), 'primary_%i_%s_visible_%s' % (hda_id, filename, file_type)) shutil.move(output_filename, target_output_filename) metadata_parameter_file.write("%s\n" % json.dumps( dict(type='new_primary_dataset', base_dataset_id=base_dataset_id, ext=file_type, filename=target_output_filename, name="GenomeSpace importer on %s" % (original_filename)))) dataset_id = None #only one primary dataset available output_filename = None #only have one filename available metadata_parameter_file.close() return True
def add_file(dataset, registry, output_path): ext = None compression_type = None line_count = None converted_path = None stdout = None link_data_only_str = dataset.get('link_data_only', 'copy_files') if link_data_only_str not in ['link_to_files', 'copy_files']: raise UploadProblemException( "Invalid setting '%s' for option link_data_only - upload request misconfigured" % link_data_only_str) link_data_only = link_data_only_str == 'link_to_files' # run_as_real_user is estimated from galaxy config (external chmod indicated of inputs executed) # If this is True we always purge supplied upload inputs so they are cleaned up and we reuse their # paths during data conversions since this user already owns that path. # Older in_place check for upload jobs created before 18.01, TODO remove in 19.XX. xref #5206 run_as_real_user = dataset.get('run_as_real_user', False) or dataset.get( "in_place", False) # purge_source defaults to True unless this is an FTP import and # ftp_upload_purge has been overridden to False in Galaxy's config. # We set purge_source to False if: # - the job does not have write access to the file, e.g. when running as the # real user # - the files are uploaded from external paths. purge_source = dataset.get( 'purge_source', True) and not run_as_real_user and dataset.type not in ('server_dir', 'path_paste') # in_place is True unless we are running as a real user or importing external paths (i.e. # this is a real upload and not a path paste or ftp import). # in_place should always be False if running as real user because the uploaded file will # be owned by Galaxy and not the user and it should be False for external paths so Galaxy doesn't # modify files not controlled by Galaxy. in_place = not run_as_real_user and dataset.type not in ('server_dir', 'path_paste', 'ftp_import') # Base on the check_upload_content Galaxy config option and on by default, this enables some # security related checks on the uploaded content, but can prevent uploads from working in some cases. check_content = dataset.get('check_content', True) # auto_decompress is a request flag that can be swapped off to prevent Galaxy from automatically # decompressing archive files before sniffing. auto_decompress = dataset.get('auto_decompress', True) try: dataset.file_type except AttributeError: raise UploadProblemException( 'Unable to process uploaded file, missing file_type parameter.') if dataset.type == 'url': try: dataset.path = sniff.stream_url_to_file(dataset.path) except Exception as e: raise UploadProblemException('Unable to fetch %s\n%s' % (dataset.path, str(e))) # See if we have an empty file if not os.path.exists(dataset.path): raise UploadProblemException( 'Uploaded temporary file (%s) does not exist.' % dataset.path) if not os.path.getsize(dataset.path) > 0: raise UploadProblemException('The uploaded file is empty') # Does the first 1K contain a null? is_binary = check_binary(dataset.path) # Decompress if needed/desired and determine/validate filetype. If a keep-compressed datatype is explicitly selected # or if autodetection is selected and the file sniffs as a keep-compressed datatype, it will not be decompressed. if not link_data_only: if is_zip(dataset.path) and not is_single_file_zip(dataset.path): stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.' try: ext, converted_path, compression_type = sniff.handle_uploaded_dataset_file( dataset.path, registry, ext=dataset.file_type, tmp_prefix='data_id_%s_upload_' % dataset.dataset_id, tmp_dir=output_adjacent_tmpdir(output_path), in_place=in_place, check_content=check_content, is_binary=is_binary, auto_decompress=auto_decompress, uploaded_file_ext=os.path.splitext( dataset.name)[1].lower().lstrip('.'), convert_to_posix_lines=dataset.to_posix_lines, convert_spaces_to_tabs=dataset.space_to_tab, ) except sniff.InappropriateDatasetContentError as exc: raise UploadProblemException(str(exc)) elif dataset.file_type == 'auto': # Link mode can't decompress anyway, so enable sniffing for keep-compressed datatypes even when auto_decompress # is enabled os.environ['GALAXY_SNIFFER_VALIDATE_MODE'] = '1' ext = sniff.guess_ext(dataset.path, registry.sniff_order, is_binary=is_binary) os.environ.pop('GALAXY_SNIFFER_VALIDATE_MODE') # The converted path will be the same as the input path if no conversion was done (or in-place conversion is used) converted_path = None if converted_path == dataset.path else converted_path # Validate datasets where the filetype was explicitly set using the filetype's sniffer (if any) if dataset.file_type != 'auto': datatype = registry.get_datatype_by_extension(dataset.file_type) # Enable sniffer "validate mode" (prevents certain sniffers from disabling themselves) os.environ['GALAXY_SNIFFER_VALIDATE_MODE'] = '1' if hasattr(datatype, 'sniff') and not datatype.sniff(dataset.path): stdout = ( "Warning: The file 'Type' was set to '{ext}' but the file does not appear to be of that" " type".format(ext=dataset.file_type)) os.environ.pop('GALAXY_SNIFFER_VALIDATE_MODE') # Handle unsniffable binaries if is_binary and ext == 'binary': upload_ext = os.path.splitext(dataset.name)[1].lower().lstrip('.') if registry.is_extension_unsniffable_binary(upload_ext): stdout = ( "Warning: The file's datatype cannot be determined from its contents and was guessed based on" " its extension, to avoid this warning, manually set the file 'Type' to '{ext}' when uploading" " this type of file".format(ext=upload_ext)) ext = upload_ext else: stdout = ( "The uploaded binary file format cannot be determined automatically, please set the file 'Type'" " manually") datatype = registry.get_datatype_by_extension(ext) # Strip compression extension from name if compression_type and not getattr( datatype, 'compressed', False) and dataset.name.endswith('.' + compression_type): dataset.name = dataset.name[:-len('.' + compression_type)] # Move dataset if link_data_only: # Never alter a file that will not be copied to Galaxy's local file store. if datatype.dataset_content_needs_grooming(dataset.path): err_msg = 'The uploaded files need grooming, so change your <b>Copy data into Galaxy?</b> selection to be ' + \ '<b>Copy files into Galaxy</b> instead of <b>Link to files without copying into Galaxy</b> so grooming can be performed.' raise UploadProblemException(err_msg) if not link_data_only: # Move the dataset to its "real" path. converted_path is a tempfile so we move it even if purge_source is False. if purge_source or converted_path: try: shutil.move(converted_path or dataset.path, output_path) except OSError as e: # We may not have permission to remove the input if e.errno != errno.EACCES: raise else: shutil.copy(dataset.path, output_path) # Write the job info stdout = stdout or 'uploaded %s file' % ext info = dict(type='dataset', dataset_id=dataset.dataset_id, ext=ext, stdout=stdout, name=dataset.name, line_count=line_count) if dataset.get('uuid', None) is not None: info['uuid'] = dataset.get('uuid') # FIXME: does this belong here? also not output-adjacent-tmpdir aware =/ if not link_data_only and datatype and datatype.dataset_content_needs_grooming( output_path): # Groom the dataset content if necessary datatype.groom_dataset_content(output_path) return info
def __main__(): filename = sys.argv[1] try: max_file_size = int(sys.argv[2]) except Exception: max_file_size = 0 job_params, params = load_input_parameters(filename) if job_params is None: # using an older tabular file enhanced_handling = False job_params = dict(param_dict=params) job_params['output_data'] = [dict(out_data_name='output', ext='data', file_name=filename, extra_files_path=None)] job_params['job_config'] = dict(GALAXY_ROOT_DIR=GALAXY_ROOT_DIR, GALAXY_DATATYPES_CONF_FILE=GALAXY_DATATYPES_CONF_FILE, TOOL_PROVIDED_JOB_METADATA_FILE=TOOL_PROVIDED_JOB_METADATA_FILE) else: enhanced_handling = True json_file = open(job_params['job_config']['TOOL_PROVIDED_JOB_METADATA_FILE'], 'w') # specially named file for output junk to pass onto set metadata datatypes_registry = Registry() datatypes_registry.load_datatypes(root_dir=job_params['job_config']['GALAXY_ROOT_DIR'], config=job_params['job_config']['GALAXY_DATATYPES_CONF_FILE']) URL = params.get('URL', None) # using exactly URL indicates that only one dataset is being downloaded URL_method = params.get('URL_method', None) # The Python support for fetching resources from the web is layered. urllib uses the httplib # library, which in turn uses the socket library. As of Python 2.3 you can specify how long # a socket should wait for a response before timing out. By default the socket module has no # timeout and can hang. Currently, the socket timeout is not exposed at the httplib or urllib2 # levels. However, you can set the default timeout ( in seconds ) globally for all sockets by # doing the following. socket.setdefaulttimeout(600) for data_dict in job_params['output_data']: cur_filename = data_dict.get('file_name', filename) cur_URL = params.get('%s|%s|URL' % (GALAXY_PARAM_PREFIX, data_dict['out_data_name']), URL) if not cur_URL or urlparse(cur_URL).scheme not in ('http', 'https', 'ftp'): open(cur_filename, 'w').write("") stop_err('The remote data source application has not sent back a URL parameter in the request.') # The following calls to urlopen() will use the above default timeout try: if not URL_method or URL_method == 'get': page = urlopen(cur_URL) elif URL_method == 'post': page = urlopen(cur_URL, urlencode(params).encode("utf-8")) except Exception as e: stop_err('The remote data source application may be off line, please try again later. Error: %s' % str(e)) if max_file_size: file_size = int(page.info().get('Content-Length', 0)) if file_size > max_file_size: stop_err('The size of the data (%d bytes) you have requested exceeds the maximum allowed (%d bytes) on this server.' % (file_size, max_file_size)) try: cur_filename = sniff.stream_to_open_named_file(page, os.open(cur_filename, os.O_WRONLY | os.O_CREAT), cur_filename, source_encoding=get_charset_from_http_headers(page.headers)) except Exception as e: stop_err('Unable to fetch %s:\n%s' % (cur_URL, e)) # here import checks that upload tool performs if enhanced_handling: try: ext = sniff.handle_uploaded_dataset_file(filename, datatypes_registry, ext=data_dict['ext']) except Exception as e: stop_err(str(e)) info = dict(type='dataset', dataset_id=data_dict['dataset_id'], ext=ext) json_file.write("%s\n" % dumps(info))
% (file_size, max_file_size)) # do sniff stream for multi_byte try: cur_filename, is_multi_byte = sniff.stream_to_open_named_file( page, os.open(cur_filename, os.O_WRONLY | os.O_CREAT), cur_filename, source_encoding=get_charset_from_http_headers(page.headers)) except Exception, e: stop_err('Unable to fetch %s:\n%s' % (cur_URL, e)) # here import checks that upload tool performs if enhanced_handling: try: ext = sniff.handle_uploaded_dataset_file( filename, datatypes_registry, ext=data_dict['ext'], is_multi_byte=is_multi_byte) except Exception, e: stop_err(str(e)) info = dict(type='dataset', dataset_id=data_dict['dataset_id'], ext=ext) json_file.write("%s\n" % dumps(info)) if __name__ == "__main__": __main__()
try: if not URL_method or URL_method == 'get': page = urllib.urlopen( cur_URL ) elif URL_method == 'post': page = urllib.urlopen( cur_URL, urllib.urlencode( params ) ) except Exception, e: stop_err( 'The remote data source application may be off line, please try again later. Error: %s' % str( e ) ) if max_file_size: file_size = int( page.info().get( 'Content-Length', 0 ) ) if file_size > max_file_size: stop_err( 'The size of the data (%d bytes) you have requested exceeds the maximum allowed (%d bytes) on this server.' % ( file_size, max_file_size ) ) #do sniff stream for multi_byte try: cur_filename, is_multi_byte = sniff.stream_to_open_named_file( page, os.open( cur_filename, os.O_WRONLY | os.O_CREAT ), cur_filename, source_encoding=get_charset_from_http_headers( page.headers ) ) except Exception, e: stop_err( 'Unable to fetch %s:\n%s' % ( cur_URL, e ) ) #here import checks that upload tool performs if enhanced_handling: try: ext = sniff.handle_uploaded_dataset_file( filename, datatypes_registry, ext = data_dict[ 'ext' ], is_multi_byte = is_multi_byte ) except Exception, e: stop_err( str( e ) ) info = dict( type = 'dataset', dataset_id = data_dict[ 'dataset_id' ], ext = ext) json_file.write( "%s\n" % dumps( info ) ) if __name__ == "__main__": __main__()
def download_from_genomespace_importer( username, token, json_parameter_file, genomespace_site ): json_params = simplejson.loads( open( json_parameter_file, 'r' ).read() ) datasource_params = json_params.get( 'param_dict' ) assert None not in [ username, token ], "Missing GenomeSpace username or token." output_filename = datasource_params.get( "output_file1", None ) dataset_id = json_params['output_data'][0]['dataset_id'] hda_id = json_params['output_data'][0]['hda_id'] url_opener = get_cookie_opener( username, token ) #load and set genomespace format ids to galaxy exts genomespace_site_dict = get_genomespace_site_urls()[ genomespace_site ] set_genomespace_format_identifiers( url_opener, genomespace_site_dict['dmServer'] ) file_url_name = "URL" metadata_parameter_file = open( json_params['job_config']['TOOL_PROVIDED_JOB_METADATA_FILE'], 'wb' ) #setup datatypes registry for sniffing datatypes_registry = Registry() datatypes_registry.load_datatypes( root_dir = json_params[ 'job_config' ][ 'GALAXY_ROOT_DIR' ], config = json_params[ 'job_config' ][ 'GALAXY_DATATYPES_CONF_FILE' ] ) url_param = datasource_params.get( file_url_name, None ) for download_url in url_param.split( ',' ): using_temp_file = False parsed_url = urlparse.urlparse( download_url ) query_params = urlparse.parse_qs( parsed_url[4] ) #write file to disk new_file_request = urllib2.Request( download_url ) new_file_request.get_method = lambda: 'GET' target_download_url = url_opener.open( new_file_request ) filename = None if 'Content-Disposition' in target_download_url.info(): content_disposition = dict( map( lambda x: x.strip().split('=') if '=' in x else ( x.strip(),'' ), target_download_url.info()['Content-Disposition'].split( ';' ) ) ) if 'filename' in content_disposition: filename = content_disposition[ 'filename' ].strip( "\"'" ) if not filename: parsed_url = urlparse.urlparse( download_url ) query_params = urlparse.parse_qs( parsed_url[4] ) filename = urllib.unquote_plus( parsed_url[2].split( '/' )[-1] ) if output_filename is None: #need to use a temp file here, because we do not know the ext yet using_temp_file = True output_filename = tempfile.NamedTemporaryFile( prefix='tmp-genomespace-importer-' ).name output_file = open( output_filename, 'wb' ) chunk_write( target_download_url, output_file ) output_file.close() #determine file format file_type = None if 'dataformat' in query_params: #this is a converted dataset file_type = query_params[ 'dataformat' ][0] file_type = get_galaxy_ext_from_genomespace_format_url( url_opener, file_type ) else: try: #get and use GSMetadata object download_file_path = download_url.split( "%s/file/" % ( genomespace_site_dict['dmServer'] ), 1)[-1] #FIXME: This is a very bad way to get the path for determining metadata. There needs to be a way to query API using download URLto get to the metadata object metadata_request = urllib2.Request( "%s/%s/filemetadata/%s" % ( genomespace_site_dict['dmServer'], GENOMESPACE_API_VERSION_STRING, download_file_path ) ) metadata_request.get_method = lambda: 'GET' metadata_url = url_opener.open( metadata_request ) file_metadata_dict = simplejson.loads( metadata_url.read() ) metadata_url.close() file_type = file_metadata_dict.get( 'dataFormat', None ) if file_type and file_type.get( 'url' ): file_type = file_type.get( 'url' ) file_type = get_galaxy_ext_from_genomespace_format_url( url_opener, file_type, default = None ) except: pass if file_type is None: #try to sniff datatype try: file_type = sniff.handle_uploaded_dataset_file( output_filename, datatypes_registry ) except: pass #sniff failed if file_type is None and '.' in parsed_url[2]: #still no known datatype, fall back to using extension file_type = parsed_url[2].rsplit( '.', 1 )[-1] file_type = GENOMESPACE_EXT_TO_GALAXY_EXT.get( file_type, file_type ) if file_type is None: #use default extension (e.g. 'data') file_type = DEFAULT_GALAXY_EXT #save json info for single primary dataset if dataset_id is not None: metadata_parameter_file.write( "%s\n" % simplejson.dumps( dict( type = 'dataset', dataset_id = dataset_id, ext = file_type, name = "GenomeSpace importer on %s" % ( filename ) ) ) ) #if using tmp file, move the file to the new file path dir to get scooped up later if using_temp_file: shutil.move( output_filename, os.path.join( datasource_params['__new_file_path__'], 'primary_%i_output%s_visible_%s' % ( hda_id, ''.join( c in VALID_CHARS and c or '-' for c in filename ), file_type ) ) ) dataset_id = None #only one primary dataset available output_filename = None #only have one filename available metadata_parameter_file.close() return True
def download_from_genomespace_file_browser(json_parameter_file, genomespace_site): json_params = json.loads(open(json_parameter_file, 'r').read()) datasource_params = json_params.get('param_dict') username = datasource_params.get("gs-username", None) token = datasource_params.get("gs-token", None) assert None not in [username, token], "Missing GenomeSpace username or token." output_filename = datasource_params.get("output", None) dataset_id = json_params['output_data'][0]['dataset_id'] hda_id = json_params['output_data'][0]['hda_id'] url_opener = get_cookie_opener(username, token) #load and set genomespace format ids to galaxy exts genomespace_site_dict = get_genomespace_site_urls()[genomespace_site] set_genomespace_format_identifiers(url_opener, genomespace_site_dict['dmServer']) file_url_prefix = "fileUrl" file_type_prefix = "fileFormat" metadata_parameter_file = open( json_params['job_config']['TOOL_PROVIDED_JOB_METADATA_FILE'], 'wb') #setup datatypes registry for sniffing datatypes_registry = Registry() datatypes_registry.load_datatypes( root_dir=json_params['job_config']['GALAXY_ROOT_DIR'], config=json_params['job_config']['GALAXY_DATATYPES_CONF_FILE']) file_numbers = [] for name in datasource_params.keys(): if name.startswith(file_url_prefix): name = name[len(file_url_prefix):] file_numbers.append(int(name)) if not file_numbers: if output_filename: open(output_filename, 'wb') #erase contents of file raise Exception( "You must select at least one file to import into Galaxy.") file_numbers.sort() used_filenames = [] for file_num in file_numbers: url_key = "%s%i" % (file_url_prefix, file_num) download_url = datasource_params.get(url_key, None) if download_url is None: break filetype_key = "%s%i" % (file_type_prefix, file_num) filetype_url = datasource_params.get(filetype_key, None) galaxy_ext = get_galaxy_ext_from_genomespace_format_url( url_opener, filetype_url) formated_download_url = "%s?%s" % ( download_url, urllib.urlencode([('dataformat', filetype_url)])) new_file_request = urllib2.Request(formated_download_url) new_file_request.get_method = lambda: 'GET' target_download_url = url_opener.open(new_file_request) filename = None if 'Content-Disposition' in target_download_url.info(): # If the response has Content-Disposition, try to get filename from it content_disposition = dict( map( lambda x: x.strip().split('=') if '=' in x else (x.strip(), ''), target_download_url.info()['Content-Disposition'].split( ';'))) if 'filename' in content_disposition: filename = content_disposition['filename'].strip("\"'") if not filename: parsed_url = urlparse.urlparse(download_url) query_params = urlparse.parse_qs(parsed_url[4]) filename = urllib.unquote_plus(parsed_url[2].split('/')[-1]) if not filename: filename = download_url metadata_dict = None original_filename = filename if output_filename is None: filename = ''.join(c in VALID_CHARS and c or '-' for c in filename) while filename in used_filenames: filename = "-%s" % filename used_filenames.append(filename) output_filename = os.path.join( os.getcwd(), 'primary_%i_%s_visible_%s' % (hda_id, filename, galaxy_ext)) metadata_dict = dict(type='new_primary_dataset', base_dataset_id=dataset_id, ext=galaxy_ext, filename=output_filename, name="GenomeSpace import on %s" % (original_filename)) else: if dataset_id is not None: metadata_dict = dict(type='dataset', dataset_id=dataset_id, ext=galaxy_ext, name="GenomeSpace import on %s" % (filename)) output_file = open(output_filename, 'wb') chunk_write(target_download_url, output_file) output_file.close() if (galaxy_ext == AUTO_GALAXY_EXT or filetype_url == GENOMESPACE_FORMAT_IDENTIFIER_UNKNOWN) and metadata_dict: #try to sniff datatype try: galaxy_ext = sniff.handle_uploaded_dataset_file( output_filename, datatypes_registry) except: #sniff failed galaxy_ext = original_filename.rsplit('.', 1)[-1] if galaxy_ext not in datatypes_registry.datatypes_by_extension: galaxy_ext = DEFAULT_GALAXY_EXT metadata_dict['ext'] = galaxy_ext output_filename = None #only have one filename available #write out metadata info if metadata_dict: metadata_parameter_file.write("%s\n" % json.dumps(metadata_dict)) metadata_parameter_file.close() return True