示例#1
0
def set_meta_with_tool_provided(dataset_instance, file_dict, set_meta_kwds,
                                datatypes_registry, max_metadata_value_size):
    # This method is somewhat odd, in that we set the metadata attributes from tool,
    # then call set_meta, then set metadata attributes from tool again.
    # This is intentional due to interplay of overwrite kwd, the fact that some metadata
    # parameters may rely on the values of others, and that we are accepting the
    # values provided by the tool as Truth.
    extension = dataset_instance.extension
    if extension == "_sniff_":
        try:
            from galaxy.datatypes import sniff
            extension = sniff.handle_uploaded_dataset_file(
                dataset_instance.dataset.external_filename, datatypes_registry)
            # We need to both set the extension so it is available to set_meta
            # and record it in the metadata so it can be reloaded on the server
            # side and the model updated (see MetadataCollection.{from,to}_JSON_dict)
            dataset_instance.extension = extension
            # Set special metadata property that will reload this on server side.
            setattr(dataset_instance.metadata, "__extension__", extension)
        except Exception:
            log.exception("Problem sniffing datatype.")

    for metadata_name, metadata_value in file_dict.get('metadata', {}).items():
        setattr(dataset_instance.metadata, metadata_name, metadata_value)
    dataset_instance.datatype.set_meta(dataset_instance, **set_meta_kwds)
    for metadata_name, metadata_value in file_dict.get('metadata', {}).items():
        setattr(dataset_instance.metadata, metadata_name, metadata_value)

    if max_metadata_value_size:
        for k, v in list(dataset_instance.metadata.items()):
            if total_size(v) > max_metadata_value_size:
                log.info("Key %s too large for metadata, discarding" % k)
                dataset_instance.metadata.remove_key(k)
示例#2
0
def set_meta_with_tool_provided( dataset_instance, file_dict, set_meta_kwds, datatypes_registry ):
    # This method is somewhat odd, in that we set the metadata attributes from tool,
    # then call set_meta, then set metadata attributes from tool again.
    # This is intentional due to interplay of overwrite kwd, the fact that some metadata
    # parameters may rely on the values of others, and that we are accepting the
    # values provided by the tool as Truth.
    extension = dataset_instance.extension
    if extension == "_sniff_":
        try:
            from galaxy.datatypes import sniff
            extension = sniff.handle_uploaded_dataset_file( dataset_instance.dataset.external_filename, datatypes_registry )
            # We need to both set the extension so it is available to set_meta
            # and record it in the metadata so it can be reloaded on the server
            # side and the model updated (see MetadataCollection.{from,to}_JSON_dict)
            dataset_instance.extension = extension
            # Set special metadata property that will reload this on server side.
            setattr( dataset_instance.metadata, "__extension__", extension )
        except Exception:
            # TODO: log this when metadata can log stuff...
            # https://trello.com/c/Nrwodu9d
            pass

    for metadata_name, metadata_value in file_dict.get( 'metadata', {} ).iteritems():
        setattr( dataset_instance.metadata, metadata_name, metadata_value )
    dataset_instance.datatype.set_meta( dataset_instance, **set_meta_kwds )
    for metadata_name, metadata_value in file_dict.get( 'metadata', {} ).iteritems():
        setattr( dataset_instance.metadata, metadata_name, metadata_value )
示例#3
0
def set_meta_with_tool_provided(dataset_instance, file_dict, set_meta_kwds,
                                datatypes_registry):
    # This method is somewhat odd, in that we set the metadata attributes from tool,
    # then call set_meta, then set metadata attributes from tool again.
    # This is intentional due to interplay of overwrite kwd, the fact that some metadata
    # parameters may rely on the values of others, and that we are accepting the
    # values provided by the tool as Truth.
    extension = dataset_instance.extension
    if extension == "_sniff_":
        try:
            from galaxy.datatypes import sniff
            extension = sniff.handle_uploaded_dataset_file(
                dataset_instance.dataset.external_filename, datatypes_registry)
            # We need to both set the extension so it is available to set_meta
            # and record it in the metadata so it can be reloaded on the server
            # side and the model updated (see MetadataCollection.{from,to}_JSON_dict)
            dataset_instance.extension = extension
            # Set special metadata property that will reload this on server side.
            setattr(dataset_instance.metadata, "__extension__", extension)
        except Exception:
            # TODO: log this when metadata can log stuff...
            # https://trello.com/c/Nrwodu9d
            pass

    for metadata_name, metadata_value in file_dict.get('metadata',
                                                       {}).iteritems():
        setattr(dataset_instance.metadata, metadata_name, metadata_value)
    dataset_instance.datatype.set_meta(dataset_instance, **set_meta_kwds)
    for metadata_name, metadata_value in file_dict.get('metadata',
                                                       {}).iteritems():
        setattr(dataset_instance.metadata, metadata_name, metadata_value)
def sniff_and_handle_data_type(file_path, datatypes_registry):
    """
    The sniff.handle_uploaded_dataset_file() method in Galaxy performs dual
    functions: it sniffs the filetype and if it's a compressed archive for
    a non compressed datatype such as fasta, it will be unpacked.
    """
    ext = sniff.handle_uploaded_dataset_file(file_path, datatypes_registry)
    if not ext or ext == "data":
        is_binary = check_binary(file_path)
        ext = sniff.guess_ext(file_path,
                              datatypes_registry.sniff_order,
                              is_binary=is_binary)
    return ext
def sniff_and_handle_data_type(json_params, output_file):
    """
    The sniff.handle_uploaded_dataset_file() method in Galaxy performs dual
    functions: it sniffs the filetype and if it's a compressed archive for
    a non compressed datatype such as fasta, it will be unpacked.
    """
    try:
        datatypes_registry = Registry()
        datatypes_registry.load_datatypes(
            root_dir=json_params['job_config']['GALAXY_ROOT_DIR'],
            config=json_params['job_config']['GALAXY_DATATYPES_CONF_FILE'])
        file_type = sniff.handle_uploaded_dataset_file(output_file,
                                                       datatypes_registry)
        return file_type
    except Exception:
        return None
示例#6
0
def sniff_and_handle_data_type(json_params, output_file):
    """
    The sniff.handle_uploaded_dataset_file() method in Galaxy performs dual
    functions: it sniffs the filetype and if it's a compressed archive for
    a non compressed datatype such as fasta, it will be unpacked.
    """
    try:
        datatypes_registry = Registry()
        datatypes_registry.load_datatypes(
            root_dir=json_params['job_config']['GALAXY_ROOT_DIR'],
            config=json_params['job_config']['GALAXY_DATATYPES_CONF_FILE'])
        file_type = sniff.handle_uploaded_dataset_file(
            output_file,
            datatypes_registry)
        return file_type
    except:
        return None
示例#7
0
def __main__():
    filename = sys.argv[1]
    try:
        max_file_size = int( sys.argv[2] )
    except:
        max_file_size = 0

    job_params, params = load_input_parameters( filename )
    if job_params is None:  # using an older tabular file
        enhanced_handling = False
        job_params = dict( param_dict=params )
        job_params[ 'output_data' ] = [ dict( out_data_name='output',
                                              ext='data',
                                              file_name=filename,
                                              extra_files_path=None ) ]
        job_params[ 'job_config' ] = dict( GALAXY_ROOT_DIR=GALAXY_ROOT_DIR, GALAXY_DATATYPES_CONF_FILE=GALAXY_DATATYPES_CONF_FILE, TOOL_PROVIDED_JOB_METADATA_FILE=TOOL_PROVIDED_JOB_METADATA_FILE )
    else:
        enhanced_handling = True
        json_file = open( job_params[ 'job_config' ][ 'TOOL_PROVIDED_JOB_METADATA_FILE' ], 'w' )  # specially named file for output junk to pass onto set metadata

    datatypes_registry = Registry()
    datatypes_registry.load_datatypes( root_dir=job_params[ 'job_config' ][ 'GALAXY_ROOT_DIR' ], config=job_params[ 'job_config' ][ 'GALAXY_DATATYPES_CONF_FILE' ] )

    URL = params.get( 'URL', None )  # using exactly URL indicates that only one dataset is being downloaded
    URL_method = params.get( 'URL_method', None )

    # The Python support for fetching resources from the web is layered. urllib uses the httplib
    # library, which in turn uses the socket library.  As of Python 2.3 you can specify how long
    # a socket should wait for a response before timing out. By default the socket module has no
    # timeout and can hang. Currently, the socket timeout is not exposed at the httplib or urllib2
    # levels. However, you can set the default timeout ( in seconds ) globally for all sockets by
    # doing the following.
    socket.setdefaulttimeout( 600 )

    for data_dict in job_params[ 'output_data' ]:
        cur_filename = data_dict.get( 'file_name', filename )
        cur_URL = params.get( '%s|%s|URL' % ( GALAXY_PARAM_PREFIX, data_dict[ 'out_data_name' ] ), URL )
        if not cur_URL:
            open( cur_filename, 'w' ).write( "" )
            stop_err( 'The remote data source application has not sent back a URL parameter in the request.' )

        # The following calls to urlopen() will use the above default timeout
        try:
            if not URL_method or URL_method == 'get':
                page = urlopen( cur_URL )
            elif URL_method == 'post':
                page = urlopen( cur_URL, urlencode( params ) )
        except Exception as e:
            stop_err( 'The remote data source application may be off line, please try again later. Error: %s' % str( e ) )
        if max_file_size:
            file_size = int( page.info().get( 'Content-Length', 0 ) )
            if file_size > max_file_size:
                stop_err( 'The size of the data (%d bytes) you have requested exceeds the maximum allowed (%d bytes) on this server.' % ( file_size, max_file_size ) )
        # do sniff stream for multi_byte
        try:
            cur_filename, is_multi_byte = sniff.stream_to_open_named_file( page, os.open( cur_filename, os.O_WRONLY | os.O_CREAT ), cur_filename, source_encoding=get_charset_from_http_headers( page.headers ) )
        except Exception as e:
            stop_err( 'Unable to fetch %s:\n%s' % ( cur_URL, e ) )

        # here import checks that upload tool performs
        if enhanced_handling:
            try:
                ext = sniff.handle_uploaded_dataset_file( filename, datatypes_registry, ext=data_dict[ 'ext' ], is_multi_byte=is_multi_byte )
            except Exception as e:
                stop_err( str( e ) )
            info = dict( type='dataset',
                         dataset_id=data_dict[ 'dataset_id' ],
                         ext=ext)

            json_file.write( "%s\n" % dumps( info ) )
def download_from_genomespace_file_browser( json_parameter_file, genomespace_site ):
    json_params = json.loads( open( json_parameter_file, 'r' ).read() )
    datasource_params = json_params.get( 'param_dict' )
    username = datasource_params.get( "gs-username", None )
    token = datasource_params.get( "gs-token", None )
    assert None not in [ username, token ], "Missing GenomeSpace username or token."
    output_filename = datasource_params.get( "output", None )
    dataset_id = json_params['output_data'][0]['dataset_id']
    hda_id = json_params['output_data'][0]['hda_id']
    url_opener = get_cookie_opener( username, token )
    #load and set genomespace format ids to galaxy exts
    genomespace_site_dict = get_genomespace_site_urls()[ genomespace_site ]
    set_genomespace_format_identifiers( url_opener, genomespace_site_dict['dmServer'] )
    
    file_url_prefix = "fileUrl"
    file_type_prefix = "fileFormat"
    metadata_parameter_file = open( json_params['job_config']['TOOL_PROVIDED_JOB_METADATA_FILE'], 'wb' )
    
    #setup datatypes registry for sniffing
    datatypes_registry = Registry()
    datatypes_registry.load_datatypes( root_dir = json_params[ 'job_config' ][ 'GALAXY_ROOT_DIR' ], config = json_params[ 'job_config' ][ 'GALAXY_DATATYPES_CONF_FILE' ] )
    
    file_numbers = []
    for name in datasource_params.keys():
        if name.startswith( file_url_prefix ):
            name = name[len( file_url_prefix ):]
            file_numbers.append( int( name ) )
    if not file_numbers:
        if output_filename:
            open( output_filename, 'wb' ) #erase contents of file
        raise Exception( "You must select at least one file to import into Galaxy." )
    file_numbers.sort()
    used_filenames = []
    for file_num in file_numbers:
        url_key = "%s%i" % ( file_url_prefix, file_num )
        download_url = datasource_params.get( url_key, None )
        if download_url is None:
            break
        filetype_key = "%s%i" % ( file_type_prefix, file_num )
        filetype_url = datasource_params.get( filetype_key, None )
        galaxy_ext = get_galaxy_ext_from_genomespace_format_url( url_opener, filetype_url )
        formated_download_url = "%s?%s" % ( download_url, urllib.urlencode( [ ( 'dataformat', filetype_url ) ] ) )
        new_file_request = urllib2.Request( formated_download_url )
        new_file_request.get_method = lambda: 'GET'
        target_download_url = url_opener.open( new_file_request )
        filename = None
        if 'Content-Disposition' in target_download_url.info():
            # If the response has Content-Disposition, try to get filename from it
            content_disposition = dict( map( lambda x: x.strip().split('=') if '=' in x else ( x.strip(),'' ), target_download_url.info()['Content-Disposition'].split( ';' ) ) )
            if 'filename' in content_disposition:
                filename = content_disposition[ 'filename' ].strip( "\"'" )
        if not filename:
            parsed_url = urlparse.urlparse( download_url )
            query_params = urlparse.parse_qs( parsed_url[4] )
            filename = urllib.unquote_plus( parsed_url[2].split( '/' )[-1] )
        if not filename:
            filename = download_url
        metadata_dict = None
        original_filename = filename
        if output_filename is None:
            filename = ''.join( c in VALID_CHARS and c or '-' for c in filename )
            while filename in used_filenames:
                filename = "-%s" % filename
            used_filenames.append( filename )
            output_filename = os.path.join( os.getcwd(),  'primary_%i_%s_visible_%s' % ( hda_id, filename, galaxy_ext ) )
            
            metadata_dict = dict( type = 'new_primary_dataset',
                                base_dataset_id = dataset_id,
                                ext = galaxy_ext,
                                filename = output_filename,
                                name = "GenomeSpace import on %s" % ( original_filename ) )
        else:
            if dataset_id is not None:
                metadata_dict = dict( type = 'dataset',
                                dataset_id = dataset_id,
                                ext = galaxy_ext,
                                name = "GenomeSpace import on %s" % ( filename ) )
        output_file = open( output_filename, 'wb' )
        chunk_write( target_download_url, output_file )
        output_file.close()
        
        if ( galaxy_ext == AUTO_GALAXY_EXT or filetype_url == GENOMESPACE_FORMAT_IDENTIFIER_UNKNOWN ) and metadata_dict:
            #try to sniff datatype
            try:
                galaxy_ext = sniff.handle_uploaded_dataset_file( output_filename, datatypes_registry )
            except:
                #sniff failed
                galaxy_ext = original_filename.rsplit( '.', 1 )[-1]
                if galaxy_ext not in datatypes_registry.datatypes_by_extension:
                    galaxy_ext = DEFAULT_GALAXY_EXT
            metadata_dict[ 'ext' ] = galaxy_ext
        
        output_filename = None #only have one filename available
        
        #write out metadata info
        if metadata_dict:
            metadata_parameter_file.write( "%s\n" % json.dumps( metadata_dict ) )
        
    metadata_parameter_file.close()
    return True
def download_from_genomespace_importer(username, token, json_parameter_file,
                                       genomespace_site, gs_toolname):
    json_params = json.loads(open(json_parameter_file, 'r').read())
    datasource_params = json_params.get('param_dict')
    assert None not in [username,
                        token], "Missing GenomeSpace username or token."
    output_filename = datasource_params.get("output_file1", None)
    dataset_id = base_dataset_id = json_params['output_data'][0]['dataset_id']
    hda_id = json_params['output_data'][0]['hda_id']
    url_opener = get_cookie_opener(username, token, gs_toolname=gs_toolname)
    #load and set genomespace format ids to galaxy exts
    genomespace_site_dict = get_genomespace_site_urls()[genomespace_site]
    set_genomespace_format_identifiers(url_opener,
                                       genomespace_site_dict['dmServer'])
    file_url_name = "URL"
    metadata_parameter_file = open(
        json_params['job_config']['TOOL_PROVIDED_JOB_METADATA_FILE'], 'wb')
    #setup datatypes registry for sniffing
    datatypes_registry = Registry()
    datatypes_registry.load_datatypes(
        root_dir=json_params['job_config']['GALAXY_ROOT_DIR'],
        config=json_params['job_config']['GALAXY_DATATYPES_CONF_FILE'])
    url_param = datasource_params.get(file_url_name, None)
    used_filenames = []
    for download_url in url_param.split(','):
        using_temp_file = False
        parsed_url = urlparse.urlparse(download_url)
        query_params = urlparse.parse_qs(parsed_url[4])
        #write file to disk
        new_file_request = urllib2.Request(download_url)
        new_file_request.get_method = lambda: 'GET'
        target_download_url = url_opener.open(new_file_request)
        filename = None
        if 'Content-Disposition' in target_download_url.info():
            content_disposition = dict(
                map(
                    lambda x: x.strip().split('=')
                    if '=' in x else (x.strip(), ''),
                    target_download_url.info()['Content-Disposition'].split(
                        ';')))
            if 'filename' in content_disposition:
                filename = content_disposition['filename'].strip("\"'")
        if not filename:
            parsed_url = urlparse.urlparse(download_url)
            query_params = urlparse.parse_qs(parsed_url[4])
            filename = urllib.unquote_plus(parsed_url[2].split('/')[-1])
        if not filename:
            filename = download_url
        if output_filename is None:
            #need to use a temp file here, because we do not know the ext yet
            using_temp_file = True
            output_filename = tempfile.NamedTemporaryFile(
                prefix='tmp-genomespace-importer-').name
        output_file = open(output_filename, 'wb')
        chunk_write(target_download_url, output_file)
        output_file.close()

        #determine file format
        file_type = None
        if 'dataformat' in query_params:  #this is a converted dataset
            file_type = query_params['dataformat'][0]
            file_type = get_galaxy_ext_from_genomespace_format_url(
                url_opener, file_type)
        else:
            try:
                #get and use GSMetadata object
                download_file_path = download_url.split(
                    "%s/file/" % (genomespace_site_dict['dmServer']), 1
                )[-1]  #FIXME: This is a very bad way to get the path for determining metadata. There needs to be a way to query API using download URLto get to the metadata object
                metadata_request = urllib2.Request(
                    "%s/%s/filemetadata/%s" %
                    (genomespace_site_dict['dmServer'],
                     GENOMESPACE_API_VERSION_STRING, download_file_path))
                metadata_request.get_method = lambda: 'GET'
                metadata_url = url_opener.open(metadata_request)
                file_metadata_dict = json.loads(metadata_url.read())
                metadata_url.close()
                file_type = file_metadata_dict.get('dataFormat', None)
                if file_type and file_type.get('url'):
                    file_type = file_type.get('url')
                    file_type = get_galaxy_ext_from_genomespace_format_url(
                        url_opener, file_type, default=None)
            except:
                pass
        if file_type is None:
            #try to sniff datatype
            try:
                file_type = sniff.handle_uploaded_dataset_file(
                    output_filename, datatypes_registry)
            except:
                pass  #sniff failed
        if file_type is None and '.' in parsed_url[2]:
            #still no known datatype, fall back to using extension
            file_type = parsed_url[2].rsplit('.', 1)[-1]
            file_type = GENOMESPACE_EXT_TO_GALAXY_EXT.get(file_type, file_type)
        if file_type is None:
            #use default extension (e.g. 'data')
            file_type = DEFAULT_GALAXY_EXT

        #save json info for single primary dataset
        if dataset_id is not None:
            metadata_parameter_file.write("%s\n" % json.dumps(
                dict(type='dataset',
                     dataset_id=dataset_id,
                     ext=file_type,
                     name="GenomeSpace importer on %s" % (filename))))
        #if using tmp file, move the file to the new file path dir to get scooped up later
        if using_temp_file:
            original_filename = filename
            filename = ''.join(c in VALID_CHARS and c or '-' for c in filename)
            while filename in used_filenames:
                filename = "-%s" % filename
            used_filenames.append(filename)
            target_output_filename = os.path.join(
                os.getcwd(),
                'primary_%i_%s_visible_%s' % (hda_id, filename, file_type))
            shutil.move(output_filename, target_output_filename)
            metadata_parameter_file.write("%s\n" % json.dumps(
                dict(type='new_primary_dataset',
                     base_dataset_id=base_dataset_id,
                     ext=file_type,
                     filename=target_output_filename,
                     name="GenomeSpace importer on %s" % (original_filename))))
        dataset_id = None  #only one primary dataset available
        output_filename = None  #only have one filename available
    metadata_parameter_file.close()
    return True
示例#10
0
def add_file(dataset, registry, output_path):
    ext = None
    compression_type = None
    line_count = None
    converted_path = None
    stdout = None
    link_data_only_str = dataset.get('link_data_only', 'copy_files')
    if link_data_only_str not in ['link_to_files', 'copy_files']:
        raise UploadProblemException(
            "Invalid setting '%s' for option link_data_only - upload request misconfigured"
            % link_data_only_str)
    link_data_only = link_data_only_str == 'link_to_files'

    # run_as_real_user is estimated from galaxy config (external chmod indicated of inputs executed)
    # If this is True we always purge supplied upload inputs so they are cleaned up and we reuse their
    # paths during data conversions since this user already owns that path.
    # Older in_place check for upload jobs created before 18.01, TODO remove in 19.XX. xref #5206
    run_as_real_user = dataset.get('run_as_real_user', False) or dataset.get(
        "in_place", False)

    # purge_source defaults to True unless this is an FTP import and
    # ftp_upload_purge has been overridden to False in Galaxy's config.
    # We set purge_source to False if:
    # - the job does not have write access to the file, e.g. when running as the
    #   real user
    # - the files are uploaded from external paths.
    purge_source = dataset.get(
        'purge_source',
        True) and not run_as_real_user and dataset.type not in ('server_dir',
                                                                'path_paste')

    # in_place is True unless we are running as a real user or importing external paths (i.e.
    # this is a real upload and not a path paste or ftp import).
    # in_place should always be False if running as real user because the uploaded file will
    # be owned by Galaxy and not the user and it should be False for external paths so Galaxy doesn't
    # modify files not controlled by Galaxy.
    in_place = not run_as_real_user and dataset.type not in ('server_dir',
                                                             'path_paste',
                                                             'ftp_import')

    # Base on the check_upload_content Galaxy config option and on by default, this enables some
    # security related checks on the uploaded content, but can prevent uploads from working in some cases.
    check_content = dataset.get('check_content', True)

    # auto_decompress is a request flag that can be swapped off to prevent Galaxy from automatically
    # decompressing archive files before sniffing.
    auto_decompress = dataset.get('auto_decompress', True)
    try:
        dataset.file_type
    except AttributeError:
        raise UploadProblemException(
            'Unable to process uploaded file, missing file_type parameter.')

    if dataset.type == 'url':
        try:
            dataset.path = sniff.stream_url_to_file(dataset.path)
        except Exception as e:
            raise UploadProblemException('Unable to fetch %s\n%s' %
                                         (dataset.path, str(e)))

    # See if we have an empty file
    if not os.path.exists(dataset.path):
        raise UploadProblemException(
            'Uploaded temporary file (%s) does not exist.' % dataset.path)

    if not os.path.getsize(dataset.path) > 0:
        raise UploadProblemException('The uploaded file is empty')

    # Does the first 1K contain a null?
    is_binary = check_binary(dataset.path)

    # Decompress if needed/desired and determine/validate filetype. If a keep-compressed datatype is explicitly selected
    # or if autodetection is selected and the file sniffs as a keep-compressed datatype, it will not be decompressed.
    if not link_data_only:
        if is_zip(dataset.path) and not is_single_file_zip(dataset.path):
            stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.'
        try:
            ext, converted_path, compression_type = sniff.handle_uploaded_dataset_file(
                dataset.path,
                registry,
                ext=dataset.file_type,
                tmp_prefix='data_id_%s_upload_' % dataset.dataset_id,
                tmp_dir=output_adjacent_tmpdir(output_path),
                in_place=in_place,
                check_content=check_content,
                is_binary=is_binary,
                auto_decompress=auto_decompress,
                uploaded_file_ext=os.path.splitext(
                    dataset.name)[1].lower().lstrip('.'),
                convert_to_posix_lines=dataset.to_posix_lines,
                convert_spaces_to_tabs=dataset.space_to_tab,
            )
        except sniff.InappropriateDatasetContentError as exc:
            raise UploadProblemException(str(exc))
    elif dataset.file_type == 'auto':
        # Link mode can't decompress anyway, so enable sniffing for keep-compressed datatypes even when auto_decompress
        # is enabled
        os.environ['GALAXY_SNIFFER_VALIDATE_MODE'] = '1'
        ext = sniff.guess_ext(dataset.path,
                              registry.sniff_order,
                              is_binary=is_binary)
        os.environ.pop('GALAXY_SNIFFER_VALIDATE_MODE')

    # The converted path will be the same as the input path if no conversion was done (or in-place conversion is used)
    converted_path = None if converted_path == dataset.path else converted_path

    # Validate datasets where the filetype was explicitly set using the filetype's sniffer (if any)
    if dataset.file_type != 'auto':
        datatype = registry.get_datatype_by_extension(dataset.file_type)
        # Enable sniffer "validate mode" (prevents certain sniffers from disabling themselves)
        os.environ['GALAXY_SNIFFER_VALIDATE_MODE'] = '1'
        if hasattr(datatype, 'sniff') and not datatype.sniff(dataset.path):
            stdout = (
                "Warning: The file 'Type' was set to '{ext}' but the file does not appear to be of that"
                " type".format(ext=dataset.file_type))
        os.environ.pop('GALAXY_SNIFFER_VALIDATE_MODE')

    # Handle unsniffable binaries
    if is_binary and ext == 'binary':
        upload_ext = os.path.splitext(dataset.name)[1].lower().lstrip('.')
        if registry.is_extension_unsniffable_binary(upload_ext):
            stdout = (
                "Warning: The file's datatype cannot be determined from its contents and was guessed based on"
                " its extension, to avoid this warning, manually set the file 'Type' to '{ext}' when uploading"
                " this type of file".format(ext=upload_ext))
            ext = upload_ext
        else:
            stdout = (
                "The uploaded binary file format cannot be determined automatically, please set the file 'Type'"
                " manually")

    datatype = registry.get_datatype_by_extension(ext)

    # Strip compression extension from name
    if compression_type and not getattr(
            datatype, 'compressed',
            False) and dataset.name.endswith('.' + compression_type):
        dataset.name = dataset.name[:-len('.' + compression_type)]

    # Move dataset
    if link_data_only:
        # Never alter a file that will not be copied to Galaxy's local file store.
        if datatype.dataset_content_needs_grooming(dataset.path):
            err_msg = 'The uploaded files need grooming, so change your <b>Copy data into Galaxy?</b> selection to be ' + \
                '<b>Copy files into Galaxy</b> instead of <b>Link to files without copying into Galaxy</b> so grooming can be performed.'
            raise UploadProblemException(err_msg)
    if not link_data_only:
        # Move the dataset to its "real" path. converted_path is a tempfile so we move it even if purge_source is False.
        if purge_source or converted_path:
            try:
                shutil.move(converted_path or dataset.path, output_path)
            except OSError as e:
                # We may not have permission to remove the input
                if e.errno != errno.EACCES:
                    raise
        else:
            shutil.copy(dataset.path, output_path)

    # Write the job info
    stdout = stdout or 'uploaded %s file' % ext
    info = dict(type='dataset',
                dataset_id=dataset.dataset_id,
                ext=ext,
                stdout=stdout,
                name=dataset.name,
                line_count=line_count)
    if dataset.get('uuid', None) is not None:
        info['uuid'] = dataset.get('uuid')
    # FIXME: does this belong here? also not output-adjacent-tmpdir aware =/
    if not link_data_only and datatype and datatype.dataset_content_needs_grooming(
            output_path):
        # Groom the dataset content if necessary
        datatype.groom_dataset_content(output_path)
    return info
示例#11
0
def __main__():
    filename = sys.argv[1]
    try:
        max_file_size = int(sys.argv[2])
    except Exception:
        max_file_size = 0

    job_params, params = load_input_parameters(filename)
    if job_params is None:  # using an older tabular file
        enhanced_handling = False
        job_params = dict(param_dict=params)
        job_params['output_data'] = [dict(out_data_name='output',
                                          ext='data',
                                          file_name=filename,
                                          extra_files_path=None)]
        job_params['job_config'] = dict(GALAXY_ROOT_DIR=GALAXY_ROOT_DIR, GALAXY_DATATYPES_CONF_FILE=GALAXY_DATATYPES_CONF_FILE, TOOL_PROVIDED_JOB_METADATA_FILE=TOOL_PROVIDED_JOB_METADATA_FILE)
    else:
        enhanced_handling = True
        json_file = open(job_params['job_config']['TOOL_PROVIDED_JOB_METADATA_FILE'], 'w')  # specially named file for output junk to pass onto set metadata

    datatypes_registry = Registry()
    datatypes_registry.load_datatypes(root_dir=job_params['job_config']['GALAXY_ROOT_DIR'], config=job_params['job_config']['GALAXY_DATATYPES_CONF_FILE'])

    URL = params.get('URL', None)  # using exactly URL indicates that only one dataset is being downloaded
    URL_method = params.get('URL_method', None)

    # The Python support for fetching resources from the web is layered. urllib uses the httplib
    # library, which in turn uses the socket library.  As of Python 2.3 you can specify how long
    # a socket should wait for a response before timing out. By default the socket module has no
    # timeout and can hang. Currently, the socket timeout is not exposed at the httplib or urllib2
    # levels. However, you can set the default timeout ( in seconds ) globally for all sockets by
    # doing the following.
    socket.setdefaulttimeout(600)

    for data_dict in job_params['output_data']:
        cur_filename = data_dict.get('file_name', filename)
        cur_URL = params.get('%s|%s|URL' % (GALAXY_PARAM_PREFIX, data_dict['out_data_name']), URL)
        if not cur_URL or urlparse(cur_URL).scheme not in ('http', 'https', 'ftp'):
            open(cur_filename, 'w').write("")
            stop_err('The remote data source application has not sent back a URL parameter in the request.')

        # The following calls to urlopen() will use the above default timeout
        try:
            if not URL_method or URL_method == 'get':
                page = urlopen(cur_URL)
            elif URL_method == 'post':
                page = urlopen(cur_URL, urlencode(params).encode("utf-8"))
        except Exception as e:
            stop_err('The remote data source application may be off line, please try again later. Error: %s' % str(e))
        if max_file_size:
            file_size = int(page.info().get('Content-Length', 0))
            if file_size > max_file_size:
                stop_err('The size of the data (%d bytes) you have requested exceeds the maximum allowed (%d bytes) on this server.' % (file_size, max_file_size))
        try:
            cur_filename = sniff.stream_to_open_named_file(page, os.open(cur_filename, os.O_WRONLY | os.O_CREAT), cur_filename, source_encoding=get_charset_from_http_headers(page.headers))
        except Exception as e:
            stop_err('Unable to fetch %s:\n%s' % (cur_URL, e))

        # here import checks that upload tool performs
        if enhanced_handling:
            try:
                ext = sniff.handle_uploaded_dataset_file(filename, datatypes_registry, ext=data_dict['ext'])
            except Exception as e:
                stop_err(str(e))
            info = dict(type='dataset',
                        dataset_id=data_dict['dataset_id'],
                        ext=ext)

            json_file.write("%s\n" % dumps(info))
示例#12
0
                    % (file_size, max_file_size))
        # do sniff stream for multi_byte
        try:
            cur_filename, is_multi_byte = sniff.stream_to_open_named_file(
                page,
                os.open(cur_filename, os.O_WRONLY | os.O_CREAT),
                cur_filename,
                source_encoding=get_charset_from_http_headers(page.headers))
        except Exception, e:
            stop_err('Unable to fetch %s:\n%s' % (cur_URL, e))

        # here import checks that upload tool performs
        if enhanced_handling:
            try:
                ext = sniff.handle_uploaded_dataset_file(
                    filename,
                    datatypes_registry,
                    ext=data_dict['ext'],
                    is_multi_byte=is_multi_byte)
            except Exception, e:
                stop_err(str(e))
            info = dict(type='dataset',
                        dataset_id=data_dict['dataset_id'],
                        ext=ext)

            json_file.write("%s\n" % dumps(info))


if __name__ == "__main__":
    __main__()
示例#13
0
        try:
            if not URL_method or URL_method == 'get':
                page = urllib.urlopen( cur_URL )
            elif URL_method == 'post':
                page = urllib.urlopen( cur_URL, urllib.urlencode( params ) )
        except Exception, e:
            stop_err( 'The remote data source application may be off line, please try again later. Error: %s' % str( e ) )
        if max_file_size:
            file_size = int( page.info().get( 'Content-Length', 0 ) )
            if file_size > max_file_size:
                stop_err( 'The size of the data (%d bytes) you have requested exceeds the maximum allowed (%d bytes) on this server.' % ( file_size, max_file_size ) )
        #do sniff stream for multi_byte
        try:
            cur_filename, is_multi_byte = sniff.stream_to_open_named_file( page, os.open( cur_filename, os.O_WRONLY | os.O_CREAT ), cur_filename, source_encoding=get_charset_from_http_headers( page.headers ) )
        except Exception, e:
            stop_err( 'Unable to fetch %s:\n%s' % ( cur_URL, e ) )

        #here import checks that upload tool performs
        if enhanced_handling:
            try:
                ext = sniff.handle_uploaded_dataset_file( filename, datatypes_registry, ext = data_dict[ 'ext' ], is_multi_byte = is_multi_byte )
            except Exception, e:
                stop_err( str( e ) )
            info = dict( type = 'dataset',
                         dataset_id = data_dict[ 'dataset_id' ],
                         ext = ext)

            json_file.write( "%s\n" % dumps( info ) )

if __name__ == "__main__": __main__()
def download_from_genomespace_importer( username, token, json_parameter_file, genomespace_site ):
    json_params = simplejson.loads( open( json_parameter_file, 'r' ).read() )
    datasource_params = json_params.get( 'param_dict' )
    assert None not in [ username, token ], "Missing GenomeSpace username or token."
    output_filename = datasource_params.get( "output_file1", None )
    dataset_id = json_params['output_data'][0]['dataset_id']
    hda_id = json_params['output_data'][0]['hda_id']
    url_opener = get_cookie_opener( username, token )
    #load and set genomespace format ids to galaxy exts
    genomespace_site_dict = get_genomespace_site_urls()[ genomespace_site ]
    set_genomespace_format_identifiers( url_opener, genomespace_site_dict['dmServer'] )
    file_url_name = "URL"
    metadata_parameter_file = open( json_params['job_config']['TOOL_PROVIDED_JOB_METADATA_FILE'], 'wb' )
    #setup datatypes registry for sniffing
    datatypes_registry = Registry()
    datatypes_registry.load_datatypes( root_dir = json_params[ 'job_config' ][ 'GALAXY_ROOT_DIR' ], config = json_params[ 'job_config' ][ 'GALAXY_DATATYPES_CONF_FILE' ] )
    url_param = datasource_params.get( file_url_name, None )
    for download_url in url_param.split( ',' ):
        using_temp_file = False
        parsed_url = urlparse.urlparse( download_url )
        query_params = urlparse.parse_qs( parsed_url[4] )
        #write file to disk
        new_file_request = urllib2.Request( download_url )
        new_file_request.get_method = lambda: 'GET'
        target_download_url = url_opener.open( new_file_request )
        filename = None
        if 'Content-Disposition' in target_download_url.info():
            content_disposition = dict( map( lambda x: x.strip().split('=') if '=' in x else ( x.strip(),'' ), target_download_url.info()['Content-Disposition'].split( ';' ) ) )
            if 'filename' in content_disposition:
                filename = content_disposition[ 'filename' ].strip( "\"'" )
        if not filename:
            parsed_url = urlparse.urlparse( download_url )
            query_params = urlparse.parse_qs( parsed_url[4] )
            filename = urllib.unquote_plus( parsed_url[2].split( '/' )[-1] )
        if output_filename is None:
            #need to use a temp file here, because we do not know the ext yet
            using_temp_file = True
            output_filename = tempfile.NamedTemporaryFile( prefix='tmp-genomespace-importer-' ).name
        output_file = open( output_filename, 'wb' )
        chunk_write( target_download_url, output_file )
        output_file.close()
        
        #determine file format
        file_type = None
        if 'dataformat' in query_params: #this is a converted dataset
            file_type = query_params[ 'dataformat' ][0]
            file_type = get_galaxy_ext_from_genomespace_format_url( url_opener, file_type )
        else:
            try:
                #get and use GSMetadata object
                download_file_path = download_url.split( "%s/file/" % ( genomespace_site_dict['dmServer'] ), 1)[-1] #FIXME: This is a very bad way to get the path for determining metadata. There needs to be a way to query API using download URLto get to the metadata object
                metadata_request = urllib2.Request( "%s/%s/filemetadata/%s" % ( genomespace_site_dict['dmServer'], GENOMESPACE_API_VERSION_STRING, download_file_path ) )
                metadata_request.get_method = lambda: 'GET'
                metadata_url = url_opener.open( metadata_request )
                file_metadata_dict = simplejson.loads( metadata_url.read() )
                metadata_url.close()
                file_type = file_metadata_dict.get( 'dataFormat', None )
                if file_type and file_type.get( 'url' ):
                    file_type = file_type.get( 'url' )
                    file_type = get_galaxy_ext_from_genomespace_format_url( url_opener, file_type, default = None )
            except:
                pass
        if file_type is None:
            #try to sniff datatype
            try:
                file_type = sniff.handle_uploaded_dataset_file( output_filename, datatypes_registry )
            except:
                pass #sniff failed
        if file_type is None and '.' in parsed_url[2]:
            #still no known datatype, fall back to using extension
            file_type = parsed_url[2].rsplit( '.', 1 )[-1]
            file_type = GENOMESPACE_EXT_TO_GALAXY_EXT.get( file_type, file_type )
        if file_type is None:
            #use default extension (e.g. 'data')
            file_type = DEFAULT_GALAXY_EXT
        
        #save json info for single primary dataset
        if dataset_id is not None:
           metadata_parameter_file.write( "%s\n" % simplejson.dumps( dict( type = 'dataset',
                                 dataset_id = dataset_id,
                                 ext = file_type,
                                 name = "GenomeSpace importer on %s" % ( filename ) ) ) )
        #if using tmp file, move the file to the new file path dir to get scooped up later
        if using_temp_file:
            shutil.move( output_filename, os.path.join( datasource_params['__new_file_path__'],  'primary_%i_output%s_visible_%s' % ( hda_id, ''.join( c in VALID_CHARS and c or '-' for c in filename ), file_type ) ) )
        
        dataset_id = None #only one primary dataset available
        output_filename = None #only have one filename available
    metadata_parameter_file.close()
    return True
示例#15
0
def download_from_genomespace_file_browser(json_parameter_file,
                                           genomespace_site):
    json_params = json.loads(open(json_parameter_file, 'r').read())
    datasource_params = json_params.get('param_dict')
    username = datasource_params.get("gs-username", None)
    token = datasource_params.get("gs-token", None)
    assert None not in [username,
                        token], "Missing GenomeSpace username or token."
    output_filename = datasource_params.get("output", None)
    dataset_id = json_params['output_data'][0]['dataset_id']
    hda_id = json_params['output_data'][0]['hda_id']
    url_opener = get_cookie_opener(username, token)
    #load and set genomespace format ids to galaxy exts
    genomespace_site_dict = get_genomespace_site_urls()[genomespace_site]
    set_genomespace_format_identifiers(url_opener,
                                       genomespace_site_dict['dmServer'])

    file_url_prefix = "fileUrl"
    file_type_prefix = "fileFormat"
    metadata_parameter_file = open(
        json_params['job_config']['TOOL_PROVIDED_JOB_METADATA_FILE'], 'wb')

    #setup datatypes registry for sniffing
    datatypes_registry = Registry()
    datatypes_registry.load_datatypes(
        root_dir=json_params['job_config']['GALAXY_ROOT_DIR'],
        config=json_params['job_config']['GALAXY_DATATYPES_CONF_FILE'])

    file_numbers = []
    for name in datasource_params.keys():
        if name.startswith(file_url_prefix):
            name = name[len(file_url_prefix):]
            file_numbers.append(int(name))
    if not file_numbers:
        if output_filename:
            open(output_filename, 'wb')  #erase contents of file
        raise Exception(
            "You must select at least one file to import into Galaxy.")
    file_numbers.sort()
    used_filenames = []
    for file_num in file_numbers:
        url_key = "%s%i" % (file_url_prefix, file_num)
        download_url = datasource_params.get(url_key, None)
        if download_url is None:
            break
        filetype_key = "%s%i" % (file_type_prefix, file_num)
        filetype_url = datasource_params.get(filetype_key, None)
        galaxy_ext = get_galaxy_ext_from_genomespace_format_url(
            url_opener, filetype_url)
        formated_download_url = "%s?%s" % (
            download_url, urllib.urlencode([('dataformat', filetype_url)]))
        new_file_request = urllib2.Request(formated_download_url)
        new_file_request.get_method = lambda: 'GET'
        target_download_url = url_opener.open(new_file_request)
        filename = None
        if 'Content-Disposition' in target_download_url.info():
            # If the response has Content-Disposition, try to get filename from it
            content_disposition = dict(
                map(
                    lambda x: x.strip().split('=')
                    if '=' in x else (x.strip(), ''),
                    target_download_url.info()['Content-Disposition'].split(
                        ';')))
            if 'filename' in content_disposition:
                filename = content_disposition['filename'].strip("\"'")
        if not filename:
            parsed_url = urlparse.urlparse(download_url)
            query_params = urlparse.parse_qs(parsed_url[4])
            filename = urllib.unquote_plus(parsed_url[2].split('/')[-1])
        if not filename:
            filename = download_url
        metadata_dict = None
        original_filename = filename
        if output_filename is None:
            filename = ''.join(c in VALID_CHARS and c or '-' for c in filename)
            while filename in used_filenames:
                filename = "-%s" % filename
            used_filenames.append(filename)
            output_filename = os.path.join(
                os.getcwd(),
                'primary_%i_%s_visible_%s' % (hda_id, filename, galaxy_ext))

            metadata_dict = dict(type='new_primary_dataset',
                                 base_dataset_id=dataset_id,
                                 ext=galaxy_ext,
                                 filename=output_filename,
                                 name="GenomeSpace import on %s" %
                                 (original_filename))
        else:
            if dataset_id is not None:
                metadata_dict = dict(type='dataset',
                                     dataset_id=dataset_id,
                                     ext=galaxy_ext,
                                     name="GenomeSpace import on %s" %
                                     (filename))
        output_file = open(output_filename, 'wb')
        chunk_write(target_download_url, output_file)
        output_file.close()

        if (galaxy_ext == AUTO_GALAXY_EXT or filetype_url
                == GENOMESPACE_FORMAT_IDENTIFIER_UNKNOWN) and metadata_dict:
            #try to sniff datatype
            try:
                galaxy_ext = sniff.handle_uploaded_dataset_file(
                    output_filename, datatypes_registry)
            except:
                #sniff failed
                galaxy_ext = original_filename.rsplit('.', 1)[-1]
                if galaxy_ext not in datatypes_registry.datatypes_by_extension:
                    galaxy_ext = DEFAULT_GALAXY_EXT
            metadata_dict['ext'] = galaxy_ext

        output_filename = None  #only have one filename available

        #write out metadata info
        if metadata_dict:
            metadata_parameter_file.write("%s\n" % json.dumps(metadata_dict))

    metadata_parameter_file.close()
    return True