Exemplo n.º 1
0
def looks_like_xml(path, regex=TOOL_REGEX):
    full_path = os.path.abspath(path)

    if not full_path.endswith(".xml"):
        return False

    if not os.path.getsize(full_path):
        return False

    if(checkers.check_binary(full_path) or
       checkers.check_image(full_path) or
       checkers.is_gzip(full_path) or
       checkers.is_bz2(full_path) or
       checkers.is_zip(full_path)):
        return False

    with open(path, encoding='utf-8') as f:
        try:
            start_contents = f.read(5 * 1024)
        except UnicodeDecodeError:
            return False
        if regex.search(start_contents):
            return True

    return False
Exemplo n.º 2
0
def handle_uploaded_dataset_file(filename,
                                 datatypes_registry,
                                 ext='auto',
                                 is_multi_byte=False):
    is_valid, ext = handle_compressed_file(filename,
                                           datatypes_registry,
                                           ext=ext)

    if not is_valid:
        raise InappropriateDatasetContentError(
            'The compressed uploaded file contains inappropriate content.')

    if ext in AUTO_DETECT_EXTENSIONS:
        ext = guess_ext(filename,
                        sniff_order=datatypes_registry.sniff_order,
                        is_multi_byte=is_multi_byte)

    if check_binary(filename):
        if not Binary.is_ext_unsniffable(
                ext) and not datatypes_registry.get_datatype_by_extension(
                    ext).sniff(filename):
            raise InappropriateDatasetContentError(
                'The binary uploaded file contains inappropriate content.')
    elif check_html(filename):
        raise InappropriateDatasetContentError(
            'The uploaded file contains inappropriate HTML content.')
    return ext
Exemplo n.º 3
0
def handle_uploaded_dataset_file_internal(
        filename,
        datatypes_registry,
        ext='auto',
        tmp_prefix='sniff_upload_',
        tmp_dir=None,
        in_place=False,
        check_content=True,
        is_binary=None,
        auto_decompress=True,
        uploaded_file_ext=None,
        convert_to_posix_lines=None,
        convert_spaces_to_tabs=None,
):
    is_valid, ext, converted_path, compressed_type = handle_compressed_file(
        filename,
        datatypes_registry,
        ext=ext,
        tmp_prefix=tmp_prefix,
        tmp_dir=tmp_dir,
        in_place=in_place,
        check_content=check_content,
        auto_decompress=auto_decompress,
    )
    try:
        if not is_valid:
            if is_tar(converted_path):
                raise InappropriateDatasetContentError('TAR file uploads are not supported')
            raise InappropriateDatasetContentError('The uploaded compressed file contains invalid content')

        # This needs to be checked again after decompression
        is_binary = check_binary(converted_path)

        if not is_binary and (convert_to_posix_lines or convert_spaces_to_tabs):
            # Convert universal line endings to Posix line endings, spaces to tabs (if desired)
            if convert_spaces_to_tabs:
                convert_fxn = convert_newlines_sep2tabs
            else:
                convert_fxn = convert_newlines
            line_count, _converted_path = convert_fxn(converted_path, in_place=in_place, tmp_dir=tmp_dir, tmp_prefix=tmp_prefix)
            if not in_place:
                if converted_path and filename != converted_path:
                    os.unlink(converted_path)
                converted_path = _converted_path

        if ext in AUTO_DETECT_EXTENSIONS:
            ext = guess_ext(converted_path, sniff_order=datatypes_registry.sniff_order, is_binary=is_binary)

        if not is_binary and check_content and check_html(converted_path):
            raise InappropriateDatasetContentError('The uploaded file contains invalid HTML content')
    except Exception:
        if filename != converted_path:
            os.unlink(converted_path)
        raise
    return ext, converted_path, compressed_type
Exemplo n.º 4
0
def handle_uploaded_dataset_file_internal(
        filename,
        datatypes_registry,
        ext='auto',
        tmp_prefix='sniff_upload_',
        tmp_dir=None,
        in_place=False,
        check_content=True,
        is_binary=None,
        auto_decompress=True,
        uploaded_file_ext=None,
        convert_to_posix_lines=None,
        convert_spaces_to_tabs=None,
):
    is_valid, ext, converted_path, compressed_type = handle_compressed_file(
        filename,
        datatypes_registry,
        ext=ext,
        tmp_prefix=tmp_prefix,
        tmp_dir=tmp_dir,
        in_place=in_place,
        check_content=check_content,
        auto_decompress=auto_decompress,
    )
    try:
        if not is_valid:
            if is_tar(converted_path):
                raise InappropriateDatasetContentError('TAR file uploads are not supported')
            raise InappropriateDatasetContentError('The uploaded compressed file contains invalid content')

        # This needs to be checked again after decompression
        is_binary = check_binary(converted_path)

        if not is_binary and (convert_to_posix_lines or convert_spaces_to_tabs):
            # Convert universal line endings to Posix line endings, spaces to tabs (if desired)
            if convert_spaces_to_tabs:
                convert_fxn = convert_newlines_sep2tabs
            else:
                convert_fxn = convert_newlines
            line_count, _converted_path = convert_fxn(converted_path, in_place=in_place, tmp_dir=tmp_dir, tmp_prefix=tmp_prefix)
            if not in_place:
                if converted_path and filename != converted_path:
                    os.unlink(converted_path)
                converted_path = _converted_path

        if ext in AUTO_DETECT_EXTENSIONS:
            ext = guess_ext(converted_path, sniff_order=datatypes_registry.sniff_order, is_binary=is_binary)

        if not is_binary and check_content and check_html(converted_path):
            raise InappropriateDatasetContentError('The uploaded file contains invalid HTML content')
    except Exception:
        if filename != converted_path:
            os.unlink(converted_path)
        raise
    return ext, converted_path, compressed_type
def sniff_and_handle_data_type(file_path, datatypes_registry):
    """
    The sniff.handle_uploaded_dataset_file() method in Galaxy performs dual
    functions: it sniffs the filetype and if it's a compressed archive for
    a non compressed datatype such as fasta, it will be unpacked.
    """
    ext = sniff.handle_uploaded_dataset_file(file_path, datatypes_registry)
    if not ext or ext == "data":
        is_binary = check_binary(file_path)
        ext = sniff.guess_ext(file_path,
                              datatypes_registry.sniff_order,
                              is_binary=is_binary)
    return ext
Exemplo n.º 6
0
def handle_uploaded_dataset_file(filename, datatypes_registry, ext='auto', is_multi_byte=False):
    is_valid, ext = handle_compressed_file(filename, datatypes_registry, ext=ext)

    if not is_valid:
        raise InappropriateDatasetContentError('The compressed uploaded file contains inappropriate content.')

    if ext in AUTO_DETECT_EXTENSIONS:
        ext = guess_ext(filename, sniff_order=datatypes_registry.sniff_order, is_multi_byte=is_multi_byte)

    if check_binary(filename):
        if not Binary.is_ext_unsniffable(ext) and not datatypes_registry.get_datatype_by_extension(ext).sniff(filename):
            raise InappropriateDatasetContentError('The binary uploaded file contains inappropriate content.')
    elif check_html(filename):
        raise InappropriateDatasetContentError('The uploaded file contains inappropriate HTML content.')
    return ext
Exemplo n.º 7
0
 def detect_datatype(self, trans, dataset_assoc):
     """Sniff and assign the datatype to a given dataset association (ldda or hda)"""
     data = trans.sa_session.query(self.model_class).get(dataset_assoc.id)
     if data.datatype.is_datatype_change_allowed():
         if not data.ok_to_edit_metadata():
             raise exceptions.ItemAccessibilityException('This dataset is currently being used as input or output. You cannot change datatype until the jobs have completed or you have canceled them.')
         else:
             path = data.dataset.file_name
             is_binary = check_binary(path)
             datatype = sniff.guess_ext(path, trans.app.datatypes_registry.sniff_order, is_binary=is_binary)
             trans.app.datatypes_registry.change_datatype(data, datatype)
             trans.sa_session.flush()
             self.set_metadata(trans, dataset_assoc)
     else:
         raise exceptions.InsufficientPermissionsException(f'Changing datatype "{data.extension}" is not allowed.')
Exemplo n.º 8
0
def get_repository_file_contents(app,
                                 file_path,
                                 repository_id,
                                 is_admin=False):
    """Return the display-safe contents of a repository file for display in a browser."""
    safe_str = ''
    if not is_path_browsable(app, file_path, repository_id, is_admin):
        log.warning(
            'Request tries to access a file outside of the repository location. File path: %s',
            file_path)
        return 'Invalid file path'
    # Symlink targets are checked by is_path_browsable
    if os.path.islink(file_path):
        safe_str = 'link to: ' + basic_util.to_html_string(
            os.readlink(file_path))
        return safe_str
    elif checkers.is_gzip(file_path):
        return '<br/>gzip compressed file<br/>'
    elif checkers.is_bz2(file_path):
        return '<br/>bz2 compressed file<br/>'
    elif checkers.check_zip(file_path):
        return '<br/>zip compressed file<br/>'
    elif checkers.check_binary(file_path):
        return '<br/>Binary file<br/>'
    else:
        for i, line in enumerate(open(file_path)):
            safe_str = '%s%s' % (safe_str, basic_util.to_html_string(line))
            # Stop reading after string is larger than MAX_CONTENT_SIZE.
            if len(safe_str) > MAX_CONTENT_SIZE:
                large_str = \
                    '<br/>File contents truncated because file size is larger than maximum viewing size of %s<br/>' % \
                    util.nice_size( MAX_CONTENT_SIZE )
                safe_str = '%s%s' % (safe_str, large_str)
                break

        if len(safe_str) > basic_util.MAX_DISPLAY_SIZE:
            # Eliminate the middle of the file to display a file no larger than basic_util.MAX_DISPLAY_SIZE.
            # This may not be ideal if the file is larger than MAX_CONTENT_SIZE.
            join_by_str = \
                "<br/><br/>...some text eliminated here because file size is larger than maximum viewing size of %s...<br/><br/>" % \
                util.nice_size( basic_util.MAX_DISPLAY_SIZE )
            safe_str = util.shrink_string_by_size(safe_str,
                                                  basic_util.MAX_DISPLAY_SIZE,
                                                  join_by=join_by_str,
                                                  left_larger=True,
                                                  beginning_on_size_error=True)
        return safe_str
Exemplo n.º 9
0
def get_repository_file_contents(app, file_path, repository_id, is_admin=False):
    """Return the display-safe contents of a repository file for display in a browser."""
    safe_str = ''
    if not is_path_browsable(app, file_path, repository_id, is_admin):
        log.warning('Request tries to access a file outside of the repository location. File path: %s', file_path)
        return 'Invalid file path'
    # Symlink targets are checked by is_path_browsable
    if os.path.islink(file_path):
        safe_str = 'link to: ' + basic_util.to_html_string(os.readlink(file_path))
        return safe_str
    elif checkers.is_gzip(file_path):
        return '<br/>gzip compressed file<br/>'
    elif checkers.is_bz2(file_path):
        return '<br/>bz2 compressed file<br/>'
    elif checkers.is_zip(file_path):
        return '<br/>zip compressed file<br/>'
    elif checkers.check_binary(file_path):
        return '<br/>Binary file<br/>'
    else:
        for i, line in enumerate(open(file_path)):
            safe_str = '%s%s' % (safe_str, basic_util.to_html_string(line))
            # Stop reading after string is larger than MAX_CONTENT_SIZE.
            if len(safe_str) > MAX_CONTENT_SIZE:
                large_str = \
                    '<br/>File contents truncated because file size is larger than maximum viewing size of %s<br/>' % \
                    util.nice_size(MAX_CONTENT_SIZE)
                safe_str = '%s%s' % (safe_str, large_str)
                break

        if len(safe_str) > basic_util.MAX_DISPLAY_SIZE:
            # Eliminate the middle of the file to display a file no larger than basic_util.MAX_DISPLAY_SIZE.
            # This may not be ideal if the file is larger than MAX_CONTENT_SIZE.
            join_by_str = \
                "<br/><br/>...some text eliminated here because file size is larger than maximum viewing size of %s...<br/><br/>" % \
                util.nice_size(basic_util.MAX_DISPLAY_SIZE)
            safe_str = util.shrink_string_by_size(safe_str,
                                                  basic_util.MAX_DISPLAY_SIZE,
                                                  join_by=join_by_str,
                                                  left_larger=True,
                                                  beginning_on_size_error=True)
        return safe_str
Exemplo n.º 10
0
def looks_like_a_tool_xml(path):
    full_path = os.path.abspath(path)

    if not full_path.endswith(".xml"):
        return False

    if not os.path.getsize(full_path):
        return False

    if (checkers.check_binary(full_path) or checkers.check_image(full_path)
            or checkers.check_gzip(full_path)[0]
            or checkers.check_bz2(full_path)[0]
            or checkers.check_zip(full_path)):
        return False

    with open(path, "r") as f:
        start_contents = f.read(5 * 1024)
        if TOOL_REGEX.search(start_contents):
            return True

    return False
Exemplo n.º 11
0
def looks_like_a_tool_xml(path):
    """Quick check to see if a file looks like it may be a Galaxy XML tool file."""
    full_path = os.path.abspath(path)

    if not full_path.endswith(".xml"):
        return False

    if not os.path.getsize(full_path):
        return False

    if (checkers.check_binary(full_path) or checkers.check_image(full_path)
            or checkers.is_gzip(full_path) or checkers.is_bz2(full_path)
            or checkers.is_zip(full_path)):
        return False

    with open(path, "r") as f:
        start_contents = f.read(5 * 1024)
        if TOOL_REGEX.search(start_contents):
            return True

    return False
Exemplo n.º 12
0
def looks_like_xml(path, regex=TOOL_REGEX):
    full_path = os.path.abspath(path)

    if not full_path.endswith(".xml"):
        return False

    if not os.path.getsize(full_path):
        return False

    if(checkers.check_binary(full_path) or
       checkers.check_image(full_path) or
       checkers.is_gzip(full_path) or
       checkers.is_bz2(full_path) or
       checkers.is_zip(full_path)):
        return False

    with open(path, "r") as f:
        start_contents = f.read(5 * 1024)
        if regex.search(start_contents):
            return True

    return False
Exemplo n.º 13
0
def looks_like_a_tool_xml(path):
    """Quick check to see if a file looks like it may be a Galaxy XML tool file."""
    full_path = os.path.abspath(path)

    if not full_path.endswith(".xml"):
        return False

    if not os.path.getsize(full_path):
        return False

    if(checkers.check_binary(full_path) or
       checkers.check_image(full_path) or
       checkers.check_gzip(full_path)[0] or
       checkers.check_bz2(full_path)[0] or
       checkers.check_zip(full_path)):
        return False

    with open(path, "r") as f:
        start_contents = f.read(5 * 1024)
        if TOOL_REGEX.search(start_contents):
            return True

    return False
Exemplo n.º 14
0
def is_data_index_sample_file( file_path ):
    """
    Attempt to determine if a .sample file is appropriate for copying to ~/tool-data when
    a tool shed repository is being installed into a Galaxy instance.
    """
    # Currently most data index files are tabular, so check that first.  We'll assume that
    # if the file is tabular, it's ok to copy.
    if is_column_based( file_path ):
        return True
    # If the file is any of the following, don't copy it.
    if checkers.check_html( file_path ):
        return False
    if checkers.check_image( file_path ):
        return False
    if checkers.check_binary( name=file_path ):
        return False
    if checkers.is_bz2( file_path ):
        return False
    if checkers.is_gzip( file_path ):
        return False
    if checkers.check_zip( file_path ):
        return False
    # Default to copying the file if none of the above are true.
    return True
Exemplo n.º 15
0
def is_data_index_sample_file(file_path):
    """
    Attempt to determine if a .sample file is appropriate for copying to ~/tool-data when
    a tool shed repository is being installed into a Galaxy instance.
    """
    # Currently most data index files are tabular, so check that first.  We'll assume that
    # if the file is tabular, it's ok to copy.
    if is_column_based(file_path):
        return True
    # If the file is any of the following, don't copy it.
    if checkers.check_html(file_path):
        return False
    if checkers.check_image(file_path):
        return False
    if checkers.check_binary(name=file_path):
        return False
    if checkers.is_bz2(file_path):
        return False
    if checkers.is_gzip(file_path):
        return False
    if checkers.is_zip(file_path):
        return False
    # Default to copying the file if none of the above are true.
    return True
Exemplo n.º 16
0
def handle_upload(
    registry,
    path,  # dataset.path
    requested_ext,  # dataset.file_type
    name,  # dataset.name,
    tmp_prefix,
    tmp_dir,
    check_content,
    link_data_only,
    in_place,
    auto_decompress,
    convert_to_posix_lines,
    convert_spaces_to_tabs,
):
    stdout = None
    converted_path = None
    multi_file_zip = False

    # Does the first 1K contain a null?
    is_binary = check_binary(path)

    # Decompress if needed/desired and determine/validate filetype. If a keep-compressed datatype is explicitly selected
    # or if autodetection is selected and the file sniffs as a keep-compressed datatype, it will not be decompressed.
    if not link_data_only:
        if auto_decompress and is_zip(path) and not is_single_file_zip(path):
            multi_file_zip = True
        try:
            ext, converted_path, compression_type = sniff.handle_uploaded_dataset_file_internal(
                path,
                registry,
                ext=requested_ext,
                tmp_prefix=tmp_prefix,
                tmp_dir=tmp_dir,
                in_place=in_place,
                check_content=check_content,
                is_binary=is_binary,
                auto_decompress=auto_decompress,
                uploaded_file_ext=os.path.splitext(name)[1].lower().lstrip('.'),
                convert_to_posix_lines=convert_to_posix_lines,
                convert_spaces_to_tabs=convert_spaces_to_tabs,
            )
        except sniff.InappropriateDatasetContentError as exc:
            raise UploadProblemException(str(exc))
    elif requested_ext == 'auto':
        ext = sniff.guess_ext(path, registry.sniff_order, is_binary=is_binary)
    else:
        ext = requested_ext

    # The converted path will be the same as the input path if no conversion was done (or in-place conversion is used)
    converted_path = None if converted_path == path else converted_path

    # Validate datasets where the filetype was explicitly set using the filetype's sniffer (if any)
    if requested_ext != 'auto':
        datatype = registry.get_datatype_by_extension(requested_ext)
        # Enable sniffer "validate mode" (prevents certain sniffers from disabling themselves)
        if check_content and hasattr(datatype, 'sniff') and not datatype.sniff(path):
            stdout = ("Warning: The file 'Type' was set to '{ext}' but the file does not appear to be of that"
                      " type".format(ext=requested_ext))

    # Handle unsniffable binaries
    if is_binary and ext == 'binary':
        upload_ext = os.path.splitext(name)[1].lower().lstrip('.')
        if registry.is_extension_unsniffable_binary(upload_ext):
            stdout = ("Warning: The file's datatype cannot be determined from its contents and was guessed based on"
                     " its extension, to avoid this warning, manually set the file 'Type' to '{ext}' when uploading"
                     " this type of file".format(ext=upload_ext))
            ext = upload_ext
        else:
            stdout = ("The uploaded binary file format cannot be determined automatically, please set the file 'Type'"
                      " manually")

    datatype = registry.get_datatype_by_extension(ext)
    if multi_file_zip and not getattr(datatype, 'compressed', False):
        stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.'

    return stdout, ext, datatype, is_binary, converted_path
Exemplo n.º 17
0
def add_file(dataset, registry, json_file, output_path):
    data_type = None
    line_count = None
    converted_path = None
    stdout = None
    link_data_only = dataset.get('link_data_only', 'copy_files')
    run_as_real_user = in_place = dataset.get('in_place', True)
    purge_source = dataset.get('purge_source', True)
    # in_place is True if there is no external chmod in place,
    # however there are other instances where modifications should not occur in_place:
    # when a file is added from a directory on the local file system (ftp import folder or any other path).
    if dataset.type in ('server_dir', 'path_paste', 'ftp_import'):
        in_place = False
    check_content = dataset.get('check_content' , True)
    auto_decompress = dataset.get('auto_decompress', True)
    try:
        ext = dataset.file_type
    except AttributeError:
        file_err('Unable to process uploaded file, missing file_type parameter.', dataset, json_file)
        return

    if dataset.type == 'url':
        try:
            page = urlopen(dataset.path)  # page will be .close()ed by sniff methods
            temp_name, dataset.is_multi_byte = sniff.stream_to_file(page, prefix='url_paste', source_encoding=util.get_charset_from_http_headers(page.headers))
        except Exception as e:
            file_err('Unable to fetch %s\n%s' % (dataset.path, str(e)), dataset, json_file)
            return
        dataset.path = temp_name
    # See if we have an empty file
    if not os.path.exists(dataset.path):
        file_err('Uploaded temporary file (%s) does not exist.' % dataset.path, dataset, json_file)
        return
    if not os.path.getsize(dataset.path) > 0:
        file_err('The uploaded file is empty', dataset, json_file)
        return
    if not dataset.type == 'url':
        # Already set is_multi_byte above if type == 'url'
        try:
            dataset.is_multi_byte = multi_byte.is_multi_byte(codecs.open(dataset.path, 'r', 'utf-8').read(100))
        except UnicodeDecodeError as e:
            dataset.is_multi_byte = False
    # Is dataset an image?
    i_ext = get_image_ext(dataset.path)
    if i_ext:
        ext = i_ext
        data_type = ext
    # Is dataset content multi-byte?
    elif dataset.is_multi_byte:
        data_type = 'multi-byte char'
        ext = sniff.guess_ext(dataset.path, registry.sniff_order, is_multi_byte=True)
    # Is dataset content supported sniffable binary?
    else:
        # FIXME: This ignores the declared sniff order in datatype_conf.xml
        # resulting in improper behavior
        type_info = Binary.is_sniffable_binary(dataset.path)
        if type_info:
            data_type = type_info[0]
            ext = type_info[1]
    if not data_type:
        root_datatype = registry.get_datatype_by_extension(dataset.file_type)
        if getattr(root_datatype, 'compressed', False):
            data_type = 'compressed archive'
            ext = dataset.file_type
        else:
            # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress
            is_gzipped, is_valid = check_gzip(dataset.path, check_content=check_content)
            if is_gzipped and not is_valid:
                file_err('The gzipped uploaded file contains inappropriate content', dataset, json_file)
                return
            elif is_gzipped and is_valid and auto_decompress:
                if link_data_only == 'copy_files':
                    # We need to uncompress the temp_name file, but BAM files must remain compressed in the BGZF format
                    CHUNK_SIZE = 2 ** 20  # 1Mb
                    fd, uncompressed = tempfile.mkstemp(prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False)
                    gzipped_file = gzip.GzipFile(dataset.path, 'rb')
                    while 1:
                        try:
                            chunk = gzipped_file.read(CHUNK_SIZE)
                        except IOError:
                            os.close(fd)
                            os.remove(uncompressed)
                            file_err('Problem decompressing gzipped data', dataset, json_file)
                            return
                        if not chunk:
                            break
                        os.write(fd, chunk)
                    os.close(fd)
                    gzipped_file.close()
                    # Replace the gzipped file with the decompressed file if it's safe to do so
                    if not in_place:
                        dataset.path = uncompressed
                    else:
                        shutil.move(uncompressed, dataset.path)
                    os.chmod(dataset.path, 0o644)
                dataset.name = dataset.name.rstrip('.gz')
                data_type = 'gzip'
            if not data_type:
                # See if we have a bz2 file, much like gzip
                is_bzipped, is_valid = check_bz2(dataset.path, check_content)
                if is_bzipped and not is_valid:
                    file_err('The gzipped uploaded file contains inappropriate content', dataset, json_file)
                    return
                elif is_bzipped and is_valid and auto_decompress:
                    if link_data_only == 'copy_files':
                        # We need to uncompress the temp_name file
                        CHUNK_SIZE = 2 ** 20  # 1Mb
                        fd, uncompressed = tempfile.mkstemp(prefix='data_id_%s_upload_bunzip2_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False)
                        bzipped_file = bz2.BZ2File(dataset.path, 'rb')
                        while 1:
                            try:
                                chunk = bzipped_file.read(CHUNK_SIZE)
                            except IOError:
                                os.close(fd)
                                os.remove(uncompressed)
                                file_err('Problem decompressing bz2 compressed data', dataset, json_file)
                                return
                            if not chunk:
                                break
                            os.write(fd, chunk)
                        os.close(fd)
                        bzipped_file.close()
                        # Replace the bzipped file with the decompressed file if it's safe to do so
                        if not in_place:
                            dataset.path = uncompressed
                        else:
                            shutil.move(uncompressed, dataset.path)
                        os.chmod(dataset.path, 0o644)
                    dataset.name = dataset.name.rstrip('.bz2')
                    data_type = 'bz2'
            if not data_type:
                # See if we have a zip archive
                is_zipped = check_zip(dataset.path)
                if is_zipped and auto_decompress:
                    if link_data_only == 'copy_files':
                        CHUNK_SIZE = 2 ** 20  # 1Mb
                        uncompressed = None
                        uncompressed_name = None
                        unzipped = False
                        z = zipfile.ZipFile(dataset.path)
                        for name in z.namelist():
                            if name.endswith('/'):
                                continue
                            if unzipped:
                                stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.'
                                break
                            fd, uncompressed = tempfile.mkstemp(prefix='data_id_%s_upload_zip_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False)
                            if sys.version_info[:2] >= (2, 6):
                                zipped_file = z.open(name)
                                while 1:
                                    try:
                                        chunk = zipped_file.read(CHUNK_SIZE)
                                    except IOError:
                                        os.close(fd)
                                        os.remove(uncompressed)
                                        file_err('Problem decompressing zipped data', dataset, json_file)
                                        return
                                    if not chunk:
                                        break
                                    os.write(fd, chunk)
                                os.close(fd)
                                zipped_file.close()
                                uncompressed_name = name
                                unzipped = True
                            else:
                                # python < 2.5 doesn't have a way to read members in chunks(!)
                                try:
                                    outfile = open(uncompressed, 'wb')
                                    outfile.write(z.read(name))
                                    outfile.close()
                                    uncompressed_name = name
                                    unzipped = True
                                except IOError:
                                    os.close(fd)
                                    os.remove(uncompressed)
                                    file_err('Problem decompressing zipped data', dataset, json_file)
                                    return
                        z.close()
                        # Replace the zipped file with the decompressed file if it's safe to do so
                        if uncompressed is not None:
                            if not in_place:
                                dataset.path = uncompressed
                            else:
                                shutil.move(uncompressed, dataset.path)
                            os.chmod(dataset.path, 0o644)
                            dataset.name = uncompressed_name
                    data_type = 'zip'
            if not data_type:
                # TODO refactor this logic.  check_binary isn't guaranteed to be
                # correct since it only looks at whether the first 100 chars are
                # printable or not.  If someone specifies a known unsniffable
                # binary datatype and check_binary fails, the file gets mangled.
                if check_binary(dataset.path) or Binary.is_ext_unsniffable(dataset.file_type):
                    # We have a binary dataset, but it is not Bam, Sff or Pdf
                    data_type = 'binary'
                    # binary_ok = False
                    parts = dataset.name.split(".")
                    if len(parts) > 1:
                        ext = parts[-1].strip().lower()
                        if check_content and not Binary.is_ext_unsniffable(ext):
                            file_err('The uploaded binary file contains inappropriate content', dataset, json_file)
                            return
                        elif Binary.is_ext_unsniffable(ext) and dataset.file_type != ext:
                            err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % (ext.capitalize(), ext)
                            file_err(err_msg, dataset, json_file)
                            return
            if not data_type:
                # We must have a text file
                if check_content and check_html(dataset.path):
                    file_err('The uploaded file contains inappropriate HTML content', dataset, json_file)
                    return
            if data_type != 'binary':
                if link_data_only == 'copy_files' and data_type not in ('gzip', 'bz2', 'zip'):
                    # Convert universal line endings to Posix line endings if to_posix_lines is True
                    # and the data is not binary or gzip-, bz2- or zip-compressed.
                    if dataset.to_posix_lines:
                        tmpdir = output_adjacent_tmpdir(output_path)
                        tmp_prefix = 'data_id_%s_convert_' % dataset.dataset_id
                        if dataset.space_to_tab:
                            line_count, converted_path = sniff.convert_newlines_sep2tabs(dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix)
                        else:
                            line_count, converted_path = sniff.convert_newlines(dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix)
                if dataset.file_type == 'auto':
                    ext = sniff.guess_ext(dataset.path, registry.sniff_order)
                else:
                    ext = dataset.file_type
                data_type = ext
    # Save job info for the framework
    if ext == 'auto' and data_type == 'binary':
        ext = 'data'
    if ext == 'auto' and dataset.ext:
        ext = dataset.ext
    if ext == 'auto':
        ext = 'data'
    datatype = registry.get_datatype_by_extension(ext)
    if dataset.type in ('server_dir', 'path_paste') and link_data_only == 'link_to_files':
        # Never alter a file that will not be copied to Galaxy's local file store.
        if datatype.dataset_content_needs_grooming(dataset.path):
            err_msg = 'The uploaded files need grooming, so change your <b>Copy data into Galaxy?</b> selection to be ' + \
                '<b>Copy files into Galaxy</b> instead of <b>Link to files without copying into Galaxy</b> so grooming can be performed.'
            file_err(err_msg, dataset, json_file)
            return
    if link_data_only == 'copy_files' and converted_path:
        # Move the dataset to its "real" path
        try:
            shutil.move(converted_path, output_path)
        except OSError as e:
            # We may not have permission to remove converted_path
            if e.errno != errno.EACCES:
                raise
    elif link_data_only == 'copy_files':
        if purge_source and not run_as_real_user:
            # if the upload tool runs as a real user the real user
            # can't move dataset.path as this path is owned by galaxy.
            shutil.move(dataset.path, output_path)
        else:
            shutil.copy(dataset.path, output_path)
    # Write the job info
    stdout = stdout or 'uploaded %s file' % data_type
    info = dict(type='dataset',
                dataset_id=dataset.dataset_id,
                ext=ext,
                stdout=stdout,
                name=dataset.name,
                line_count=line_count)
    if dataset.get('uuid', None) is not None:
        info['uuid'] = dataset.get('uuid')
    json_file.write(dumps(info) + "\n")
    if link_data_only == 'copy_files' and datatype and datatype.dataset_content_needs_grooming(output_path):
        # Groom the dataset content if necessary
        datatype.groom_dataset_content(output_path)
Exemplo n.º 18
0
             z.close()
             # Replace the zipped file with the decompressed file if it's safe to do so
             if uncompressed is not None:
                 if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place:
                     dataset.path = uncompressed
                 else:
                     shutil.move( uncompressed, dataset.path )
                 os.chmod(dataset.path, 0644)
                 dataset.name = uncompressed_name
         data_type = 'zip'
 if not data_type:
     # TODO refactor this logic.  check_binary isn't guaranteed to be
     # correct since it only looks at whether the first 100 chars are
     # printable or not.  If someone specifies a known unsniffable
     # binary datatype and check_binary fails, the file gets mangled.
     if check_binary( dataset.path ) or Binary.is_ext_unsniffable(dataset.file_type):
         # We have a binary dataset, but it is not Bam, Sff or Pdf
         data_type = 'binary'
         # binary_ok = False
         parts = dataset.name.split( "." )
         if len( parts ) > 1:
             ext = parts[-1].strip().lower()
             if not Binary.is_ext_unsniffable(ext):
                 file_err( 'The uploaded binary file contains inappropriate content', dataset, json_file )
                 return
             elif Binary.is_ext_unsniffable(ext) and dataset.file_type != ext:
                 err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % ( ext.capitalize(), ext )
                 file_err( err_msg, dataset, json_file )
                 return
 if not data_type:
     # We must have a text file
Exemplo n.º 19
0
def handle_upload(
    registry,
    path,  # dataset.path
    requested_ext,  # dataset.file_type
    name,  # dataset.name,
    tmp_prefix,
    tmp_dir,
    check_content,
    link_data_only,
    in_place,
    auto_decompress,
    convert_to_posix_lines,
    convert_spaces_to_tabs,
):
    stdout = None
    converted_path = None

    # Does the first 1K contain a null?
    is_binary = check_binary(path)

    # Decompress if needed/desired and determine/validate filetype. If a keep-compressed datatype is explicitly selected
    # or if autodetection is selected and the file sniffs as a keep-compressed datatype, it will not be decompressed.
    if not link_data_only:
        if auto_decompress and is_zip(path) and not is_single_file_zip(path):
            stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.'
        try:
            ext, converted_path, compression_type = sniff.handle_uploaded_dataset_file_internal(
                path,
                registry,
                ext=requested_ext,
                tmp_prefix=tmp_prefix,
                tmp_dir=tmp_dir,
                in_place=in_place,
                check_content=check_content,
                is_binary=is_binary,
                auto_decompress=auto_decompress,
                uploaded_file_ext=os.path.splitext(name)[1].lower().lstrip(
                    '.'),
                convert_to_posix_lines=convert_to_posix_lines,
                convert_spaces_to_tabs=convert_spaces_to_tabs,
            )
        except sniff.InappropriateDatasetContentError as exc:
            raise UploadProblemException(str(exc))
    elif requested_ext == 'auto':
        ext = sniff.guess_ext(path, registry.sniff_order, is_binary=is_binary)
    else:
        ext = requested_ext

    # The converted path will be the same as the input path if no conversion was done (or in-place conversion is used)
    converted_path = None if converted_path == path else converted_path

    # Validate datasets where the filetype was explicitly set using the filetype's sniffer (if any)
    if requested_ext != 'auto':
        datatype = registry.get_datatype_by_extension(requested_ext)
        # Enable sniffer "validate mode" (prevents certain sniffers from disabling themselves)
        if check_content and hasattr(datatype,
                                     'sniff') and not datatype.sniff(path):
            stdout = (
                "Warning: The file 'Type' was set to '{ext}' but the file does not appear to be of that"
                " type".format(ext=requested_ext))

    # Handle unsniffable binaries
    if is_binary and ext == 'binary':
        upload_ext = os.path.splitext(name)[1].lower().lstrip('.')
        if registry.is_extension_unsniffable_binary(upload_ext):
            stdout = (
                "Warning: The file's datatype cannot be determined from its contents and was guessed based on"
                " its extension, to avoid this warning, manually set the file 'Type' to '{ext}' when uploading"
                " this type of file".format(ext=upload_ext))
            ext = upload_ext
        else:
            stdout = (
                "The uploaded binary file format cannot be determined automatically, please set the file 'Type'"
                " manually")

    datatype = registry.get_datatype_by_extension(ext)

    return stdout, ext, datatype, is_binary, converted_path
Exemplo n.º 20
0
             # Replace the zipped file with the decompressed file if it's safe to do so
             if uncompressed is not None:
                 if dataset.type in ('server_dir',
                                     'path_paste') or not in_place:
                     dataset.path = uncompressed
                 else:
                     shutil.move(uncompressed, dataset.path)
                 os.chmod(dataset.path, 0644)
                 dataset.name = uncompressed_name
         data_type = 'zip'
 if not data_type:
     # TODO refactor this logic.  check_binary isn't guaranteed to be
     # correct since it only looks at whether the first 100 chars are
     # printable or not.  If someone specifies a known unsniffable
     # binary datatype and check_binary fails, the file gets mangled.
     if check_binary(dataset.path) or Binary.is_ext_unsniffable(
             dataset.file_type):
         # We have a binary dataset, but it is not Bam, Sff or Pdf
         data_type = 'binary'
         # binary_ok = False
         parts = dataset.name.split(".")
         if len(parts) > 1:
             ext = parts[-1].strip().lower()
             if not Binary.is_ext_unsniffable(ext):
                 file_err(
                     'The uploaded binary file contains inappropriate content',
                     dataset, json_file)
                 return
             elif Binary.is_ext_unsniffable(
                     ext) and dataset.file_type != ext:
                 err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % (
Exemplo n.º 21
0
def handle_uploaded_dataset_file_internal(
    filename,
    datatypes_registry,
    ext='auto',
    tmp_prefix='sniff_upload_',
    tmp_dir=None,
    in_place=False,
    check_content=True,
    is_binary=None,
    auto_decompress=True,
    uploaded_file_ext=None,
    convert_to_posix_lines=None,
    convert_spaces_to_tabs=None,
):
    is_valid, ext, converted_path, compressed_type = handle_compressed_file(
        filename,
        datatypes_registry,
        ext=ext,
        tmp_prefix=tmp_prefix,
        tmp_dir=tmp_dir,
        in_place=in_place,
        check_content=check_content,
        auto_decompress=auto_decompress,
    )
    try:
        if not is_valid:
            if is_tar(converted_path):
                raise InappropriateDatasetContentError(
                    'TAR file uploads are not supported')
            raise InappropriateDatasetContentError(
                'The uploaded compressed file contains invalid content')

        # This needs to be checked again after decompression
        is_binary = check_binary(converted_path)
        guessed_ext = ext
        if ext in AUTO_DETECT_EXTENSIONS:
            guessed_ext = guess_ext(converted_path,
                                    sniff_order=datatypes_registry.sniff_order,
                                    is_binary=is_binary)
            guessed_datatype = datatypes_registry.get_datatype_by_extension(
                guessed_ext)
            if not is_binary and guessed_datatype.is_binary:
                # It's possible to have a datatype that is binary but not within the first 1024 bytes,
                # so check_binary might return a false negative. This is for instance true for PDF files
                is_binary = True

        if not is_binary and (convert_to_posix_lines
                              or convert_spaces_to_tabs):
            # Convert universal line endings to Posix line endings, spaces to tabs (if desired)
            if convert_spaces_to_tabs:
                convert_fxn = convert_newlines_sep2tabs
            else:
                convert_fxn = convert_newlines
            line_count, _converted_path = convert_fxn(converted_path,
                                                      in_place=in_place,
                                                      tmp_dir=tmp_dir,
                                                      tmp_prefix=tmp_prefix)
            if not in_place:
                if converted_path and filename != converted_path:
                    os.unlink(converted_path)
                converted_path = _converted_path
            if ext in AUTO_DETECT_EXTENSIONS:
                ext = guess_ext(converted_path,
                                sniff_order=datatypes_registry.sniff_order,
                                is_binary=is_binary)
        else:
            ext = guessed_ext

        # AMP Commenting this out.  Does not seem necessary.  check_html returns true for valid HTML, but the error
        # thrown is "invalid html".  Similarly, if we change it to not check_html, then it throws an error with JSON
        # or other non-binary data types
        #if not is_binary and check_content and check_html(converted_path):
        #raise InappropriateDatasetContentError('The uploaded file contains invalid HTML content')
    except Exception:
        if filename != converted_path:
            os.unlink(converted_path)
        raise
    return ext, converted_path, compressed_type
Exemplo n.º 22
0
def add_file(dataset, registry, output_path):
    ext = None
    compression_type = None
    line_count = None
    converted_path = None
    stdout = None
    link_data_only_str = dataset.get('link_data_only', 'copy_files')
    if link_data_only_str not in ['link_to_files', 'copy_files']:
        raise UploadProblemException(
            "Invalid setting '%s' for option link_data_only - upload request misconfigured"
            % link_data_only_str)
    link_data_only = link_data_only_str == 'link_to_files'

    # run_as_real_user is estimated from galaxy config (external chmod indicated of inputs executed)
    # If this is True we always purge supplied upload inputs so they are cleaned up and we reuse their
    # paths during data conversions since this user already owns that path.
    # Older in_place check for upload jobs created before 18.01, TODO remove in 19.XX. xref #5206
    run_as_real_user = dataset.get('run_as_real_user', False) or dataset.get(
        "in_place", False)

    # purge_source defaults to True unless this is an FTP import and
    # ftp_upload_purge has been overridden to False in Galaxy's config.
    # We set purge_source to False if:
    # - the job does not have write access to the file, e.g. when running as the
    #   real user
    # - the files are uploaded from external paths.
    purge_source = dataset.get(
        'purge_source',
        True) and not run_as_real_user and dataset.type not in ('server_dir',
                                                                'path_paste')

    # in_place is True unless we are running as a real user or importing external paths (i.e.
    # this is a real upload and not a path paste or ftp import).
    # in_place should always be False if running as real user because the uploaded file will
    # be owned by Galaxy and not the user and it should be False for external paths so Galaxy doesn't
    # modify files not controlled by Galaxy.
    in_place = not run_as_real_user and dataset.type not in ('server_dir',
                                                             'path_paste',
                                                             'ftp_import')

    # Base on the check_upload_content Galaxy config option and on by default, this enables some
    # security related checks on the uploaded content, but can prevent uploads from working in some cases.
    check_content = dataset.get('check_content', True)

    # auto_decompress is a request flag that can be swapped off to prevent Galaxy from automatically
    # decompressing archive files before sniffing.
    auto_decompress = dataset.get('auto_decompress', True)
    try:
        dataset.file_type
    except AttributeError:
        raise UploadProblemException(
            'Unable to process uploaded file, missing file_type parameter.')

    if dataset.type == 'url':
        try:
            dataset.path = sniff.stream_url_to_file(dataset.path)
        except Exception as e:
            raise UploadProblemException('Unable to fetch %s\n%s' %
                                         (dataset.path, str(e)))

    # See if we have an empty file
    if not os.path.exists(dataset.path):
        raise UploadProblemException(
            'Uploaded temporary file (%s) does not exist.' % dataset.path)

    if not os.path.getsize(dataset.path) > 0:
        raise UploadProblemException('The uploaded file is empty')

    # Does the first 1K contain a null?
    is_binary = check_binary(dataset.path)

    # Decompress if needed/desired and determine/validate filetype. If a keep-compressed datatype is explicitly selected
    # or if autodetection is selected and the file sniffs as a keep-compressed datatype, it will not be decompressed.
    if not link_data_only:
        if is_zip(dataset.path) and not is_single_file_zip(dataset.path):
            stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.'
        try:
            ext, converted_path, compression_type = sniff.handle_uploaded_dataset_file(
                dataset.path,
                registry,
                ext=dataset.file_type,
                tmp_prefix='data_id_%s_upload_' % dataset.dataset_id,
                tmp_dir=output_adjacent_tmpdir(output_path),
                in_place=in_place,
                check_content=check_content,
                is_binary=is_binary,
                auto_decompress=auto_decompress,
                uploaded_file_ext=os.path.splitext(
                    dataset.name)[1].lower().lstrip('.'),
                convert_to_posix_lines=dataset.to_posix_lines,
                convert_spaces_to_tabs=dataset.space_to_tab,
            )
        except sniff.InappropriateDatasetContentError as exc:
            raise UploadProblemException(str(exc))
    elif dataset.file_type == 'auto':
        # Link mode can't decompress anyway, so enable sniffing for keep-compressed datatypes even when auto_decompress
        # is enabled
        os.environ['GALAXY_SNIFFER_VALIDATE_MODE'] = '1'
        ext = sniff.guess_ext(dataset.path,
                              registry.sniff_order,
                              is_binary=is_binary)
        os.environ.pop('GALAXY_SNIFFER_VALIDATE_MODE')

    # The converted path will be the same as the input path if no conversion was done (or in-place conversion is used)
    converted_path = None if converted_path == dataset.path else converted_path

    # Validate datasets where the filetype was explicitly set using the filetype's sniffer (if any)
    if dataset.file_type != 'auto':
        datatype = registry.get_datatype_by_extension(dataset.file_type)
        # Enable sniffer "validate mode" (prevents certain sniffers from disabling themselves)
        os.environ['GALAXY_SNIFFER_VALIDATE_MODE'] = '1'
        if hasattr(datatype, 'sniff') and not datatype.sniff(dataset.path):
            stdout = (
                "Warning: The file 'Type' was set to '{ext}' but the file does not appear to be of that"
                " type".format(ext=dataset.file_type))
        os.environ.pop('GALAXY_SNIFFER_VALIDATE_MODE')

    # Handle unsniffable binaries
    if is_binary and ext == 'binary':
        upload_ext = os.path.splitext(dataset.name)[1].lower().lstrip('.')
        if registry.is_extension_unsniffable_binary(upload_ext):
            stdout = (
                "Warning: The file's datatype cannot be determined from its contents and was guessed based on"
                " its extension, to avoid this warning, manually set the file 'Type' to '{ext}' when uploading"
                " this type of file".format(ext=upload_ext))
            ext = upload_ext
        else:
            stdout = (
                "The uploaded binary file format cannot be determined automatically, please set the file 'Type'"
                " manually")

    datatype = registry.get_datatype_by_extension(ext)

    # Strip compression extension from name
    if compression_type and not getattr(
            datatype, 'compressed',
            False) and dataset.name.endswith('.' + compression_type):
        dataset.name = dataset.name[:-len('.' + compression_type)]

    # Move dataset
    if link_data_only:
        # Never alter a file that will not be copied to Galaxy's local file store.
        if datatype.dataset_content_needs_grooming(dataset.path):
            err_msg = 'The uploaded files need grooming, so change your <b>Copy data into Galaxy?</b> selection to be ' + \
                '<b>Copy files into Galaxy</b> instead of <b>Link to files without copying into Galaxy</b> so grooming can be performed.'
            raise UploadProblemException(err_msg)
    if not link_data_only:
        # Move the dataset to its "real" path. converted_path is a tempfile so we move it even if purge_source is False.
        if purge_source or converted_path:
            try:
                shutil.move(converted_path or dataset.path, output_path)
            except OSError as e:
                # We may not have permission to remove the input
                if e.errno != errno.EACCES:
                    raise
        else:
            shutil.copy(dataset.path, output_path)

    # Write the job info
    stdout = stdout or 'uploaded %s file' % ext
    info = dict(type='dataset',
                dataset_id=dataset.dataset_id,
                ext=ext,
                stdout=stdout,
                name=dataset.name,
                line_count=line_count)
    if dataset.get('uuid', None) is not None:
        info['uuid'] = dataset.get('uuid')
    # FIXME: does this belong here? also not output-adjacent-tmpdir aware =/
    if not link_data_only and datatype and datatype.dataset_content_needs_grooming(
            output_path):
        # Groom the dataset content if necessary
        datatype.groom_dataset_content(output_path)
    return info
Exemplo n.º 23
0
def add_file(dataset, registry, json_file, output_path):
    data_type = None
    line_count = None
    converted_path = None
    stdout = None
    link_data_only = dataset.get('link_data_only', 'copy_files')
    in_place = dataset.get('in_place', True)
    purge_source = dataset.get('purge_source', True)
    try:
        ext = dataset.file_type
    except AttributeError:
        file_err(
            'Unable to process uploaded file, missing file_type parameter.',
            dataset, json_file)
        return

    if dataset.type == 'url':
        try:
            page = urlopen(
                dataset.path)  # page will be .close()ed by sniff methods
            temp_name, dataset.is_multi_byte = sniff.stream_to_file(
                page,
                prefix='url_paste',
                source_encoding=util.get_charset_from_http_headers(
                    page.headers))
        except Exception as e:
            file_err('Unable to fetch %s\n%s' % (dataset.path, str(e)),
                     dataset, json_file)
            return
        dataset.path = temp_name
    # See if we have an empty file
    if not os.path.exists(dataset.path):
        file_err('Uploaded temporary file (%s) does not exist.' % dataset.path,
                 dataset, json_file)
        return
    if not os.path.getsize(dataset.path) > 0:
        file_err('The uploaded file is empty', dataset, json_file)
        return
    if not dataset.type == 'url':
        # Already set is_multi_byte above if type == 'url'
        try:
            dataset.is_multi_byte = multi_byte.is_multi_byte(
                codecs.open(dataset.path, 'r', 'utf-8').read(100))
        except UnicodeDecodeError as e:
            dataset.is_multi_byte = False
    # Is dataset an image?
    i_ext = get_image_ext(dataset.path)
    if i_ext:
        ext = i_ext
        data_type = ext
    # Is dataset content multi-byte?
    elif dataset.is_multi_byte:
        data_type = 'multi-byte char'
        ext = sniff.guess_ext(dataset.path,
                              registry.sniff_order,
                              is_multi_byte=True)
    # Is dataset content supported sniffable binary?
    else:
        # FIXME: This ignores the declared sniff order in datatype_conf.xml
        # resulting in improper behavior
        type_info = Binary.is_sniffable_binary(dataset.path)
        if type_info:
            data_type = type_info[0]
            ext = type_info[1]
    if not data_type:
        root_datatype = registry.get_datatype_by_extension(dataset.file_type)
        if getattr(root_datatype, 'compressed', False):
            data_type = 'compressed archive'
            ext = dataset.file_type
        else:
            # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress
            is_gzipped, is_valid = check_gzip(dataset.path)
            if is_gzipped and not is_valid:
                file_err(
                    'The gzipped uploaded file contains inappropriate content',
                    dataset, json_file)
                return
            elif is_gzipped and is_valid:
                if link_data_only == 'copy_files':
                    # We need to uncompress the temp_name file, but BAM files must remain compressed in the BGZF format
                    CHUNK_SIZE = 2**20  # 1Mb
                    fd, uncompressed = tempfile.mkstemp(
                        prefix='data_id_%s_upload_gunzip_' %
                        dataset.dataset_id,
                        dir=os.path.dirname(output_path),
                        text=False)
                    gzipped_file = gzip.GzipFile(dataset.path, 'rb')
                    while 1:
                        try:
                            chunk = gzipped_file.read(CHUNK_SIZE)
                        except IOError:
                            os.close(fd)
                            os.remove(uncompressed)
                            file_err('Problem decompressing gzipped data',
                                     dataset, json_file)
                            return
                        if not chunk:
                            break
                        os.write(fd, chunk)
                    os.close(fd)
                    gzipped_file.close()
                    # Replace the gzipped file with the decompressed file if it's safe to do so
                    if dataset.type in ('server_dir',
                                        'path_paste') or not in_place:
                        dataset.path = uncompressed
                    else:
                        shutil.move(uncompressed, dataset.path)
                    os.chmod(dataset.path, 0o644)
                dataset.name = dataset.name.rstrip('.gz')
                data_type = 'gzip'
            if not data_type and bz2 is not None:
                # See if we have a bz2 file, much like gzip
                is_bzipped, is_valid = check_bz2(dataset.path)
                if is_bzipped and not is_valid:
                    file_err(
                        'The gzipped uploaded file contains inappropriate content',
                        dataset, json_file)
                    return
                elif is_bzipped and is_valid:
                    if link_data_only == 'copy_files':
                        # We need to uncompress the temp_name file
                        CHUNK_SIZE = 2**20  # 1Mb
                        fd, uncompressed = tempfile.mkstemp(
                            prefix='data_id_%s_upload_bunzip2_' %
                            dataset.dataset_id,
                            dir=os.path.dirname(output_path),
                            text=False)
                        bzipped_file = bz2.BZ2File(dataset.path, 'rb')
                        while 1:
                            try:
                                chunk = bzipped_file.read(CHUNK_SIZE)
                            except IOError:
                                os.close(fd)
                                os.remove(uncompressed)
                                file_err(
                                    'Problem decompressing bz2 compressed data',
                                    dataset, json_file)
                                return
                            if not chunk:
                                break
                            os.write(fd, chunk)
                        os.close(fd)
                        bzipped_file.close()
                        # Replace the bzipped file with the decompressed file if it's safe to do so
                        if dataset.type in ('server_dir',
                                            'path_paste') or not in_place:
                            dataset.path = uncompressed
                        else:
                            shutil.move(uncompressed, dataset.path)
                        os.chmod(dataset.path, 0o644)
                    dataset.name = dataset.name.rstrip('.bz2')
                    data_type = 'bz2'
            if not data_type:
                # See if we have a zip archive
                is_zipped = check_zip(dataset.path)
                if is_zipped:
                    if link_data_only == 'copy_files':
                        CHUNK_SIZE = 2**20  # 1Mb
                        uncompressed = None
                        uncompressed_name = None
                        unzipped = False
                        z = zipfile.ZipFile(dataset.path)
                        for name in z.namelist():
                            if name.endswith('/'):
                                continue
                            if unzipped:
                                stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.'
                                break
                            fd, uncompressed = tempfile.mkstemp(
                                prefix='data_id_%s_upload_zip_' %
                                dataset.dataset_id,
                                dir=os.path.dirname(output_path),
                                text=False)
                            if sys.version_info[:2] >= (2, 6):
                                zipped_file = z.open(name)
                                while 1:
                                    try:
                                        chunk = zipped_file.read(CHUNK_SIZE)
                                    except IOError:
                                        os.close(fd)
                                        os.remove(uncompressed)
                                        file_err(
                                            'Problem decompressing zipped data',
                                            dataset, json_file)
                                        return
                                    if not chunk:
                                        break
                                    os.write(fd, chunk)
                                os.close(fd)
                                zipped_file.close()
                                uncompressed_name = name
                                unzipped = True
                            else:
                                # python < 2.5 doesn't have a way to read members in chunks(!)
                                try:
                                    outfile = open(uncompressed, 'wb')
                                    outfile.write(z.read(name))
                                    outfile.close()
                                    uncompressed_name = name
                                    unzipped = True
                                except IOError:
                                    os.close(fd)
                                    os.remove(uncompressed)
                                    file_err(
                                        'Problem decompressing zipped data',
                                        dataset, json_file)
                                    return
                        z.close()
                        # Replace the zipped file with the decompressed file if it's safe to do so
                        if uncompressed is not None:
                            if dataset.type in ('server_dir',
                                                'path_paste') or not in_place:
                                dataset.path = uncompressed
                            else:
                                shutil.move(uncompressed, dataset.path)
                            os.chmod(dataset.path, 0o644)
                            dataset.name = uncompressed_name
                    data_type = 'zip'
            if not data_type:
                # TODO refactor this logic.  check_binary isn't guaranteed to be
                # correct since it only looks at whether the first 100 chars are
                # printable or not.  If someone specifies a known unsniffable
                # binary datatype and check_binary fails, the file gets mangled.
                if check_binary(dataset.path) or Binary.is_ext_unsniffable(
                        dataset.file_type):
                    # We have a binary dataset, but it is not Bam, Sff or Pdf
                    data_type = 'binary'
                    # binary_ok = False
                    parts = dataset.name.split(".")
                    if len(parts) > 1:
                        ext = parts[-1].strip().lower()
                        if not Binary.is_ext_unsniffable(ext):
                            file_err(
                                'The uploaded binary file contains inappropriate content',
                                dataset, json_file)
                            return
                        elif Binary.is_ext_unsniffable(
                                ext) and dataset.file_type != ext:
                            err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % (
                                ext.capitalize(), ext)
                            file_err(err_msg, dataset, json_file)
                            return
            if not data_type:
                # We must have a text file
                if check_html(dataset.path):
                    file_err(
                        'The uploaded file contains inappropriate HTML content',
                        dataset, json_file)
                    return
            if data_type != 'binary':
                if link_data_only == 'copy_files':
                    if dataset.type in ('server_dir',
                                        'path_paste') and data_type not in [
                                            'gzip', 'bz2', 'zip'
                                        ]:
                        in_place = False
                    # Convert universal line endings to Posix line endings, but allow the user to turn it off,
                    # so that is becomes possible to upload gzip, bz2 or zip files with binary data without
                    # corrupting the content of those files.
                    if dataset.to_posix_lines:
                        tmpdir = output_adjacent_tmpdir(output_path)
                        tmp_prefix = 'data_id_%s_convert_' % dataset.dataset_id
                        if dataset.space_to_tab:
                            line_count, converted_path = sniff.convert_newlines_sep2tabs(
                                dataset.path,
                                in_place=in_place,
                                tmp_dir=tmpdir,
                                tmp_prefix=tmp_prefix)
                        else:
                            line_count, converted_path = sniff.convert_newlines(
                                dataset.path,
                                in_place=in_place,
                                tmp_dir=tmpdir,
                                tmp_prefix=tmp_prefix)
                if dataset.file_type == 'auto':
                    ext = sniff.guess_ext(dataset.path, registry.sniff_order)
                else:
                    ext = dataset.file_type
                data_type = ext
    # Save job info for the framework
    if ext == 'auto' and dataset.ext:
        ext = dataset.ext
    if ext == 'auto':
        ext = 'data'
    datatype = registry.get_datatype_by_extension(ext)
    if dataset.type in ('server_dir',
                        'path_paste') and link_data_only == 'link_to_files':
        # Never alter a file that will not be copied to Galaxy's local file store.
        if datatype.dataset_content_needs_grooming(dataset.path):
            err_msg = 'The uploaded files need grooming, so change your <b>Copy data into Galaxy?</b> selection to be ' + \
                '<b>Copy files into Galaxy</b> instead of <b>Link to files without copying into Galaxy</b> so grooming can be performed.'
            file_err(err_msg, dataset, json_file)
            return
    if link_data_only == 'copy_files' and dataset.type in (
            'server_dir',
            'path_paste') and data_type not in ['gzip', 'bz2', 'zip']:
        # Move the dataset to its "real" path
        if converted_path is not None:
            shutil.copy(converted_path, output_path)
            try:
                os.remove(converted_path)
            except:
                pass
        else:
            # This should not happen, but it's here just in case
            shutil.copy(dataset.path, output_path)
    elif link_data_only == 'copy_files':
        if purge_source:
            shutil.move(dataset.path, output_path)
        else:
            shutil.copy(dataset.path, output_path)
    # Write the job info
    stdout = stdout or 'uploaded %s file' % data_type
    info = dict(type='dataset',
                dataset_id=dataset.dataset_id,
                ext=ext,
                stdout=stdout,
                name=dataset.name,
                line_count=line_count)
    if dataset.get('uuid', None) is not None:
        info['uuid'] = dataset.get('uuid')
    json_file.write(dumps(info) + "\n")

    if link_data_only == 'copy_files' and datatype.dataset_content_needs_grooming(
            output_path):
        # Groom the dataset content if necessary
        datatype.groom_dataset_content(output_path)
Exemplo n.º 24
0
def handle_uploaded_dataset_file_internal(
    filename: str,
    datatypes_registry,
    ext: str = 'auto',
    tmp_prefix: Optional[str] = 'sniff_upload_',
    tmp_dir: Optional[str] = None,
    in_place: bool = False,
    check_content: bool = True,
    is_binary: Optional[bool] = None,
    auto_decompress: bool = True,
    uploaded_file_ext: Optional[str] = None,
    convert_to_posix_lines: Optional[bool] = None,
    convert_spaces_to_tabs: Optional[bool] = None,
) -> HandleUploadedDatasetFileInternalResponse:
    is_valid, ext, converted_path, compressed_type = handle_compressed_file(
        filename,
        datatypes_registry,
        ext=ext,
        tmp_prefix=tmp_prefix,
        tmp_dir=tmp_dir,
        in_place=in_place,
        check_content=check_content,
        auto_decompress=auto_decompress,
    )
    converted_newlines = False
    converted_spaces = False
    try:
        if not is_valid:
            if is_tar(converted_path):
                raise InappropriateDatasetContentError(
                    'TAR file uploads are not supported')
            raise InappropriateDatasetContentError(
                'The uploaded compressed file contains invalid content')

        # This needs to be checked again after decompression
        is_binary = check_binary(converted_path)
        guessed_ext = ext
        if ext in AUTO_DETECT_EXTENSIONS:
            guessed_ext = guess_ext(converted_path,
                                    sniff_order=datatypes_registry.sniff_order,
                                    is_binary=is_binary)
            guessed_datatype = datatypes_registry.get_datatype_by_extension(
                guessed_ext)
            if not is_binary and guessed_datatype.is_binary:
                # It's possible to have a datatype that is binary but not within the first 1024 bytes,
                # so check_binary might return a false negative. This is for instance true for PDF files
                is_binary = True

        if not is_binary and (convert_to_posix_lines
                              or convert_spaces_to_tabs):
            # Convert universal line endings to Posix line endings, spaces to tabs (if desired)
            convert_fxn = convert_function(convert_to_posix_lines,
                                           convert_spaces_to_tabs)
            line_count, _converted_path, converted_newlines, converted_spaces = convert_fxn(
                converted_path,
                in_place=in_place,
                tmp_dir=tmp_dir,
                tmp_prefix=tmp_prefix)
            if not in_place:
                if converted_path and filename != converted_path:
                    os.unlink(converted_path)
                assert _converted_path
                converted_path = _converted_path
            if ext in AUTO_DETECT_EXTENSIONS:
                ext = guess_ext(converted_path,
                                sniff_order=datatypes_registry.sniff_order,
                                is_binary=is_binary)
        else:
            ext = guessed_ext

        if not is_binary and check_content and check_html(converted_path):
            raise InappropriateDatasetContentError(
                'The uploaded file contains invalid HTML content')
    except Exception:
        if filename != converted_path:
            os.unlink(converted_path)
        raise
    return HandleUploadedDatasetFileInternalResponse(ext, converted_path,
                                                     compressed_type,
                                                     converted_newlines,
                                                     converted_spaces)
Exemplo n.º 25
0
def add_file(dataset, registry, json_file, output_path):
    data_type = None
    line_count = None
    converted_path = None
    stdout = None
    link_data_only = dataset.get('link_data_only', 'copy_files') != 'copy_files'

    # run_as_real_user is estimated from galaxy config (external chmod indicated of inputs executed)
    # If this is True we always purge supplied upload inputs so they are cleaned up and we reuse their
    # paths during data conversions since this user already owns that path.
    # Older in_place check for upload jobs created before 18.01, TODO remove in 19.XX. xref #5206
    run_as_real_user = dataset.get('run_as_real_user', False) or dataset.get("in_place", False)

    # purge_source defaults to True unless this is an FTP import and
    # ftp_upload_purge has been overridden to False in Galaxy's config.
    # We set purge_source to False if:
    # - the job does not have write access to the file, e.g. when running as the
    #   real user
    # - the files are uploaded from external paths.
    purge_source = dataset.get('purge_source', True) and not run_as_real_user and dataset.type not in ('server_dir', 'path_paste')

    # in_place is True unless we are running as a real user or importing external paths (i.e.
    # this is a real upload and not a path paste or ftp import).
    # in_place should always be False if running as real user because the uploaded file will
    # be owned by Galaxy and not the user and it should be False for external paths so Galaxy doesn't
    # modify files not controlled by Galaxy.
    in_place = not run_as_real_user and dataset.type not in ('server_dir', 'path_paste', 'ftp_import')

    # Base on the check_upload_content Galaxy config option and on by default, this enables some
    # security related checks on the uploaded content, but can prevent uploads from working in some cases.
    check_content = dataset.get('check_content' , True)

    # auto_decompress is a request flag that can be swapped off to prevent Galaxy from automatically
    # decompressing archive files before sniffing.
    auto_decompress = dataset.get('auto_decompress', True)
    try:
        ext = dataset.file_type
    except AttributeError:
        raise UploadProblemException('Unable to process uploaded file, missing file_type parameter.')

    if dataset.type == 'url':
        try:
            page = urlopen(dataset.path)  # page will be .close()ed by sniff methods
            temp_name = sniff.stream_to_file(page, prefix='url_paste', source_encoding=util.get_charset_from_http_headers(page.headers))
        except Exception as e:
            raise UploadProblemException('Unable to fetch %s\n%s' % (dataset.path, str(e)))
        dataset.path = temp_name
    # See if we have an empty file
    if not os.path.exists(dataset.path):
        raise UploadProblemException('Uploaded temporary file (%s) does not exist.' % dataset.path)
    if not os.path.getsize(dataset.path) > 0:
        raise UploadProblemException('The uploaded file is empty')
    # Is dataset content supported sniffable binary?
    is_binary = check_binary(dataset.path)
    if is_binary:
        # Sniff the data type
        guessed_ext = sniff.guess_ext(dataset.path, registry.sniff_order)
        # Set data_type only if guessed_ext is a binary datatype
        datatype = registry.get_datatype_by_extension(guessed_ext)
        if isinstance(datatype, Binary):
            data_type = guessed_ext
            ext = guessed_ext
    if not data_type:
        root_datatype = registry.get_datatype_by_extension(dataset.file_type)
        if getattr(root_datatype, 'compressed', False):
            data_type = 'compressed archive'
            ext = dataset.file_type
        else:
            # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress
            is_gzipped, is_valid = check_gzip(dataset.path, check_content=check_content)
            if is_gzipped and not is_valid:
                raise UploadProblemException('The gzipped uploaded file contains inappropriate content')
            elif is_gzipped and is_valid and auto_decompress:
                if not link_data_only:
                    # We need to uncompress the temp_name file, but BAM files must remain compressed in the BGZF format
                    CHUNK_SIZE = 2 ** 20  # 1Mb
                    fd, uncompressed = tempfile.mkstemp(prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False)
                    gzipped_file = gzip.GzipFile(dataset.path, 'rb')
                    while 1:
                        try:
                            chunk = gzipped_file.read(CHUNK_SIZE)
                        except IOError:
                            os.close(fd)
                            os.remove(uncompressed)
                            raise UploadProblemException('Problem decompressing gzipped data')
                        if not chunk:
                            break
                        os.write(fd, chunk)
                    os.close(fd)
                    gzipped_file.close()
                    # Replace the gzipped file with the decompressed file if it's safe to do so
                    if not in_place:
                        dataset.path = uncompressed
                    else:
                        shutil.move(uncompressed, dataset.path)
                    os.chmod(dataset.path, 0o644)
                dataset.name = dataset.name.rstrip('.gz')
                data_type = 'gzip'
            if not data_type:
                # See if we have a bz2 file, much like gzip
                is_bzipped, is_valid = check_bz2(dataset.path, check_content)
                if is_bzipped and not is_valid:
                    raise UploadProblemException('The gzipped uploaded file contains inappropriate content')
                elif is_bzipped and is_valid and auto_decompress:
                    if not link_data_only:
                        # We need to uncompress the temp_name file
                        CHUNK_SIZE = 2 ** 20  # 1Mb
                        fd, uncompressed = tempfile.mkstemp(prefix='data_id_%s_upload_bunzip2_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False)
                        bzipped_file = bz2.BZ2File(dataset.path, 'rb')
                        while 1:
                            try:
                                chunk = bzipped_file.read(CHUNK_SIZE)
                            except IOError:
                                os.close(fd)
                                os.remove(uncompressed)
                                raise UploadProblemException('Problem decompressing bz2 compressed data')
                            if not chunk:
                                break
                            os.write(fd, chunk)
                        os.close(fd)
                        bzipped_file.close()
                        # Replace the bzipped file with the decompressed file if it's safe to do so
                        if not in_place:
                            dataset.path = uncompressed
                        else:
                            shutil.move(uncompressed, dataset.path)
                        os.chmod(dataset.path, 0o644)
                    dataset.name = dataset.name.rstrip('.bz2')
                    data_type = 'bz2'
            if not data_type:
                # See if we have a zip archive
                is_zipped = check_zip(dataset.path)
                if is_zipped and auto_decompress:
                    if not link_data_only:
                        CHUNK_SIZE = 2 ** 20  # 1Mb
                        uncompressed = None
                        uncompressed_name = None
                        unzipped = False
                        z = zipfile.ZipFile(dataset.path)
                        for name in z.namelist():
                            if name.endswith('/'):
                                continue
                            if unzipped:
                                stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.'
                                break
                            fd, uncompressed = tempfile.mkstemp(prefix='data_id_%s_upload_zip_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False)
                            if sys.version_info[:2] >= (2, 6):
                                zipped_file = z.open(name)
                                while 1:
                                    try:
                                        chunk = zipped_file.read(CHUNK_SIZE)
                                    except IOError:
                                        os.close(fd)
                                        os.remove(uncompressed)
                                        raise UploadProblemException('Problem decompressing zipped data')
                                    if not chunk:
                                        break
                                    os.write(fd, chunk)
                                os.close(fd)
                                zipped_file.close()
                                uncompressed_name = name
                                unzipped = True
                            else:
                                # python < 2.5 doesn't have a way to read members in chunks(!)
                                try:
                                    with open(uncompressed, 'wb') as outfile:
                                        outfile.write(z.read(name))
                                    uncompressed_name = name
                                    unzipped = True
                                except IOError:
                                    os.close(fd)
                                    os.remove(uncompressed)
                                    raise UploadProblemException('Problem decompressing zipped data')
                        z.close()
                        # Replace the zipped file with the decompressed file if it's safe to do so
                        if uncompressed is not None:
                            if not in_place:
                                dataset.path = uncompressed
                            else:
                                shutil.move(uncompressed, dataset.path)
                            os.chmod(dataset.path, 0o644)
                            dataset.name = uncompressed_name
                    data_type = 'zip'
            if not data_type:
                if is_binary or registry.is_extension_unsniffable_binary(dataset.file_type):
                    # We have a binary dataset, but it is not Bam, Sff or Pdf
                    data_type = 'binary'
                    parts = dataset.name.split(".")
                    if len(parts) > 1:
                        ext = parts[-1].strip().lower()
                        is_ext_unsniffable_binary = registry.is_extension_unsniffable_binary(ext)
                        if check_content and not is_ext_unsniffable_binary:
                            raise UploadProblemException('The uploaded binary file contains inappropriate content')
                        elif is_ext_unsniffable_binary and dataset.file_type != ext:
                            err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % (ext, ext)
                            raise UploadProblemException(err_msg)
            if not data_type:
                # We must have a text file
                if check_content and check_html(dataset.path):
                    raise UploadProblemException('The uploaded file contains inappropriate HTML content')
            if data_type != 'binary':
                if not link_data_only and data_type not in ('gzip', 'bz2', 'zip'):
                    # Convert universal line endings to Posix line endings if to_posix_lines is True
                    # and the data is not binary or gzip-, bz2- or zip-compressed.
                    if dataset.to_posix_lines:
                        tmpdir = output_adjacent_tmpdir(output_path)
                        tmp_prefix = 'data_id_%s_convert_' % dataset.dataset_id
                        if dataset.space_to_tab:
                            line_count, converted_path = sniff.convert_newlines_sep2tabs(dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix)
                        else:
                            line_count, converted_path = sniff.convert_newlines(dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix)
                if dataset.file_type == 'auto':
                    ext = sniff.guess_ext(converted_path or dataset.path, registry.sniff_order)
                else:
                    ext = dataset.file_type
                data_type = ext
    # Save job info for the framework
    if ext == 'auto' and data_type == 'binary':
        ext = 'data'
    if ext == 'auto' and dataset.ext:
        ext = dataset.ext
    if ext == 'auto':
        ext = 'data'
    datatype = registry.get_datatype_by_extension(ext)
    if dataset.type in ('server_dir', 'path_paste') and link_data_only:
        # Never alter a file that will not be copied to Galaxy's local file store.
        if datatype.dataset_content_needs_grooming(dataset.path):
            err_msg = 'The uploaded files need grooming, so change your <b>Copy data into Galaxy?</b> selection to be ' + \
                '<b>Copy files into Galaxy</b> instead of <b>Link to files without copying into Galaxy</b> so grooming can be performed.'
            raise UploadProblemException(err_msg)
    if not link_data_only and converted_path:
        # Move the dataset to its "real" path
        try:
            shutil.move(converted_path, output_path)
        except OSError as e:
            # We may not have permission to remove converted_path
            if e.errno != errno.EACCES:
                raise
    elif not link_data_only:
        if purge_source:
            shutil.move(dataset.path, output_path)
        else:
            shutil.copy(dataset.path, output_path)
    # Write the job info
    stdout = stdout or 'uploaded %s file' % data_type
    info = dict(type='dataset',
                dataset_id=dataset.dataset_id,
                ext=ext,
                stdout=stdout,
                name=dataset.name,
                line_count=line_count)
    if dataset.get('uuid', None) is not None:
        info['uuid'] = dataset.get('uuid')
    json_file.write(dumps(info) + "\n")
    if not link_data_only and datatype and datatype.dataset_content_needs_grooming(output_path):
        # Groom the dataset content if necessary
        datatype.groom_dataset_content(output_path)
Exemplo n.º 26
0
    def _resolve_src(item):
        converted_path = None

        name, path = _has_src_to_path(item)
        dbkey = item.get("dbkey", "?")
        requested_ext = item.get("ext", "auto")
        info = item.get("info", None)
        object_id = item.get("object_id", None)
        link_data_only = upload_config.link_data_only
        if "link_data_only" in item:
            # Allow overriding this on a per file basis.
            link_data_only = _link_data_only(item)
        to_posix_lines = upload_config.get_option(item, "to_posix_lines")
        space_to_tab = upload_config.get_option(item, "space_to_tab")
        in_place = item.get("in_place", False)
        purge_source = item.get("purge_source", True)

        # Follow upload.py logic but without the auto-decompress logic.
        registry = upload_config.registry
        check_content = upload_config.check_content
        data_type, ext = None, requested_ext

        is_binary = check_binary(path)
        if is_binary:
            data_type, ext = handle_sniffable_binary_check(
                data_type, ext, path, registry)
        if data_type is None:
            root_datatype = registry.get_datatype_by_extension(ext)
            if getattr(root_datatype, 'compressed', False):
                data_type = 'compressed archive'
                ext = ext
            elif is_binary:
                data_type, ext = handle_unsniffable_binary_check(
                    data_type, ext, path, name, is_binary, requested_ext,
                    check_content, registry)
        if not data_type and check_content and check_html(path):
            raise UploadProblemException(
                'The uploaded file contains inappropriate HTML content')

        if data_type != 'binary':
            if not link_data_only:
                if to_posix_lines:
                    if space_to_tab:
                        line_count, converted_path = sniff.convert_newlines_sep2tabs(
                            path, in_place=in_place, tmp_dir=".")
                    else:
                        line_count, converted_path = sniff.convert_newlines(
                            path, in_place=in_place, tmp_dir=".")
                else:
                    if space_to_tab:
                        line_count, converted_path = sniff.sep2tabs(
                            path, in_place=in_place, tmp_dir=".")

            if requested_ext == 'auto':
                ext = sniff.guess_ext(converted_path or path,
                                      registry.sniff_order)
            else:
                ext = requested_ext

            data_type = ext

        if ext == 'auto' and data_type == 'binary':
            ext = 'data'
        if ext == 'auto' and requested_ext:
            ext = requested_ext
        if ext == 'auto':
            ext = 'data'

        datatype = registry.get_datatype_by_extension(ext)
        if link_data_only:
            # Never alter a file that will not be copied to Galaxy's local file store.
            if datatype.dataset_content_needs_grooming(path):
                err_msg = 'The uploaded files need grooming, so change your <b>Copy data into Galaxy?</b> selection to be ' + \
                    '<b>Copy files into Galaxy</b> instead of <b>Link to files without copying into Galaxy</b> so grooming can be performed.'
                raise UploadProblemException(err_msg)

        # If this file is not in the workdir make sure it gets there.
        if not link_data_only and converted_path:
            path = upload_config.ensure_in_working_directory(
                converted_path, purge_source, in_place)
        elif not link_data_only:
            path = upload_config.ensure_in_working_directory(
                path, purge_source, in_place)

        if not link_data_only and datatype and datatype.dataset_content_needs_grooming(
                path):
            # Groom the dataset content if necessary
            datatype.groom_dataset_content(path)

        rval = {
            "name": name,
            "filename": path,
            "dbkey": dbkey,
            "ext": ext,
            "link_data_only": link_data_only
        }
        if info is not None:
            rval["info"] = info
        if object_id is not None:
            rval["object_id"] = object_id
        return rval