Exemplo n.º 1
0
def handle_uploaded_dataset_file_internal(
        filename,
        datatypes_registry,
        ext='auto',
        tmp_prefix='sniff_upload_',
        tmp_dir=None,
        in_place=False,
        check_content=True,
        is_binary=None,
        auto_decompress=True,
        uploaded_file_ext=None,
        convert_to_posix_lines=None,
        convert_spaces_to_tabs=None,
):
    is_valid, ext, converted_path, compressed_type = handle_compressed_file(
        filename,
        datatypes_registry,
        ext=ext,
        tmp_prefix=tmp_prefix,
        tmp_dir=tmp_dir,
        in_place=in_place,
        check_content=check_content,
        auto_decompress=auto_decompress,
    )
    try:
        if not is_valid:
            if is_tar(converted_path):
                raise InappropriateDatasetContentError('TAR file uploads are not supported')
            raise InappropriateDatasetContentError('The uploaded compressed file contains invalid content')

        # This needs to be checked again after decompression
        is_binary = check_binary(converted_path)

        if not is_binary and (convert_to_posix_lines or convert_spaces_to_tabs):
            # Convert universal line endings to Posix line endings, spaces to tabs (if desired)
            if convert_spaces_to_tabs:
                convert_fxn = convert_newlines_sep2tabs
            else:
                convert_fxn = convert_newlines
            line_count, _converted_path = convert_fxn(converted_path, in_place=in_place, tmp_dir=tmp_dir, tmp_prefix=tmp_prefix)
            if not in_place:
                if converted_path and filename != converted_path:
                    os.unlink(converted_path)
                converted_path = _converted_path

        if ext in AUTO_DETECT_EXTENSIONS:
            ext = guess_ext(converted_path, sniff_order=datatypes_registry.sniff_order, is_binary=is_binary)

        if not is_binary and check_content and check_html(converted_path):
            raise InappropriateDatasetContentError('The uploaded file contains invalid HTML content')
    except Exception:
        if filename != converted_path:
            os.unlink(converted_path)
        raise
    return ext, converted_path, compressed_type
Exemplo n.º 2
0
def handle_uploaded_dataset_file_internal(
        filename,
        datatypes_registry,
        ext='auto',
        tmp_prefix='sniff_upload_',
        tmp_dir=None,
        in_place=False,
        check_content=True,
        is_binary=None,
        auto_decompress=True,
        uploaded_file_ext=None,
        convert_to_posix_lines=None,
        convert_spaces_to_tabs=None,
):
    is_valid, ext, converted_path, compressed_type = handle_compressed_file(
        filename,
        datatypes_registry,
        ext=ext,
        tmp_prefix=tmp_prefix,
        tmp_dir=tmp_dir,
        in_place=in_place,
        check_content=check_content,
        auto_decompress=auto_decompress,
    )
    try:
        if not is_valid:
            if is_tar(converted_path):
                raise InappropriateDatasetContentError('TAR file uploads are not supported')
            raise InappropriateDatasetContentError('The uploaded compressed file contains invalid content')

        # This needs to be checked again after decompression
        is_binary = check_binary(converted_path)

        if not is_binary and (convert_to_posix_lines or convert_spaces_to_tabs):
            # Convert universal line endings to Posix line endings, spaces to tabs (if desired)
            if convert_spaces_to_tabs:
                convert_fxn = convert_newlines_sep2tabs
            else:
                convert_fxn = convert_newlines
            line_count, _converted_path = convert_fxn(converted_path, in_place=in_place, tmp_dir=tmp_dir, tmp_prefix=tmp_prefix)
            if not in_place:
                if converted_path and filename != converted_path:
                    os.unlink(converted_path)
                converted_path = _converted_path

        if ext in AUTO_DETECT_EXTENSIONS:
            ext = guess_ext(converted_path, sniff_order=datatypes_registry.sniff_order, is_binary=is_binary)

        if not is_binary and check_content and check_html(converted_path):
            raise InappropriateDatasetContentError('The uploaded file contains invalid HTML content')
    except Exception:
        if filename != converted_path:
            os.unlink(converted_path)
        raise
    return ext, converted_path, compressed_type
Exemplo n.º 3
0
def handle_uploaded_dataset_file_internal(
    filename: str,
    datatypes_registry,
    ext: str = 'auto',
    tmp_prefix: Optional[str] = 'sniff_upload_',
    tmp_dir: Optional[str] = None,
    in_place: bool = False,
    check_content: bool = True,
    is_binary: Optional[bool] = None,
    auto_decompress: bool = True,
    uploaded_file_ext: Optional[str] = None,
    convert_to_posix_lines: Optional[bool] = None,
    convert_spaces_to_tabs: Optional[bool] = None,
) -> HandleUploadedDatasetFileInternalResponse:
    is_valid, ext, converted_path, compressed_type = handle_compressed_file(
        filename,
        datatypes_registry,
        ext=ext,
        tmp_prefix=tmp_prefix,
        tmp_dir=tmp_dir,
        in_place=in_place,
        check_content=check_content,
        auto_decompress=auto_decompress,
    )
    converted_newlines = False
    converted_spaces = False
    try:
        if not is_valid:
            if is_tar(converted_path):
                raise InappropriateDatasetContentError(
                    'TAR file uploads are not supported')
            raise InappropriateDatasetContentError(
                'The uploaded compressed file contains invalid content')

        # This needs to be checked again after decompression
        is_binary = check_binary(converted_path)
        guessed_ext = ext
        if ext in AUTO_DETECT_EXTENSIONS:
            guessed_ext = guess_ext(converted_path,
                                    sniff_order=datatypes_registry.sniff_order,
                                    is_binary=is_binary)
            guessed_datatype = datatypes_registry.get_datatype_by_extension(
                guessed_ext)
            if not is_binary and guessed_datatype.is_binary:
                # It's possible to have a datatype that is binary but not within the first 1024 bytes,
                # so check_binary might return a false negative. This is for instance true for PDF files
                is_binary = True

        if not is_binary and (convert_to_posix_lines
                              or convert_spaces_to_tabs):
            # Convert universal line endings to Posix line endings, spaces to tabs (if desired)
            convert_fxn = convert_function(convert_to_posix_lines,
                                           convert_spaces_to_tabs)
            line_count, _converted_path, converted_newlines, converted_spaces = convert_fxn(
                converted_path,
                in_place=in_place,
                tmp_dir=tmp_dir,
                tmp_prefix=tmp_prefix)
            if not in_place:
                if converted_path and filename != converted_path:
                    os.unlink(converted_path)
                assert _converted_path
                converted_path = _converted_path
            if ext in AUTO_DETECT_EXTENSIONS:
                ext = guess_ext(converted_path,
                                sniff_order=datatypes_registry.sniff_order,
                                is_binary=is_binary)
        else:
            ext = guessed_ext

        if not is_binary and check_content and check_html(converted_path):
            raise InappropriateDatasetContentError(
                'The uploaded file contains invalid HTML content')
    except Exception:
        if filename != converted_path:
            os.unlink(converted_path)
        raise
    return HandleUploadedDatasetFileInternalResponse(ext, converted_path,
                                                     compressed_type,
                                                     converted_newlines,
                                                     converted_spaces)
Exemplo n.º 4
0
def handle_uploaded_dataset_file_internal(
    filename,
    datatypes_registry,
    ext='auto',
    tmp_prefix='sniff_upload_',
    tmp_dir=None,
    in_place=False,
    check_content=True,
    is_binary=None,
    auto_decompress=True,
    uploaded_file_ext=None,
    convert_to_posix_lines=None,
    convert_spaces_to_tabs=None,
):
    is_valid, ext, converted_path, compressed_type = handle_compressed_file(
        filename,
        datatypes_registry,
        ext=ext,
        tmp_prefix=tmp_prefix,
        tmp_dir=tmp_dir,
        in_place=in_place,
        check_content=check_content,
        auto_decompress=auto_decompress,
    )
    try:
        if not is_valid:
            if is_tar(converted_path):
                raise InappropriateDatasetContentError(
                    'TAR file uploads are not supported')
            raise InappropriateDatasetContentError(
                'The uploaded compressed file contains invalid content')

        # This needs to be checked again after decompression
        is_binary = check_binary(converted_path)
        guessed_ext = ext
        if ext in AUTO_DETECT_EXTENSIONS:
            guessed_ext = guess_ext(converted_path,
                                    sniff_order=datatypes_registry.sniff_order,
                                    is_binary=is_binary)
            guessed_datatype = datatypes_registry.get_datatype_by_extension(
                guessed_ext)
            if not is_binary and guessed_datatype.is_binary:
                # It's possible to have a datatype that is binary but not within the first 1024 bytes,
                # so check_binary might return a false negative. This is for instance true for PDF files
                is_binary = True

        if not is_binary and (convert_to_posix_lines
                              or convert_spaces_to_tabs):
            # Convert universal line endings to Posix line endings, spaces to tabs (if desired)
            if convert_spaces_to_tabs:
                convert_fxn = convert_newlines_sep2tabs
            else:
                convert_fxn = convert_newlines
            line_count, _converted_path = convert_fxn(converted_path,
                                                      in_place=in_place,
                                                      tmp_dir=tmp_dir,
                                                      tmp_prefix=tmp_prefix)
            if not in_place:
                if converted_path and filename != converted_path:
                    os.unlink(converted_path)
                converted_path = _converted_path
            if ext in AUTO_DETECT_EXTENSIONS:
                ext = guess_ext(converted_path,
                                sniff_order=datatypes_registry.sniff_order,
                                is_binary=is_binary)
        else:
            ext = guessed_ext

        # AMP Commenting this out.  Does not seem necessary.  check_html returns true for valid HTML, but the error
        # thrown is "invalid html".  Similarly, if we change it to not check_html, then it throws an error with JSON
        # or other non-binary data types
        #if not is_binary and check_content and check_html(converted_path):
        #raise InappropriateDatasetContentError('The uploaded file contains invalid HTML content')
    except Exception:
        if filename != converted_path:
            os.unlink(converted_path)
        raise
    return ext, converted_path, compressed_type