def handle_uploaded_dataset_file_internal( filename, datatypes_registry, ext='auto', tmp_prefix='sniff_upload_', tmp_dir=None, in_place=False, check_content=True, is_binary=None, auto_decompress=True, uploaded_file_ext=None, convert_to_posix_lines=None, convert_spaces_to_tabs=None, ): is_valid, ext, converted_path, compressed_type = handle_compressed_file( filename, datatypes_registry, ext=ext, tmp_prefix=tmp_prefix, tmp_dir=tmp_dir, in_place=in_place, check_content=check_content, auto_decompress=auto_decompress, ) try: if not is_valid: if is_tar(converted_path): raise InappropriateDatasetContentError('TAR file uploads are not supported') raise InappropriateDatasetContentError('The uploaded compressed file contains invalid content') # This needs to be checked again after decompression is_binary = check_binary(converted_path) if not is_binary and (convert_to_posix_lines or convert_spaces_to_tabs): # Convert universal line endings to Posix line endings, spaces to tabs (if desired) if convert_spaces_to_tabs: convert_fxn = convert_newlines_sep2tabs else: convert_fxn = convert_newlines line_count, _converted_path = convert_fxn(converted_path, in_place=in_place, tmp_dir=tmp_dir, tmp_prefix=tmp_prefix) if not in_place: if converted_path and filename != converted_path: os.unlink(converted_path) converted_path = _converted_path if ext in AUTO_DETECT_EXTENSIONS: ext = guess_ext(converted_path, sniff_order=datatypes_registry.sniff_order, is_binary=is_binary) if not is_binary and check_content and check_html(converted_path): raise InappropriateDatasetContentError('The uploaded file contains invalid HTML content') except Exception: if filename != converted_path: os.unlink(converted_path) raise return ext, converted_path, compressed_type
def handle_uploaded_dataset_file_internal( filename: str, datatypes_registry, ext: str = 'auto', tmp_prefix: Optional[str] = 'sniff_upload_', tmp_dir: Optional[str] = None, in_place: bool = False, check_content: bool = True, is_binary: Optional[bool] = None, auto_decompress: bool = True, uploaded_file_ext: Optional[str] = None, convert_to_posix_lines: Optional[bool] = None, convert_spaces_to_tabs: Optional[bool] = None, ) -> HandleUploadedDatasetFileInternalResponse: is_valid, ext, converted_path, compressed_type = handle_compressed_file( filename, datatypes_registry, ext=ext, tmp_prefix=tmp_prefix, tmp_dir=tmp_dir, in_place=in_place, check_content=check_content, auto_decompress=auto_decompress, ) converted_newlines = False converted_spaces = False try: if not is_valid: if is_tar(converted_path): raise InappropriateDatasetContentError( 'TAR file uploads are not supported') raise InappropriateDatasetContentError( 'The uploaded compressed file contains invalid content') # This needs to be checked again after decompression is_binary = check_binary(converted_path) guessed_ext = ext if ext in AUTO_DETECT_EXTENSIONS: guessed_ext = guess_ext(converted_path, sniff_order=datatypes_registry.sniff_order, is_binary=is_binary) guessed_datatype = datatypes_registry.get_datatype_by_extension( guessed_ext) if not is_binary and guessed_datatype.is_binary: # It's possible to have a datatype that is binary but not within the first 1024 bytes, # so check_binary might return a false negative. This is for instance true for PDF files is_binary = True if not is_binary and (convert_to_posix_lines or convert_spaces_to_tabs): # Convert universal line endings to Posix line endings, spaces to tabs (if desired) convert_fxn = convert_function(convert_to_posix_lines, convert_spaces_to_tabs) line_count, _converted_path, converted_newlines, converted_spaces = convert_fxn( converted_path, in_place=in_place, tmp_dir=tmp_dir, tmp_prefix=tmp_prefix) if not in_place: if converted_path and filename != converted_path: os.unlink(converted_path) assert _converted_path converted_path = _converted_path if ext in AUTO_DETECT_EXTENSIONS: ext = guess_ext(converted_path, sniff_order=datatypes_registry.sniff_order, is_binary=is_binary) else: ext = guessed_ext if not is_binary and check_content and check_html(converted_path): raise InappropriateDatasetContentError( 'The uploaded file contains invalid HTML content') except Exception: if filename != converted_path: os.unlink(converted_path) raise return HandleUploadedDatasetFileInternalResponse(ext, converted_path, compressed_type, converted_newlines, converted_spaces)
def handle_uploaded_dataset_file_internal( filename, datatypes_registry, ext='auto', tmp_prefix='sniff_upload_', tmp_dir=None, in_place=False, check_content=True, is_binary=None, auto_decompress=True, uploaded_file_ext=None, convert_to_posix_lines=None, convert_spaces_to_tabs=None, ): is_valid, ext, converted_path, compressed_type = handle_compressed_file( filename, datatypes_registry, ext=ext, tmp_prefix=tmp_prefix, tmp_dir=tmp_dir, in_place=in_place, check_content=check_content, auto_decompress=auto_decompress, ) try: if not is_valid: if is_tar(converted_path): raise InappropriateDatasetContentError( 'TAR file uploads are not supported') raise InappropriateDatasetContentError( 'The uploaded compressed file contains invalid content') # This needs to be checked again after decompression is_binary = check_binary(converted_path) guessed_ext = ext if ext in AUTO_DETECT_EXTENSIONS: guessed_ext = guess_ext(converted_path, sniff_order=datatypes_registry.sniff_order, is_binary=is_binary) guessed_datatype = datatypes_registry.get_datatype_by_extension( guessed_ext) if not is_binary and guessed_datatype.is_binary: # It's possible to have a datatype that is binary but not within the first 1024 bytes, # so check_binary might return a false negative. This is for instance true for PDF files is_binary = True if not is_binary and (convert_to_posix_lines or convert_spaces_to_tabs): # Convert universal line endings to Posix line endings, spaces to tabs (if desired) if convert_spaces_to_tabs: convert_fxn = convert_newlines_sep2tabs else: convert_fxn = convert_newlines line_count, _converted_path = convert_fxn(converted_path, in_place=in_place, tmp_dir=tmp_dir, tmp_prefix=tmp_prefix) if not in_place: if converted_path and filename != converted_path: os.unlink(converted_path) converted_path = _converted_path if ext in AUTO_DETECT_EXTENSIONS: ext = guess_ext(converted_path, sniff_order=datatypes_registry.sniff_order, is_binary=is_binary) else: ext = guessed_ext # AMP Commenting this out. Does not seem necessary. check_html returns true for valid HTML, but the error # thrown is "invalid html". Similarly, if we change it to not check_html, then it throws an error with JSON # or other non-binary data types #if not is_binary and check_content and check_html(converted_path): #raise InappropriateDatasetContentError('The uploaded file contains invalid HTML content') except Exception: if filename != converted_path: os.unlink(converted_path) raise return ext, converted_path, compressed_type