def any_tableset(fileobj, mimetype=None, extension=None): """Reads any supported table type according to a specified MIME type or file extension or automatically detecting the type. Best matching TableSet loaded with the fileobject is returned. Matching is done by looking at the type (e.g mimetype='text/csv') or file extension (e.g. extension='tsv'), or otherwise autodetecting the file format by using the magic library which looks at the first few bytes of the file BUT is often wrong. Consult the source for recognized MIME types and file extensions. On error it raises messytables.ReadError """ # Auto-detect if the caller has offered no clue. (Because the # auto-detection routine is pretty poor.) if mimetype is None and extension is None: import magic # Since we need to peek the start of the stream, make sure we can # seek back later. If not, slurp in the contents into a StringIO. fileobj = messytables.seekable_stream(fileobj) header = fileobj.read(1024) mimetype = magic.from_buffer(header, mime=True) fileobj.seek(0) if (mimetype in ('application/x-zip-compressed', 'application/zip') or (extension and extension.lower() in ('zip',))): # Do this first because the extension applies to the content # type of the inner files, so don't check them before we check # for a ZIP file. return ZIPTableSet(fileobj) if (mimetype in ('text/csv', 'text/comma-separated-values') or (extension and extension.lower() in ('csv',))): return CSVTableSet(fileobj) # guess delimiter if (mimetype in ('text/tsv', 'text/tab-separated-values') or (extension and extension.lower() in ('tsv',))): return CSVTableSet(fileobj, delimiter='\t') if mimetype in ('application/ms-excel', 'application/vnd.ms-excel', 'application/xls') or (extension and extension.lower() in ('xls',)): return XLSTableSet(fileobj) if (mimetype in ( 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',) or (extension and extension.lower() in ('xlsx',))): return XLSXTableSet(fileobj) if (mimetype in ('text/html',) or (extension and extension.lower() in ('htm', 'html',))): return HTMLTableSet(fileobj) if mimetype: raise ValueError("Unrecognized MIME type: {mimetype}".format( mimetype=mimetype)) if extension: raise ValueError('''Could not determine MIME type and unrecognized extension: {extension}'''.format(extension=extension)) raise ValueError("Could not determine MIME type and no extension given.")
def test_read_simple_zip(self): fh = horror_fobj('simple.zip') table_set = ZIPTableSet(fh) row_set = table_set.tables[0] assert_equal(7, len(list(row_set))) row = list(row_set.sample)[0] assert_equal(row[0].value, 'date') assert_equal(row[1].value, 'temperature') for row in list(row_set): assert_equal(3, len(row)) assert_equal(row[0].type, StringType())
def from_fileobj(cls, fileobj, mimetype=None, extension=None): """ Opens whatever sort of file is passed in, using the MIME type (e.g mimetype='text/csv') or file extension (e.g. extension='tsv'), or otherwise autodetecting the file format. Consult the source for recognized MIME types and file extensions.""" if mimetype == None: import magic # Since we need to peek the start of the stream, make sure we can # seek back later. If not, slurp in the contents into a StringIO. fileobj = messytables.seekable_stream(fileobj) header = fileobj.read(1024) mimetype = magic.from_buffer(header, mime=True) fileobj.seek(0) if mimetype in ('application/x-zip-compressed', 'application/zip') \ or (extension and extension.lower() in ('zip',)): # Do this first because the extension applies to the content # type of the inner files, so don't check them before we check # for a ZIP file. return ZIPTableSet.from_fileobj(fileobj) if mimetype in ('text/csv', 'text/comma-separated-values') or \ (extension and extension.lower() in ('csv',)): return CSVTableSet.from_fileobj(fileobj) # guess delimiter if mimetype in ('text/tsv', 'text/tab-separated-values') or \ (extension and extension.lower() in ('tsv',)): return CSVTableSet.from_fileobj(fileobj, delimiter='\t') if mimetype in ('application/ms-excel', 'application/vnd.ms-excel', 'application/xls') or (extension and extension.lower() in \ ('xls',)): return XLSTableSet.from_fileobj(fileobj) if mimetype in ('application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',) \ or (extension and extension.lower() in ('xlsx',)): return XLSXTableSet.from_fileobj(fileobj) if mimetype: raise ValueError("Unrecognized MIME type: " + mimetype) if extension: raise ValueError("Could not determine MIME type and " + "unrecognized extension: " + extension) raise ValueError("Could not determine MIME type and no extension given.")
def from_fileobj(cls, fileobj, mimetype=None, extension=None): """ Opens whatever sort of file is passed in, using the MIME type (e.g mimetype='text/csv') or file extension (e.g. extension='tsv'), or otherwise autodetecting the file format. Consult the source for recognized MIME types and file extensions.""" if mimetype == None: import magic # Since we need to peek the start of the stream, make sure we can # seek back later. If not, slurp in the contents into a StringIO. fileobj = messytables.seekable_stream(fileobj) header = fileobj.read(1024) mimetype = magic.from_buffer(header, mime=True) fileobj.seek(0) if mimetype in ('application/x-zip-compressed', 'application/zip') \ or (extension and extension.lower() in ('zip',)): # Do this first because the extension applies to the content # type of the inner files, so don't check them before we check # for a ZIP file. return ZIPTableSet.from_fileobj(fileobj) if mimetype in ('text/csv', 'text/comma-separated-values') or \ (extension and extension.lower() in ('csv',)): return CSVTableSet.from_fileobj(fileobj, delimiter=',') if mimetype in ('text/tsv', 'text/tab-separated-values') or \ (extension and extension.lower() in ('tsv',)): return CSVTableSet.from_fileobj(fileobj, delimiter='\t') if mimetype in ('application/ms-excel', 'application/vnd.ms-excel', 'application/xls', 'application/excel') or (extension and extension.lower() in \ ('xls',)): return XLSTableSet.from_fileobj(fileobj) if mimetype in ('application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',) \ or (extension and extension.lower() in ('xlsx',)): return XLSXTableSet.from_fileobj(fileobj) if mimetype: raise ValueError("Unrecognized MIME type: " + mimetype) if extension: raise ValueError("Could not determine MIME type and " + "unrecognized extension: " + extension) raise ValueError("Could not determine MIME type and no extension given.")
def parse_resource(resource_file, mime_type, fileext, default_schema): # Given resource data, returns a tuple of # * the schema used to load the file # * the resource as a table, as given by messytables (an iterator over rows) # Schema defaults. We'll build up the schema with defaults # from the actual resource so that it is easy for the data # owner to customize the schema later. schema = { "_format": 1, } # Update defaults from the provided schema. if default_schema: schema.update(default_schema) else: schema["auto"] = True # Utility function that's like dict.get() but works on nested # dicts and takes a path through to dicts as arguments. Returns # None if no value is found. # e.g. schema_get('format', 'name') # This returns schema["format"]["name"], or None if # "format" isn't in schema or "name" isn't in schema["format"]. def schema_get(*path, **kwargs): if len(path) < 1: raise ValueError() v = schema for p in path: v = v.get(p, {}) if v == { }: v = kwargs.get("default", None) return v # Utility function that sets a value in a set of nested dicts. # Takes a path plus a value as arguments. # e.g. schema_set('format', 'name', 'zip') # This is equivalent to: # schema["format"]["name"] = "zip" # but creating dicts as necessary along the way. def schema_set(*path_and_value): if len(path_and_value) < 2: raise ValueError() path = path_and_value[0:-2] field = path_and_value[-2] value = path_and_value[-1] container = schema for p in path: container = container.setdefault(p, {}) container[field] = value # Parse the resource_file. # Get the table data format. if schema_get('format', 'name') == None: # Auto-detect format. from messytables import AnyTableSet as data_format data_format_args = { } elif schema_get('format', 'name') in ("csv", "tsv"): # "format" = { # "name": "csv" | "tsv", # "delimiter": ",", # "quotechar": "\"", # "encoding": "utf-8" # } # Load as CSV/TSV. from messytables import CSVTableSet as data_format # Default load parameters. data_format_args = { "delimiter": "," if schema_get('format', 'name') == "csv" else "\t", "quotechar": '"', "encoding": None, } # Override parameters from the schema. for n in ("delimiter", "quotechar", "encoding"): v = schema_get("format", n) if v: data_format_args[n] = str(v) # csv module requires str's not unicode else: raise UserError("Invalid format name in schema. Allowed values are: csv, tsv.") # If the user specifies a ZIP container, then parse the # resource_file as a ZIP file and pass the format parameters # into ZIPTableSet so it knows how to parse the inner files. if schema_get("container", "name") == "zip": # "container = { # "name": "zip" # } from messytables import ZIPTableSet table_set = ZIPTableSet.from_fileobj(resource_file, inner_data_format=data_format, inner_parser_args=data_format_args) elif schema_get("container", "name") != None: raise UserError("Invalid container name in schema. Allowed values are: zip.") # If format parameters were given explicity, do not use a container. # Just parse according to the specified format. elif schema_get('format', 'name') != None: table_set = data_format.from_fileobj(resource_file, **data_format_args) # If no container and no format was specified, auto-detect everything. else: # Use the AnyTableSet to guess all parsing parameters. from messytables import AnyTableSet try: table_set = AnyTableSet.from_fileobj(resource_file, mimetype=mime_type, extension=fileext) except Exception as e: raise UserError("The file format could not be recognized: %s" % str(e)) # Provide the container information that may have been guessed. if type(table_set).__name__ == "ZIPTableSet": schema_set("container", "name", "zip") table = table_set.tables[0] # Provide the CSV parser settings that may have been guessed. if type(table).__name__ == "CSVRowSet": schema_set("format", "name", "tsv" if table.delimiter == "\t" else "csv") schema_set("format", "delimiter", table.delimiter) schema_set("format", "quotechar", table.quotechar) schema_set("format", "encoding", table.encoding) # Get the column header names and the row offset to the data. # Start by guessing. from messytables import headers_guess, headers_processor offset, headers = headers_guess(table.sample) # Overwrite the has_headers and offset values with the schema, if present. has_headers = schema_get("header", "present", default=True) offset = schema_get("header", "offset", default=offset) # Set the header information back into the schema. schema_set("header", "present", True) schema_set("header", "offset", offset) # Override the header names with what is specified in the schema. for cidx, col in enumerate(schema_get("columns", default=[])): try: headers[cidx] = col.get("name", headers[cidx]) except IndexError: pass # ignore schema problems? # Since SQL column names are not case sensitive, prevent any # uniqueness clashes by converting all to lowercase. While # we're at it, also trim spaces. headers = [h.lower().strip() for h in headers] # Ensure the headers are valid SQL-ish & datastore column names: # 1st character: letter # subsequent characters: letter, number, or underscore for i, header in enumerate(headers): # To play nice with international characters, convert to ASCII # by replacing extended characters with their closest ASCII # equivalent where possible. header = u"".join(c for c in unicodedata.normalize('NFKD', header) if not unicodedata.combining(c)) # Replace any invalid characters with "". header = re.sub("[^a-z0-9_]", "", header) # If there is an invalid 1st character, prepend a valid start. if not re.match("^[a-z]", header): header = "field_" + header # And force to an ASCII byte string, which should be possible by now. headers[i] = str(header) # TODO: Check that there is not an insane number of columns. # That could crash headers_make_unique. # Ensure the headers are unique, and not too long. Postgres # supports 63 (64?)-char-long col names, but that's ridiculous. from messytables import headers_make_unique headers = headers_make_unique(headers, max_length=24) # Skip the header row. # (Add one to begin with content, not the header.) from messytables import offset_processor table.register_processor(offset_processor(offset + 1)) # Try to guess the datatypes. import messytables.types from messytables import type_guess, types_processor datatypes = type_guess( table.sample, [ messytables.types.StringType, messytables.types.IntegerType, messytables.types.FloatType, messytables.types.DecimalType, messytables.types.DateType ], strict=True ) if len(datatypes) != len(headers): raise UserError("Could not guess data types. Column header count does not match rows found during type guessing.") messytable_datastore_type_mapping = { messytables.types.StringType: lambda t : 'text', messytables.types.IntegerType: lambda t : 'bigint', # 'int' may not be big enough, # and type detection may not realize it needs to be big messytables.types.FloatType: lambda t : 'float', messytables.types.DecimalType: lambda t : 'numeric', messytables.types.DateType: lambda t : 'timestamp:' + t.format, } datatypes = [messytable_datastore_type_mapping[type(t)](t) for t in datatypes] # convert objects to strings # Override the datatypes from the schema. for cidx, col in enumerate(schema_get("columns", default=[])): try: datatypes[cidx] = col.get("type", datatypes[cidx]) except IndexError: pass # ignore schema problems? # Provide the header names and types back to the user in the schema. schema["columns"] = [] for i in xrange(len(headers)): schema["columns"].append({ "name": headers[i], "type": datatypes[i], }) # Validate that the datatypes are all legit. for dt in datatypes: if dt.split(":")[0] not in ("text", "int", "bigint", "float", "bool", "numeric", "date", "time", "timestamp", "json"): raise UserError("Invalid data type in schema: %s" % dt) return schema, table
def parse_resource(resource_file, mime_type, fileext, default_schema): # Given resource data, returns a tuple of # * the schema used to load the file # * the resource as a table, as given by messytables (an iterator over rows) # Schema defaults. We'll build up the schema with defaults # from the actual resource so that it is easy for the data # owner to customize the schema later. schema = { "_format": 1, } # Update defaults from the provided schema. if default_schema: schema.update(default_schema) else: schema["auto"] = True # Utility function that's like dict.get() but works on nested # dicts and takes a path through to dicts as arguments. Returns # None if no value is found. # e.g. schema_get('format', 'name') # This returns schema["format"]["name"], or None if # "format" isn't in schema or "name" isn't in schema["format"]. def schema_get(*path, **kwargs): if len(path) < 1: raise ValueError() v = schema for p in path: v = v.get(p, {}) if v == {}: v = kwargs.get("default", None) return v # Utility function that sets a value in a set of nested dicts. # Takes a path plus a value as arguments. # e.g. schema_set('format', 'name', 'zip') # This is equivalent to: # schema["format"]["name"] = "zip" # but creating dicts as necessary along the way. def schema_set(*path_and_value): if len(path_and_value) < 2: raise ValueError() path = path_and_value[0:-2] field = path_and_value[-2] value = path_and_value[-1] container = schema for p in path: container = container.setdefault(p, {}) container[field] = value # Parse the resource_file. # Get the table data format. if schema_get('format', 'name') == None: # Auto-detect format. from messytables import AnyTableSet as data_format data_format_args = {} elif schema_get('format', 'name') in ("csv", "tsv"): # "format" = { # "name": "csv" | "tsv", # "delimiter": ",", # "quotechar": "\"", # "encoding": "utf-8" # } # Load as CSV/TSV. from messytables import CSVTableSet as data_format # Default load parameters. data_format_args = { "delimiter": "," if schema_get('format', 'name') == "csv" else "\t", "quotechar": '"', "encoding": None, } # Override parameters from the schema. for n in ("delimiter", "quotechar", "encoding"): v = schema_get("format", n) if v: data_format_args[n] = str( v) # csv module requires str's not unicode else: raise UserError( "Invalid format name in schema. Allowed values are: csv, tsv.") # If the user specifies a ZIP container, then parse the # resource_file as a ZIP file and pass the format parameters # into ZIPTableSet so it knows how to parse the inner files. if schema_get("container", "name") == "zip": # "container = { # "name": "zip" # } from messytables import ZIPTableSet table_set = ZIPTableSet.from_fileobj( resource_file, inner_data_format=data_format, inner_parser_args=data_format_args) elif schema_get("container", "name") != None: raise UserError( "Invalid container name in schema. Allowed values are: zip.") # If format parameters were given explicity, do not use a container. # Just parse according to the specified format. elif schema_get('format', 'name') != None: table_set = data_format.from_fileobj(resource_file, **data_format_args) # If no container and no format was specified, auto-detect everything. else: # Use the AnyTableSet to guess all parsing parameters. from messytables import AnyTableSet try: table_set = AnyTableSet.from_fileobj(resource_file, mimetype=mime_type, extension=fileext) except Exception as e: raise UserError("The file format could not be recognized: %s" % str(e)) # Provide the container information that may have been guessed. if type(table_set).__name__ == "ZIPTableSet": schema_set("container", "name", "zip") table = table_set.tables[0] # Provide the CSV parser settings that may have been guessed. if type(table).__name__ == "CSVRowSet": schema_set("format", "name", "tsv" if table.delimiter == "\t" else "csv") schema_set("format", "delimiter", table.delimiter) schema_set("format", "quotechar", table.quotechar) schema_set("format", "encoding", table.encoding) # Get the column header names and the row offset to the data. # Start by guessing. from messytables import headers_guess, headers_processor offset, headers = headers_guess(table.sample) # Overwrite the has_headers and offset values with the schema, if present. has_headers = schema_get("header", "present", default=True) offset = schema_get("header", "offset", default=offset) # Set the header information back into the schema. schema_set("header", "present", True) schema_set("header", "offset", offset) # Override the header names with what is specified in the schema. for cidx, col in enumerate(schema_get("columns", default=[])): try: headers[cidx] = col.get("name", headers[cidx]) except IndexError: pass # ignore schema problems? # Since SQL column names are not case sensitive, prevent any # uniqueness clashes by converting all to lowercase. While # we're at it, also trim spaces. headers = [h.lower().strip() for h in headers] # Ensure the headers are valid SQL-ish & datastore column names: # 1st character: letter # subsequent characters: letter, number, or underscore for i, header in enumerate(headers): # To play nice with international characters, convert to ASCII # by replacing extended characters with their closest ASCII # equivalent where possible. header = u"".join(c for c in unicodedata.normalize('NFKD', header) if not unicodedata.combining(c)) # Replace any invalid characters with "". header = re.sub("[^a-z0-9_]", "", header) # If there is an invalid 1st character, prepend a valid start. if not re.match("^[a-z]", header): header = "field_" + header # And force to an ASCII byte string, which should be possible by now. headers[i] = str(header) # TODO: Check that there is not an insane number of columns. # That could crash headers_make_unique. # Ensure the headers are unique, and not too long. Postgres # supports 63 (64?)-char-long col names, but that's ridiculous. from messytables import headers_make_unique headers = headers_make_unique(headers, max_length=24) # Skip the header row. # (Add one to begin with content, not the header.) from messytables import offset_processor table.register_processor(offset_processor(offset + 1)) # Try to guess the datatypes. import messytables.types from messytables import type_guess, types_processor datatypes = type_guess(table.sample, [ messytables.types.StringType, messytables.types.IntegerType, messytables.types.FloatType, messytables.types.DecimalType, messytables.types.DateType ], strict=True) if len(datatypes) != len(headers): raise UserError( "Could not guess data types. Column header count does not match rows found during type guessing." ) messytable_datastore_type_mapping = { messytables.types.StringType: lambda t: 'text', messytables.types.IntegerType: lambda t: 'bigint', # 'int' may not be big enough, # and type detection may not realize it needs to be big messytables.types.FloatType: lambda t: 'float', messytables.types.DecimalType: lambda t: 'numeric', messytables.types.DateType: lambda t: 'timestamp:' + t.format, } datatypes = [ messytable_datastore_type_mapping[type(t)](t) for t in datatypes ] # convert objects to strings # Override the datatypes from the schema. for cidx, col in enumerate(schema_get("columns", default=[])): try: datatypes[cidx] = col.get("type", datatypes[cidx]) except IndexError: pass # ignore schema problems? # Provide the header names and types back to the user in the schema. schema["columns"] = [] for i in xrange(len(headers)): schema["columns"].append({ "name": headers[i], "type": datatypes[i], }) # Validate that the datatypes are all legit. for dt in datatypes: if dt.split(":")[0] not in ("text", "int", "bigint", "float", "bool", "numeric", "date", "time", "timestamp", "json"): raise UserError("Invalid data type in schema: %s" % dt) return schema, table