def from_fileobj(cls, fileobj, mimetype=None, extension=None): """ Opens whatever sort of file is passed in, using the MIME type (e.g mimetype='text/csv') or file extension (e.g. extension='tsv'), or otherwise autodetecting the file format. Consult the source for recognized MIME types and file extensions.""" if mimetype == None: import magic # Since we need to peek the start of the stream, make sure we can # seek back later. If not, slurp in the contents into a StringIO. fileobj = messytables.seekable_stream(fileobj) header = fileobj.read(1024) mimetype = magic.from_buffer(header, mime=True) fileobj.seek(0) if mimetype in ('application/x-zip-compressed', 'application/zip') \ or (extension and extension.lower() in ('zip',)): # Do this first because the extension applies to the content # type of the inner files, so don't check them before we check # for a ZIP file. return ZIPTableSet.from_fileobj(fileobj) if mimetype in ('text/csv', 'text/comma-separated-values') or \ (extension and extension.lower() in ('csv',)): return CSVTableSet.from_fileobj(fileobj) # guess delimiter if mimetype in ('text/tsv', 'text/tab-separated-values') or \ (extension and extension.lower() in ('tsv',)): return CSVTableSet.from_fileobj(fileobj, delimiter='\t') if mimetype in ('application/ms-excel', 'application/vnd.ms-excel', 'application/xls') or (extension and extension.lower() in \ ('xls',)): return XLSTableSet.from_fileobj(fileobj) if mimetype in ('application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',) \ or (extension and extension.lower() in ('xlsx',)): return XLSXTableSet.from_fileobj(fileobj) if mimetype: raise ValueError("Unrecognized MIME type: " + mimetype) if extension: raise ValueError("Could not determine MIME type and " + "unrecognized extension: " + extension) raise ValueError("Could not determine MIME type and no extension given.")
def from_fileobj(cls, fileobj, mimetype=None, extension=None): """ Opens whatever sort of file is passed in, using the MIME type (e.g mimetype='text/csv') or file extension (e.g. extension='tsv'), or otherwise autodetecting the file format. Consult the source for recognized MIME types and file extensions.""" if mimetype == None: import magic # Since we need to peek the start of the stream, make sure we can # seek back later. If not, slurp in the contents into a StringIO. fileobj = messytables.seekable_stream(fileobj) header = fileobj.read(1024) mimetype = magic.from_buffer(header, mime=True) fileobj.seek(0) if mimetype in ('application/x-zip-compressed', 'application/zip') \ or (extension and extension.lower() in ('zip',)): # Do this first because the extension applies to the content # type of the inner files, so don't check them before we check # for a ZIP file. return ZIPTableSet.from_fileobj(fileobj) if mimetype in ('text/csv', 'text/comma-separated-values') or \ (extension and extension.lower() in ('csv',)): return CSVTableSet.from_fileobj(fileobj, delimiter=',') if mimetype in ('text/tsv', 'text/tab-separated-values') or \ (extension and extension.lower() in ('tsv',)): return CSVTableSet.from_fileobj(fileobj, delimiter='\t') if mimetype in ('application/ms-excel', 'application/vnd.ms-excel', 'application/xls', 'application/excel') or (extension and extension.lower() in \ ('xls',)): return XLSTableSet.from_fileobj(fileobj) if mimetype in ('application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',) \ or (extension and extension.lower() in ('xlsx',)): return XLSXTableSet.from_fileobj(fileobj) if mimetype: raise ValueError("Unrecognized MIME type: " + mimetype) if extension: raise ValueError("Could not determine MIME type and " + "unrecognized extension: " + extension) raise ValueError("Could not determine MIME type and no extension given.")
def parse_resource(resource_file, mime_type, fileext, default_schema): # Given resource data, returns a tuple of # * the schema used to load the file # * the resource as a table, as given by messytables (an iterator over rows) # Schema defaults. We'll build up the schema with defaults # from the actual resource so that it is easy for the data # owner to customize the schema later. schema = { "_format": 1, } # Update defaults from the provided schema. if default_schema: schema.update(default_schema) else: schema["auto"] = True # Utility function that's like dict.get() but works on nested # dicts and takes a path through to dicts as arguments. Returns # None if no value is found. # e.g. schema_get('format', 'name') # This returns schema["format"]["name"], or None if # "format" isn't in schema or "name" isn't in schema["format"]. def schema_get(*path, **kwargs): if len(path) < 1: raise ValueError() v = schema for p in path: v = v.get(p, {}) if v == { }: v = kwargs.get("default", None) return v # Utility function that sets a value in a set of nested dicts. # Takes a path plus a value as arguments. # e.g. schema_set('format', 'name', 'zip') # This is equivalent to: # schema["format"]["name"] = "zip" # but creating dicts as necessary along the way. def schema_set(*path_and_value): if len(path_and_value) < 2: raise ValueError() path = path_and_value[0:-2] field = path_and_value[-2] value = path_and_value[-1] container = schema for p in path: container = container.setdefault(p, {}) container[field] = value # Parse the resource_file. # Get the table data format. if schema_get('format', 'name') == None: # Auto-detect format. from messytables import AnyTableSet as data_format data_format_args = { } elif schema_get('format', 'name') in ("csv", "tsv"): # "format" = { # "name": "csv" | "tsv", # "delimiter": ",", # "quotechar": "\"", # "encoding": "utf-8" # } # Load as CSV/TSV. from messytables import CSVTableSet as data_format # Default load parameters. data_format_args = { "delimiter": "," if schema_get('format', 'name') == "csv" else "\t", "quotechar": '"', "encoding": None, } # Override parameters from the schema. for n in ("delimiter", "quotechar", "encoding"): v = schema_get("format", n) if v: data_format_args[n] = str(v) # csv module requires str's not unicode else: raise UserError("Invalid format name in schema. Allowed values are: csv, tsv.") # If the user specifies a ZIP container, then parse the # resource_file as a ZIP file and pass the format parameters # into ZIPTableSet so it knows how to parse the inner files. if schema_get("container", "name") == "zip": # "container = { # "name": "zip" # } from messytables import ZIPTableSet table_set = ZIPTableSet.from_fileobj(resource_file, inner_data_format=data_format, inner_parser_args=data_format_args) elif schema_get("container", "name") != None: raise UserError("Invalid container name in schema. Allowed values are: zip.") # If format parameters were given explicity, do not use a container. # Just parse according to the specified format. elif schema_get('format', 'name') != None: table_set = data_format.from_fileobj(resource_file, **data_format_args) # If no container and no format was specified, auto-detect everything. else: # Use the AnyTableSet to guess all parsing parameters. from messytables import AnyTableSet try: table_set = AnyTableSet.from_fileobj(resource_file, mimetype=mime_type, extension=fileext) except Exception as e: raise UserError("The file format could not be recognized: %s" % str(e)) # Provide the container information that may have been guessed. if type(table_set).__name__ == "ZIPTableSet": schema_set("container", "name", "zip") table = table_set.tables[0] # Provide the CSV parser settings that may have been guessed. if type(table).__name__ == "CSVRowSet": schema_set("format", "name", "tsv" if table.delimiter == "\t" else "csv") schema_set("format", "delimiter", table.delimiter) schema_set("format", "quotechar", table.quotechar) schema_set("format", "encoding", table.encoding) # Get the column header names and the row offset to the data. # Start by guessing. from messytables import headers_guess, headers_processor offset, headers = headers_guess(table.sample) # Overwrite the has_headers and offset values with the schema, if present. has_headers = schema_get("header", "present", default=True) offset = schema_get("header", "offset", default=offset) # Set the header information back into the schema. schema_set("header", "present", True) schema_set("header", "offset", offset) # Override the header names with what is specified in the schema. for cidx, col in enumerate(schema_get("columns", default=[])): try: headers[cidx] = col.get("name", headers[cidx]) except IndexError: pass # ignore schema problems? # Since SQL column names are not case sensitive, prevent any # uniqueness clashes by converting all to lowercase. While # we're at it, also trim spaces. headers = [h.lower().strip() for h in headers] # Ensure the headers are valid SQL-ish & datastore column names: # 1st character: letter # subsequent characters: letter, number, or underscore for i, header in enumerate(headers): # To play nice with international characters, convert to ASCII # by replacing extended characters with their closest ASCII # equivalent where possible. header = u"".join(c for c in unicodedata.normalize('NFKD', header) if not unicodedata.combining(c)) # Replace any invalid characters with "". header = re.sub("[^a-z0-9_]", "", header) # If there is an invalid 1st character, prepend a valid start. if not re.match("^[a-z]", header): header = "field_" + header # And force to an ASCII byte string, which should be possible by now. headers[i] = str(header) # TODO: Check that there is not an insane number of columns. # That could crash headers_make_unique. # Ensure the headers are unique, and not too long. Postgres # supports 63 (64?)-char-long col names, but that's ridiculous. from messytables import headers_make_unique headers = headers_make_unique(headers, max_length=24) # Skip the header row. # (Add one to begin with content, not the header.) from messytables import offset_processor table.register_processor(offset_processor(offset + 1)) # Try to guess the datatypes. import messytables.types from messytables import type_guess, types_processor datatypes = type_guess( table.sample, [ messytables.types.StringType, messytables.types.IntegerType, messytables.types.FloatType, messytables.types.DecimalType, messytables.types.DateType ], strict=True ) if len(datatypes) != len(headers): raise UserError("Could not guess data types. Column header count does not match rows found during type guessing.") messytable_datastore_type_mapping = { messytables.types.StringType: lambda t : 'text', messytables.types.IntegerType: lambda t : 'bigint', # 'int' may not be big enough, # and type detection may not realize it needs to be big messytables.types.FloatType: lambda t : 'float', messytables.types.DecimalType: lambda t : 'numeric', messytables.types.DateType: lambda t : 'timestamp:' + t.format, } datatypes = [messytable_datastore_type_mapping[type(t)](t) for t in datatypes] # convert objects to strings # Override the datatypes from the schema. for cidx, col in enumerate(schema_get("columns", default=[])): try: datatypes[cidx] = col.get("type", datatypes[cidx]) except IndexError: pass # ignore schema problems? # Provide the header names and types back to the user in the schema. schema["columns"] = [] for i in xrange(len(headers)): schema["columns"].append({ "name": headers[i], "type": datatypes[i], }) # Validate that the datatypes are all legit. for dt in datatypes: if dt.split(":")[0] not in ("text", "int", "bigint", "float", "bool", "numeric", "date", "time", "timestamp", "json"): raise UserError("Invalid data type in schema: %s" % dt) return schema, table
def parse_resource(resource_file, mime_type, fileext, default_schema): # Given resource data, returns a tuple of # * the schema used to load the file # * the resource as a table, as given by messytables (an iterator over rows) # Schema defaults. We'll build up the schema with defaults # from the actual resource so that it is easy for the data # owner to customize the schema later. schema = { "_format": 1, } # Update defaults from the provided schema. if default_schema: schema.update(default_schema) else: schema["auto"] = True # Utility function that's like dict.get() but works on nested # dicts and takes a path through to dicts as arguments. Returns # None if no value is found. # e.g. schema_get('format', 'name') # This returns schema["format"]["name"], or None if # "format" isn't in schema or "name" isn't in schema["format"]. def schema_get(*path, **kwargs): if len(path) < 1: raise ValueError() v = schema for p in path: v = v.get(p, {}) if v == {}: v = kwargs.get("default", None) return v # Utility function that sets a value in a set of nested dicts. # Takes a path plus a value as arguments. # e.g. schema_set('format', 'name', 'zip') # This is equivalent to: # schema["format"]["name"] = "zip" # but creating dicts as necessary along the way. def schema_set(*path_and_value): if len(path_and_value) < 2: raise ValueError() path = path_and_value[0:-2] field = path_and_value[-2] value = path_and_value[-1] container = schema for p in path: container = container.setdefault(p, {}) container[field] = value # Parse the resource_file. # Get the table data format. if schema_get('format', 'name') == None: # Auto-detect format. from messytables import AnyTableSet as data_format data_format_args = {} elif schema_get('format', 'name') in ("csv", "tsv"): # "format" = { # "name": "csv" | "tsv", # "delimiter": ",", # "quotechar": "\"", # "encoding": "utf-8" # } # Load as CSV/TSV. from messytables import CSVTableSet as data_format # Default load parameters. data_format_args = { "delimiter": "," if schema_get('format', 'name') == "csv" else "\t", "quotechar": '"', "encoding": None, } # Override parameters from the schema. for n in ("delimiter", "quotechar", "encoding"): v = schema_get("format", n) if v: data_format_args[n] = str( v) # csv module requires str's not unicode else: raise UserError( "Invalid format name in schema. Allowed values are: csv, tsv.") # If the user specifies a ZIP container, then parse the # resource_file as a ZIP file and pass the format parameters # into ZIPTableSet so it knows how to parse the inner files. if schema_get("container", "name") == "zip": # "container = { # "name": "zip" # } from messytables import ZIPTableSet table_set = ZIPTableSet.from_fileobj( resource_file, inner_data_format=data_format, inner_parser_args=data_format_args) elif schema_get("container", "name") != None: raise UserError( "Invalid container name in schema. Allowed values are: zip.") # If format parameters were given explicity, do not use a container. # Just parse according to the specified format. elif schema_get('format', 'name') != None: table_set = data_format.from_fileobj(resource_file, **data_format_args) # If no container and no format was specified, auto-detect everything. else: # Use the AnyTableSet to guess all parsing parameters. from messytables import AnyTableSet try: table_set = AnyTableSet.from_fileobj(resource_file, mimetype=mime_type, extension=fileext) except Exception as e: raise UserError("The file format could not be recognized: %s" % str(e)) # Provide the container information that may have been guessed. if type(table_set).__name__ == "ZIPTableSet": schema_set("container", "name", "zip") table = table_set.tables[0] # Provide the CSV parser settings that may have been guessed. if type(table).__name__ == "CSVRowSet": schema_set("format", "name", "tsv" if table.delimiter == "\t" else "csv") schema_set("format", "delimiter", table.delimiter) schema_set("format", "quotechar", table.quotechar) schema_set("format", "encoding", table.encoding) # Get the column header names and the row offset to the data. # Start by guessing. from messytables import headers_guess, headers_processor offset, headers = headers_guess(table.sample) # Overwrite the has_headers and offset values with the schema, if present. has_headers = schema_get("header", "present", default=True) offset = schema_get("header", "offset", default=offset) # Set the header information back into the schema. schema_set("header", "present", True) schema_set("header", "offset", offset) # Override the header names with what is specified in the schema. for cidx, col in enumerate(schema_get("columns", default=[])): try: headers[cidx] = col.get("name", headers[cidx]) except IndexError: pass # ignore schema problems? # Since SQL column names are not case sensitive, prevent any # uniqueness clashes by converting all to lowercase. While # we're at it, also trim spaces. headers = [h.lower().strip() for h in headers] # Ensure the headers are valid SQL-ish & datastore column names: # 1st character: letter # subsequent characters: letter, number, or underscore for i, header in enumerate(headers): # To play nice with international characters, convert to ASCII # by replacing extended characters with their closest ASCII # equivalent where possible. header = u"".join(c for c in unicodedata.normalize('NFKD', header) if not unicodedata.combining(c)) # Replace any invalid characters with "". header = re.sub("[^a-z0-9_]", "", header) # If there is an invalid 1st character, prepend a valid start. if not re.match("^[a-z]", header): header = "field_" + header # And force to an ASCII byte string, which should be possible by now. headers[i] = str(header) # TODO: Check that there is not an insane number of columns. # That could crash headers_make_unique. # Ensure the headers are unique, and not too long. Postgres # supports 63 (64?)-char-long col names, but that's ridiculous. from messytables import headers_make_unique headers = headers_make_unique(headers, max_length=24) # Skip the header row. # (Add one to begin with content, not the header.) from messytables import offset_processor table.register_processor(offset_processor(offset + 1)) # Try to guess the datatypes. import messytables.types from messytables import type_guess, types_processor datatypes = type_guess(table.sample, [ messytables.types.StringType, messytables.types.IntegerType, messytables.types.FloatType, messytables.types.DecimalType, messytables.types.DateType ], strict=True) if len(datatypes) != len(headers): raise UserError( "Could not guess data types. Column header count does not match rows found during type guessing." ) messytable_datastore_type_mapping = { messytables.types.StringType: lambda t: 'text', messytables.types.IntegerType: lambda t: 'bigint', # 'int' may not be big enough, # and type detection may not realize it needs to be big messytables.types.FloatType: lambda t: 'float', messytables.types.DecimalType: lambda t: 'numeric', messytables.types.DateType: lambda t: 'timestamp:' + t.format, } datatypes = [ messytable_datastore_type_mapping[type(t)](t) for t in datatypes ] # convert objects to strings # Override the datatypes from the schema. for cidx, col in enumerate(schema_get("columns", default=[])): try: datatypes[cidx] = col.get("type", datatypes[cidx]) except IndexError: pass # ignore schema problems? # Provide the header names and types back to the user in the schema. schema["columns"] = [] for i in xrange(len(headers)): schema["columns"].append({ "name": headers[i], "type": datatypes[i], }) # Validate that the datatypes are all legit. for dt in datatypes: if dt.split(":")[0] not in ("text", "int", "bigint", "float", "bool", "numeric", "date", "time", "timestamp", "json"): raise UserError("Invalid data type in schema: %s" % dt) return schema, table