示例#1
0
def any_tableset(fileobj, mimetype=None, extension=None):
    """Reads any supported table type according to a specified
    MIME type or file extension or automatically detecting the
    type.

    Best matching TableSet loaded with the fileobject is returned.
    Matching is done by looking at the type (e.g mimetype='text/csv')
    or file extension (e.g. extension='tsv'), or otherwise autodetecting
    the file format by using the magic library which looks at the first few
    bytes of the file BUT is often wrong. Consult the source for recognized
    MIME types and file extensions.

    On error it raises messytables.ReadError
    """
    # Auto-detect if the caller has offered no clue. (Because the
    # auto-detection routine is pretty poor.)
    if mimetype is None and extension is None:
        import magic
        # Since we need to peek the start of the stream, make sure we can
        # seek back later. If not, slurp in the contents into a StringIO.
        fileobj = messytables.seekable_stream(fileobj)
        header = fileobj.read(1024)
        mimetype = magic.from_buffer(header, mime=True)
        fileobj.seek(0)

    if (mimetype in ('application/x-zip-compressed', 'application/zip')
            or (extension and extension.lower() in ('zip',))):
        # Do this first because the extension applies to the content
        # type of the inner files, so don't check them before we check
        # for a ZIP file.
        return ZIPTableSet(fileobj)

    if (mimetype in ('text/csv', 'text/comma-separated-values') or
            (extension and extension.lower() in ('csv',))):
        return CSVTableSet(fileobj)  # guess delimiter
    if (mimetype in ('text/tsv', 'text/tab-separated-values') or
            (extension and extension.lower() in ('tsv',))):
        return CSVTableSet(fileobj, delimiter='\t')
    if mimetype in ('application/ms-excel', 'application/vnd.ms-excel',
                    'application/xls') or (extension and extension.lower() in
                                           ('xls',)):
        return XLSTableSet(fileobj)
    if (mimetype in (
        'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',)
            or (extension and extension.lower() in ('xlsx',))):
        return XLSXTableSet(fileobj)
    if (mimetype in ('text/html',)
            or (extension and extension.lower() in ('htm', 'html',))):
        return HTMLTableSet(fileobj)
    if mimetype:
        raise ValueError("Unrecognized MIME type: {mimetype}".format(
            mimetype=mimetype))
    if extension:
        raise ValueError('''Could not determine MIME type and
         unrecognized extension: {extension}'''.format(extension=extension))
    raise ValueError("Could not determine MIME type and no extension given.")
示例#2
0
    def test_read_simple_zip(self):
        fh = horror_fobj('simple.zip')
        table_set = ZIPTableSet(fh)
        row_set = table_set.tables[0]
        assert_equal(7, len(list(row_set)))
        row = list(row_set.sample)[0]
        assert_equal(row[0].value, 'date')
        assert_equal(row[1].value, 'temperature')

        for row in list(row_set):
            assert_equal(3, len(row))
            assert_equal(row[0].type, StringType())
示例#3
0
    def from_fileobj(cls, fileobj, mimetype=None, extension=None):
        """ Opens whatever sort of file is passed in, using the MIME
        type (e.g mimetype='text/csv') or file extension (e.g.
        extension='tsv'), or otherwise autodetecting the file format.
        Consult the source for recognized MIME types and file
        extensions."""
        if mimetype == None:
            import magic
            # Since we need to peek the start of the stream, make sure we can
            # seek back later. If not, slurp in the contents into a StringIO.
            fileobj = messytables.seekable_stream(fileobj)
            header = fileobj.read(1024)
            mimetype = magic.from_buffer(header, mime=True)
            fileobj.seek(0)

        if mimetype in ('application/x-zip-compressed', 'application/zip') \
                or (extension and extension.lower() in ('zip',)):
            # Do this first because the extension applies to the content
            # type of the inner files, so don't check them before we check
            # for a ZIP file.
            return ZIPTableSet.from_fileobj(fileobj)

        if mimetype in ('text/csv', 'text/comma-separated-values') or \
                (extension and extension.lower() in ('csv',)):
            return CSVTableSet.from_fileobj(fileobj)  # guess delimiter
        if mimetype in ('text/tsv', 'text/tab-separated-values') or \
                (extension and extension.lower() in ('tsv',)):
            return CSVTableSet.from_fileobj(fileobj, delimiter='\t')
        if mimetype in ('application/ms-excel', 'application/vnd.ms-excel',
                'application/xls') or (extension and extension.lower() in \
                    ('xls',)):
            return XLSTableSet.from_fileobj(fileobj)
        if mimetype in ('application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',) \
                or (extension and extension.lower() in ('xlsx',)):
            return XLSXTableSet.from_fileobj(fileobj)

        if mimetype:
            raise ValueError("Unrecognized MIME type: " + mimetype)
        if extension:
            raise ValueError("Could not determine MIME type and "
             + "unrecognized extension: " + extension)
        raise ValueError("Could not determine MIME type and no extension given.")
示例#4
0
    def from_fileobj(cls, fileobj, mimetype=None, extension=None):
        """ Opens whatever sort of file is passed in, using the MIME
        type (e.g mimetype='text/csv') or file extension (e.g.
        extension='tsv'), or otherwise autodetecting the file format.
        Consult the source for recognized MIME types and file
        extensions."""
        if mimetype == None:
            import magic
            # Since we need to peek the start of the stream, make sure we can
            # seek back later. If not, slurp in the contents into a StringIO.
            fileobj = messytables.seekable_stream(fileobj)
            header = fileobj.read(1024)
            mimetype = magic.from_buffer(header, mime=True)
            fileobj.seek(0)

        if mimetype in ('application/x-zip-compressed', 'application/zip') \
                or (extension and extension.lower() in ('zip',)):
            # Do this first because the extension applies to the content
            # type of the inner files, so don't check them before we check
            # for a ZIP file.
            return ZIPTableSet.from_fileobj(fileobj)

        if mimetype in ('text/csv', 'text/comma-separated-values') or \
                (extension and extension.lower() in ('csv',)):
            return CSVTableSet.from_fileobj(fileobj, delimiter=',')
        if mimetype in ('text/tsv', 'text/tab-separated-values') or \
                (extension and extension.lower() in ('tsv',)):
            return CSVTableSet.from_fileobj(fileobj, delimiter='\t')
        if mimetype in ('application/ms-excel', 'application/vnd.ms-excel',
                'application/xls', 'application/excel') or (extension and extension.lower() in \
                    ('xls',)):
            return XLSTableSet.from_fileobj(fileobj)
        if mimetype in ('application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',) \
                or (extension and extension.lower() in ('xlsx',)):
            return XLSXTableSet.from_fileobj(fileobj)

        if mimetype:
            raise ValueError("Unrecognized MIME type: " + mimetype)
        if extension:
            raise ValueError("Could not determine MIME type and "
             + "unrecognized extension: " + extension)
        raise ValueError("Could not determine MIME type and no extension given.")
def parse_resource(resource_file, mime_type, fileext, default_schema):
	# Given resource data, returns a tuple of
	#  * the schema used to load the file
	#  * the resource as a table, as given by messytables (an iterator over rows) 
	
	# Schema defaults. We'll build up the schema with defaults
	# from the actual resource so that it is easy for the data
	# owner to customize the schema later.
	schema = {
		"_format": 1,
	}
	
	# Update defaults from the provided schema.
	if default_schema:
		schema.update(default_schema)
	else:
		schema["auto"] = True
	
	# Utility function that's like dict.get() but works on nested
	# dicts and takes a path through to dicts as arguments. Returns
	# None if no value is found.
	#   e.g. schema_get('format', 'name')
	#        This returns schema["format"]["name"], or None if
	#        "format" isn't in schema or "name" isn't in schema["format"].
	def schema_get(*path, **kwargs):
		if len(path) < 1: raise ValueError()
		v = schema
		for p in path: v = v.get(p, {})
		if v == { }: v = kwargs.get("default", None)
		return v
	
	# Utility function that sets a value in a set of nested dicts.
	# Takes a path plus a value as arguments.
	#   e.g. schema_set('format', 'name', 'zip')
	#        This is equivalent to:
	#          schema["format"]["name"] = "zip"
	#        but creating dicts as necessary along the way.
	def schema_set(*path_and_value):
		if len(path_and_value) < 2: raise ValueError()
		path = path_and_value[0:-2]
		field = path_and_value[-2]
		value = path_and_value[-1]
		container = schema
		for p in path:
			container = container.setdefault(p, {})
		container[field] = value
	
	# Parse the resource_file.
	
	# Get the table data format.
	
	if schema_get('format', 'name') == None:
		# Auto-detect format.
		from messytables import AnyTableSet as data_format
		data_format_args = { }
		
	elif schema_get('format', 'name') in ("csv", "tsv"):
		# "format" = {
		#   "name": "csv" | "tsv",
		#   "delimiter": ",",
		#   "quotechar": "\"",
		#   "encoding": "utf-8"
		# }
		
		# Load as CSV/TSV.
		from messytables import CSVTableSet as data_format
		
		# Default load parameters.
		data_format_args = {
			"delimiter": "," if schema_get('format', 'name') == "csv" else "\t",
			"quotechar": '"',
			"encoding": None,
		}
		
		# Override parameters from the schema.
		for n in ("delimiter", "quotechar", "encoding"):
			v = schema_get("format", n)
			if v:
				data_format_args[n] = str(v) # csv module requires str's not unicode
		
	else:
		raise UserError("Invalid format name in schema. Allowed values are: csv, tsv.")
		
	# If the user specifies a ZIP container, then parse the
	# resource_file as a ZIP file and pass the format parameters
	# into ZIPTableSet so it knows how to parse the inner files.
	
	if schema_get("container", "name") == "zip":
		# "container = {
		#   "name": "zip"
		# }
		
		from messytables import ZIPTableSet
		table_set = ZIPTableSet.from_fileobj(resource_file,
			inner_data_format=data_format,
			inner_parser_args=data_format_args)

	elif schema_get("container", "name") != None:
		raise UserError("Invalid container name in schema. Allowed values are: zip.")

	# If format parameters were given explicity, do not use a container.
	# Just parse according to the specified format.

	elif schema_get('format', 'name') != None:
		table_set = data_format.from_fileobj(resource_file, **data_format_args)
		
	# If no container and no format was specified, auto-detect everything.
	
	else:
		# Use the AnyTableSet to guess all parsing parameters.
		from messytables import AnyTableSet
		try:
			table_set = AnyTableSet.from_fileobj(resource_file, mimetype=mime_type, extension=fileext)
		except Exception as e:
			raise UserError("The file format could not be recognized: %s" % str(e))
		
		# Provide the container information that may have been guessed.
		if type(table_set).__name__ == "ZIPTableSet":
			schema_set("container", "name", "zip")
		
	table = table_set.tables[0]

	# Provide the CSV parser settings that may have been guessed.
	if type(table).__name__ == "CSVRowSet":
		schema_set("format", "name", "tsv" if table.delimiter == "\t" else "csv")
		schema_set("format", "delimiter", table.delimiter)
		schema_set("format", "quotechar", table.quotechar)
		schema_set("format", "encoding", table.encoding)
        
	# Get the column header names and the row offset to the data.
	
	# Start by guessing.
	from messytables import headers_guess, headers_processor
	offset, headers = headers_guess(table.sample)
	
	# Overwrite the has_headers and offset values with the schema, if present.
	has_headers = schema_get("header", "present", default=True)
	offset = schema_get("header", "offset", default=offset)
	
	# Set the header information back into the schema.
	schema_set("header", "present", True)
	schema_set("header", "offset", offset)
	
	# Override the header names with what is specified in the schema.
	for cidx, col in enumerate(schema_get("columns", default=[])):
		try:
			headers[cidx] = col.get("name", headers[cidx])
		except IndexError:
			pass # ignore schema problems?
	
	# Since SQL column names are not case sensitive, prevent any
	# uniqueness clashes by converting all to lowercase. While
	# we're at it, also trim spaces.
	headers = [h.lower().strip() for h in headers]
	
	# Ensure the headers are valid SQL-ish & datastore column names:
	#  1st character: letter
	#  subsequent characters: letter, number, or underscore
	for i, header in enumerate(headers):
		# To play nice with international characters, convert to ASCII
		# by replacing extended characters with their closest ASCII
		# equivalent where possible.
		header = u"".join(c for c in unicodedata.normalize('NFKD', header)
			if not unicodedata.combining(c))
		
		# Replace any invalid characters with "".
		header = re.sub("[^a-z0-9_]", "", header)
		
		# If there is an invalid 1st character, prepend a valid start.
		if not re.match("^[a-z]", header):
			header = "field_" + header
			
		# And force to an ASCII byte string, which should be possible by now.
		headers[i] = str(header)

	# TODO: Check that there is not an insane number of columns.
	# That could crash headers_make_unique. 

	# Ensure the headers are unique, and not too long. Postgres
	# supports 63 (64?)-char-long col names, but that's ridiculous.
	from messytables import headers_make_unique
	headers = headers_make_unique(headers, max_length=24)
	
	# Skip the header row.
	# (Add one to begin with content, not the header.)
	from messytables import offset_processor
	table.register_processor(offset_processor(offset + 1))
	
	# Try to guess the datatypes.
	import messytables.types
	from messytables import type_guess, types_processor
	datatypes = type_guess(
		table.sample,
		[
			messytables.types.StringType,
			messytables.types.IntegerType,
			messytables.types.FloatType,
			messytables.types.DecimalType,
			messytables.types.DateType
		],
		strict=True
		)
	 
	if len(datatypes) != len(headers):
		raise UserError("Could not guess data types. Column header count does not match rows found during type guessing.")
	messytable_datastore_type_mapping = {
		messytables.types.StringType: lambda t : 'text',
		messytables.types.IntegerType: lambda t : 'bigint',  # 'int' may not be big enough,
						# and type detection may not realize it needs to be big
		messytables.types.FloatType: lambda t : 'float',
		messytables.types.DecimalType: lambda t : 'numeric',
		messytables.types.DateType: lambda t : 'timestamp:' + t.format,
	}
	datatypes = [messytable_datastore_type_mapping[type(t)](t) for t in datatypes] # convert objects to strings
	
	# Override the datatypes from the schema.
	for cidx, col in enumerate(schema_get("columns", default=[])):
		try:
			datatypes[cidx] = col.get("type", datatypes[cidx])
		except IndexError:
			pass # ignore schema problems?
	
	# Provide the header names and types back to the user in the schema.
	schema["columns"] = []
	for i in xrange(len(headers)):
		schema["columns"].append({
			"name": headers[i],
			"type": datatypes[i],
		})
		
	# Validate that the datatypes are all legit.
	for dt in datatypes:
		if dt.split(":")[0] not in ("text", "int", "bigint", "float", "bool", "numeric", "date", "time", "timestamp", "json"):
			raise UserError("Invalid data type in schema: %s" % dt)
			
	return schema, table
示例#6
0
def parse_resource(resource_file, mime_type, fileext, default_schema):
    # Given resource data, returns a tuple of
    #  * the schema used to load the file
    #  * the resource as a table, as given by messytables (an iterator over rows)

    # Schema defaults. We'll build up the schema with defaults
    # from the actual resource so that it is easy for the data
    # owner to customize the schema later.
    schema = {
        "_format": 1,
    }

    # Update defaults from the provided schema.
    if default_schema:
        schema.update(default_schema)
    else:
        schema["auto"] = True

    # Utility function that's like dict.get() but works on nested
    # dicts and takes a path through to dicts as arguments. Returns
    # None if no value is found.
    #   e.g. schema_get('format', 'name')
    #        This returns schema["format"]["name"], or None if
    #        "format" isn't in schema or "name" isn't in schema["format"].
    def schema_get(*path, **kwargs):
        if len(path) < 1: raise ValueError()
        v = schema
        for p in path:
            v = v.get(p, {})
        if v == {}: v = kwargs.get("default", None)
        return v

    # Utility function that sets a value in a set of nested dicts.
    # Takes a path plus a value as arguments.
    #   e.g. schema_set('format', 'name', 'zip')
    #        This is equivalent to:
    #          schema["format"]["name"] = "zip"
    #        but creating dicts as necessary along the way.
    def schema_set(*path_and_value):
        if len(path_and_value) < 2: raise ValueError()
        path = path_and_value[0:-2]
        field = path_and_value[-2]
        value = path_and_value[-1]
        container = schema
        for p in path:
            container = container.setdefault(p, {})
        container[field] = value

    # Parse the resource_file.

    # Get the table data format.

    if schema_get('format', 'name') == None:
        # Auto-detect format.
        from messytables import AnyTableSet as data_format
        data_format_args = {}

    elif schema_get('format', 'name') in ("csv", "tsv"):
        # "format" = {
        #   "name": "csv" | "tsv",
        #   "delimiter": ",",
        #   "quotechar": "\"",
        #   "encoding": "utf-8"
        # }

        # Load as CSV/TSV.
        from messytables import CSVTableSet as data_format

        # Default load parameters.
        data_format_args = {
            "delimiter":
            "," if schema_get('format', 'name') == "csv" else "\t",
            "quotechar": '"',
            "encoding": None,
        }

        # Override parameters from the schema.
        for n in ("delimiter", "quotechar", "encoding"):
            v = schema_get("format", n)
            if v:
                data_format_args[n] = str(
                    v)  # csv module requires str's not unicode

    else:
        raise UserError(
            "Invalid format name in schema. Allowed values are: csv, tsv.")

    # If the user specifies a ZIP container, then parse the
    # resource_file as a ZIP file and pass the format parameters
    # into ZIPTableSet so it knows how to parse the inner files.

    if schema_get("container", "name") == "zip":
        # "container = {
        #   "name": "zip"
        # }

        from messytables import ZIPTableSet
        table_set = ZIPTableSet.from_fileobj(
            resource_file,
            inner_data_format=data_format,
            inner_parser_args=data_format_args)

    elif schema_get("container", "name") != None:
        raise UserError(
            "Invalid container name in schema. Allowed values are: zip.")

    # If format parameters were given explicity, do not use a container.
    # Just parse according to the specified format.

    elif schema_get('format', 'name') != None:
        table_set = data_format.from_fileobj(resource_file, **data_format_args)

    # If no container and no format was specified, auto-detect everything.

    else:
        # Use the AnyTableSet to guess all parsing parameters.
        from messytables import AnyTableSet
        try:
            table_set = AnyTableSet.from_fileobj(resource_file,
                                                 mimetype=mime_type,
                                                 extension=fileext)
        except Exception as e:
            raise UserError("The file format could not be recognized: %s" %
                            str(e))

        # Provide the container information that may have been guessed.
        if type(table_set).__name__ == "ZIPTableSet":
            schema_set("container", "name", "zip")

    table = table_set.tables[0]

    # Provide the CSV parser settings that may have been guessed.
    if type(table).__name__ == "CSVRowSet":
        schema_set("format", "name",
                   "tsv" if table.delimiter == "\t" else "csv")
        schema_set("format", "delimiter", table.delimiter)
        schema_set("format", "quotechar", table.quotechar)
        schema_set("format", "encoding", table.encoding)

    # Get the column header names and the row offset to the data.

    # Start by guessing.
    from messytables import headers_guess, headers_processor
    offset, headers = headers_guess(table.sample)

    # Overwrite the has_headers and offset values with the schema, if present.
    has_headers = schema_get("header", "present", default=True)
    offset = schema_get("header", "offset", default=offset)

    # Set the header information back into the schema.
    schema_set("header", "present", True)
    schema_set("header", "offset", offset)

    # Override the header names with what is specified in the schema.
    for cidx, col in enumerate(schema_get("columns", default=[])):
        try:
            headers[cidx] = col.get("name", headers[cidx])
        except IndexError:
            pass  # ignore schema problems?

    # Since SQL column names are not case sensitive, prevent any
    # uniqueness clashes by converting all to lowercase. While
    # we're at it, also trim spaces.
    headers = [h.lower().strip() for h in headers]

    # Ensure the headers are valid SQL-ish & datastore column names:
    #  1st character: letter
    #  subsequent characters: letter, number, or underscore
    for i, header in enumerate(headers):
        # To play nice with international characters, convert to ASCII
        # by replacing extended characters with their closest ASCII
        # equivalent where possible.
        header = u"".join(c for c in unicodedata.normalize('NFKD', header)
                          if not unicodedata.combining(c))

        # Replace any invalid characters with "".
        header = re.sub("[^a-z0-9_]", "", header)

        # If there is an invalid 1st character, prepend a valid start.
        if not re.match("^[a-z]", header):
            header = "field_" + header

        # And force to an ASCII byte string, which should be possible by now.
        headers[i] = str(header)

    # TODO: Check that there is not an insane number of columns.
    # That could crash headers_make_unique.

    # Ensure the headers are unique, and not too long. Postgres
    # supports 63 (64?)-char-long col names, but that's ridiculous.
    from messytables import headers_make_unique
    headers = headers_make_unique(headers, max_length=24)

    # Skip the header row.
    # (Add one to begin with content, not the header.)
    from messytables import offset_processor
    table.register_processor(offset_processor(offset + 1))

    # Try to guess the datatypes.
    import messytables.types
    from messytables import type_guess, types_processor
    datatypes = type_guess(table.sample, [
        messytables.types.StringType, messytables.types.IntegerType,
        messytables.types.FloatType, messytables.types.DecimalType,
        messytables.types.DateType
    ],
                           strict=True)

    if len(datatypes) != len(headers):
        raise UserError(
            "Could not guess data types. Column header count does not match rows found during type guessing."
        )
    messytable_datastore_type_mapping = {
        messytables.types.StringType:
        lambda t: 'text',
        messytables.types.IntegerType:
        lambda t: 'bigint',  # 'int' may not be big enough,
        # and type detection may not realize it needs to be big
        messytables.types.FloatType:
        lambda t: 'float',
        messytables.types.DecimalType:
        lambda t: 'numeric',
        messytables.types.DateType:
        lambda t: 'timestamp:' + t.format,
    }
    datatypes = [
        messytable_datastore_type_mapping[type(t)](t) for t in datatypes
    ]  # convert objects to strings

    # Override the datatypes from the schema.
    for cidx, col in enumerate(schema_get("columns", default=[])):
        try:
            datatypes[cidx] = col.get("type", datatypes[cidx])
        except IndexError:
            pass  # ignore schema problems?

    # Provide the header names and types back to the user in the schema.
    schema["columns"] = []
    for i in xrange(len(headers)):
        schema["columns"].append({
            "name": headers[i],
            "type": datatypes[i],
        })

    # Validate that the datatypes are all legit.
    for dt in datatypes:
        if dt.split(":")[0] not in ("text", "int", "bigint", "float", "bool",
                                    "numeric", "date", "time", "timestamp",
                                    "json"):
            raise UserError("Invalid data type in schema: %s" % dt)

    return schema, table