Python ZIPTableSet.from_fileobj 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: messytables

클래스/타입: ZIPTableSet

메소드/함수: from_fileobj

hotexamples.com에서의 예제들: 4

Python ZIPTableSet.from_fileobj - 4개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 messytables.ZIPTableSet.from_fileobj에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

ZIPTableSet(2)

from_fileobj(2)

예제 #1

파일 보기

파일: any.py 프로젝트: perfectcode1/messytables

    def from_fileobj(cls, fileobj, mimetype=None, extension=None):
        """ Opens whatever sort of file is passed in, using the MIME
        type (e.g mimetype='text/csv') or file extension (e.g.
        extension='tsv'), or otherwise autodetecting the file format.
        Consult the source for recognized MIME types and file
        extensions."""
        if mimetype == None:
            import magic
            # Since we need to peek the start of the stream, make sure we can
            # seek back later. If not, slurp in the contents into a StringIO.
            fileobj = messytables.seekable_stream(fileobj)
            header = fileobj.read(1024)
            mimetype = magic.from_buffer(header, mime=True)
            fileobj.seek(0)

        if mimetype in ('application/x-zip-compressed', 'application/zip') \
                or (extension and extension.lower() in ('zip',)):
            # Do this first because the extension applies to the content
            # type of the inner files, so don't check them before we check
            # for a ZIP file.
            return ZIPTableSet.from_fileobj(fileobj)

        if mimetype in ('text/csv', 'text/comma-separated-values') or \
                (extension and extension.lower() in ('csv',)):
            return CSVTableSet.from_fileobj(fileobj)  # guess delimiter
        if mimetype in ('text/tsv', 'text/tab-separated-values') or \
                (extension and extension.lower() in ('tsv',)):
            return CSVTableSet.from_fileobj(fileobj, delimiter='\t')
        if mimetype in ('application/ms-excel', 'application/vnd.ms-excel',
                'application/xls') or (extension and extension.lower() in \
                    ('xls',)):
            return XLSTableSet.from_fileobj(fileobj)
        if mimetype in ('application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',) \
                or (extension and extension.lower() in ('xlsx',)):
            return XLSXTableSet.from_fileobj(fileobj)

        if mimetype:
            raise ValueError("Unrecognized MIME type: " + mimetype)
        if extension:
            raise ValueError("Could not determine MIME type and "
             + "unrecognized extension: " + extension)
        raise ValueError("Could not determine MIME type and no extension given.")

예제 #2

파일 보기

파일: any.py 프로젝트: JoshData/messytables

    def from_fileobj(cls, fileobj, mimetype=None, extension=None):
        """ Opens whatever sort of file is passed in, using the MIME
        type (e.g mimetype='text/csv') or file extension (e.g.
        extension='tsv'), or otherwise autodetecting the file format.
        Consult the source for recognized MIME types and file
        extensions."""
        if mimetype == None:
            import magic
            # Since we need to peek the start of the stream, make sure we can
            # seek back later. If not, slurp in the contents into a StringIO.
            fileobj = messytables.seekable_stream(fileobj)
            header = fileobj.read(1024)
            mimetype = magic.from_buffer(header, mime=True)
            fileobj.seek(0)

        if mimetype in ('application/x-zip-compressed', 'application/zip') \
                or (extension and extension.lower() in ('zip',)):
            # Do this first because the extension applies to the content
            # type of the inner files, so don't check them before we check
            # for a ZIP file.
            return ZIPTableSet.from_fileobj(fileobj)

        if mimetype in ('text/csv', 'text/comma-separated-values') or \
                (extension and extension.lower() in ('csv',)):
            return CSVTableSet.from_fileobj(fileobj, delimiter=',')
        if mimetype in ('text/tsv', 'text/tab-separated-values') or \
                (extension and extension.lower() in ('tsv',)):
            return CSVTableSet.from_fileobj(fileobj, delimiter='\t')
        if mimetype in ('application/ms-excel', 'application/vnd.ms-excel',
                'application/xls', 'application/excel') or (extension and extension.lower() in \
                    ('xls',)):
            return XLSTableSet.from_fileobj(fileobj)
        if mimetype in ('application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',) \
                or (extension and extension.lower() in ('xlsx',)):
            return XLSXTableSet.from_fileobj(fileobj)

        if mimetype:
            raise ValueError("Unrecognized MIME type: " + mimetype)
        if extension:
            raise ValueError("Could not determine MIME type and "
             + "unrecognized extension: " + extension)
        raise ValueError("Could not determine MIME type and no extension given.")

예제 #3

파일 보기

파일: datastore_loader.py 프로젝트: JoshData/datastore-loader

def parse_resource(resource_file, mime_type, fileext, default_schema):
	# Given resource data, returns a tuple of
	#  * the schema used to load the file
	#  * the resource as a table, as given by messytables (an iterator over rows) 
	
	# Schema defaults. We'll build up the schema with defaults
	# from the actual resource so that it is easy for the data
	# owner to customize the schema later.
	schema = {
		"_format": 1,
	}
	
	# Update defaults from the provided schema.
	if default_schema:
		schema.update(default_schema)
	else:
		schema["auto"] = True
	
	# Utility function that's like dict.get() but works on nested
	# dicts and takes a path through to dicts as arguments. Returns
	# None if no value is found.
	#   e.g. schema_get('format', 'name')
	#        This returns schema["format"]["name"], or None if
	#        "format" isn't in schema or "name" isn't in schema["format"].
	def schema_get(*path, **kwargs):
		if len(path) < 1: raise ValueError()
		v = schema
		for p in path: v = v.get(p, {})
		if v == { }: v = kwargs.get("default", None)
		return v
	
	# Utility function that sets a value in a set of nested dicts.
	# Takes a path plus a value as arguments.
	#   e.g. schema_set('format', 'name', 'zip')
	#        This is equivalent to:
	#          schema["format"]["name"] = "zip"
	#        but creating dicts as necessary along the way.
	def schema_set(*path_and_value):
		if len(path_and_value) < 2: raise ValueError()
		path = path_and_value[0:-2]
		field = path_and_value[-2]
		value = path_and_value[-1]
		container = schema
		for p in path:
			container = container.setdefault(p, {})
		container[field] = value
	
	# Parse the resource_file.
	
	# Get the table data format.
	
	if schema_get('format', 'name') == None:
		# Auto-detect format.
		from messytables import AnyTableSet as data_format
		data_format_args = { }
		
	elif schema_get('format', 'name') in ("csv", "tsv"):
		# "format" = {
		#   "name": "csv" | "tsv",
		#   "delimiter": ",",
		#   "quotechar": "\"",
		#   "encoding": "utf-8"
		# }
		
		# Load as CSV/TSV.
		from messytables import CSVTableSet as data_format
		
		# Default load parameters.
		data_format_args = {
			"delimiter": "," if schema_get('format', 'name') == "csv" else "\t",
			"quotechar": '"',
			"encoding": None,
		}
		
		# Override parameters from the schema.
		for n in ("delimiter", "quotechar", "encoding"):
			v = schema_get("format", n)
			if v:
				data_format_args[n] = str(v) # csv module requires str's not unicode
		
	else:
		raise UserError("Invalid format name in schema. Allowed values are: csv, tsv.")
		
	# If the user specifies a ZIP container, then parse the
	# resource_file as a ZIP file and pass the format parameters
	# into ZIPTableSet so it knows how to parse the inner files.
	
	if schema_get("container", "name") == "zip":
		# "container = {
		#   "name": "zip"
		# }
		
		from messytables import ZIPTableSet
		table_set = ZIPTableSet.from_fileobj(resource_file,
			inner_data_format=data_format,
			inner_parser_args=data_format_args)

	elif schema_get("container", "name") != None:
		raise UserError("Invalid container name in schema. Allowed values are: zip.")

	# If format parameters were given explicity, do not use a container.
	# Just parse according to the specified format.

	elif schema_get('format', 'name') != None:
		table_set = data_format.from_fileobj(resource_file, **data_format_args)
		
	# If no container and no format was specified, auto-detect everything.
	
	else:
		# Use the AnyTableSet to guess all parsing parameters.
		from messytables import AnyTableSet
		try:
			table_set = AnyTableSet.from_fileobj(resource_file, mimetype=mime_type, extension=fileext)
		except Exception as e:
			raise UserError("The file format could not be recognized: %s" % str(e))
		
		# Provide the container information that may have been guessed.
		if type(table_set).__name__ == "ZIPTableSet":
			schema_set("container", "name", "zip")
		
	table = table_set.tables[0]

	# Provide the CSV parser settings that may have been guessed.
	if type(table).__name__ == "CSVRowSet":
		schema_set("format", "name", "tsv" if table.delimiter == "\t" else "csv")
		schema_set("format", "delimiter", table.delimiter)
		schema_set("format", "quotechar", table.quotechar)
		schema_set("format", "encoding", table.encoding)
        
	# Get the column header names and the row offset to the data.
	
	# Start by guessing.
	from messytables import headers_guess, headers_processor
	offset, headers = headers_guess(table.sample)
	
	# Overwrite the has_headers and offset values with the schema, if present.
	has_headers = schema_get("header", "present", default=True)
	offset = schema_get("header", "offset", default=offset)
	
	# Set the header information back into the schema.
	schema_set("header", "present", True)
	schema_set("header", "offset", offset)
	
	# Override the header names with what is specified in the schema.
	for cidx, col in enumerate(schema_get("columns", default=[])):
		try:
			headers[cidx] = col.get("name", headers[cidx])
		except IndexError:
			pass # ignore schema problems?
	
	# Since SQL column names are not case sensitive, prevent any
	# uniqueness clashes by converting all to lowercase. While
	# we're at it, also trim spaces.
	headers = [h.lower().strip() for h in headers]
	
	# Ensure the headers are valid SQL-ish & datastore column names:
	#  1st character: letter
	#  subsequent characters: letter, number, or underscore
	for i, header in enumerate(headers):
		# To play nice with international characters, convert to ASCII
		# by replacing extended characters with their closest ASCII
		# equivalent where possible.
		header = u"".join(c for c in unicodedata.normalize('NFKD', header)
			if not unicodedata.combining(c))
		
		# Replace any invalid characters with "".
		header = re.sub("[^a-z0-9_]", "", header)
		
		# If there is an invalid 1st character, prepend a valid start.
		if not re.match("^[a-z]", header):
			header = "field_" + header
			
		# And force to an ASCII byte string, which should be possible by now.
		headers[i] = str(header)

	# TODO: Check that there is not an insane number of columns.
	# That could crash headers_make_unique. 

	# Ensure the headers are unique, and not too long. Postgres
	# supports 63 (64?)-char-long col names, but that's ridiculous.
	from messytables import headers_make_unique
	headers = headers_make_unique(headers, max_length=24)
	
	# Skip the header row.
	# (Add one to begin with content, not the header.)
	from messytables import offset_processor
	table.register_processor(offset_processor(offset + 1))
	
	# Try to guess the datatypes.
	import messytables.types
	from messytables import type_guess, types_processor
	datatypes = type_guess(
		table.sample,
		[
			messytables.types.StringType,
			messytables.types.IntegerType,
			messytables.types.FloatType,
			messytables.types.DecimalType,
			messytables.types.DateType
		],
		strict=True
		)
	 
	if len(datatypes) != len(headers):
		raise UserError("Could not guess data types. Column header count does not match rows found during type guessing.")
	messytable_datastore_type_mapping = {
		messytables.types.StringType: lambda t : 'text',
		messytables.types.IntegerType: lambda t : 'bigint',  # 'int' may not be big enough,
						# and type detection may not realize it needs to be big
		messytables.types.FloatType: lambda t : 'float',
		messytables.types.DecimalType: lambda t : 'numeric',
		messytables.types.DateType: lambda t : 'timestamp:' + t.format,
	}
	datatypes = [messytable_datastore_type_mapping[type(t)](t) for t in datatypes] # convert objects to strings
	
	# Override the datatypes from the schema.
	for cidx, col in enumerate(schema_get("columns", default=[])):
		try:
			datatypes[cidx] = col.get("type", datatypes[cidx])
		except IndexError:
			pass # ignore schema problems?
	
	# Provide the header names and types back to the user in the schema.
	schema["columns"] = []
	for i in xrange(len(headers)):
		schema["columns"].append({
			"name": headers[i],
			"type": datatypes[i],
		})
		
	# Validate that the datatypes are all legit.
	for dt in datatypes:
		if dt.split(":")[0] not in ("text", "int", "bigint", "float", "bool", "numeric", "date", "time", "timestamp", "json"):
			raise UserError("Invalid data type in schema: %s" % dt)
			
	return schema, table

예제 #4

파일 보기

def parse_resource(resource_file, mime_type, fileext, default_schema):
    # Given resource data, returns a tuple of
    #  * the schema used to load the file
    #  * the resource as a table, as given by messytables (an iterator over rows)

    # Schema defaults. We'll build up the schema with defaults
    # from the actual resource so that it is easy for the data
    # owner to customize the schema later.
    schema = {
        "_format": 1,
    }

    # Update defaults from the provided schema.
    if default_schema:
        schema.update(default_schema)
    else:
        schema["auto"] = True

    # Utility function that's like dict.get() but works on nested
    # dicts and takes a path through to dicts as arguments. Returns
    # None if no value is found.
    #   e.g. schema_get('format', 'name')
    #        This returns schema["format"]["name"], or None if
    #        "format" isn't in schema or "name" isn't in schema["format"].
    def schema_get(*path, **kwargs):
        if len(path) < 1: raise ValueError()
        v = schema
        for p in path:
            v = v.get(p, {})
        if v == {}: v = kwargs.get("default", None)
        return v

    # Utility function that sets a value in a set of nested dicts.
    # Takes a path plus a value as arguments.
    #   e.g. schema_set('format', 'name', 'zip')
    #        This is equivalent to:
    #          schema["format"]["name"] = "zip"
    #        but creating dicts as necessary along the way.
    def schema_set(*path_and_value):
        if len(path_and_value) < 2: raise ValueError()
        path = path_and_value[0:-2]
        field = path_and_value[-2]
        value = path_and_value[-1]
        container = schema
        for p in path:
            container = container.setdefault(p, {})
        container[field] = value

    # Parse the resource_file.

    # Get the table data format.

    if schema_get('format', 'name') == None:
        # Auto-detect format.
        from messytables import AnyTableSet as data_format
        data_format_args = {}

    elif schema_get('format', 'name') in ("csv", "tsv"):
        # "format" = {
        #   "name": "csv" | "tsv",
        #   "delimiter": ",",
        #   "quotechar": "\"",
        #   "encoding": "utf-8"
        # }

        # Load as CSV/TSV.
        from messytables import CSVTableSet as data_format

        # Default load parameters.
        data_format_args = {
            "delimiter":
            "," if schema_get('format', 'name') == "csv" else "\t",
            "quotechar": '"',
            "encoding": None,
        }

        # Override parameters from the schema.
        for n in ("delimiter", "quotechar", "encoding"):
            v = schema_get("format", n)
            if v:
                data_format_args[n] = str(
                    v)  # csv module requires str's not unicode

    else:
        raise UserError(
            "Invalid format name in schema. Allowed values are: csv, tsv.")

    # If the user specifies a ZIP container, then parse the
    # resource_file as a ZIP file and pass the format parameters
    # into ZIPTableSet so it knows how to parse the inner files.

    if schema_get("container", "name") == "zip":
        # "container = {
        #   "name": "zip"
        # }

        from messytables import ZIPTableSet
        table_set = ZIPTableSet.from_fileobj(
            resource_file,
            inner_data_format=data_format,
            inner_parser_args=data_format_args)

    elif schema_get("container", "name") != None:
        raise UserError(
            "Invalid container name in schema. Allowed values are: zip.")

    # If format parameters were given explicity, do not use a container.
    # Just parse according to the specified format.

    elif schema_get('format', 'name') != None:
        table_set = data_format.from_fileobj(resource_file, **data_format_args)

    # If no container and no format was specified, auto-detect everything.

    else:
        # Use the AnyTableSet to guess all parsing parameters.
        from messytables import AnyTableSet
        try:
            table_set = AnyTableSet.from_fileobj(resource_file,
                                                 mimetype=mime_type,
                                                 extension=fileext)
        except Exception as e:
            raise UserError("The file format could not be recognized: %s" %
                            str(e))

        # Provide the container information that may have been guessed.
        if type(table_set).__name__ == "ZIPTableSet":
            schema_set("container", "name", "zip")

    table = table_set.tables[0]

    # Provide the CSV parser settings that may have been guessed.
    if type(table).__name__ == "CSVRowSet":
        schema_set("format", "name",
                   "tsv" if table.delimiter == "\t" else "csv")
        schema_set("format", "delimiter", table.delimiter)
        schema_set("format", "quotechar", table.quotechar)
        schema_set("format", "encoding", table.encoding)

    # Get the column header names and the row offset to the data.

    # Start by guessing.
    from messytables import headers_guess, headers_processor
    offset, headers = headers_guess(table.sample)

    # Overwrite the has_headers and offset values with the schema, if present.
    has_headers = schema_get("header", "present", default=True)
    offset = schema_get("header", "offset", default=offset)

    # Set the header information back into the schema.
    schema_set("header", "present", True)
    schema_set("header", "offset", offset)

    # Override the header names with what is specified in the schema.
    for cidx, col in enumerate(schema_get("columns", default=[])):
        try:
            headers[cidx] = col.get("name", headers[cidx])
        except IndexError:
            pass  # ignore schema problems?

    # Since SQL column names are not case sensitive, prevent any
    # uniqueness clashes by converting all to lowercase. While
    # we're at it, also trim spaces.
    headers = [h.lower().strip() for h in headers]

    # Ensure the headers are valid SQL-ish & datastore column names:
    #  1st character: letter
    #  subsequent characters: letter, number, or underscore
    for i, header in enumerate(headers):
        # To play nice with international characters, convert to ASCII
        # by replacing extended characters with their closest ASCII
        # equivalent where possible.
        header = u"".join(c for c in unicodedata.normalize('NFKD', header)
                          if not unicodedata.combining(c))

        # Replace any invalid characters with "".
        header = re.sub("[^a-z0-9_]", "", header)

        # If there is an invalid 1st character, prepend a valid start.
        if not re.match("^[a-z]", header):
            header = "field_" + header

        # And force to an ASCII byte string, which should be possible by now.
        headers[i] = str(header)

    # TODO: Check that there is not an insane number of columns.
    # That could crash headers_make_unique.

    # Ensure the headers are unique, and not too long. Postgres
    # supports 63 (64?)-char-long col names, but that's ridiculous.
    from messytables import headers_make_unique
    headers = headers_make_unique(headers, max_length=24)

    # Skip the header row.
    # (Add one to begin with content, not the header.)
    from messytables import offset_processor
    table.register_processor(offset_processor(offset + 1))

    # Try to guess the datatypes.
    import messytables.types
    from messytables import type_guess, types_processor
    datatypes = type_guess(table.sample, [
        messytables.types.StringType, messytables.types.IntegerType,
        messytables.types.FloatType, messytables.types.DecimalType,
        messytables.types.DateType
    ],
                           strict=True)

    if len(datatypes) != len(headers):
        raise UserError(
            "Could not guess data types. Column header count does not match rows found during type guessing."
        )
    messytable_datastore_type_mapping = {
        messytables.types.StringType:
        lambda t: 'text',
        messytables.types.IntegerType:
        lambda t: 'bigint',  # 'int' may not be big enough,
        # and type detection may not realize it needs to be big
        messytables.types.FloatType:
        lambda t: 'float',
        messytables.types.DecimalType:
        lambda t: 'numeric',
        messytables.types.DateType:
        lambda t: 'timestamp:' + t.format,
    }
    datatypes = [
        messytable_datastore_type_mapping[type(t)](t) for t in datatypes
    ]  # convert objects to strings

    # Override the datatypes from the schema.
    for cidx, col in enumerate(schema_get("columns", default=[])):
        try:
            datatypes[cidx] = col.get("type", datatypes[cidx])
        except IndexError:
            pass  # ignore schema problems?

    # Provide the header names and types back to the user in the schema.
    schema["columns"] = []
    for i in xrange(len(headers)):
        schema["columns"].append({
            "name": headers[i],
            "type": datatypes[i],
        })

    # Validate that the datatypes are all legit.
    for dt in datatypes:
        if dt.split(":")[0] not in ("text", "int", "bigint", "float", "bool",
                                    "numeric", "date", "time", "timestamp",
                                    "json"):
            raise UserError("Invalid data type in schema: %s" % dt)

    return schema, table