def __init__(self, name, fileobj, delimiter=None, quotechar=None, encoding='utf-8', window=None, doublequote=None, lineterminator=None, skipinitialspace=None): self.name = name seekable_fileobj = messytables.seekable_stream(fileobj) self.fileobj = UTF8Recoder(seekable_fileobj, encoding) self.lines = ilines(self.fileobj) self._sample = [] self.delimiter = delimiter self.quotechar = quotechar self.window = window or 1000 self.doublequote = doublequote self.lineterminator = lineterminator self.skipinitialspace = skipinitialspace try: for i in xrange(self.window): self._sample.append(self.lines.next()) except StopIteration: pass super(CSVRowSet, self).__init__()
def __init__(self, fileobj, delimiter=None, quotechar=None, name=None, encoding=None, window=None): self.fileobj = messytables.seekable_stream(fileobj) self.name = name or 'table' self.delimiter = delimiter self.quotechar = quotechar self.encoding = encoding self.window = window
def __init__( self, name, fileobj, delimiter=None, quotechar=None, encoding="utf-8", window=None, doublequote=None, lineterminator=None, skipinitialspace=None, ): self.name = name seekable_fileobj = messytables.seekable_stream(fileobj) self.fileobj = UTF8Recoder(seekable_fileobj, encoding) self.lines = ilines(self.fileobj) self._sample = [] self.delimiter = delimiter self.quotechar = quotechar self.window = window or 1000 self.doublequote = doublequote self.lineterminator = lineterminator self.skipinitialspace = skipinitialspace try: for i in xrange(self.window): self._sample.append(self.lines.next()) except StopIteration: pass super(CSVRowSet, self).__init__()
def __init__(self, fileobj, delimiter=None, quotechar=None, name=None, encoding=None, window=None): self.fileobj = messytables.seekable_stream(fileobj) self.name = name or 'table' self.delimiter = delimiter or ',' self.quotechar = quotechar or '"' self.encoding = encoding self.window = window
def any_tableset(fileobj, mimetype=None, extension=None): """Reads any supported table type according to a specified MIME type or file extension or automatically detecting the type. Best matching TableSet loaded with the fileobject is returned. Matching is done by looking at the type (e.g mimetype='text/csv') or file extension (e.g. extension='tsv'), or otherwise autodetecting the file format by using the magic library which looks at the first few bytes of the file BUT is often wrong. Consult the source for recognized MIME types and file extensions. On error it raises messytables.ReadError """ # Auto-detect if the caller has offered no clue. (Because the # auto-detection routine is pretty poor.) if mimetype is None and extension is None: import magic # Since we need to peek the start of the stream, make sure we can # seek back later. If not, slurp in the contents into a StringIO. fileobj = messytables.seekable_stream(fileobj) header = fileobj.read(1024) mimetype = magic.from_buffer(header, mime=True) fileobj.seek(0) if (mimetype in ('application/x-zip-compressed', 'application/zip') or (extension and extension.lower() in ('zip',))): # Do this first because the extension applies to the content # type of the inner files, so don't check them before we check # for a ZIP file. return ZIPTableSet(fileobj) if (mimetype in ('text/csv', 'text/comma-separated-values') or (extension and extension.lower() in ('csv',))): return CSVTableSet(fileobj) # guess delimiter if (mimetype in ('text/tsv', 'text/tab-separated-values') or (extension and extension.lower() in ('tsv',))): return CSVTableSet(fileobj, delimiter='\t') if mimetype in ('application/ms-excel', 'application/vnd.ms-excel', 'application/xls') or (extension and extension.lower() in ('xls',)): return XLSTableSet(fileobj) if (mimetype in ( 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',) or (extension and extension.lower() in ('xlsx',))): return XLSXTableSet(fileobj) if (mimetype in ('text/html',) or (extension and extension.lower() in ('htm', 'html',))): return HTMLTableSet(fileobj) if mimetype: raise ValueError("Unrecognized MIME type: {mimetype}".format( mimetype=mimetype)) if extension: raise ValueError('''Could not determine MIME type and unrecognized extension: {extension}'''.format(extension=extension)) raise ValueError("Could not determine MIME type and no extension given.")
def get_mime(fileobj): import magic # Since we need to peek the start of the stream, make sure we can # seek back later. If not, slurp in the contents into a StringIO. fileobj = messytables.seekable_stream(fileobj) header = fileobj.read(4096) mimetype = magic.from_buffer(header, mime=True) fileobj.seek(0) # There's an issue with vnd.ms-excel being returned fro XLSX files, too. if mimetype == 'application/vnd.ms-excel' and header[:2] == 'PK': return 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' return mimetype
def __init__(self, fileobj, delimiter=None, quotechar=None, name=None, encoding=None, window=None, doublequote=None, lineterminator=None, skipinitialspace=None): self.fileobj = messytables.seekable_stream(fileobj) self.name = name or 'table' self.delimiter = delimiter self.quotechar = quotechar self.encoding = encoding self.window = window self.doublequote = doublequote self.lineterminator = lineterminator self.skipinitialspace = skipinitialspace
def get_mime(fileobj): import magic # Since we need to peek the start of the stream, make sure we can # seek back later. If not, slurp in the contents into a StringIO. fileobj = messytables.seekable_stream(fileobj) header = fileobj.read(4096) mimetype = magic.from_buffer(header, mime=True) fileobj.seek(0) # There's an issue with vnd.ms-excel being returned from XLSX files, too. if mimetype == 'application/vnd.ms-excel' and header[:2] == 'PK': return 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' return mimetype
def __init__(self, fileobj, delimiter=None, quotechar=None, name=None, encoding=None, window=None, doublequote=None, lineterminator=None, skipinitialspace=None, **kw): self.fileobj = messytables.seekable_stream(fileobj) self.name = name or 'table' self.delimiter = delimiter self.quotechar = quotechar self.encoding = encoding self.window = window self.doublequote = doublequote self.lineterminator = lineterminator self.skipinitialspace = skipinitialspace
def get_mime(fileobj): import magic # Since we need to peek the start of the stream, make sure we can # seek back later. If not, slurp in the contents into a StringIO. fileobj = messytables.seekable_stream(fileobj) header = fileobj.read(4096) mimetype = magic.from_buffer(header, mime=True).decode('utf-8') fileobj.seek(0) if MIMELOOKUP.get(mimetype) == 'ZIP': # consider whether it's an Microsoft Office document if b"[Content_Types].xml" in header: return 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' # There's an issue with vnd.ms-excel being returned from XLSX files, too. if mimetype == 'application/vnd.ms-excel' and header[:2] == b'PK': return 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' return mimetype
def __init__(self, name, fileobj, delimiter=None, quotechar=None, encoding='utf-8', window=None): self.name = name seekable_fileobj = messytables.seekable_stream(fileobj) self.fileobj = UTF8Recoder(seekable_fileobj, encoding) self.lines = ilines(self.fileobj) self._sample = [] self.delimiter = delimiter or ',' self.quotechar = quotechar or '"' self.window = window or 1000 try: for i in xrange(self.window): self._sample.append(self.lines.next()) except StopIteration: pass super(CSVRowSet, self).__init__()
def from_fileobj(cls, fileobj, mimetype=None, extension=None): """ Opens whatever sort of file is passed in, using the MIME type (e.g mimetype='text/csv') or file extension (e.g. extension='tsv'), or otherwise autodetecting the file format. Consult the source for recognized MIME types and file extensions.""" if mimetype == None: import magic # Since we need to peek the start of the stream, make sure we can # seek back later. If not, slurp in the contents into a StringIO. fileobj = messytables.seekable_stream(fileobj) header = fileobj.read(1024) mimetype = magic.from_buffer(header, mime=True) fileobj.seek(0) if mimetype in ('application/x-zip-compressed', 'application/zip') \ or (extension and extension.lower() in ('zip',)): # Do this first because the extension applies to the content # type of the inner files, so don't check them before we check # for a ZIP file. return ZIPTableSet.from_fileobj(fileobj) if mimetype in ('text/csv', 'text/comma-separated-values') or \ (extension and extension.lower() in ('csv',)): return CSVTableSet.from_fileobj(fileobj) # guess delimiter if mimetype in ('text/tsv', 'text/tab-separated-values') or \ (extension and extension.lower() in ('tsv',)): return CSVTableSet.from_fileobj(fileobj, delimiter='\t') if mimetype in ('application/ms-excel', 'application/vnd.ms-excel', 'application/xls') or (extension and extension.lower() in \ ('xls',)): return XLSTableSet.from_fileobj(fileobj) if mimetype in ('application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',) \ or (extension and extension.lower() in ('xlsx',)): return XLSXTableSet.from_fileobj(fileobj) if mimetype: raise ValueError("Unrecognized MIME type: " + mimetype) if extension: raise ValueError("Could not determine MIME type and " + "unrecognized extension: " + extension) raise ValueError("Could not determine MIME type and no extension given.")
def from_fileobj(cls, fileobj, mimetype=None, extension=None): """ Opens whatever sort of file is passed in, using the MIME type (e.g mimetype='text/csv') or file extension (e.g. extension='tsv'), or otherwise autodetecting the file format. Consult the source for recognized MIME types and file extensions.""" if mimetype == None: import magic # Since we need to peek the start of the stream, make sure we can # seek back later. If not, slurp in the contents into a StringIO. fileobj = messytables.seekable_stream(fileobj) header = fileobj.read(1024) mimetype = magic.from_buffer(header, mime=True) fileobj.seek(0) if mimetype in ('application/x-zip-compressed', 'application/zip') \ or (extension and extension.lower() in ('zip',)): # Do this first because the extension applies to the content # type of the inner files, so don't check them before we check # for a ZIP file. return ZIPTableSet.from_fileobj(fileobj) if mimetype in ('text/csv', 'text/comma-separated-values') or \ (extension and extension.lower() in ('csv',)): return CSVTableSet.from_fileobj(fileobj, delimiter=',') if mimetype in ('text/tsv', 'text/tab-separated-values') or \ (extension and extension.lower() in ('tsv',)): return CSVTableSet.from_fileobj(fileobj, delimiter='\t') if mimetype in ('application/ms-excel', 'application/vnd.ms-excel', 'application/xls', 'application/excel') or (extension and extension.lower() in \ ('xls',)): return XLSTableSet.from_fileobj(fileobj) if mimetype in ('application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',) \ or (extension and extension.lower() in ('xlsx',)): return XLSXTableSet.from_fileobj(fileobj) if mimetype: raise ValueError("Unrecognized MIME type: " + mimetype) if extension: raise ValueError("Could not determine MIME type and " + "unrecognized extension: " + extension) raise ValueError("Could not determine MIME type and no extension given.")