def any_tableset(fileobj, mimetype=None, extension=None): """Reads any supported table type according to a specified MIME type or file extension or automatically detecting the type. Best matching TableSet loaded with the fileobject is returned. Matching is done by looking at the type (e.g mimetype='text/csv') or file extension (e.g. extension='tsv'), or otherwise autodetecting the file format by using the magic library which looks at the first few bytes of the file BUT is often wrong. Consult the source for recognized MIME types and file extensions. On error it raises messytables.ReadError """ # Auto-detect if the caller has offered no clue. (Because the # auto-detection routine is pretty poor.) if mimetype is None and extension is None: import magic # Since we need to peek the start of the stream, make sure we can # seek back later. If not, slurp in the contents into a StringIO. fileobj = messytables.seekable_stream(fileobj) header = fileobj.read(1024) mimetype = magic.from_buffer(header, mime=True) fileobj.seek(0) if (mimetype in ('application/x-zip-compressed', 'application/zip') or (extension and extension.lower() in ('zip',))): # Do this first because the extension applies to the content # type of the inner files, so don't check them before we check # for a ZIP file. return ZIPTableSet(fileobj) if (mimetype in ('text/csv', 'text/comma-separated-values') or (extension and extension.lower() in ('csv',))): return CSVTableSet(fileobj) # guess delimiter if (mimetype in ('text/tsv', 'text/tab-separated-values') or (extension and extension.lower() in ('tsv',))): return CSVTableSet(fileobj, delimiter='\t') if mimetype in ('application/ms-excel', 'application/vnd.ms-excel', 'application/xls') or (extension and extension.lower() in ('xls',)): return XLSTableSet(fileobj) if (mimetype in ( 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',) or (extension and extension.lower() in ('xlsx',))): return XLSXTableSet(fileobj) if (mimetype in ('text/html',) or (extension and extension.lower() in ('htm', 'html',))): return HTMLTableSet(fileobj) if mimetype: raise ValueError("Unrecognized MIME type: {mimetype}".format( mimetype=mimetype)) if extension: raise ValueError('''Could not determine MIME type and unrecognized extension: {extension}'''.format(extension=extension)) raise ValueError("Could not determine MIME type and no extension given.")
def test_strict_type_guessing_with_large_file(self): fh = horror_fobj('211.csv') rows = CSVTableSet(fh).tables[0] offset, headers = headers_guess(rows.sample) rows.register_processor(offset_processor(offset + 1)) types = [StringType, IntegerType, DecimalType, DateUtilType] guessed_types = type_guess(rows.sample, types, True) assert_equal(len(guessed_types), 96) assert_equal(guessed_types, [ IntegerType(), StringType(), StringType(), StringType(), StringType(), StringType(), IntegerType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), IntegerType(), StringType(), DecimalType(), DecimalType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), IntegerType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), IntegerType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), DateUtilType(), DateUtilType(), DateUtilType(), DateUtilType(), StringType(), StringType(), StringType()])
def get_column_types(data: io.BytesIO) -> Tuple[List[str], List[str]]: """derive the column types Using messytables' CSV API, attempt to derive the column types based on a best-guess of a sample of the rows. This is still a WIP due to the parlous state of the DV360/CM CSV data formats in general Arguments: data {io.BytesIO} -- sample of the CSV file Returns: (List[str], List[str]) -- tuple of list of header names and list of column types """ table_set = CSVTableSet(data) row_set = table_set.tables[0] offset, headers = headers_guess(row_set.sample) logging.info(headers) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) types = type_guess(row_set.sample, strict=True) logging.info(types) return (headers, types)
def test_read_encoded_csv(self): fh = horror_fobj('utf-16le_encoded.csv') table_set = CSVTableSet(fh) row_set = table_set.tables[0] assert_equal(328, len(list(row_set))) row = list(row_set.sample)[0] assert_equal(row[1].value, 'Organisation_name')
def test_file_with_few_strings_among_integers(self): fh = horror_fobj('mixedGLB.csv') rows = CSVTableSet(fh).tables[0] offset, headers = headers_guess(rows.sample) rows.register_processor(offset_processor(offset + 1)) types = [StringType, IntegerType, DecimalType, DateUtilType] guessed_types = type_guess(rows.sample, types, True) assert_equal(len(guessed_types), 19) print guessed_types assert_equal(guessed_types, [ IntegerType(), IntegerType(), IntegerType(), IntegerType(), IntegerType(), IntegerType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), IntegerType(), StringType(), StringType() ])
def test_null_process(self): fh = horror_fobj('null.csv') table_set = CSVTableSet(fh) row_set = table_set.tables[0] row_set.register_processor(null_processor(['null'])) data = list(row_set) nones = [[x.value is None for x in row] for row in data] assert_equal(nones[0], [False, True, False, False]) assert_equal(nones[1], [False, False, False, True]) assert_equal(nones[2], [False, True, False, False]) types = type_guess(row_set.sample, strict=True) expected_types = [ IntegerType(), IntegerType(), IntegerType(), IntegerType() ] assert_equal(types, expected_types) row_set.register_processor(types_processor(types)) # after applying the types, '' should become None for int columns data = list(row_set) nones = [[x.value is None for x in row] for row in data] assert_equal(nones[0], [False, True, False, False]) assert_equal(nones[1], [False, False, False, True]) assert_equal(nones[2], [False, True, True, True])
def test_apply_null_values(self): fh = horror_fobj('null.csv') table_set = CSVTableSet(fh) row_set = table_set.tables[0] types = type_guess(row_set.sample, strict=True) expected_types = [ IntegerType(), StringType(), IntegerType(), StringType() ] assert_equal(types, expected_types) row_set.register_processor(types_processor(types)) data = list(row_set) # treat null as non empty text and 0 as non empty integer assert [x.empty for x in data[0]] == [False, False, False, False] assert [x.empty for x in data[1]] == [False, False, False, False] assert [x.empty for x in data[2]] == [False, False, True, True] assert [x.empty for x in data[3]] == [False, False, False, False] assert [x.empty for x in data[4]] == [False, False, False, True] assert [x.empty for x in data[5]] == [False, False, False, True] # we expect None for Integers and "" for empty strings in CSV assert [x.value for x in data[2]] == [3, "null", None, ""], data[2]
def test_overriding_sniffed(self): # semicolon separated values fh = horror_fobj('simple.csv') table_set = CSVTableSet(fh, delimiter=";") row_set = table_set.tables[0] assert_equal(7, len(list(row_set))) row = list(row_set.sample)[0] assert_equal(len(row), 1)
def test_rowset_as_schema(self): from StringIO import StringIO as sio ts = CSVTableSet(sio('''name,dob\nmk,2012-01-02\n''')) rs = ts.tables[0] jts = rowset_as_jts(rs).as_dict() assert_equal(jts['fields'], [ {'type': 'string', 'id': u'name', 'label': u'name'}, {'type': 'date', 'id': u'dob', 'label': u'dob'}])
def test_guess_headers(self): fh = horror_fobj('weird_head_padding.csv') table_set = CSVTableSet(fh) row_set = table_set.tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) data = list(row_set) assert 'Frauenheilkunde' in data[9][0].value, data[9][0].value fh = horror_fobj('weird_head_padding.csv') table_set = CSVTableSet(fh) row_set = table_set.tables[0] row_set.register_processor(headers_processor(['foo', 'bar'])) data = list(row_set) assert 'foo' in data[12][0].column, data[12][0] assert 'Chirurgie' in data[12][0].value, data[12][0].value
def test_non_strict_guessing_handles_padding(self): csv_file = StringIO.StringIO(''' 1, , 2 2, , 1.1 foo, , 1500''') rows = CSVTableSet(csv_file).tables[0] guessed_types = type_guess(rows.sample, strict=False) assert_equal(len(guessed_types), 3) assert_equal(guessed_types, [IntegerType(), StringType(), DecimalType()])
def test_read_head_offset_csv(self): fh = horror_fobj('simple.csv') table_set = CSVTableSet(fh) row_set = table_set.tables[0] offset, headers = headers_guess(row_set.sample) assert_equal(offset, 0) row_set.register_processor(offset_processor(offset + 1)) data = list(row_set.sample) assert_equal(int(data[0][1].value), 1) data = list(row_set) assert_equal(int(data[0][1].value), 1)
def test_strict_guessing_handles_padding(self): csv_file = io.BytesIO(b''' 1, , 2 2, , 1.1 foo, , 1500''') rows = CSVTableSet(csv_file).tables[0] guessed_types = type_guess(rows.sample, strict=True) assert_equal(len(guessed_types), 3) assert_equal(guessed_types, [StringType(), StringType(), DecimalType()])
def test_read_simple_tsv(self): fh = horror_fobj('example.tsv') table_set = CSVTableSet(fh) row_set = table_set.tables[0] assert_equal(141, len(list(row_set))) row = list(row_set.sample)[0] assert_equal(row[0].value, 'hour') assert_equal(row[1].value, 'expr1_0_imp') for row in list(row_set): assert_equal(17, len(row)) assert_equal(row[0].type, StringType())
def test_http_csv_encoding(self): url = 'http://www.messytables.org/static/utf-16le_encoded.csv' httpretty.register_uri(httpretty.GET, url, body=horror_fobj('utf-16le_encoded.csv').read(), content_type="application/csv") fh = urlopen(url) table_set = CSVTableSet(fh) row_set = table_set.tables[0] data = list(row_set) assert_equal(328, len(data))
def test_read_encoded_characters_csv(self): fh = horror_fobj('characters.csv') table_set = CSVTableSet(fh) row_set = table_set.tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) data = list(row_set) assert_equal(382, len(data)) assert_equal(data[0][2].value, u'雲嘉南濱海國家風景區管理處') assert_equal(data[-1][2].value, u'沈光文紀念廳')
def test_read_simple_csv(self): fh = horror_fobj('simple.csv') table_set = CSVTableSet(fh) row_set = table_set.tables[0] assert_equal(7, len(list(row_set))) row = list(row_set.sample)[0] assert_equal(row[0].value, 'date') assert_equal(row[1].value, 'temperature') for row in list(row_set): assert_equal(3, len(row)) assert_equal(row[0].type, StringType())
def connect(self, host=None, port=None, database=None, username=None, password=None, file=None): # TODO: mysql, pymssql, csv, sqlite3, pymongo, cx_Oracle self.database = database conn_string = '' if self.engine == 'psycopg2': if database: conn_string += "dbname='%s' " % database if username: conn_string += "user='******' " % username if host: conn_string += "host='%s' " % host if port: conn_string += "port='%s' " % port if password: conn_string += "password='******' " % password self.conn = psycopg2.connect(conn_string) elif self.engine == 'pymssql': self.conn = pymssql.connect(host, username, password, database, port=port, as_dict=True, charset='LATIN1') elif self.engine == 'csv': # https://messytables.readthedocs.io/en/latest/ fh = StringIO.StringIO(self.data) #dialect = csv.Sniffer().sniff(f.read(1024)) #f.seek(0) #self.conn = csv.DictReader(f, dialect=dialect) #fh = open('messy.csv', 'rb') # Load a file object: table_set = CSVTableSet(fh) row_set = table_set.tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) types = type_guess(row_set.sample, strict=True) row_set.register_processor(types_processor(types)) self.conn = row_set return self.conn
def test_guessing_uses_first_in_case_of_tie(self): csv_file = StringIO.StringIO(''' 2 1.1 1500''') rows = CSVTableSet(csv_file).tables[0] guessed_types = type_guess( rows.sample, types=[DecimalType, IntegerType], strict=False) assert_equal(guessed_types, [DecimalType()]) guessed_types = type_guess( rows.sample, types=[IntegerType, DecimalType], strict=False) assert_equal(guessed_types, [IntegerType()])
def test_read_simple_psv(self): # pipe/vertical bar ("|") separated values fh = horror_fobj('simple.psv') table_set = CSVTableSet(fh) row_set = table_set.tables[0] assert_equal(29, len(list(row_set))) row = list(row_set.sample)[0] assert_equal(row[0].value, 'Year') assert_equal(row[1].value, 'New dwellings') for row in list(row_set): assert_equal(6, len(row)) assert_equal(row[0].type, StringType())
def test_http_csv_requests(self): url = 'http://www.messytables.org/static/long.csv' httpretty.register_uri(httpretty.GET, url, body=horror_fobj('long.csv').read(), content_type="application/csv") r = requests.get(url, stream=True) # no full support for non blocking version yet, use urllib2 fh = io.BytesIO(r.raw.read()) table_set = CSVTableSet(fh, encoding='utf-8') row_set = table_set.tables[0] data = list(row_set) assert_equal(4000, len(data))
def test_type_guess(self): csv_file = StringIO.StringIO(''' 1, 2012/2/12, 2, 02 October 2011, yes, 1 2, 2012/2/12, 2, 02 October 2011, true, 1 2.4, 2012/2/12, 1, 1 May 2011, no, 0 foo, bar, 1000, , false, 0 4.3, , 42, 24 October 2012,, , 2012/2/12, 21, 24 December 2013, true, 1''') rows = CSVTableSet(csv_file).tables[0] guessed_types = type_guess(rows.sample) assert_equal(guessed_types, [ DecimalType(), DateType('%Y/%m/%d'), IntegerType(), DateType('%d %B %Y'), BoolType(), BoolType()])
def csvParse(csv_file_path): fh = open(csv_file_path, 'rb') # Load a file object: table_set = CSVTableSet(fh) row_set = table_set.tables[0] # guess header names and the offset of the header: offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) # add one to begin with content, not the header: row_set.register_processor(offset_processor(offset + 1)) # guess column types: types = type_guess(row_set.sample, strict=True) row_set.register_processor(types_processor(types)) return row_set, headers, offset, types
def test_read_type_guess_simple(self): fh = horror_fobj('simple.csv') table_set = CSVTableSet(fh) row_set = table_set.tables[0] types = type_guess(row_set.sample) expected_types = [DateType("%Y-%m-%d"), IntegerType(), StringType()] assert_equal(types, expected_types) row_set.register_processor(types_processor(types)) data = list(row_set) header_types = [c.type for c in data[0]] assert_equal(header_types, [StringType()] * 3) row_types = [c.type for c in data[2]] assert_equal(expected_types, row_types)
def test_read_head_padding_csv(self): fh = horror_fobj('weird_head_padding.csv') table_set = CSVTableSet(fh) row_set = table_set.tables[0] offset, headers = headers_guess(row_set.sample) assert 11 == len(headers), headers assert_equal('1985', headers[1].strip()) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) data = list(row_set.sample) for row in row_set: assert_equal(11, len(row)) value = data[1][0].value.strip() assert value == u'Gefäßchirurgie', value
def test_read_complex_csv(self): fh = horror_fobj('complex.csv') table_set = CSVTableSet(fh) row_set = table_set.tables[0] assert_equal(4, len(list(row_set))) row = list(row_set.sample)[0] assert_equal(row[0].value, 'date') assert_equal(row[1].value, 'another date') assert_equal(row[2].value, 'temperature') assert_equal(row[3].value, 'place') for row in list(row_set): assert_equal(4, len(row)) assert_equal(row[0].type, StringType())
def test_type_guess_strict(self): import locale locale.setlocale(locale.LC_ALL, 'en_GB.UTF-8') csv_file = StringIO.StringIO(''' 1, 2012/2/12, 2, 2,02 October 2011,"100.234354" 2, 2012/2/12, 1.1, 0,1 May 2011,"100,000,000.12" foo, bar, 1500, 0,,"NaN" 4, 2012/2/12, 42,"-2,000",24 October 2012,"42" ,,,,,''') rows = CSVTableSet(csv_file).tables[0] guessed_types = type_guess(rows.sample, strict=True) assert_equal(guessed_types, [ StringType(), StringType(), DecimalType(), IntegerType(), DateType('%d %B %Y'), DecimalType()])
def csvimport_table(name): from messytables import CSVTableSet, type_guess from messytables import types_processor, headers_guess from messytables import headers_processor, offset_processor from spendb.etl.extract import parse_table row_set = CSVTableSet(data_fixture(name)).tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) types = type_guess(row_set.sample, strict=True) row_set.register_processor(types_processor(types)) rows = [] for num_rows, (fields, row, samples) in enumerate(parse_table(row_set)): rows.append(row) return fields, rows
def _get_table_columns(self, csv_file_path: str) -> zip: """ Read the csv file and tries to guess the the type of each column using messytables library. The type can be 'Integer', 'Decimal', 'String' or 'Bool' :param csv_file_path: path to the csv file with content in it :return: a Zip object where each tuple has two elements: the first is the column name and the second is the type """ with gzip.open(csv_file_path, 'rb') as f: table_set = CSVTableSet(f) row_set = table_set.tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) types = list(map(jts.celltype_as_string, type_guess(row_set.sample, strict=True))) return zip(headers, types)
def _guess_csv_datatype(fh): table_set = CSVTableSet(fh) row_set = table_set.tables[0] offset, headers = headers_guess(row_set.sample) logger.info("(offset, headers) = ({}, {})".format(offset, headers)) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) types = type_guess(row_set.sample, strict=True) row_set.register_processor(types_processor(types)) counter = 0 for row in row_set: logger.info(row) counter += 1 if counter >= 32: break d = {h: t for h, t in zip(headers, types)} logger.info(d) return d