def test_guessing_uses_first_in_case_of_tie(self): csv_file = StringIO.StringIO(''' 2 1.1 1500''') rows = CSVTableSet(csv_file).tables[0] guessed_types = type_guess( rows.sample, types=[DecimalType, IntegerType], strict=False) assert_equal(guessed_types, [DecimalType()]) guessed_types = type_guess( rows.sample, types=[IntegerType, DecimalType], strict=False) assert_equal(guessed_types, [IntegerType()])
def analyze_csv(url, sample=1000): try: fileobj = urlopen(url) row_set = CSVRowSet('data', fileobj, window=sample) sample = list(row_set.sample) headers, sample = sample[0], sample[1:] #values = frequent_values(sample) types = type_guess(sample[500:], types=LIMITED_TYPES) mapping = {} for header, type_ in zip(headers, types): type_ = repr(type_).lower() name = slugify(header.value).lower() meta = { 'label': header.value, 'column': header.value, 'datatype': type_ } if type_ in ['decimal', 'integer', 'float']: meta['type'] = 'measure' meta['datatype'] = 'float' elif type_.startswith('date'): meta['type'] = 'date' meta['datatype'] = 'date' else: meta['type'] = 'attribute' mapping[name] = meta return {'columns': [h.value for h in headers], 'mapping': mapping} except Exception, e: return {'error': unicode(e)}
def test_null_process(self): fh = horror_fobj('null.csv') table_set = CSVTableSet(fh) row_set = table_set.tables[0] row_set.register_processor(null_processor(['null'])) data = list(row_set) nones = [[x.value is None for x in row] for row in data] assert_equal(nones[0], [False, True, False, False]) assert_equal(nones[1], [False, False, False, True]) assert_equal(nones[2], [False, True, False, False]) types = type_guess(row_set.sample, strict=True) expected_types = [ IntegerType(), IntegerType(), IntegerType(), IntegerType() ] assert_equal(types, expected_types) row_set.register_processor(types_processor(types)) # after applying the types, '' should become None for int columns data = list(row_set) nones = [[x.value is None for x in row] for row in data] assert_equal(nones[0], [False, True, False, False]) assert_equal(nones[1], [False, False, False, True]) assert_equal(nones[2], [False, True, True, True])
def test_apply_null_values(self): fh = horror_fobj('null.csv') table_set = CSVTableSet(fh) row_set = table_set.tables[0] types = type_guess(row_set.sample, strict=True) expected_types = [ IntegerType(), StringType(), IntegerType(), StringType() ] assert_equal(types, expected_types) row_set.register_processor(types_processor(types)) data = list(row_set) # treat null as non empty text and 0 as non empty integer assert [x.empty for x in data[0]] == [False, False, False, False] assert [x.empty for x in data[1]] == [False, False, False, False] assert [x.empty for x in data[2]] == [False, False, True, True] assert [x.empty for x in data[3]] == [False, False, False, False] assert [x.empty for x in data[4]] == [False, False, False, True] assert [x.empty for x in data[5]] == [False, False, False, True] # we expect None for Integers and "" for empty strings in CSV assert [x.value for x in data[2]] == [3, "null", None, ""], data[2]
def main(basic_config_file, batch_config_file): with open(basic_config_file, "r") as f: base_settings = yaml.load(f) if batch_config_file: # RUN MANY # parse csv into a list of settings-dicts import messytables with open(batch_config_file, "rb") as f: row_set = messytables.CSVRowSet("", f) offset, headers = messytables.headers_guess(row_set.sample) row_set.register_processor(messytables.headers_processor(headers)) row_set.register_processor(messytables.offset_processor(offset + 1)) types = messytables.type_guess(row_set.sample, strict=True) row_set.register_processor(messytables.types_processor(types)) settings_list = row_set.dicts() name = batch_config_file.replace(".csv", "") run_many(settings_list, name, base_settings=base_settings) else: # RUN ONE # parse yaml into a settings-dict settings_file = os.path.join(base_settings["out_dir"], "settings.yml") with open(settings_file, "w") as f: yaml.dump(base_settings, f) training_log, exit_status = run_one(**base_settings) training_log_file = os.path.join(base_settings["out_dir"], "training_log.csv") training_log.to_csv(training_log_file) stats = compute_final_stats(training_log) stats["exit_status"] = exit_status training_stats_file = os.path.join(base_settings["out_dir"], "training_stats.yml") with open(training_stats_file, "w") as f: yaml.dump(stats, f)
def analyze_csv(url, sample=1000): try: fileobj = urlopen(url) row_set = CSVRowSet('data', fileobj, window=sample) sample = list(row_set.sample) headers, sample = sample[0], sample[1:] # values = frequent_values(sample) types = type_guess(sample[500:], types=LIMITED_TYPES) mapping = {} for header, type_ in zip(headers, types): type_ = repr(type_).lower() name = slugify(header.value).lower() meta = { 'label': header.value, 'column': header.value, 'datatype': type_ } if type_ in ['decimal', 'integer', 'float']: meta['type'] = 'measure' meta['datatype'] = 'float' elif type_.startswith('date'): meta['type'] = 'date' meta['datatype'] = 'date' else: meta['type'] = 'attribute' mapping[name] = meta return {'columns': [h.value for h in headers], 'mapping': mapping} except Exception as e: log.exception(e) return {'error': unicode(e)}
def get_column_types(data: io.BytesIO) \ -> Tuple[List[str], List[types.CellType]]: """derive the column types Using messytables' CSV API, attempt to derive the column types based on a best-guess of a sample of the rows. This is still a WIP due to the parlous state of the DV360/CM CSV data formats in general Arguments: data (io.BytesIO): sample of the CSV file Returns: (List[str], List[str]): tuple of list of header names and list of column types """ table_set = messytables.CSVTableSet(data) row_set = table_set.tables[0] offset, csv_headers = messytables.headers_guess(row_set.sample) row_set.register_processor(messytables.headers_processor(csv_headers)) row_set.register_processor(messytables.offset_processor(offset + 1)) csv_types = messytables.type_guess(row_set.sample, strict=True) return (csv_headers, csv_types)
def test_strict_type_guessing_with_large_file(self): fh = horror_fobj('211.csv') rows = CSVTableSet(fh).tables[0] offset, headers = headers_guess(rows.sample) rows.register_processor(offset_processor(offset + 1)) types = [StringType, IntegerType, DecimalType, DateUtilType] guessed_types = type_guess(rows.sample, types, True) assert_equal(len(guessed_types), 96) assert_equal(guessed_types, [ IntegerType(), StringType(), StringType(), StringType(), StringType(), StringType(), IntegerType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), IntegerType(), StringType(), DecimalType(), DecimalType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), IntegerType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), IntegerType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), DateUtilType(), DateUtilType(), DateUtilType(), DateUtilType(), StringType(), StringType(), StringType()])
def test_file_with_few_strings_among_integers(self): fh = horror_fobj('mixedGLB.csv') rows = CSVTableSet(fh).tables[0] offset, headers = headers_guess(rows.sample) rows.register_processor(offset_processor(offset + 1)) types = [StringType, IntegerType, DecimalType, DateUtilType] guessed_types = type_guess(rows.sample, types, True) assert_equal(len(guessed_types), 19) print guessed_types assert_equal(guessed_types, [ IntegerType(), IntegerType(), IntegerType(), IntegerType(), IntegerType(), IntegerType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), IntegerType(), StringType(), StringType() ])
def get_column_types(data: io.BytesIO) -> Tuple[List[str], List[str]]: """derive the column types Using messytables' CSV API, attempt to derive the column types based on a best-guess of a sample of the rows. This is still a WIP due to the parlous state of the DV360/CM CSV data formats in general Arguments: data {io.BytesIO} -- sample of the CSV file Returns: (List[str], List[str]) -- tuple of list of header names and list of column types """ table_set = CSVTableSet(data) row_set = table_set.tables[0] offset, headers = headers_guess(row_set.sample) logging.info(headers) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) types = type_guess(row_set.sample, strict=True) logging.info(types) return (headers, types)
def test_null_process(self): fh = horror_fobj('null.csv') table_set = CSVTableSet(fh) row_set = table_set.tables[0] row_set.register_processor(null_processor(['null'])) data = list(row_set) nones = [[x.value is None for x in row] for row in data] assert_equal(nones[0], [False, True, False, False]) assert_equal(nones[1], [False, False, False, True]) assert_equal(nones[2], [False, True, False, False]) types = type_guess(row_set.sample, strict=True) expected_types = [IntegerType(), BoolType(), BoolType(), BoolType()] assert_equal(types, expected_types) row_set.register_processor(types_processor(types)) # after applying the types, '' should become None for int columns data = list(row_set) nones = [[x.value is None for x in row] for row in data] assert_equal(nones[0], [False, True, False, False]) assert_equal(nones[1], [False, False, False, True]) assert_equal(nones[2], [False, True, True, True])
def rowset_as_jts(rowset, headers=None, types=None): ''' Create a json table schema from a rowset ''' _, headers = messytables.headers_guess(rowset.sample) types = map(celltype_as_string, messytables.type_guess(rowset.sample)) return headers_and_typed_as_jts(headers, types)
def create_new_model(self, modelname, app_label): """ Use messytables to guess field types and build a new model """ nocols = False cols = self.csvfile[0] for col in cols: if not col: nocols = True if nocols: cols = ["col_%s" % num for num in range(1, len(cols))] print("No column names for %s columns" % len(cols)) else: # strip quotes at ends and replace internal spaces with underscores cols = [col.strip("\r") for col in cols] cols = [col.strip('"') for col in cols] cols = [col.strip("'") for col in cols] cols = [cleancol.sub("_", col).lower() for col in cols] try: from messytables import any_tableset, type_guess except: self.errors.append( "If you want to inspect CSV files to generate model code, you must install https://messytables.readthedocs.org" ) self.modelname = "" return try: table_set = any_tableset(self.filehandle) row_set = table_set.tables[0] types = type_guess(row_set.sample) types = [str(typeobj) for typeobj in types] # If the header has more cols than the data has cols - ignore the end ones if len(cols) > len(types): cols = cols[:len(types)] except Exception as err: self.errors.append("messytables could not run due to error") self.errors.append(str(err)) self.modelname = "" return fieldset = [] maximums = self.get_maxlengths(cols) for i, col in enumerate(cols): length = maximums[i] if types[i] == "String" and length > 255: types[i] = "Text" integer = length decimal = int(length / 2) if decimal > 10: decimal = 10 blank = True default = True column = (col, types[i], length, length, integer, decimal, blank, default) fieldset.append(column) # Import here so that messytables is not a dependency for just using csvimport cmd from csvimport.make_model import MakeModel maker = MakeModel() return maker.model_from_table("%s_%s" % (app_label, modelname), fieldset)
def generate_mapping(fileobj, sample=2000): row_set = CSVRowSet('data', fileobj, window=sample) sample = list(row_set.sample) headers, sample = sample[0], sample[1:] values = frequent_values(sample) types = type_guess(sample) mapping = {} for header, type_, value in zip(headers, types, values): type_ = repr(type_).lower() name = slugify(header.value).lower() meta = { 'label': header.value, 'column': header.value, 'common_values': value, 'datatype': type_ } if type_ in ['decimal', 'integer', 'float']: meta['type'] = 'measure' meta['datatype'] = 'float' elif type_ in ['date']: meta['type'] = 'date' meta['datatype'] = 'date' else: meta['type'] = 'value' mapping[name] = meta return mapping
def get_schema(self, filename): """ Guess schema using messytables """ table_set = self.read_file(filename) # Have I been able to read the filename if table_set is None: return [] # Get the first table as rowset row_set = table_set.tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) types = type_guess(row_set.sample, strict=True) # Get a sample as well.. sample = next(row_set.sample) clean = lambda v: str(v) if not isinstance(v, str) else v schema = [] for i, h in enumerate(headers): schema.append([h, str(types[i]), clean(sample[i].value)]) return schema
def main(argv=None): args = parse_args(argv) if args.file is None: # slurp the whole input since there seems to be a bug in messytables # which should be able to handle streams but doesn't args.file = cStringIO.StringIO(sys.stdin.read()) relation_key = args_to_relation_key(args) table_set = any_tableset(args.file) if len(table_set.tables) != 1: raise ValueError("Can only handle files with a single table, not %s" % len(table_set.tables)) row_set = table_set.tables[0] # guess header names and the offset of the header: offset, headers = headers_guess(row_set.sample) row_set.register_processor(strip_processor()) row_set.register_processor(headers_processor(headers)) # Temporarily, mark the offset of the header row_set.register_processor(offset_processor(offset + 1)) # guess types and register them types = type_guess(replace_empty_string(row_set.sample), strict=True, types=[StringType, DecimalType, IntegerType]) row_set.register_processor(types_processor(types)) # Messytables seems to not handle the case where there are no headers. # Work around this as follows: # 1) offset must be 0 # 2) if the types of the data match the headers, assume there are # actually no headers if offset == 0: try: [t.cast(v) for (t, v) in zip(types, headers)] except: pass else: # We don't need the headers_processor or the offset_processor row_set._processors = [] row_set.register_processor(strip_processor()) row_set.register_processor(types_processor(types)) headers = None # Construct the Myria schema schema = messy_to_schema(types, headers) logging.info("Myria schema: {}".format(json.dumps(schema))) # Prepare data for writing to Myria data, kwargs = write_data(row_set, schema) if not args.dry: # Connect to Myria and send the data connection = myria.MyriaConnection(hostname=args.hostname, port=args.port, ssl=args.ssl) ret = connection.upload_file(relation_key, schema, data, args.overwrite, **kwargs) sys.stdout.write(pretty_json(ret)) else: sys.stdout.write(data)
def test_non_strict_guessing_handles_padding(self): csv_file = StringIO.StringIO(''' 1, , 2 2, , 1.1 foo, , 1500''') rows = CSVTableSet(csv_file).tables[0] guessed_types = type_guess(rows.sample, strict=False) assert_equal(len(guessed_types), 3) assert_equal(guessed_types, [IntegerType(), StringType(), DecimalType()])
def test_non_strict_guessing_handles_padding(self): csv_file = StringIO.StringIO(''' 1, , 2.1 2, , 1.1 foo, , 1500''') rows = CSVTableSet(csv_file).tables[0] guessed_types = type_guess(rows.sample, strict=False) assert_equal(len(guessed_types), 3) assert_equal(guessed_types, [IntegerType(), StringType(), DecimalType()])
def test_strict_guessing_handles_padding(self): csv_file = io.BytesIO(b''' 1, , 2 2, , 1.1 foo, , 1500''') rows = CSVTableSet(csv_file).tables[0] guessed_types = type_guess(rows.sample, strict=True) assert_equal(len(guessed_types), 3) assert_equal(guessed_types, [StringType(), StringType(), DecimalType()])
def parse(stream, excel_type='xls', sheet=1, guess_types=True, **kwargs): '''Parse Excel (xls or xlsx) to structured objects. :param excel_type: xls | xlsx :param sheet: index of sheet in spreadsheet to convert (starting from index = 1) ''' sheet_number = int(sheet) - 1 xlsclass = XLSTableSet if excel_type == 'xlsx': xlsclass = XLSXTableSet table_set = xlsclass.from_fileobj(stream) try: row_set = table_set.tables[sheet_number] except IndexError: raise Exception('This file does not have sheet number %d' % (sheet_number + 1)) offset, headers = headers_guess(row_set.sample) fields = [] dup_columns = {} noname_count = 1 if guess_types: guess_types = [ StringType, IntegerType, FloatType, DecimalType, DateUtilType ] row_types = type_guess(row_set.sample, guess_types) for index, field in enumerate(headers): field_dict = {} if "" == field: field = '_'.join(['column', str(noname_count)]) headers[index] = field noname_count += 1 if headers.count(field) == 1: field_dict['id'] = field else: dup_columns[field] = dup_columns.get(field, 0) + 1 field_dict['id'] = u'_'.join([field, str(dup_columns[field])]) if guess_types: if isinstance(row_types[index], DateUtilType): field_dict['type'] = 'DateTime' else: field_dict['type'] = str(row_types[index]) fields.append(field_dict) row_set.register_processor(headers_processor([x['id'] for x in fields])) row_set.register_processor(offset_processor(offset + 1)) def row_iterator(): for row in row_set: data_row = {} for index, cell in enumerate(row): data_row[cell.column] = cell.value yield data_row return row_iterator(), {'fields': fields}
def parse(stream, excel_type='xls', sheet=1, guess_types=True, **kwargs): '''Parse Excel (xls or xlsx) to structured objects. :param excel_type: xls | xlsx :param sheet: index of sheet in spreadsheet to convert (starting from index = 1) ''' sheet_number = int(sheet) - 1 xlsclass = XLSTableSet if excel_type == 'xlsx': xlsclass = XLSXTableSet table_set = xlsclass.from_fileobj(stream) try: row_set = table_set.tables[sheet_number] except IndexError: raise Exception('This file does not have sheet number %d' % (sheet_number + 1)) offset, headers = headers_guess(row_set.sample) fields = [] dup_columns = {} noname_count = 1 if guess_types: guess_types = [StringType, IntegerType, FloatType, DecimalType, DateUtilType] row_types = type_guess(row_set.sample, guess_types) for index, field in enumerate(headers): field_dict = {} if "" == field: field = '_'.join(['column', str(noname_count)]) headers[index] = field noname_count += 1 if headers.count(field) == 1: field_dict['id'] = field else: dup_columns[field] = dup_columns.get(field, 0) + 1 field_dict['id'] = u'_'.join([field, str(dup_columns[field])]) if guess_types: if isinstance(row_types[index], DateUtilType): field_dict['type'] = 'DateTime' else: field_dict['type'] = str(row_types[index]) fields.append(field_dict) row_set.register_processor(headers_processor([x['id'] for x in fields])) row_set.register_processor(offset_processor(offset + 1)) def row_iterator(): for row in row_set: data_row = {} for index, cell in enumerate(row): data_row[cell.column] = cell.value yield data_row return row_iterator(), {'fields': fields}
def test_json_type(self): csv_file = StringIO.StringIO(''' "{""a"":""b"", ""c"":""d""}", "[1, 2, 3]", 12a "[""a"", [1, 2, {""a"":""b""}]]", "{""a"": 1, ""b"":[1, 2]}", abc ,, "abc" ''') rows = CSVTableSet(csv_file).tables[0] guessed_types = type_guess(rows.sample) assert_equal(guessed_types, [JsonType(), JsonType(), StringType()])
def test_wkt_type(self): csv_file = StringIO.StringIO(''' "0102000020e6100000020000000000000000002640000000000000474000000000000024400000000000804640", "0102000020787f0000020000000000000000002640000000000000474000000000000024400000000000804640", "SRID=4326;LINESTRING(11 46,10 45)" "0101000020e610000000000000000026400000000000004740", "SRID=4326;LINESTRING(11 46,10 45)" , "SRID=4326;POINT(11 46)" ''') rows = CSVTableSet(csv_file).tables[0] guessed_types = type_guess(rows.sample, strict=True) assert_equal(guessed_types, [EWKB(), EWKT()])
def rowset_as_schema(rowset): _, headers = messytables.headers_guess(rowset.sample) types = map(celltype_as_string, messytables.type_guess(rowset.sample)) j = jsontableschema.JSONTableSchema() for field_id, field_type in zip(headers, types): j.add_field(field_id=field_id, label=field_id, field_type=field_type) return j
def connect(self, host=None, port=None, database=None, username=None, password=None, file=None): # TODO: mysql, pymssql, csv, sqlite3, pymongo, cx_Oracle self.database = database conn_string = '' if self.engine == 'psycopg2': if database: conn_string += "dbname='%s' " % database if username: conn_string += "user='******' " % username if host: conn_string += "host='%s' " % host if port: conn_string += "port='%s' " % port if password: conn_string += "password='******' " % password self.conn = psycopg2.connect(conn_string) elif self.engine == 'pymssql': self.conn = pymssql.connect(host, username, password, database, port=port, as_dict=True, charset='LATIN1') elif self.engine == 'csv': # https://messytables.readthedocs.io/en/latest/ fh = StringIO.StringIO(self.data) #dialect = csv.Sniffer().sniff(f.read(1024)) #f.seek(0) #self.conn = csv.DictReader(f, dialect=dialect) #fh = open('messy.csv', 'rb') # Load a file object: table_set = CSVTableSet(fh) row_set = table_set.tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) types = type_guess(row_set.sample, strict=True) row_set.register_processor(types_processor(types)) self.conn = row_set return self.conn
def create_new_model(self, modelname, app_label): """ Use messytables to guess field types and build a new model """ nocols = False cols = self.csvfile[0] for col in cols: if not col: nocols = True if nocols: cols = ['col_%s' % num for num in range(1, len(cols))] print('No column names for %s columns' % len(cols)) else: cols = [cleancol.sub('_', col).lower() for col in cols] try: from messytables import any_tableset, type_guess except: self.errors.append( 'If you want to inspect CSV files to generate model code, you must install https://messytables.readthedocs.org' ) self.modelname = '' return try: table_set = any_tableset(self.filehandle) row_set = table_set.tables[0] types = type_guess(row_set.sample) types = [str(typeobj) for typeobj in types] except: self.errors.append('messytables could not guess your column types') self.modelname = '' return fieldset = [] maximums = self.get_maxlengths(cols) for i, col in enumerate(cols): length = maximums[i] if types[i] == 'String' and length > 255: types[i] = 'Text' integer = length decimal = int(length / 2) if decimal > 10: decimal = 10 blank = True default = True column = (col, types[i], length, length, integer, decimal, blank, default) fieldset.append(column) # Import here so that messytables is not a dependency for just using csvimport cmd from csvimport.make_model import MakeModel maker = MakeModel() return maker.model_from_table('%s_%s' % (app_label, modelname), fieldset)
def proc(f, database_name, table_name): table_set = messytables.any_tableset(f) row_set = table_set.tables[0] # guess header names and the offset of the header: offset, headers = messytables.headers_guess(row_set.sample) row_set.register_processor(messytables.headers_processor(headers)) row_set.register_processor(messytables.offset_processor(offset + 1)) types = messytables.type_guess(row_set.sample, types=[ messytables.types.StringType, messytables.types.DateType, ], strict=True) hive_data_file = tempfile.NamedTemporaryFile(mode='w') fields_ddl = ','.join([ ' {0} {1}\n'.format( canonicalize_column_name(colName), hive_column_type(colType) ) for colName, colType in zip(headers, types) ]) hive_sql = ''' DROP TABLE IF EXISTS {0}; CREATE TABLE {0} ( {1} ) STORED AS TEXTFILE TBLPROPERTIES ("comment"="add_messytable on {3}"); LOAD DATA LOCAL INPATH '{2}' OVERWRITE INTO TABLE {0}; '''.format(table_name, fields_ddl, hive_data_file.name, datetime.datetime.now().isoformat()) hive_cmd_file = tempfile.NamedTemporaryFile(mode='w') print(hive_sql, file=hive_cmd_file) hive_cmd_file.flush() row_set.register_processor(messytables.types_processor(types)) for row in row_set: print('\001'.join(map(str, [ c.value for c in row])), file=hive_data_file) hive_data_file.flush() subprocess.call([ 'hive', '--database', database_name, '-f', hive_cmd_file.name, ])
def test_type_guess(self): csv_file = io.BytesIO(b''' 1, 2012/2/12, 2, 02 October 2011, yes, 1 2, 2012/2/12, 2, 02 October 2011, true, 1 2.4, 2012/2/12, 1, 1 May 2011, no, 0 foo, bar, 1000, , false, 0 4.3, , 42, 24 October 2012,, , 2012/2/12, 21, 24 December 2013, true, 1''') rows = CSVTableSet(csv_file).tables[0] guessed_types = type_guess(rows.sample) assert_equal(guessed_types, [ DecimalType(), DateType('%Y/%m/%d'), IntegerType(), DateType('%d %B %Y'), BoolType(), BoolType()])
def create_new_model(self, modelname, app_label): """ Use messytables to guess field types and build a new model """ nocols = False cols = self.csvfile[0] for col in cols: if not col: nocols = True if nocols: cols = ['col_%s' % num for num in range(1, len(cols))] print ('No column names for %s columns' % len(cols)) else: cols = [cleancol.sub('_', col).lower() for col in cols] try: from messytables import any_tableset, type_guess except: self.errors.append( 'If you want to inspect CSV files to generate model code, you must install https://messytables.readthedocs.org') self.modelname = '' return try: table_set = any_tableset(self.filehandle) row_set = table_set.tables[0] types = type_guess(row_set.sample) types = [str(typeobj) for typeobj in types] except Exception as err: self.errors.append('messytables could not run due to error') self.errors.append(str(err)) self.modelname = '' return fieldset = [] maximums = self.get_maxlengths(cols) for i, col in enumerate(cols): length = maximums[i] if types[i] == 'String' and length > 255: types[i] = 'Text' integer = length decimal = int(length / 2) if decimal > 10: decimal = 10 blank = True default = True column = (col, types[i], length, length, integer, decimal, blank, default) fieldset.append(column) # Import here so that messytables is not a dependency for just using csvimport cmd from csvimport.make_model import MakeModel maker = MakeModel() return maker.model_from_table('%s_%s' % (app_label, modelname), fieldset)
def test_type_guess(self): csv_file = StringIO.StringIO(''' 1, 2012/2/12, 2, 02 October 2011, yes, 1 2, 2012/2/12, 2, 02 October 2011, true, 1 2.4, 2012/2/12, 1, 1 May 2011, no, 0 foo, bar, 1000, , false, 0 4.3, , 42, 24 October 2012,, , 2012/2/12, 21, 24 December 2013, true, 1''') rows = CSVTableSet(csv_file).tables[0] guessed_types = type_guess(rows.sample) assert_equal(guessed_types, [ DecimalType(), DateType('%Y/%m/%d'), IntegerType(), DateType('%d %B %Y'), BoolType(), BoolType()])
def test_type_guess_forced(self): csv_file = StringIO.StringIO(''' 1, aaa, true 2, bbb, false 3, ccc, 4, , yes 5, ddd, no ''') rows = CSVTableSet(csv_file).tables[0] guessed_types = type_guess( rows.sample, forced_types=[None, None, StringType()] ) assert_equal(guessed_types, [IntegerType(), StringType(), StringType()])
def test_type_guess(self): csv_file = StringIO.StringIO(''' 1, 2012/2/12, 2, 02 October 2011, yes, 11 2, 2012/2/12, 2, 02 October 2011, true, 9 am 2.4, 2012/2/12, 1, 1 May 2011, no, 23:00.123 foo, bar, 1000, , false, 12:00 4.3, , 42, 24 October 2012, , 7.12 , 2012/2/12, 21, 24 December 2013, true, 11PM ''') rows = CSVTableSet(csv_file).tables[0] guessed_types = type_guess(rows.sample) assert_equal(guessed_types, [ DecimalType(), DateType('%Y/%m/%d'), IntegerType(), DateType('%d %B %Y'), BoolType(), TimeType()])
def test_read_type_guess_simple(self): fh = horror_fobj('simple.csv') table_set = CSVTableSet(fh) row_set = table_set.tables[0] types = type_guess(row_set.sample) expected_types = [DateType("%Y-%m-%d"), IntegerType(), StringType()] assert_equal(types, expected_types) row_set.register_processor(types_processor(types)) data = list(row_set) header_types = [c.type for c in data[0]] assert_equal(header_types, [StringType()] * 3) row_types = [c.type for c in data[2]] assert_equal(expected_types, row_types)
def test_type_guess(self): csv_file = StringIO.StringIO(''' 1, 2012/2/12, 2, 02 October 2011 2, 2012/2/12, 2, 02 October 2011 2.4, 2012/2/12, 1, 1 May 2011 foo, bar, 1000, 4.3, , 42, 24 October 2012 , 2012/2/12, 21, 24 December 2013''') rows = CSVTableSet(csv_file).tables[0] guessed_types = type_guess(rows.sample) assert_equal(guessed_types, [ DecimalType(), DateType('%Y/%m/%d'), IntegerType(), DateType('%d %B %Y')])
def csvParse(csv_file_path): fh = open(csv_file_path, 'rb') # Load a file object: table_set = CSVTableSet(fh) row_set = table_set.tables[0] # guess header names and the offset of the header: offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) # add one to begin with content, not the header: row_set.register_processor(offset_processor(offset + 1)) # guess column types: types = type_guess(row_set.sample, strict=True) row_set.register_processor(types_processor(types)) return row_set, headers, offset, types
def test_read_type_guess_simple(self): fh = horror_fobj("simple.csv") table_set = CSVTableSet(fh) row_set = table_set.tables[0] types = type_guess(row_set.sample) expected_types = [DateType("%Y-%m-%d"), IntegerType(), StringType()] assert_equal(types, expected_types) row_set.register_processor(types_processor(types)) data = list(row_set) header_types = map(lambda c: c.type, data[0]) assert_equal(header_types, [StringType()] * 3) row_types = map(lambda c: c.type, data[2]) assert_equal(expected_types, row_types)
def test_type_guess_strict(self): import locale locale.setlocale(locale.LC_ALL, 'en_GB.UTF-8') csv_file = StringIO.StringIO(''' 1, 2012/2/12, 2, 2,02 October 2011,"100.234354" 2, 2012/2/12, 1.1, 0,1 May 2011,"100,000,000.12" foo, bar, 1500, 0,,"NaN" 4, 2012/2/12, 42,"-2,000",24 October 2012,"42" ,,,,,''') rows = CSVTableSet(csv_file).tables[0] guessed_types = type_guess(rows.sample, strict=True) assert_equal(guessed_types, [ StringType(), StringType(), DecimalType(), IntegerType(), DateType('%d %B %Y'), DecimalType()])
def test_file_with_few_strings_among_integers(self): fh = horror_fobj('mixedGLB.csv') rows = CSVTableSet(fh).tables[0] offset, headers = headers_guess(rows.sample) rows.register_processor(offset_processor(offset + 1)) types = [StringType, IntegerType, DecimalType, DateUtilType] guessed_types = type_guess(rows.sample, types, True) assert_equal(len(guessed_types), 19) print guessed_types assert_equal(guessed_types, [ IntegerType(), IntegerType(), IntegerType(), IntegerType(), IntegerType(), IntegerType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), IntegerType(), StringType(), StringType()])
def create_sql_table(self, rowset, sql_table_name=None, headers=None, types=None): """ Create a SQL table schema from a MessyTables RowSet """ # if a different name isn't specified, use the primary root name if not sql_table_name: sql_table_name = self.table_name # we don't care about the offset returned, so just throw it away, get headers _, headers = messytables.headers_guess(rowset.sample) types = map(self.celltype_as_string, messytables.type_guess(rowset.sample, strict=False)) self.headers = headers self.header_types = types return self.headers_and_typed_as_sql(sql_table_name, headers, types)
def csvimport_table(name): from messytables import CSVTableSet, type_guess from messytables import types_processor, headers_guess from messytables import headers_processor, offset_processor from spendb.etl.extract import parse_table row_set = CSVTableSet(data_fixture(name)).tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) types = type_guess(row_set.sample, strict=True) row_set.register_processor(types_processor(types)) rows = [] for num_rows, (fields, row, samples) in enumerate(parse_table(row_set)): rows.append(row) return fields, rows
def prepare_csv_rows(csv_file): row_set = CSVTableSet(csv_file).tables[0] offset, headers = headers_guess(row_set.sample) headers = [convert_header_to_column_name(header) for header in (h for h in headers if h)] row_set.register_processor(headers_processor_remove_blank(headers)) row_set.register_processor(offset_processor(offset + 1)) DateType.formats = create_date_formats(day_first=False) # We are never wanting boolean types, so remove that from the default list eligible_types = [StringType, DecimalType, IntegerType, DateType] types = type_guess(row_set.sample, types=eligible_types, strict=True) row_set.register_processor(types_processor(types)) return row_set
def parse_table(source): # This is a work-around because messytables hangs on boto file # handles, so we're doing it via plain old HTTP. # We're also passing in an extended window size to give more # reliable type detection. # Because Python's CSV dialect sniffer isn't the best, this also # constrains the field quoting character to a double quote. table_set = mt.any_tableset(source.fh(), extension=source.meta.get('extension'), mimetype=source.meta.get('mime_type'), quotechar='"', window=20000) tables = list(table_set.tables) if not len(tables): log.error("No tables were found in the source file.") return row_set = tables[0] headers = [c.value for c in next(row_set.sample)] row_set.register_processor(mt.headers_processor(headers)) row_set.register_processor(mt.offset_processor(1)) types = mt.type_guess(row_set.sample, strict=True) row_set.register_processor(mt.types_processor(types, strict=True)) fields, i = {}, 0 row_iter = iter(row_set) while True: i += 1 try: row = row_iter.next() if not len(fields): fields = generate_field_spec(row) data = convert_row(row, fields, i) check_empty = set(data.values()) if None in check_empty and len(check_empty) == 1: continue yield None, fields, data except StopIteration: return except Exception, e: # log.exception(e) yield e, fields, None
def parse_data(input): fh = open(input, 'rb') try: table_set = messytables.any_tableset(fh) except messytables.ReadError as e: print(e) get_row_set = lambda table_set: table_set.tables.pop() row_set = get_row_set(table_set) offset, headers = messytables.headers_guess(row_set.sample) # Some headers might have been converted from strings to floats and such. headers = [str(header) for header in headers] row_set.register_processor(messytables.headers_processor(headers)) row_set.register_processor(messytables.offset_processor(offset + 1)) types = messytables.type_guess(row_set.sample, types=TYPES, strict=True) row_set.register_processor(messytables.types_processor(types)) headers = [header.strip() for header in headers if header.strip()] headers_set = set(headers) def row_iterator(): for row in row_set: data_row = {} for index, cell in enumerate(row): column_name = cell.column.strip() if column_name not in headers_set: continue data_row[column_name] = cell.value yield data_row result = row_iterator() headers_dicts = [ dict(id=field[0], type=TYPE_MAPPING[str(field[1])]) for field in zip(headers, types) ] print('Determined headers and types: {headers}'.format( headers=headers_dicts)) return headers_dicts, result
def _get_table_columns(self, csv_file_path: str) -> zip: """ Read the csv file and tries to guess the the type of each column using messytables library. The type can be 'Integer', 'Decimal', 'String' or 'Bool' :param csv_file_path: path to the csv file with content in it :return: a Zip object where each tuple has two elements: the first is the column name and the second is the type """ with gzip.open(csv_file_path, 'rb') as f: table_set = CSVTableSet(f) row_set = table_set.tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) types = list(map(jts.celltype_as_string, type_guess(row_set.sample, strict=True))) return zip(headers, types)
def test_apply_null_values(self): fh = horror_fobj('null.csv') table_set = CSVTableSet(fh) row_set = table_set.tables[0] types = type_guess(row_set.sample, strict=True) expected_types = [IntegerType(), StringType(), IntegerType(), StringType()] assert_equal(types, expected_types) row_set.register_processor(types_processor(types)) data = list(row_set) # treat null as non empty text and 0 as non empty integer assert [x.empty for x in data[0]] == [False, False, False, False] assert [x.empty for x in data[1]] == [False, False, False, False] assert [x.empty for x in data[2]] == [False, False, True, True] assert [x.empty for x in data[3]] == [False, False, False, False] assert [x.empty for x in data[4]] == [False, False, False, True] assert [x.empty for x in data[5]] == [False, False, False, True] # we expect None for Integers and "" for empty strings in CSV assert [x.value for x in data[2]] == [3, "null", None, ""], data[2]
def resource_row_set(package, resource): """ Generate an iterator over all the rows in this resource's source data. """ # This is a work-around because messytables hangs on boto file # handles, so we're doing it via plain old HTTP. table_set = any_tableset(resource.fh(), extension=resource.meta.get('extension'), mimetype=resource.meta.get('mime_type')) tables = list(table_set.tables) if not len(tables): log.error("No tables were found in the source file.") return row_set = tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) types = type_guess(row_set.sample, strict=True) row_set.register_processor(types_processor(types)) return row_set
def _guess_csv_datatype(fh): table_set = CSVTableSet(fh) row_set = table_set.tables[0] offset, headers = headers_guess(row_set.sample) logger.info("(offset, headers) = ({}, {})".format(offset, headers)) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) types = type_guess(row_set.sample, strict=True) row_set.register_processor(types_processor(types)) counter = 0 for row in row_set: logger.info(row) counter += 1 if counter >= 32: break d = {h: t for h, t in zip(headers, types)} logger.info(d) return d
def test_regex_type(self): csv_file = StringIO.StringIO(''' aaa, bbb, ccc , , aa, bb, cc a, b, c ''') class Type1(RegExType): regex = '^a+$' class Type2(RegExType): regex = '^b+$' rows = CSVTableSet(csv_file).tables[0] guessed_types =\ type_guess(rows.sample, types=[Type1(), Type2(), StringType()]) assert_equal(guessed_types, [ Type1(), Type2(), StringType() ])
def update(self, config): # Update the mime, sha1 of the files for i in range(len(config['files'])): filename = config['files'][i]['filename'] if os.path.exists(filename): u = { 'mimetype': mimetypes.guess_type(filename)[0], 'sha1': compute_sha1(filename) } if filename.lower().endswith('sv'): # csv/tsv rows = CSVTableSet(csv_file).tables[0] guessed_types = type_guess(rows.sample) u['schema'] = guessed_types config['files'][i].update(u) return config
def generate_schema(samples: List[Dict], table_spec: Dict) -> Dict: """ Guess columns types from the given samples and build json schema :param samples: List of dictionaries containing samples data from csv file(s) :param table_spec: table/stream specs given in the tap definition :return: dictionary where the keys are the headers and values are the guessed types - compatible with json schema """ schema = {} table_set = CSVTableSet(_csv2bytesio(samples)) row_set = table_set.tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) types = type_guess(row_set.sample, strict=True) for header, header_type in zip(headers, types): date_overrides = set(table_spec.get('date_overrides', [])) if header in date_overrides: schema[header] = {'type': ['null', 'string'], 'format': 'date-time'} else: if isinstance(header_type, IntegerType): schema[header] = { 'type': ['null', 'integer'] } elif isinstance(header_type, DecimalType): schema[header] = { 'type': ['null', 'number'] } else: schema[header] = { 'type': ['null', 'string'] } return schema
def determine_messytables_types(file_handle, types=messytables.types.TYPES): """ :param file_handle: file handle opened in binary mode :return: (headers, types, row_set) """ # Load a file object: table_set = messytables.CSVTableSet(file_handle) # If you aren't sure what kind of file it is # table_set = messytables.any_tableset(file_handle) # A table set is a collection of tables: row_set = table_set.tables[0] # A row set is an iterator over the table, but it can only # be run once. To peek, a sample is provided: print(next(row_set.sample)) # guess header names and the offset of the header: offset, headers = messytables.headers_guess(row_set.sample) row_set.register_processor(messytables.headers_processor(headers)) # add one to begin with content, not the header: row_set.register_processor(messytables.offset_processor(offset + 1)) # guess column types: types = messytables.type_guess(row_set.sample, types, strict=True) # and tell the row set to apply these types to # each row when traversing the iterator: row_set.register_processor(messytables.types_processor(types)) # now run some operation on the data: return headers, types, row_set