def test_usecols_csv(): target = { 'a': [1, 2, 3, 4, 5, 6], 'b': [2, 3, 4, 5, 6, 7], 'c': [3, 4, 5, 6, 7, 8], 'd': [4, 5, 6, 7, 8, 9], 'e': ['5', '6', '7', '8', '9', '0'] } fp = io.BytesIO( b'a,b,c,d,e\n1,2,3,4,5\n2,3,4,5,6\n3,4,5,6,7\n4,5,6,7,8\n5,6,7,8,9\n6,7,8,9,0' ) fp.seek(0) table = csv.read_csv(fp, convert_options=csv.ConvertOptions( column_types={ 'a': pa.int32(), 'b': pa.int64(), 'c': pa.int64(), 'd': pa.int64(), 'e': pa.string(), })) assert table engine.importArrowTable("usecols", table) assert bool(engine.closed) == False cursor = engine.executeDML("select * from usecols") assert cursor batch = cursor.getArrowRecordBatch() assert batch assert batch.to_pydict() == target
def write_files(metadata: AlchemyMetadata) -> None: """ Creates a Parquet file for each table in the schema. """ tables: Iterator[AlchemyTable] = metadata.tables.values() for table in tables: name = table.name print(name) def get_path(prefix: Path, suffix: str): parent_dir = prefix.joinpath(metadata.schema) parent_dir.mkdir(exist_ok=True, parents=True) return parent_dir.joinpath(name).with_suffix(suffix) extract_file = get_path(EXTRACT_PATH_PREFIX, ".csv.zst") parquet_file = get_path(PARQUET_PREFIX, ".parquet") arrow_schema = pa.schema(get_fields(table)) column_names = [name for name, dtype in get_fields(table)] read_options = pcsv.ReadOptions(column_names=column_names, block_size=1000000000) parse_options = pcsv.ParseOptions(newlines_in_values=True) convert_options = pcsv.ConvertOptions(column_types=arrow_schema, timestamp_parsers=["%Y%m%d", "%Y-%m-%d"], true_values=["1", "T"], false_values=["0", "F"], strings_can_be_null=True) parquet_writer = pq.ParquetWriter(parquet_file, schema=arrow_schema, compression='zstd', version="2.0", use_dictionary=True) stream_reader = pcsv.open_csv(extract_file, read_options=read_options, parse_options=parse_options, convert_options=convert_options) for batch in stream_reader: table = pa.Table.from_batches([batch]) parquet_writer.write_table(table) parquet_writer.close()
def scan_file(self, bucket, key, schema): logging.info(f"delim is {self.delimiter}") uri = f"{bucket}/{key}" s3fs = fs.S3FileSystem() # Run column order validation by opening and not reading anything. filestream = s3fs.open_input_stream(uri) parse_opts = csv.ParseOptions(delimiter=self.delimiter) reader = csv.open_csv(filestream, parse_options=parse_opts) for index, col in enumerate(reader.schema): if col.name != schema[index].name: msg = "column {} is out of order".format(col.name) raise ColumnOrderException(msg) # Run the rest of the validations. filestream = s3fs.open_input_stream(uri) opts = csv.ConvertOptions(column_types=schema) reader = csv.open_csv(filestream, convert_options=opts, parse_options=parse_opts) # Kind of a hack, but it works...if delim wrong, everything is read # as one column. if len(schema) > 1 and len(reader.schema) == 1: raise WrongDelimiterException() # Parse through the file, pyarrow will through exceptions # if there's invalid data. for batch in reader: # If primary key is a string, need to check the column # for empty strings. if schema.field(self.primary_key).type == "string": table = pyarrow.Table.from_batches([batch]) for val in table[self.primary_key]: if val.as_py() == "": raise EmptyPrimaryKeyException()
def read_stores(self): s_cols = [ 'retailer_code', 'parent_code', 'fips_state_code', 'fips_county_code', 'dma_code', 'store_zip3' ] # To reduce space -- update with dictionary arrays later store_convert = { 'year': pa.uint16(), 'dma_code': pa.uint16(), 'retailer_code': pa.uint16(), 'parent_code': pa.uint16(), 'store_zip3': pa.uint16(), 'fips_county_code': pa.uint16(), 'fips_state_code': pa.uint8() } # Use pyarrow to read CSVs and parse using the dict -- we have to fix some types again later. tmp = pa.concat_tables([ csv.read_csv( x, parse_options=csv.ParseOptions(delimiter='\t'), convert_options=csv.ConvertOptions(column_types=store_convert)) for x in self.stores_dict.values() ]).to_pandas().rename(columns={'year': 'panel_year'}) # some columns have blanks --fill with zero to avoid converting to floats(!) tmp.loc[:, s_cols] = tmp.loc[:, s_cols].fillna(0) # use the compressed types my_dict = { key: value for (key, value) in type_dict.items() if key in tmp.columns } self.stores_df = tmp.astype(my_dict) return
def _read_table_arrow(self, source: tp.BinaryIO, schema: pa.Schema) -> pa.Table: try: read_options = pa_csv.ReadOptions() read_options.encoding = 'utf-8' read_options.use_threads = False parse_options = pa_csv.ParseOptions() parse_options.newlines_in_values = True convert_options = pa_csv.ConvertOptions() convert_options.include_columns = schema.names convert_options.column_types = { n: t for (n, t) in zip(schema.names, schema.types) } convert_options.strings_can_be_null = True convert_options.quoted_strings_can_be_null = False return pa_csv.read_csv(source, read_options, parse_options, convert_options) except pa.ArrowInvalid as e: err = f"CSV file decoding failed, content is garbled" self._log.exception(err) raise _ex.EDataCorruption(err) from e except pa.ArrowKeyError as e: err = f"CSV file decoding failed, one or more columns is missing" self._log.error(err) self._log.exception(str(e)) raise _ex.EDataCorruption(err) from e
def pa_convert_options(self): if self.convert_options is not None: convert_options = self.convert_options else: convert_options = pac.ConvertOptions( column_types=FEATURES.type, ) return convert_options
def csv_to_table(self, csv_path, table_name, read = None, parse = None, convert = None, con = None, auto_infer = False): ' Pyarrow CSV reader documentation: https://arrow.apache.org/docs/python/generated/pyarrow.csv.read_csv.html ' if not ARROW: return "Optional pyarrow dependency not found. To install: pip3 install pyarrow" sqream_to_pa = { 'ftBool': pa.bool_(), 'ftUByte': pa.uint8(), 'ftShort': pa.int16(), 'ftInt': pa.int32(), 'ftLong': pa.int64(), 'ftFloat': pa.float32(), 'ftDouble': pa.float64(), 'ftDate': pa.timestamp('ns'), 'ftDateTime': pa.timestamp('ns'), 'ftVarchar': pa.string(), 'ftBlob': pa.utf8() } start = time.time() # Get table metadata con = con or self con.execute(f'select * from {table_name} where 1=0') # Map column names to pyarrow types and set Arrow's CSV parameters sqream_col_types = [col_type[0] for col_type in con.col_type_tups] column_types = zip(con.col_names, [sqream_to_pa[col_type[0]] for col_type in con.col_type_tups]) read = read or csv.ReadOptions(column_names=con.col_names) parse = parse or csv.ParseOptions(delimiter='|') convert = convert or csv.ConvertOptions(column_types = None if auto_infer else column_types) # Read CSV to in-memory arrow format csv_arrow = csv.read_csv(csv_path, read_options=read, parse_options=parse, convert_options=convert).combine_chunks() num_chunks = len(csv_arrow[0].chunks) numpy_cols = [] # For each column, get the numpy representation for quick packing for col_type, col in zip(sqream_col_types, csv_arrow): # Only one chunk after combine_chunks() col = col.chunks[0] if col_type in ('ftVarchar', 'ftBlob', 'ftDate', 'ftDateTime'): col = col.to_pandas() else: col = col.to_numpy() numpy_cols.append(col) print (f'total loading csv: {time.time()-start}') start = time.time() # Insert columns into SQream col_num = csv_arrow.shape[1] con.executemany(f'insert into {table_name} values ({"?,"*(col_num-1)}?)', numpy_cols) print (f'total inserting csv: {time.time()-start}')
def read_rms(self): self.rms_df = pa.concat_tables([ csv.read_csv( fn, parse_options=csv.ParseOptions(delimiter='\t'), convert_options=csv.ConvertOptions(column_types={ 'upc': pa.int64(), 'upc_ver_uc': pa.uint8() })) for fn in self.rms_dict.values() ]).to_pandas() return
def read_product(self, keep_groups=None, drop_groups=None, keep_modules=None, drop_modules=None): prod_cols = [ 'upc', 'upc_ver_uc', 'upc_descr', 'product_module_code', 'product_module_descr', 'product_group_code', 'product_group_descr', 'brand_code_uc', 'brand_descr', 'multi', 'size1_code_uc', 'size1_amount', 'size1_units', 'dataset_found_uc', 'size1_change_flag_uc' ] prod_dict = { 'upc': pa.int64(), 'upc_ver_uc': pa.int8(), 'product_module_code': pa.uint16(), 'brand_code_uc': pa.uint32(), 'multi': pa.uint16(), 'size1_code_uc': pa.uint16() } prod_df = csv.read_csv(self.product_file, read_options=csv.ReadOptions(encoding='latin'), parse_options=csv.ParseOptions(delimiter='\t'), convert_options=csv.ConvertOptions( column_types=prod_dict, include_columns=prod_cols)).to_pandas() if keep_groups: prod_df = prod_df[prod_df['product_group_code'].isin(keep_groups)] if drop_groups: prod_df = prod_df[~prod_df['product_group_code'].isin(drop_groups)] if keep_modules: prod_df = prod_df[prod_df['product_module_code'].isin( keep_modules)] if drop_modules: prod_df = prod_df[~prod_df['product_module_code'].isin(drop_modules )] # dictionary encoding to save space prod_df['size1_units'] = prod_df['size1_units'].astype('category') prod_df['product_module_descr'] = prod_df[ 'product_module_descr'].astype('category') prod_df['product_group_code'] = prod_df['product_group_code'].astype( 'category') # clean up product info prod_df['upc_descr'] = prod_df['upc_descr'].str.strip().str.replace( 'RTE', '') prod_df['brand_descr'] = prod_df['brand_descr'].str.strip( ).str.replace('CTL BR', 'Private Label') self.prod_df = prod_df.copy() return
def _read_csv(self) -> "Table": import pyarrow as pa from pyarrow import csv if not self.output_location: raise ProgrammingError("OutputLocation is none or empty.") if not self.output_location.endswith((".csv", ".txt")): return pa.Table.from_pydict(dict()) length = self._get_content_length() if length and self.output_location.endswith(".txt"): description = self.description if self.description else [] column_names = [d[0] for d in description] read_opts = csv.ReadOptions( skip_rows=0, column_names=column_names, block_size=self._block_size, use_threads=True, ) parse_opts = csv.ParseOptions( delimiter="\t", quote_char=False, double_quote=False, escape_char=False, ) elif length and self.output_location.endswith(".csv"): read_opts = csv.ReadOptions(skip_rows=0, block_size=self._block_size, use_threads=True) parse_opts = csv.ParseOptions( delimiter=",", quote_char='"', double_quote=True, escape_char=False, ) else: return pa.Table.from_pydict(dict()) bucket, key = parse_output_location(self.output_location) try: return csv.read_csv( self._fs.open_input_stream(f"{bucket}/{key}"), read_options=read_opts, parse_options=parse_opts, convert_options=csv.ConvertOptions( quoted_strings_can_be_null=False, timestamp_parsers=self.timestamp_parsers, column_types=self.column_types, ), ) except Exception as e: _logger.exception(f"Failed to read {bucket}/{key}.") raise OperationalError(*e.args) from e
def read_delta_file_envelopes( annotated_delta_manifests: List[Dict[str, Any]], column_names: List[str], primary_keys: List[str], sort_keys: List[str], deltacat_storage=unimplemented_deltacat_storage) \ -> Optional[List[Dict[str, Any]]]: tables_and_annotations = [] columns_to_read = list(chain(primary_keys, sort_keys)) for annotated_delta_manifest in annotated_delta_manifests: tables = deltacat_storage.download_delta_manifest( annotated_delta_manifest, file_reader_kwargs={ CONTENT_TYPE_TO_USER_KWARGS_KEY[ContentType.CSV.value]: { pacsv.ReadOptions(column_names=column_names), pacsv.ConvertOptions(include_columns=columns_to_read) }, CONTENT_TYPE_TO_USER_KWARGS_KEY[ContentType.PARQUET.value]: { "columns": columns_to_read }, CONTENT_TYPE_TO_USER_KWARGS_KEY[ContentType.FEATHER.value]: { "columns": columns_to_read }, }, ) annotations = dma.get_annotations(annotated_delta_manifest) assert ( len(tables) == len(annotations), f"Unexpected Error: Length of downloaded delta manifest tables " f"({len(tables)}) doesn't match the length of delta manifest " f"annotations ({len(annotations)}).") tables_and_annotations.append((tables, annotations)) if not tables_and_annotations: return None delta_file_envelopes = [] for tables, annotations in tables_and_annotations: for i in range(len(tables)): delta_file = delta_file_envelope.of( dma.get_annotation_stream_position(annotations[i]), dma.get_annotation_file_index(annotations[i]), dma.get_annotation_delta_type(annotations[i]), tables[i], ) delta_file_envelopes.append(delta_file) return delta_file_envelopes
def _convert_options_from_dict(**kwargs): """Returns the convert options for CSV. Returns: (object) A pyarrow ConvertOptions object. """ return csv.ConvertOptions( check_utf8=kwargs.pop('check_utf8', True), column_types=kwargs.pop('column_types', None), null_values=kwargs.pop('null_values', [" "]), true_values=kwargs.pop('true_values', None), false_values=kwargs.pop('false_values', None), strings_can_be_null=kwargs.pop('strings_can_be_null', True), auto_dict_encode=kwargs.pop('auto_dict_encode', None), auto_dict_max_cardinality=kwargs.pop('auto_dict_max_cardinality', None), include_columns=kwargs.pop('include_columns', None), include_missing_columns=kwargs.pop('include_missing_columns', None))
def as_parquet(csv_file, metadata_file): result = io.BytesIO() csv_parse_options = csv.ParseOptions(delimiter="\t") schema_parse_options = csv.ParseOptions(delimiter="|") schema = csv.read_csv(metadata_file, parse_options=schema_parse_options) pyarrow_schema = create_pyarrow_schema(schema["cleaned_name"], schema["data_type"]) csv_convert_options = csv.ConvertOptions(column_types=pyarrow_schema) parquet_table = csv.read_csv(csv_file, parse_options=csv_parse_options, convert_options=csv_convert_options) w = parquet.ParquetWriter(where=result, schema=pyarrow_schema, compression="SNAPPY", flavor="spark") w.write_table(parquet_table) return result
def read_tables( names: Iterable[str], data_directory: Path, ) -> Iterator[tuple[str, pa.Table]]: import pyarrow.csv as pac import ibis.backends.pyarrow.datatypes as pa_dt for name in names: schema = TEST_TABLES[name] convert_options = pac.ConvertOptions(column_types={ name: pa_dt.to_pyarrow_type(type) for name, type in schema.items() }) yield name, pac.read_csv( data_directory / f'{name}.csv', convert_options=convert_options, )
def test_null_col(): target = {'a': [1, 2, 3], 'b': [1, 2, 3], 'c': [None, None, None]} fp = io.BytesIO(b'a,b,c\n1,1,\n2,2,\n3,3,\n') fp.seek(0) table = csv.read_csv(fp, convert_options=csv.ConvertOptions(column_types={ 'a': pa.int32(), 'b': pa.int64(), 'c': pa.int64(), })) assert table engine.importArrowTable("test_null_col", table) assert bool(engine.closed) == False cursor = engine.executeDML("select * from test_null_col") assert cursor batch = cursor.getArrowRecordBatch() assert batch assert batch.to_pydict() == target
def read_csv(self, filenames, delimiter=','): global parquet_writer for file in filenames: csv_reader = csv.open_csv( file, read_options=csv.ReadOptions(use_threads=True), parse_options=csv.ParseOptions(delimiter=delimiter), convert_options=csv.ConvertOptions(column_types=self.dtype)) parquet_writer = pq.ParquetWriter(self.parquet_file, csv_reader.schema) nrow = 0 for batch in csv_reader: batch_df = batch.to_pandas() nrow += batch_df.shape[0] parquet_writer.write_table(pa.Table.from_pandas(df=batch_df)) parquet_writer.close() return ds.dataset(self.parquet_file, format="parquet")
def test_csv_options(in_type, pd_old_type, pd_new_type): schema = pa.schema([("string_col", pa.string())]) read_options = csv.ReadOptions(skip_rows=1) parse_options = csv.ParseOptions(quote_char="'", escape_char="\\", delimiter=";", newlines_in_values=True) convert_options = csv.ConvertOptions( include_columns=["i", "my_string", "nonexistent_column"], include_missing_columns=True, null_values=["NULL_STRING"], strings_can_be_null=True, ) df = pa_read_csv_to_pandas( "tests/data/csv_options_test.csv", schema, False, pd_string=False, parse_options=parse_options, convert_options=convert_options, read_options=read_options, ) expected = [ "dsfasd;dsffadsf", "dsfasd;dsffadsf", None, "this text\nhas a line break", "this text, like so, has commas", ] assert df.columns.tolist() == ["i", "my_string", "nonexistent_column"] assert df["nonexistent_column"].isnull().all() assert_series_equal(df["my_string"], Series(expected, name="my_string"))
def read_one_sales(fn, stores_list=None, incl_promo=True): my_cols = [ 'store_code_uc', 'upc', 'week_end', 'units', 'prmult', 'price' ] if incl_promo: my_cols = my_cols + ['feature', 'display'] convert_dict = { 'feature': pa.int8(), 'display': pa.int8(), 'prmult': pa.int8(), 'units': pa.uint16(), 'store_code_uc': pa.uint32() } dataset = ds.dataset( csv.read_csv(fn, parse_options=csv.ParseOptions(delimiter='\t'), convert_options=csv.ConvertOptions( column_types=convert_dict, include_columns=my_cols))) if stores_list is None: return dataset.to_table() else: return dataset.to_table( filter=ds.field('store_code_uc').isin(stores_list))
import pyarrow as pa import pyarrow.csv as pv import pyarrow.parquet as pq from pyarrow.lib import Table csv = 'accumulated_data_300_million_rows_id_filter.csv' target_file = '../data/accumulated_data_300_million_rows_id_filter_1mill.parquet' csv_read_options = pv.ReadOptions(skip_rows=0, encoding="utf8", column_names=["unit_id"]) # Types: https://arrow.apache.org/docs/python/api/datatypes.html data_schema = pa.schema([('unit_id', pa.uint64())]) # ConvertOptions: https://arrow.apache.org/docs/python/generated/pyarrow.csv.ConvertOptions.html#pyarrow.csv.ConvertOptions csv_convert_options = pv.ConvertOptions(column_types=data_schema) table: Table = pv.read_csv(input_file=csv, read_options=csv_read_options, convert_options=csv_convert_options) pq.write_table(table, target_file) print('Generated file with the following:') print('Parquet metadata: ' + str(pq.read_metadata(target_file))) print('Parquet schema: ' + pq.read_schema(target_file).to_string())
def csv_reader_get_pyarrow_convert_options(names, usecols, dtype, parse_dates): include_columns = None # default value (include all CSV columns) # if names is not given then column names will be defined from from the first row of CSV file # otherwise pyarrow autogenerated column names will be used (see ReadOptions), so # map pandas usecols to pyarrow include_columns accordingly if usecols: if type(usecols[0]) == str: if names: include_columns = [f'f{names.index(col)}' for col in usecols] else: include_columns = usecols # no autogenerated names elif type(usecols[0]) == int: include_columns = [f'f{i}' for i in usecols] else: assert False, f"Failed building pyarrow ConvertOptions due to usecols param value: {usecols}" if dtype: # dtype pandas read_csv argument maps to pyarrow column_types dict, but column names # must match those that are read from CSV (if names is None) or pyarrows generated names (otherwise) if isinstance(dtype, dict): if names: names_list = list(names) column_types = {} for k, v in dtype.items(): # TO-DO: check this is aligned with include_columns column_name = "f{}".format(names_list.index(k)) if isinstance(v, pd.CategoricalDtype): column_type = pa.string() else: column_type = pa.from_numpy_dtype(v) column_types[column_name] = column_type else: column_types = {k: pa.from_numpy_dtype(v) for k, v in dtype.items()} else: # single dtype for all columns pa_dtype = pa.from_numpy_dtype(dtype) if names: column_types = {f"f{names_list.index(k)}": pa_dtype for k in names} elif usecols: column_types = dict.fromkeys(usecols, pa_dtype) else: column_types = pa_dtype else: column_types = None # TO-DO: support all possible parse_dates values (now only list of column positions is supported) try: for column in parse_dates: name = f"f{column}" # starting from pyarrow=3.0.0 strings are parsed to DateType (converted back to 'object' # when using to_pandas), but not TimestampType (that is used to represent np.datetime64) # see: pyarrow.from_numpy_dtype(np.datetime64('NaT', 's')) # so make pyarrow infer needed type manually column_types[name] = pa.timestamp('s') except (KeyError, TypeError): pass convert_options = csv.ConvertOptions( column_types=column_types, strings_can_be_null=True, include_columns=include_columns, ) return convert_options
import os import sys import pyarrow as pa from pyarrow import csv import pyarrow.plasma as plasma csv_name = sys.argv[1] client = plasma.connect("/tmp/fast_python") convert_options = csv.ConvertOptions(column_types={"VendorID": pa.bool_()}, true_values=["Y", "1"], false_values=["N", "2"]) table = csv.read_csv(csv_name, convert_options=convert_options) pid = os.getpid() plid = plasma.ObjectID(f"csv-{pid}".ljust(20, " ").encode("us-ascii")) a = client.put(table, plid) # eviction
def pa_convert_options(self): convert_options = self.convert_options or pac.ConvertOptions( column_types=self.features.type if self. features is not None else None) return convert_options
# ver = df.__version__ ver = "7.0.0" git = "" task = "groupby" solution = "datafusion" fun = ".groupby" cache = "TRUE" on_disk = "FALSE" data_name = os.environ["SRC_DATANAME"] src_grp = os.path.join("data", data_name + ".csv") print("loading dataset %s" % src_grp, flush=True) data = pacsv.read_csv( src_grp, convert_options=pacsv.ConvertOptions(auto_dict_encode=True)) print("dataset loaded") ctx = df.ExecutionContext() ctx.register_record_batches("x", [data.to_batches()]) print("registered record batches") # cols = ctx.sql("SHOW columns from x") # ans.show() in_rows = data.num_rows # print(in_rows, flush=True) task_init = timeit.default_timer() question = "sum v1 by id1" # q1 gc.collect()
import pyarrow as pa from pyarrow import csv import pyarrow.compute as pc import pyarrow.parquet as pq table = csv.read_csv( "../../07-pandas/sec1-intro/yellow_tripdata_2020-01.csv.gz") pq.write_table(table, "202001.parquet") convert_options = csv.ConvertOptions(column_types={ "VendorID": pa.uint8(), "passenger_count": pa.uint8(), }) table = csv.read_csv( "../../07-pandas/sec1-intro/yellow_tripdata_2020-01.csv.gz", convert_options=convert_options) # vvv read_csv doesn't support this table["VendorID"] tp = table.to_pandas() # pq.write_table(table, "202001.parquet", use_dictionary=False) pq.write_table( table, "202001.parquet", ) #memory size, disk size #retype as int64 for both #memory size, disk size
csv_parse_options = pv.ParseOptions(delimiter=';') # Types: https://arrow.apache.org/docs/python/api/datatypes.html # TODO nullable parameter does not work as expected! data_schema = pa.schema([ pa.field(name='start_year', type=pa.string(), nullable=True), pa.field(name='unit_id', type=pa.uint64(), nullable=False), pa.field(name='value', type=pa.string(), nullable=False), pa.field(name='start_epoch_days', type=pa.int16(), nullable=True), pa.field(name='stop_epoch_days', type=pa.int16(), nullable=True), ]) # ConvertOptions: https://arrow.apache.org/docs/python/generated/pyarrow.csv.ConvertOptions.html#pyarrow.csv.ConvertOptions csv_convert_options = pv.ConvertOptions(column_types=data_schema, include_columns=[ "unit_id", "value", "start_year", "start_unix_days", "stop_unix_days" ]) # read_csv: https://arrow.apache.org/docs/python/generated/pyarrow.csv.read_csv.html#pyarrow.csv.read_csv table = pv.read_csv(input_file=csv_filename, read_options=csv_read_options, parse_options=csv_parse_options, convert_options=csv_convert_options) print(table.nbytes) print(table.num_rows) print(table.schema) print(table.column_names) pandas.set_option('max_columns', None) # print all columns print(table.to_pandas().head(10))
def pa_convert_options(self): convert_options = self.convert_options or pac.ConvertOptions() return convert_options
def read_year(self, year, hh_states_keep=None, hh_states_drop=None, hh_dma_keep=None, hh_dma_drop=None): (purch_fn, trip_fn, panelist_fn) = get_fns(self.annual_dict[year]) hh_ds = ds.dataset( csv.read_csv(panelist_fn, parse_options=csv.ParseOptions(delimiter='\t'), convert_options=csv.ConvertOptions( auto_dict_encode=True, auto_dict_max_cardinality=1024))) # build an arrow dataset filter object one by one my_filter = ds.field('Projection_Factor') > 0 if hh_states_keep: my_filter = my_filter & ( ds.field('Fips_State_Desc').isin(hh_states_keep)) if hh_states_drop: my_filter = my_filter & ( ~ds.field('Fips_State_Desc').isin(hh_states_drop)) if hh_dma_keep: my_filter = my_filter & (ds.field('DMA_Cd').isin(hh_dma_keep)) if hh_dma_drop: my_filter = my_filter & (~ds.field('DMA_Cd').isin(hh_dma_drop)) # convert to pandas and get unique HH list hh_df = hh_ds.to_table(filter=my_filter).to_pandas().rename( columns=hh_dict_rename) hh_list = hh_df.household_code.unique() # use pyarrrow filter to filter trips for just our households trip_df = ds.dataset(csv.read_csv(trip_fn, parse_options=csv.ParseOptions(delimiter='\t')))\ .to_table(filter=ds.field('household_code').isin(hh_list)).to_pandas() trip_list = trip_df.trip_code_uc.unique() upc_list = self.prod_df.upc.unique() # use pyarrow to filter purchases using trips and UPCs only purch_ds = ds.dataset( csv.read_csv(purch_fn, parse_options=csv.ParseOptions(delimiter='\t'), convert_options=csv.ConvertOptions( auto_dict_encode=True, auto_dict_max_cardinality=1024))) purch_filter = ds.field('trip_code_uc').isin(trip_list) & ds.field( 'upc').isin(upc_list) purch_df = purch_ds.to_table(filter=purch_filter).to_pandas() # Add the fields to the trips and purchases for convenience later trip_df2 = pd.merge(trip_df, hh_df[self.hh_cols], on=['household_code', 'panel_year']) purch_df2 = pd.merge(pd.merge( purch_df, self.prod_df[self.prod_cols], on=['upc', 'upc_ver_uc']), trip_df2[self.hh_cols+['trip_code_uc', 'purchase_date', 'store_code_uc']], on=['trip_code_uc'])\ .rename(columns={'fips_state_desc': 'hh_state_desc'}) self.purch_df = self.purch_df.append(purch_df2, ignore_index=True) self.trip_df = self.trip_df.append(trip_df2, ignore_index=True) self.hh_df = self.hh_df.append(hh_df, ignore_index=True) return
import pyarrow as pa from pyarrow import csv if __name__ == "__main__": # read CSV CSV_NAME = "test.csv" # make sure bools are read correctly options = csv.ConvertOptions(column_types={ "i64": pa.int64(), "bool": pa.bool_() }) table = csv.read_csv(CSV_NAME, convert_options=options) # write table to arrow writer = pa.RecordBatchFileWriter("test2.arrow", table.schema) writer.write_table(table) writer.close()
tot_bytes = 0 for name in table.column_names: col_bytes = table[name].nbytes col_type = table[name].type print(name, col_bytes // (1024**2)) tot_bytes += col_bytes print("Total", tot_bytes // (1024**2)) table["store_and_fwd_flag"].unique() table_df = table.to_pandas() convert_options = csv.ConvertOptions( column_types={ "VendorID": pa.bool_(), # "trip_distance": pa.float16() }, true_values=["Y", "1"], false_values=["N", "2"]) table = csv.read_csv("../sec1-intro/yellow_tripdata_2020-01.csv.gz", convert_options=convert_options) print(table["store_and_fwd_flag"].unique(), table["store_and_fwd_flag"].nbytes // (1024**2), table["VendorID"].nbytes // 1024, table["store_and_fwd_flag"].nbytes // 1024) x = pa.array([False, True]).cast(pa.string()).cast(pa.bool_()) table_df = table.to_pandas() print(table_df.store_and_fwd_flag) mission_impossible = table.to_pandas(self_destruct=True)
def _parse_data_to_pandas(filepath: str, table_params: dict, metadata: dict): """ Reads in the data from the given filepath and returns a dataframe """ meta_col_names = [ c["name"] for c in metadata["columns"] if c["name"] not in metadata.get("partitions", []) ] # For string based file types convert make arrow readers read them in as strings # validators will still treat these as dates but will run validation against strings # cols expecting values to match a timestamp format if "json" in metadata["file_format"] or "csv" in metadata["file_format"]: md_obj = Metadata.from_dict(metadata) cols = md_obj.columns cols_to_force_str_read_in = [] for c in cols: if c["type"].startswith("time") or c["type"].startswith("date"): c["type"] = "string" c["type_category"] = "string" cols_to_force_str_read_in.append(c["name"]) md_obj.columns = cols ac = ArrowConverter() arrow_schema = ac.generate_from_meta(md_obj) ts_as_str_schema = pa.schema([]) for cname in cols_to_force_str_read_in: ts_as_str_schema = ts_as_str_schema.append( arrow_schema.field(cname)) # Set the reader type if filepath.startswith("s3://"): reader_fs = fs.S3FileSystem(region="eu-west-1") fp_for_file_reader = filepath.replace("s3://", "", 1) else: reader_fs = fs.LocalFileSystem() fp_for_file_reader = filepath with reader_fs.open_input_stream(fp_for_file_reader) as f: if "csv" in metadata["file_format"]: # Safer CSV load for newlines_in_values set to True if table_params.get("expect-header", True): po = csv.ParseOptions(newlines_in_values=True) else: po = csv.ParseOptions(newlines_in_values=True, column_names=meta_col_names) if ts_as_str_schema: co = csv.ConvertOptions(column_types=ts_as_str_schema) else: co = None df = pa_read_csv_to_pandas( input_file=f, schema=arrow_schema, expect_full_schema=False, parse_options=po, convert_options=co, ) # dates/datetimes == string elif "json" in metadata["file_format"]: po = json.ParseOptions( newlines_in_values=True, explicit_schema=ts_as_str_schema if ts_as_str_schema else None, ) df = pa_read_json_to_pandas( input_file=f, schema=arrow_schema, expect_full_schema=False, parse_options=po, ) # dates/datetimes == string elif "parquet" in metadata["file_format"]: df = arrow_to_pandas(pq.read_table(f)) # dates/datetimes == datetime / date else: raise ValueError( f"Unknown file_format in metadata: {metadata['file_format']}.") if table_params.get("row-limit"): df = df.sample(table_params.get("row-limit")) if table_params.get("headers-ignore-case"): df_cols = [c.lower() for c in df.columns] df.columns = df_cols if table_params.get("only-test-cols-in-metadata", False): keep_cols = [c for c in df.columns if c in meta_col_names] df = df[keep_cols] return df