def _read_csv(self) -> "Table": import pyarrow as pa from pyarrow import csv if not self.output_location: raise ProgrammingError("OutputLocation is none or empty.") if not self.output_location.endswith((".csv", ".txt")): return pa.Table.from_pydict(dict()) length = self._get_content_length() if length and self.output_location.endswith(".txt"): description = self.description if self.description else [] column_names = [d[0] for d in description] read_opts = csv.ReadOptions( skip_rows=0, column_names=column_names, block_size=self._block_size, use_threads=True, ) parse_opts = csv.ParseOptions( delimiter="\t", quote_char=False, double_quote=False, escape_char=False, ) elif length and self.output_location.endswith(".csv"): read_opts = csv.ReadOptions(skip_rows=0, block_size=self._block_size, use_threads=True) parse_opts = csv.ParseOptions( delimiter=",", quote_char='"', double_quote=True, escape_char=False, ) else: return pa.Table.from_pydict(dict()) bucket, key = parse_output_location(self.output_location) try: return csv.read_csv( self._fs.open_input_stream(f"{bucket}/{key}"), read_options=read_opts, parse_options=parse_opts, convert_options=csv.ConvertOptions( quoted_strings_can_be_null=False, timestamp_parsers=self.timestamp_parsers, column_types=self.column_types, ), ) except Exception as e: _logger.exception(f"Failed to read {bucket}/{key}.") raise OperationalError(*e.args) from e
def _read_table_arrow(self, source: tp.BinaryIO, schema: pa.Schema) -> pa.Table: try: read_options = pa_csv.ReadOptions() read_options.encoding = 'utf-8' read_options.use_threads = False parse_options = pa_csv.ParseOptions() parse_options.newlines_in_values = True convert_options = pa_csv.ConvertOptions() convert_options.include_columns = schema.names convert_options.column_types = { n: t for (n, t) in zip(schema.names, schema.types) } convert_options.strings_can_be_null = True convert_options.quoted_strings_can_be_null = False return pa_csv.read_csv(source, read_options, parse_options, convert_options) except pa.ArrowInvalid as e: err = f"CSV file decoding failed, content is garbled" self._log.exception(err) raise _ex.EDataCorruption(err) from e except pa.ArrowKeyError as e: err = f"CSV file decoding failed, one or more columns is missing" self._log.error(err) self._log.exception(str(e)) raise _ex.EDataCorruption(err) from e
def write_files(metadata: AlchemyMetadata) -> None: """ Creates a Parquet file for each table in the schema. """ tables: Iterator[AlchemyTable] = metadata.tables.values() for table in tables: name = table.name print(name) def get_path(prefix: Path, suffix: str): parent_dir = prefix.joinpath(metadata.schema) parent_dir.mkdir(exist_ok=True, parents=True) return parent_dir.joinpath(name).with_suffix(suffix) extract_file = get_path(EXTRACT_PATH_PREFIX, ".csv.zst") parquet_file = get_path(PARQUET_PREFIX, ".parquet") arrow_schema = pa.schema(get_fields(table)) column_names = [name for name, dtype in get_fields(table)] read_options = pcsv.ReadOptions(column_names=column_names, block_size=1000000000) parse_options = pcsv.ParseOptions(newlines_in_values=True) convert_options = pcsv.ConvertOptions(column_types=arrow_schema, timestamp_parsers=["%Y%m%d", "%Y-%m-%d"], true_values=["1", "T"], false_values=["0", "F"], strings_can_be_null=True) parquet_writer = pq.ParquetWriter(parquet_file, schema=arrow_schema, compression='zstd', version="2.0", use_dictionary=True) stream_reader = pcsv.open_csv(extract_file, read_options=read_options, parse_options=parse_options, convert_options=convert_options) for batch in stream_reader: table = pa.Table.from_batches([batch]) parquet_writer.write_table(table) parquet_writer.close()
def _read_csv_with_offset_pyarrow_on_ray(fname, num_splits, start, end, kwargs, header): # pragma: no cover """Use a Ray task to read a chunk of a CSV into a pyarrow Table. Note: Ray functions are not detected by codecov (thus pragma: no cover) Args: fname: The filename of the file to open. num_splits: The number of splits (partitions) to separate the DataFrame into. start: The start byte offset. end: The end byte offset. kwargs: The kwargs for the pyarrow `read_csv` function. header: The header of the file. Returns: A list containing the split pyarrow Tables and the the number of rows of the tables as the last element. This is used to determine the total length of the DataFrame to build a default Index. """ bio = open(fname, "rb") # The header line for the CSV file first_line = bio.readline() bio.seek(start) to_read = header + first_line + bio.read(end - start) bio.close() table = csv.read_csv(BytesIO(to_read), parse_options=csv.ParseOptions(header_rows=1)) chunksize = get_default_chunksize(table.num_columns, num_splits) chunks = [ pa.Table.from_arrays(table.columns[chunksize * i:chunksize * (i + 1)]) for i in range(num_splits) ] return chunks + [ table.num_rows, pandas.Series([t.to_pandas_dtype() for t in table.schema.types], index=table.schema.names), ]
def pa_parse_options(self): parse_options = self.parse_options or pac.ParseOptions() if self.delimiter is not None: parse_options.delimiter = self.delimiter if self.quote_char is not None: parse_options.quote_char = self.quote_char return parse_options
def parse(self, **kwargs): import pyarrow as pa import pyarrow.csv as csv fname = kwargs.pop("fname", None) num_splits = kwargs.pop("num_splits", None) start = kwargs.pop("start", None) end = kwargs.pop("end", None) header = kwargs.pop("header", None) bio = open(fname, "rb") # The header line for the CSV file first_line = bio.readline() bio.seek(start) to_read = header + first_line + bio.read(end - start) bio.close() table = csv.read_csv(BytesIO(to_read), parse_options=csv.ParseOptions(header_rows=1)) chunksize = get_default_chunksize(table.num_columns, num_splits) chunks = [ pa.Table.from_arrays(table.columns[chunksize * i:chunksize * (i + 1)]) for i in range(num_splits) ] return chunks + [ table.num_rows, pandas.Series( [t.to_pandas_dtype() for t in table.schema.types], index=table.schema.names, ), ]
def scan_file(self, bucket, key, schema): logging.info(f"delim is {self.delimiter}") uri = f"{bucket}/{key}" s3fs = fs.S3FileSystem() # Run column order validation by opening and not reading anything. filestream = s3fs.open_input_stream(uri) parse_opts = csv.ParseOptions(delimiter=self.delimiter) reader = csv.open_csv(filestream, parse_options=parse_opts) for index, col in enumerate(reader.schema): if col.name != schema[index].name: msg = "column {} is out of order".format(col.name) raise ColumnOrderException(msg) # Run the rest of the validations. filestream = s3fs.open_input_stream(uri) opts = csv.ConvertOptions(column_types=schema) reader = csv.open_csv(filestream, convert_options=opts, parse_options=parse_opts) # Kind of a hack, but it works...if delim wrong, everything is read # as one column. if len(schema) > 1 and len(reader.schema) == 1: raise WrongDelimiterException() # Parse through the file, pyarrow will through exceptions # if there's invalid data. for batch in reader: # If primary key is a string, need to check the column # for empty strings. if schema.field(self.primary_key).type == "string": table = pyarrow.Table.from_batches([batch]) for val in table[self.primary_key]: if val.as_py() == "": raise EmptyPrimaryKeyException()
def read_stores(self): s_cols = [ 'retailer_code', 'parent_code', 'fips_state_code', 'fips_county_code', 'dma_code', 'store_zip3' ] # To reduce space -- update with dictionary arrays later store_convert = { 'year': pa.uint16(), 'dma_code': pa.uint16(), 'retailer_code': pa.uint16(), 'parent_code': pa.uint16(), 'store_zip3': pa.uint16(), 'fips_county_code': pa.uint16(), 'fips_state_code': pa.uint8() } # Use pyarrow to read CSVs and parse using the dict -- we have to fix some types again later. tmp = pa.concat_tables([ csv.read_csv( x, parse_options=csv.ParseOptions(delimiter='\t'), convert_options=csv.ConvertOptions(column_types=store_convert)) for x in self.stores_dict.values() ]).to_pandas().rename(columns={'year': 'panel_year'}) # some columns have blanks --fill with zero to avoid converting to floats(!) tmp.loc[:, s_cols] = tmp.loc[:, s_cols].fillna(0) # use the compressed types my_dict = { key: value for (key, value) in type_dict.items() if key in tmp.columns } self.stores_df = tmp.astype(my_dict) return
def to_pandas(id: int, columns: list = None, samples: list = None): """ - After unzipping `gzip.open()`, bytesio still needed to be read into PyArrow before being read into Pandas. - All methods return all columns by default if they receive None: `pc.read_csv(read_options.column_names)`, `pa.read_table()`, `pd.read_csv(uscols)`, `pd.read_parquet(columns)` """ d = Dataset.get_by_id(id) is_compressed = d.is_compressed ff = d.file_format # When user provides only 1 column and forgets to [] it (e.g. the label column). if type(columns) == str: columns = [columns] data = d.data bytesio_data = io.BytesIO(data) if (ff == 'csv') or (ff == 'tsv'): # `pc.ReadOptions.column_names` verifies the existence of the names, does not filter for them. if is_compressed: bytesio_csv = gzip.open(bytesio_data) if ff == 'tsv': parse_opt = pc.ParseOptions(delimiter='\t') tbl = pc.read_csv(bytesio_csv, parse_options=parse_opt) else: tbl = pc.read_csv(bytesio_csv) df = tbl.to_pandas() if columns is not None: df = df.filter(columns) else: if ff == 'tsv': df = pd.read_csv(bytesio_data, sep='\t', usecols=columns) else: df = pd.read_csv(bytesio_data, usecols=columns) elif ff == 'parquet': if is_compressed: bytesio_parquet = gzip.open(bytesio_data) tbl = pq.read_table(bytesio_parquet, columns=columns) df = tbl.to_pandas() else: df = pd.read_parquet(bytesio_data, columns=columns) if samples is not None: df = df.iloc[samples] d_dtype = d.dtype if d_dtype is not None: if (type(d_dtype) == dict): if columns is None: columns = d.columns # need to prune out the excluded columns from the dtype dict d_dtype_cols = list(d_dtype.keys()) for col in d_dtype_cols: if col not in columns: del d_dtype[col] df = df.astype(d_dtype) return df
def as_parquet(csv_file, metadata_file): result = io.BytesIO() csv_parse_options = csv.ParseOptions(delimiter="\t") schema_parse_options = csv.ParseOptions(delimiter="|") schema = csv.read_csv(metadata_file, parse_options=schema_parse_options) pyarrow_schema = create_pyarrow_schema(schema["cleaned_name"], schema["data_type"]) csv_convert_options = csv.ConvertOptions(column_types=pyarrow_schema) parquet_table = csv.read_csv(csv_file, parse_options=csv_parse_options, convert_options=csv_convert_options) w = parquet.ParquetWriter(where=result, schema=pyarrow_schema, compression="SNAPPY", flavor="spark") w.write_table(parquet_table) return result
def csv_to_table(self, csv_path, table_name, read = None, parse = None, convert = None, con = None, auto_infer = False): ' Pyarrow CSV reader documentation: https://arrow.apache.org/docs/python/generated/pyarrow.csv.read_csv.html ' if not ARROW: return "Optional pyarrow dependency not found. To install: pip3 install pyarrow" sqream_to_pa = { 'ftBool': pa.bool_(), 'ftUByte': pa.uint8(), 'ftShort': pa.int16(), 'ftInt': pa.int32(), 'ftLong': pa.int64(), 'ftFloat': pa.float32(), 'ftDouble': pa.float64(), 'ftDate': pa.timestamp('ns'), 'ftDateTime': pa.timestamp('ns'), 'ftVarchar': pa.string(), 'ftBlob': pa.utf8() } start = time.time() # Get table metadata con = con or self con.execute(f'select * from {table_name} where 1=0') # Map column names to pyarrow types and set Arrow's CSV parameters sqream_col_types = [col_type[0] for col_type in con.col_type_tups] column_types = zip(con.col_names, [sqream_to_pa[col_type[0]] for col_type in con.col_type_tups]) read = read or csv.ReadOptions(column_names=con.col_names) parse = parse or csv.ParseOptions(delimiter='|') convert = convert or csv.ConvertOptions(column_types = None if auto_infer else column_types) # Read CSV to in-memory arrow format csv_arrow = csv.read_csv(csv_path, read_options=read, parse_options=parse, convert_options=convert).combine_chunks() num_chunks = len(csv_arrow[0].chunks) numpy_cols = [] # For each column, get the numpy representation for quick packing for col_type, col in zip(sqream_col_types, csv_arrow): # Only one chunk after combine_chunks() col = col.chunks[0] if col_type in ('ftVarchar', 'ftBlob', 'ftDate', 'ftDateTime'): col = col.to_pandas() else: col = col.to_numpy() numpy_cols.append(col) print (f'total loading csv: {time.time()-start}') start = time.time() # Insert columns into SQream col_num = csv_arrow.shape[1] con.executemany(f'insert into {table_name} values ({"?,"*(col_num-1)}?)', numpy_cols) print (f'total inserting csv: {time.time()-start}')
def read_rms(self): self.rms_df = pa.concat_tables([ csv.read_csv( fn, parse_options=csv.ParseOptions(delimiter='\t'), convert_options=csv.ConvertOptions(column_types={ 'upc': pa.int64(), 'upc_ver_uc': pa.uint8() })) for fn in self.rms_dict.values() ]).to_pandas() return
def _parse_options_from_dict(**kwargs): """Returns the parse options for CSV. Returns: (object) A pyarrow ParseOptions object. """ return csv.ParseOptions( delimiter=kwargs.pop('delimiter', ','), quote_char=kwargs.pop('quote_char', '"'), double_quote=kwargs.pop('double_quote', True), escape_char=kwargs.pop('escape_char', False), newlines_in_values=kwargs.pop('newlines_in_values', False), ignore_empty_lines=kwargs.pop('ignore_empty_lines', True))
def read_product(self, keep_groups=None, drop_groups=None, keep_modules=None, drop_modules=None): prod_cols = [ 'upc', 'upc_ver_uc', 'upc_descr', 'product_module_code', 'product_module_descr', 'product_group_code', 'product_group_descr', 'brand_code_uc', 'brand_descr', 'multi', 'size1_code_uc', 'size1_amount', 'size1_units', 'dataset_found_uc', 'size1_change_flag_uc' ] prod_dict = { 'upc': pa.int64(), 'upc_ver_uc': pa.int8(), 'product_module_code': pa.uint16(), 'brand_code_uc': pa.uint32(), 'multi': pa.uint16(), 'size1_code_uc': pa.uint16() } prod_df = csv.read_csv(self.product_file, read_options=csv.ReadOptions(encoding='latin'), parse_options=csv.ParseOptions(delimiter='\t'), convert_options=csv.ConvertOptions( column_types=prod_dict, include_columns=prod_cols)).to_pandas() if keep_groups: prod_df = prod_df[prod_df['product_group_code'].isin(keep_groups)] if drop_groups: prod_df = prod_df[~prod_df['product_group_code'].isin(drop_groups)] if keep_modules: prod_df = prod_df[prod_df['product_module_code'].isin( keep_modules)] if drop_modules: prod_df = prod_df[~prod_df['product_module_code'].isin(drop_modules )] # dictionary encoding to save space prod_df['size1_units'] = prod_df['size1_units'].astype('category') prod_df['product_module_descr'] = prod_df[ 'product_module_descr'].astype('category') prod_df['product_group_code'] = prod_df['product_group_code'].astype( 'category') # clean up product info prod_df['upc_descr'] = prod_df['upc_descr'].str.strip().str.replace( 'RTE', '') prod_df['brand_descr'] = prod_df['brand_descr'].str.strip( ).str.replace('CTL BR', 'Private Label') self.prod_df = prod_df.copy() return
def parse(self, fname, num_splits, start, end, header, **kwargs): """ Parse CSV file into PyArrow tables. Parameters ---------- fname : str Name of the CSV file to parse. num_splits : int Number of partitions to split the resulted PyArrow table into. start : int Position in the specified file to start parsing from. end : int Position in the specified file to end parsing at. header : str Header line that will be interpret as the first line of the parsed CSV file. **kwargs : kwargs Serves the compatibility purpose. Does not affect the result. Returns ------- list List with splitted parse results and it's metadata: - First `num_split` elements are PyArrow tables, representing the corresponding chunk. - Next element is the number of rows in the parsed table. - Last element is the pandas Series, containing the data-types for each column of the parsed table. """ import pyarrow as pa import pyarrow.csv as csv bio = open(fname, "rb") # The header line for the CSV file first_line = bio.readline() bio.seek(start) to_read = header + first_line + bio.read(end - start) bio.close() table = csv.read_csv(BytesIO(to_read), parse_options=csv.ParseOptions(header_rows=1)) chunksize = compute_chunksize(table.num_columns, num_splits) chunks = [ pa.Table.from_arrays(table.columns[chunksize * i:chunksize * (i + 1)]) for i in range(num_splits) ] return chunks + [ table.num_rows, pandas.Series( [t.to_pandas_dtype() for t in table.schema.types], index=table.schema.names, ), ]
def pa_parse_options(self): if self.parse_options is not None: parse_options = self.parse_options else: parse_options = pac.ParseOptions( delimiter="\r", quote_char=False, double_quote=False, escape_char=False, newlines_in_values=False, ignore_empty_lines=False, ) return parse_options
def read_product(self, upc_list=None): print("Reading product data...") prod_df = csv.read_csv( self.product_file, parse_options=csv.ParseOptions(delimiter='\t')).to_pandas() if upc_list: prod_df = prod_df[prod_df.upc.isin(upc_list)] # Clean up product descriptions #prod_df['upc_descr'] = prod_df['upc_descr'].str.strip().str.replace('RTE', '') #prod_df['brand_descr'] = prod_df['brand_descr'].str.strip().str.replace('CTL BR', 'Private Label') self.prod_df = prod_df.copy() return
def pa_parse_options(self): if self.parse_options is not None: parse_options = self.parse_options else: # To force the one-column setting, we set an arbitrary character # that is not in text files as delimiter, such as \b or \v. # The bell character, \b, was used to make beeps back in the days parse_options = pac.ParseOptions( delimiter="\b", quote_char=False, double_quote=False, escape_char=False, newlines_in_values=False, ignore_empty_lines=False, ) return parse_options
def convert_tsv_to_apache_parquet( data_path: InputPath('TSV'), output_data_path: OutputPath('ApacheParquet'), ): '''Converts TSV table to Apache Parquet. [Apache Parquet](https://parquet.apache.org/) Annotations: author: Alexey Volkov <*****@*****.**> ''' from pyarrow import csv, parquet table = csv.read_csv(data_path, parse_options=csv.ParseOptions(delimiter='\t')) parquet.write_table(table, output_data_path)
def read_csv(self, filenames, delimiter=','): global parquet_writer for file in filenames: csv_reader = csv.open_csv( file, read_options=csv.ReadOptions(use_threads=True), parse_options=csv.ParseOptions(delimiter=delimiter), convert_options=csv.ConvertOptions(column_types=self.dtype)) parquet_writer = pq.ParquetWriter(self.parquet_file, csv_reader.schema) nrow = 0 for batch in csv_reader: batch_df = batch.to_pandas() nrow += batch_df.shape[0] parquet_writer.write_table(pa.Table.from_pandas(df=batch_df)) parquet_writer.close() return ds.dataset(self.parquet_file, format="parquet")
def main(): parser = argparse.ArgumentParser() parser.add_argument("-i", "--input-file", help="CSV file to scan") parser.add_argument("-o", "--output-file", help="Schema file to create or modify") parser.add_argument("-d", "--delimiter", help="delimiter for the file") parser.add_argument("-k", "--primary-key", help="Field name that is the primary key") parser.add_argument("-p", "--pattern", help=("pattern to match the schema to file. " "Example: 'reports/.*' will match 'reports/john.csv")) # parser.add_argument("-e", "--extension", help="File extension expected (e.g. .csv)") args = parser.parse_args() # Parse the input file. df = csv.read_csv(args.input_file, parse_options=csv.ParseOptions(delimiter=args.delimiter)) fields = [] for field in df.schema: item = { "name": field.name, "type": str(field.type), "nullable": field.nullable } # Force primary key to be not-null. if field.name == args.primary_key: item["nullable"] = False fields.append(item) # Prepare the output. root = {"paths": []} if os.path.exists(args.output_file): with open(args.output_file) as f: root = json.loads(f.read()) root["paths"].append( { "delimiter": args.delimiter, "primary_key": args.primary_key, "pattern": args.pattern, # "extension": args.extension, "fields": fields } ) with open("schema.json", "w") as f: f.write(json.dumps(root, indent=4))
def csv_to_parquet( csv_file: Path, parquet_file: Path, *, delimiter: str, column_names: List[str], quiet: bool = False, ) -> None: block_size = 1 << 24 # 16 MB read_options = csv.ReadOptions(column_names=column_names, block_size=block_size) parse_options = csv.ParseOptions(delimiter=delimiter) writer = None with csv.open_csv( csv_file, read_options=read_options, parse_options=parse_options ) as csv_reader: for batch in tqdm(csv_reader, disable=quiet): if writer is None: writer = pq.ParquetWriter(parquet_file, csv_reader.schema, compression="zstd") table = pa.Table.from_batches([batch]) writer.write_table(table) if writer is not None: writer.close()
def test_csv_options(in_type, pd_old_type, pd_new_type): schema = pa.schema([("string_col", pa.string())]) read_options = csv.ReadOptions(skip_rows=1) parse_options = csv.ParseOptions(quote_char="'", escape_char="\\", delimiter=";", newlines_in_values=True) convert_options = csv.ConvertOptions( include_columns=["i", "my_string", "nonexistent_column"], include_missing_columns=True, null_values=["NULL_STRING"], strings_can_be_null=True, ) df = pa_read_csv_to_pandas( "tests/data/csv_options_test.csv", schema, False, pd_string=False, parse_options=parse_options, convert_options=convert_options, read_options=read_options, ) expected = [ "dsfasd;dsffadsf", "dsfasd;dsffadsf", None, "this text\nhas a line break", "this text, like so, has commas", ] assert df.columns.tolist() == ["i", "my_string", "nonexistent_column"] assert df["nonexistent_column"].isnull().all() assert_series_equal(df["my_string"], Series(expected, name="my_string"))
def read_one_sales(fn, stores_list=None, incl_promo=True): my_cols = [ 'store_code_uc', 'upc', 'week_end', 'units', 'prmult', 'price' ] if incl_promo: my_cols = my_cols + ['feature', 'display'] convert_dict = { 'feature': pa.int8(), 'display': pa.int8(), 'prmult': pa.int8(), 'units': pa.uint16(), 'store_code_uc': pa.uint32() } dataset = ds.dataset( csv.read_csv(fn, parse_options=csv.ParseOptions(delimiter='\t'), convert_options=csv.ConvertOptions( column_types=convert_dict, include_columns=my_cols))) if stores_list is None: return dataset.to_table() else: return dataset.to_table( filter=ds.field('store_code_uc').isin(stores_list))
def pa_parse_options(self): parse_options = self.parse_options or pac.ParseOptions() parse_options.delimiter = self.delimiter parse_options.quote_char = self.quote_char return parse_options
def from_file(path: str, file_format: str = None, name: str = None, perform_gzip: bool = True, dtype: dict = None): """ - File is read in with pyarrow, converted to bytes, compressed by default, and stored as a SQLite blob field. - Note: If you do not remove your file's index columns before importing them, then they will be included in your Dataset. The ordered nature of this column represents potential bias during analysis. You can drop these and other columns in memory when creating a Featureset from your Dataset. - Note: If no column names are provided, then they will be inserted automatically. - `path`: Local or absolute path - `file_format`: Accepts uncompressed formats including parquet, csv, and tsv (a csv with `delimiter='\t'`). This tag is used to tell pyarrow how to handle the file. We do not infer the path because (a) we don't want to force file extensions, (b) we want to make sure users know what file formats we support. - `name`: if none specified, then `path` string will be used. - `perform_gzip`: Whether or not to perform gzip compression on the file. We have observed up to 90% compression rates during testing. """ # create some files with no column names # do some testing with sparse null column names... # do some testing with all null column names... accepted_formats = ['csv', 'tsv', 'parquet'] if file_format not in accepted_formats: print( "Error - Accepted file formats include uncompressed csv, tsv, and parquet." ) else: # Defaults. if name is None: name = path if perform_gzip is None: perform_gzip = True #ToDo prevent ff combos like '.csv' with 'parquet' vice versa. # File formats. if (file_format == 'tsv') or (file_format is None): parse_opt = pc.ParseOptions(delimiter='\t') tbl = pc.read_csv(path, parse_options=parse_opt) file_format = 'tsv' elif (file_format == 'csv'): parse_opt = pc.ParseOptions(delimiter=',') tbl = pc.read_csv(path) elif (file_format == 'parquet'): tbl = pq.read_table(path) #ToDo - handle columns with no name. columns = tbl.column_names with open(path, "rb") as f: bytesio = io.BytesIO(f.read()) data = bytesio.getvalue() if perform_gzip: data = gzip.compress(data) is_compressed = True else: is_compressed = False d = Dataset.create(name=name, data=data, dtype=dtype, file_format=file_format, is_compressed=is_compressed, columns=columns) return d
def read_year(self, year, hh_states_keep=None, hh_states_drop=None, hh_dma_keep=None, hh_dma_drop=None): (purch_fn, trip_fn, panelist_fn) = get_fns(self.annual_dict[year]) hh_ds = ds.dataset( csv.read_csv(panelist_fn, parse_options=csv.ParseOptions(delimiter='\t'), convert_options=csv.ConvertOptions( auto_dict_encode=True, auto_dict_max_cardinality=1024))) # build an arrow dataset filter object one by one my_filter = ds.field('Projection_Factor') > 0 if hh_states_keep: my_filter = my_filter & ( ds.field('Fips_State_Desc').isin(hh_states_keep)) if hh_states_drop: my_filter = my_filter & ( ~ds.field('Fips_State_Desc').isin(hh_states_drop)) if hh_dma_keep: my_filter = my_filter & (ds.field('DMA_Cd').isin(hh_dma_keep)) if hh_dma_drop: my_filter = my_filter & (~ds.field('DMA_Cd').isin(hh_dma_drop)) # convert to pandas and get unique HH list hh_df = hh_ds.to_table(filter=my_filter).to_pandas().rename( columns=hh_dict_rename) hh_list = hh_df.household_code.unique() # use pyarrrow filter to filter trips for just our households trip_df = ds.dataset(csv.read_csv(trip_fn, parse_options=csv.ParseOptions(delimiter='\t')))\ .to_table(filter=ds.field('household_code').isin(hh_list)).to_pandas() trip_list = trip_df.trip_code_uc.unique() upc_list = self.prod_df.upc.unique() # use pyarrow to filter purchases using trips and UPCs only purch_ds = ds.dataset( csv.read_csv(purch_fn, parse_options=csv.ParseOptions(delimiter='\t'), convert_options=csv.ConvertOptions( auto_dict_encode=True, auto_dict_max_cardinality=1024))) purch_filter = ds.field('trip_code_uc').isin(trip_list) & ds.field( 'upc').isin(upc_list) purch_df = purch_ds.to_table(filter=purch_filter).to_pandas() # Add the fields to the trips and purchases for convenience later trip_df2 = pd.merge(trip_df, hh_df[self.hh_cols], on=['household_code', 'panel_year']) purch_df2 = pd.merge(pd.merge( purch_df, self.prod_df[self.prod_cols], on=['upc', 'upc_ver_uc']), trip_df2[self.hh_cols+['trip_code_uc', 'purchase_date', 'store_code_uc']], on=['trip_code_uc'])\ .rename(columns={'fips_state_desc': 'hh_state_desc'}) self.purch_df = self.purch_df.append(purch_df2, ignore_index=True) self.trip_df = self.trip_df.append(trip_df2, ignore_index=True) self.hh_df = self.hh_df.append(hh_df, ignore_index=True) return
from pyarrow import csv as pacsv filename = "test_input.txt" if __name__ == "__main__": read_options = pacsv.ReadOptions( column_names=["group_id", "seq_number", "data"]) parse_options = pacsv.ParseOptions(delimiter="\t") table = pacsv.read_csv(filename, read_options=read_options, parse_options=parse_options)
import pyarrow.parquet as pq import pyarrow.csv as pv csvfile = 'pressureVacuum.csv' tb = pv.read_csv(csvfile,parse_options=pv.ParseOptions(delimiter=',')) print(tb) parquetfile = 'pressureVacuum.parquet' pq.write_table(tb,parquetfile,compression='BROTLI') # {'NONE', 'SNAPPY', 'GZIP', 'LZO', 'BROTLI', 'LZ4', 'ZSTD'} df = pq.read_table(parquetfile,columns=None) print(df)
target_path = Path("data.arrows") with open(source_path) as source: with open(temp_path, "w") as target: for source_line in source: if source_line.count("\t") != 8: # filter out records with anomalous columns # matches the hack in streaming-tsv-parser.js:27 continue target.write(source_line) table = csv.read_csv( temp_path, parse_options=csv.ParseOptions( delimiter="\t", ), convert_options=pa.csv.ConvertOptions( column_types={ "date": pa.uint32(), "x": pa.float32(), "y": pa.float32(), "ix": pa.uint32(), "language": pa.dictionary(pa.int32(), pa.utf8()) }, null_values=["None", ""] ), ) # remove unused columns table = table.select(["ix", "x", "y", "title", "first_author_name", "date", "language"])