Exemplo n.º 1
0
    def _read_csv(self) -> "Table":
        import pyarrow as pa
        from pyarrow import csv

        if not self.output_location:
            raise ProgrammingError("OutputLocation is none or empty.")
        if not self.output_location.endswith((".csv", ".txt")):
            return pa.Table.from_pydict(dict())
        length = self._get_content_length()
        if length and self.output_location.endswith(".txt"):
            description = self.description if self.description else []
            column_names = [d[0] for d in description]
            read_opts = csv.ReadOptions(
                skip_rows=0,
                column_names=column_names,
                block_size=self._block_size,
                use_threads=True,
            )
            parse_opts = csv.ParseOptions(
                delimiter="\t",
                quote_char=False,
                double_quote=False,
                escape_char=False,
            )
        elif length and self.output_location.endswith(".csv"):
            read_opts = csv.ReadOptions(skip_rows=0,
                                        block_size=self._block_size,
                                        use_threads=True)
            parse_opts = csv.ParseOptions(
                delimiter=",",
                quote_char='"',
                double_quote=True,
                escape_char=False,
            )
        else:
            return pa.Table.from_pydict(dict())

        bucket, key = parse_output_location(self.output_location)
        try:
            return csv.read_csv(
                self._fs.open_input_stream(f"{bucket}/{key}"),
                read_options=read_opts,
                parse_options=parse_opts,
                convert_options=csv.ConvertOptions(
                    quoted_strings_can_be_null=False,
                    timestamp_parsers=self.timestamp_parsers,
                    column_types=self.column_types,
                ),
            )
        except Exception as e:
            _logger.exception(f"Failed to read {bucket}/{key}.")
            raise OperationalError(*e.args) from e
Exemplo n.º 2
0
    def _read_table_arrow(self, source: tp.BinaryIO,
                          schema: pa.Schema) -> pa.Table:

        try:

            read_options = pa_csv.ReadOptions()
            read_options.encoding = 'utf-8'
            read_options.use_threads = False

            parse_options = pa_csv.ParseOptions()
            parse_options.newlines_in_values = True

            convert_options = pa_csv.ConvertOptions()
            convert_options.include_columns = schema.names
            convert_options.column_types = {
                n: t
                for (n, t) in zip(schema.names, schema.types)
            }
            convert_options.strings_can_be_null = True
            convert_options.quoted_strings_can_be_null = False

            return pa_csv.read_csv(source, read_options, parse_options,
                                   convert_options)

        except pa.ArrowInvalid as e:
            err = f"CSV file decoding failed, content is garbled"
            self._log.exception(err)
            raise _ex.EDataCorruption(err) from e

        except pa.ArrowKeyError as e:
            err = f"CSV file decoding failed, one or more columns is missing"
            self._log.error(err)
            self._log.exception(str(e))
            raise _ex.EDataCorruption(err) from e
Exemplo n.º 3
0
def write_files(metadata: AlchemyMetadata) -> None:
    """
    Creates a Parquet file for each table in the schema.
    """
    tables: Iterator[AlchemyTable] = metadata.tables.values()
    for table in tables:
        name = table.name
        print(name)

        def get_path(prefix: Path, suffix: str):
            parent_dir = prefix.joinpath(metadata.schema)
            parent_dir.mkdir(exist_ok=True, parents=True)
            return parent_dir.joinpath(name).with_suffix(suffix)

        extract_file = get_path(EXTRACT_PATH_PREFIX, ".csv.zst")
        parquet_file = get_path(PARQUET_PREFIX, ".parquet")

        arrow_schema = pa.schema(get_fields(table))
        column_names = [name for name, dtype in get_fields(table)]

        read_options = pcsv.ReadOptions(column_names=column_names, block_size=1000000000)
        parse_options = pcsv.ParseOptions(newlines_in_values=True)
        convert_options = pcsv.ConvertOptions(column_types=arrow_schema, timestamp_parsers=["%Y%m%d", "%Y-%m-%d"],
                                              true_values=["1", "T"], false_values=["0", "F"], strings_can_be_null=True)

        parquet_writer = pq.ParquetWriter(parquet_file, schema=arrow_schema, compression='zstd',
                                          version="2.0", use_dictionary=True)
        stream_reader = pcsv.open_csv(extract_file, read_options=read_options, parse_options=parse_options,
                                      convert_options=convert_options)
        for batch in stream_reader:
            table = pa.Table.from_batches([batch])
            parquet_writer.write_table(table)
        parquet_writer.close()
Exemplo n.º 4
0
def _read_csv_with_offset_pyarrow_on_ray(fname, num_splits, start, end, kwargs,
                                         header):  # pragma: no cover
    """Use a Ray task to read a chunk of a CSV into a pyarrow Table.
     Note: Ray functions are not detected by codecov (thus pragma: no cover)
     Args:
        fname: The filename of the file to open.
        num_splits: The number of splits (partitions) to separate the DataFrame into.
        start: The start byte offset.
        end: The end byte offset.
        kwargs: The kwargs for the pyarrow `read_csv` function.
        header: The header of the file.
     Returns:
         A list containing the split pyarrow Tables and the the number of
         rows of the tables as the last element. This is used to determine
         the total length of the DataFrame to build a default Index.
    """
    bio = open(fname, "rb")
    # The header line for the CSV file
    first_line = bio.readline()
    bio.seek(start)
    to_read = header + first_line + bio.read(end - start)
    bio.close()
    table = csv.read_csv(BytesIO(to_read),
                         parse_options=csv.ParseOptions(header_rows=1))
    chunksize = get_default_chunksize(table.num_columns, num_splits)
    chunks = [
        pa.Table.from_arrays(table.columns[chunksize * i:chunksize * (i + 1)])
        for i in range(num_splits)
    ]
    return chunks + [
        table.num_rows,
        pandas.Series([t.to_pandas_dtype() for t in table.schema.types],
                      index=table.schema.names),
    ]
Exemplo n.º 5
0
 def pa_parse_options(self):
     parse_options = self.parse_options or pac.ParseOptions()
     if self.delimiter is not None:
         parse_options.delimiter = self.delimiter
     if self.quote_char is not None:
         parse_options.quote_char = self.quote_char
     return parse_options
Exemplo n.º 6
0
    def parse(self, **kwargs):
        import pyarrow as pa
        import pyarrow.csv as csv

        fname = kwargs.pop("fname", None)
        num_splits = kwargs.pop("num_splits", None)
        start = kwargs.pop("start", None)
        end = kwargs.pop("end", None)
        header = kwargs.pop("header", None)
        bio = open(fname, "rb")
        # The header line for the CSV file
        first_line = bio.readline()
        bio.seek(start)
        to_read = header + first_line + bio.read(end - start)
        bio.close()
        table = csv.read_csv(BytesIO(to_read),
                             parse_options=csv.ParseOptions(header_rows=1))
        chunksize = get_default_chunksize(table.num_columns, num_splits)
        chunks = [
            pa.Table.from_arrays(table.columns[chunksize * i:chunksize *
                                               (i + 1)])
            for i in range(num_splits)
        ]
        return chunks + [
            table.num_rows,
            pandas.Series(
                [t.to_pandas_dtype() for t in table.schema.types],
                index=table.schema.names,
            ),
        ]
Exemplo n.º 7
0
 def scan_file(self, bucket, key, schema):
     logging.info(f"delim is {self.delimiter}")
     uri = f"{bucket}/{key}"
     s3fs = fs.S3FileSystem()
     # Run column order validation by opening and not reading anything.
     filestream = s3fs.open_input_stream(uri)
     parse_opts = csv.ParseOptions(delimiter=self.delimiter)
     reader = csv.open_csv(filestream, parse_options=parse_opts)
     for index, col in enumerate(reader.schema):
         if col.name != schema[index].name:
             msg = "column {} is out of order".format(col.name)
             raise ColumnOrderException(msg)
     # Run the rest of the validations.
     filestream = s3fs.open_input_stream(uri)
     opts = csv.ConvertOptions(column_types=schema)
     reader = csv.open_csv(filestream,
                           convert_options=opts,
                           parse_options=parse_opts)
     # Kind of a hack, but it works...if delim wrong, everything is read
     # as one column.
     if len(schema) > 1 and len(reader.schema) == 1:
         raise WrongDelimiterException()
     # Parse through the file, pyarrow will through exceptions
     # if there's invalid data.
     for batch in reader:
         # If primary key is a string, need to check the column
         # for empty strings.
         if schema.field(self.primary_key).type == "string":
             table = pyarrow.Table.from_batches([batch])
             for val in table[self.primary_key]:
                 if val.as_py() == "":
                     raise EmptyPrimaryKeyException()
Exemplo n.º 8
0
    def read_stores(self):
        s_cols = [
            'retailer_code', 'parent_code', 'fips_state_code',
            'fips_county_code', 'dma_code', 'store_zip3'
        ]
        # To reduce space -- update with dictionary arrays later
        store_convert = {
            'year': pa.uint16(),
            'dma_code': pa.uint16(),
            'retailer_code': pa.uint16(),
            'parent_code': pa.uint16(),
            'store_zip3': pa.uint16(),
            'fips_county_code': pa.uint16(),
            'fips_state_code': pa.uint8()
        }

        # Use pyarrow to read CSVs and parse using the dict -- we have to fix some types again later.
        tmp = pa.concat_tables([
            csv.read_csv(
                x,
                parse_options=csv.ParseOptions(delimiter='\t'),
                convert_options=csv.ConvertOptions(column_types=store_convert))
            for x in self.stores_dict.values()
        ]).to_pandas().rename(columns={'year': 'panel_year'})

        # some columns have blanks --fill with zero to avoid converting to floats(!)
        tmp.loc[:, s_cols] = tmp.loc[:, s_cols].fillna(0)

        # use the compressed types
        my_dict = {
            key: value
            for (key, value) in type_dict.items() if key in tmp.columns
        }
        self.stores_df = tmp.astype(my_dict)
        return
Exemplo n.º 9
0
    def to_pandas(id: int, columns: list = None, samples: list = None):
        """
		- After unzipping `gzip.open()`, bytesio still needed to be read into PyArrow before being read into Pandas.
		- All methods return all columns by default if they receive None: 
		  `pc.read_csv(read_options.column_names)`, `pa.read_table()`, `pd.read_csv(uscols)`, `pd.read_parquet(columns)`
		"""
        d = Dataset.get_by_id(id)
        is_compressed = d.is_compressed
        ff = d.file_format

        # When user provides only 1 column and forgets to [] it (e.g. the label column).
        if type(columns) == str:
            columns = [columns]

        data = d.data
        bytesio_data = io.BytesIO(data)
        if (ff == 'csv') or (ff == 'tsv'):
            # `pc.ReadOptions.column_names` verifies the existence of the names, does not filter for them.
            if is_compressed:
                bytesio_csv = gzip.open(bytesio_data)
                if ff == 'tsv':
                    parse_opt = pc.ParseOptions(delimiter='\t')
                    tbl = pc.read_csv(bytesio_csv, parse_options=parse_opt)
                else:
                    tbl = pc.read_csv(bytesio_csv)
                df = tbl.to_pandas()
                if columns is not None:
                    df = df.filter(columns)
            else:
                if ff == 'tsv':
                    df = pd.read_csv(bytesio_data, sep='\t', usecols=columns)
                else:
                    df = pd.read_csv(bytesio_data, usecols=columns)
        elif ff == 'parquet':
            if is_compressed:
                bytesio_parquet = gzip.open(bytesio_data)
                tbl = pq.read_table(bytesio_parquet, columns=columns)
                df = tbl.to_pandas()
            else:
                df = pd.read_parquet(bytesio_data, columns=columns)

        if samples is not None:
            df = df.iloc[samples]

        d_dtype = d.dtype
        if d_dtype is not None:
            if (type(d_dtype) == dict):
                if columns is None:
                    columns = d.columns
                # need to prune out the excluded columns from the dtype dict
                d_dtype_cols = list(d_dtype.keys())
                for col in d_dtype_cols:
                    if col not in columns:
                        del d_dtype[col]
            df = df.astype(d_dtype)

        return df
Exemplo n.º 10
0
def as_parquet(csv_file, metadata_file):

    result = io.BytesIO()
    csv_parse_options = csv.ParseOptions(delimiter="\t")
    schema_parse_options = csv.ParseOptions(delimiter="|")
    schema = csv.read_csv(metadata_file, parse_options=schema_parse_options)
    pyarrow_schema = create_pyarrow_schema(schema["cleaned_name"],
                                           schema["data_type"])
    csv_convert_options = csv.ConvertOptions(column_types=pyarrow_schema)
    parquet_table = csv.read_csv(csv_file,
                                 parse_options=csv_parse_options,
                                 convert_options=csv_convert_options)
    w = parquet.ParquetWriter(where=result,
                              schema=pyarrow_schema,
                              compression="SNAPPY",
                              flavor="spark")
    w.write_table(parquet_table)
    return result
Exemplo n.º 11
0
    def csv_to_table(self, csv_path, table_name, read = None, parse = None, convert = None, con = None, auto_infer = False):
        ' Pyarrow CSV reader documentation: https://arrow.apache.org/docs/python/generated/pyarrow.csv.read_csv.html '

        if not ARROW:
            return "Optional pyarrow dependency not found. To install: pip3 install pyarrow"
        
        sqream_to_pa = {
            'ftBool':     pa.bool_(),
            'ftUByte':    pa.uint8(),
            'ftShort':    pa.int16(),
            'ftInt':      pa.int32(),
            'ftLong':     pa.int64(),
            'ftFloat':    pa.float32(),
            'ftDouble':   pa.float64(),
            'ftDate':     pa.timestamp('ns'),
            'ftDateTime': pa.timestamp('ns'),
            'ftVarchar':  pa.string(),
            'ftBlob':     pa.utf8()
        }

        start = time.time()
        # Get table metadata
        con = con or self
        con.execute(f'select * from {table_name} where 1=0')
        
        # Map column names to pyarrow types and set Arrow's CSV parameters
        sqream_col_types = [col_type[0] for col_type in con.col_type_tups]
        column_types = zip(con.col_names, [sqream_to_pa[col_type[0]] for col_type in con.col_type_tups])
        read = read or csv.ReadOptions(column_names=con.col_names)
        parse = parse or csv.ParseOptions(delimiter='|')
        convert = convert or csv.ConvertOptions(column_types = None if auto_infer else column_types)
        
        # Read CSV to in-memory arrow format
        csv_arrow = csv.read_csv(csv_path, read_options=read, parse_options=parse, convert_options=convert).combine_chunks()
        num_chunks = len(csv_arrow[0].chunks)
        numpy_cols = []

        # For each column, get the numpy representation for quick packing 
        for col_type, col in zip(sqream_col_types, csv_arrow):
            # Only one chunk after combine_chunks()
            col = col.chunks[0]
            if col_type in  ('ftVarchar', 'ftBlob', 'ftDate', 'ftDateTime'):
                col = col.to_pandas()
            else:
                col = col.to_numpy()
            
            numpy_cols.append(col)
        
        print (f'total loading csv: {time.time()-start}')
        start = time.time()
        
        # Insert columns into SQream
        col_num = csv_arrow.shape[1]
        con.executemany(f'insert into {table_name} values ({"?,"*(col_num-1)}?)', numpy_cols)
        print (f'total inserting csv: {time.time()-start}')
Exemplo n.º 12
0
 def read_rms(self):
     self.rms_df = pa.concat_tables([
         csv.read_csv(
             fn,
             parse_options=csv.ParseOptions(delimiter='\t'),
             convert_options=csv.ConvertOptions(column_types={
                 'upc': pa.int64(),
                 'upc_ver_uc': pa.uint8()
             })) for fn in self.rms_dict.values()
     ]).to_pandas()
     return
Exemplo n.º 13
0
def _parse_options_from_dict(**kwargs):
    """Returns the parse options for CSV.
    Returns:
        (object) A pyarrow ParseOptions object.
    """
    return csv.ParseOptions(
        delimiter=kwargs.pop('delimiter', ','),
        quote_char=kwargs.pop('quote_char', '"'),
        double_quote=kwargs.pop('double_quote', True),
        escape_char=kwargs.pop('escape_char', False),
        newlines_in_values=kwargs.pop('newlines_in_values', False),
        ignore_empty_lines=kwargs.pop('ignore_empty_lines', True))
Exemplo n.º 14
0
    def read_product(self,
                     keep_groups=None,
                     drop_groups=None,
                     keep_modules=None,
                     drop_modules=None):
        prod_cols = [
            'upc', 'upc_ver_uc', 'upc_descr', 'product_module_code',
            'product_module_descr', 'product_group_code',
            'product_group_descr', 'brand_code_uc', 'brand_descr', 'multi',
            'size1_code_uc', 'size1_amount', 'size1_units', 'dataset_found_uc',
            'size1_change_flag_uc'
        ]

        prod_dict = {
            'upc': pa.int64(),
            'upc_ver_uc': pa.int8(),
            'product_module_code': pa.uint16(),
            'brand_code_uc': pa.uint32(),
            'multi': pa.uint16(),
            'size1_code_uc': pa.uint16()
        }

        prod_df = csv.read_csv(self.product_file,
                               read_options=csv.ReadOptions(encoding='latin'),
                               parse_options=csv.ParseOptions(delimiter='\t'),
                               convert_options=csv.ConvertOptions(
                                   column_types=prod_dict,
                                   include_columns=prod_cols)).to_pandas()
        if keep_groups:
            prod_df = prod_df[prod_df['product_group_code'].isin(keep_groups)]
        if drop_groups:
            prod_df = prod_df[~prod_df['product_group_code'].isin(drop_groups)]
        if keep_modules:
            prod_df = prod_df[prod_df['product_module_code'].isin(
                keep_modules)]
        if drop_modules:
            prod_df = prod_df[~prod_df['product_module_code'].isin(drop_modules
                                                                   )]

        # dictionary encoding to save space
        prod_df['size1_units'] = prod_df['size1_units'].astype('category')
        prod_df['product_module_descr'] = prod_df[
            'product_module_descr'].astype('category')
        prod_df['product_group_code'] = prod_df['product_group_code'].astype(
            'category')

        # clean up product info
        prod_df['upc_descr'] = prod_df['upc_descr'].str.strip().str.replace(
            'RTE', '')
        prod_df['brand_descr'] = prod_df['brand_descr'].str.strip(
        ).str.replace('CTL BR', 'Private Label')
        self.prod_df = prod_df.copy()
        return
Exemplo n.º 15
0
    def parse(self, fname, num_splits, start, end, header, **kwargs):
        """
        Parse CSV file into PyArrow tables.

        Parameters
        ----------
        fname : str
            Name of the CSV file to parse.
        num_splits : int
            Number of partitions to split the resulted PyArrow table into.
        start : int
            Position in the specified file to start parsing from.
        end : int
            Position in the specified file to end parsing at.
        header : str
            Header line that will be interpret as the first line of the parsed CSV file.
        **kwargs : kwargs
            Serves the compatibility purpose. Does not affect the result.

        Returns
        -------
        list
            List with splitted parse results and it's metadata:

            - First `num_split` elements are PyArrow tables, representing the corresponding chunk.
            - Next element is the number of rows in the parsed table.
            - Last element is the pandas Series, containing the data-types for each column of the parsed table.
        """
        import pyarrow as pa
        import pyarrow.csv as csv

        bio = open(fname, "rb")
        # The header line for the CSV file
        first_line = bio.readline()
        bio.seek(start)
        to_read = header + first_line + bio.read(end - start)
        bio.close()
        table = csv.read_csv(BytesIO(to_read),
                             parse_options=csv.ParseOptions(header_rows=1))
        chunksize = compute_chunksize(table.num_columns, num_splits)
        chunks = [
            pa.Table.from_arrays(table.columns[chunksize * i:chunksize *
                                               (i + 1)])
            for i in range(num_splits)
        ]
        return chunks + [
            table.num_rows,
            pandas.Series(
                [t.to_pandas_dtype() for t in table.schema.types],
                index=table.schema.names,
            ),
        ]
Exemplo n.º 16
0
 def pa_parse_options(self):
     if self.parse_options is not None:
         parse_options = self.parse_options
     else:
         parse_options = pac.ParseOptions(
             delimiter="\r",
             quote_char=False,
             double_quote=False,
             escape_char=False,
             newlines_in_values=False,
             ignore_empty_lines=False,
         )
     return parse_options
Exemplo n.º 17
0
    def read_product(self, upc_list=None):
        print("Reading product data...")
        prod_df = csv.read_csv(
            self.product_file,
            parse_options=csv.ParseOptions(delimiter='\t')).to_pandas()
        if upc_list:
            prod_df = prod_df[prod_df.upc.isin(upc_list)]

        # Clean up product descriptions
        #prod_df['upc_descr'] = prod_df['upc_descr'].str.strip().str.replace('RTE', '')
        #prod_df['brand_descr'] = prod_df['brand_descr'].str.strip().str.replace('CTL BR', 'Private Label')

        self.prod_df = prod_df.copy()
        return
Exemplo n.º 18
0
 def pa_parse_options(self):
     if self.parse_options is not None:
         parse_options = self.parse_options
     else:
         # To force the one-column setting, we set an arbitrary character
         # that is not in text files as delimiter, such as \b or \v.
         # The bell character, \b, was used to make beeps back in the days
         parse_options = pac.ParseOptions(
             delimiter="\b",
             quote_char=False,
             double_quote=False,
             escape_char=False,
             newlines_in_values=False,
             ignore_empty_lines=False,
         )
     return parse_options
Exemplo n.º 19
0
def convert_tsv_to_apache_parquet(
        data_path: InputPath('TSV'),
        output_data_path: OutputPath('ApacheParquet'),
):
    '''Converts TSV table to Apache Parquet.

    [Apache Parquet](https://parquet.apache.org/)

    Annotations:
        author: Alexey Volkov <*****@*****.**>
    '''
    from pyarrow import csv, parquet

    table = csv.read_csv(data_path,
                         parse_options=csv.ParseOptions(delimiter='\t'))
    parquet.write_table(table, output_data_path)
Exemplo n.º 20
0
    def read_csv(self, filenames, delimiter=','):
        global parquet_writer
        for file in filenames:
            csv_reader = csv.open_csv(
                file,
                read_options=csv.ReadOptions(use_threads=True),
                parse_options=csv.ParseOptions(delimiter=delimiter),
                convert_options=csv.ConvertOptions(column_types=self.dtype))
            parquet_writer = pq.ParquetWriter(self.parquet_file,
                                              csv_reader.schema)

            nrow = 0
            for batch in csv_reader:
                batch_df = batch.to_pandas()
                nrow += batch_df.shape[0]
                parquet_writer.write_table(pa.Table.from_pandas(df=batch_df))

        parquet_writer.close()
        return ds.dataset(self.parquet_file, format="parquet")
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--input-file", help="CSV file to scan")
    parser.add_argument("-o", "--output-file", help="Schema file to create or modify")
    parser.add_argument("-d", "--delimiter", help="delimiter for the file")
    parser.add_argument("-k", "--primary-key", help="Field name that is the primary key")
    parser.add_argument("-p", "--pattern",
                        help=("pattern to match the schema to file. "
                              "Example: 'reports/.*' will match 'reports/john.csv"))
#    parser.add_argument("-e", "--extension", help="File extension expected (e.g. .csv)")
    args = parser.parse_args()

    # Parse the input file.
    df = csv.read_csv(args.input_file,
                      parse_options=csv.ParseOptions(delimiter=args.delimiter))
    fields = []
    for field in df.schema:
        item = {
            "name": field.name,
            "type": str(field.type),
            "nullable": field.nullable
        }
        # Force primary key to be not-null.
        if field.name == args.primary_key:
            item["nullable"] = False
        fields.append(item)
    # Prepare the output.
    root = {"paths": []}
    if os.path.exists(args.output_file):
        with open(args.output_file) as f:
            root = json.loads(f.read())
    root["paths"].append(
        {
            "delimiter": args.delimiter,
            "primary_key": args.primary_key,
            "pattern": args.pattern,
#            "extension": args.extension,
            "fields": fields
        }
    )
    with open("schema.json", "w") as f:
        f.write(json.dumps(root, indent=4))
Exemplo n.º 22
0
def csv_to_parquet(
    csv_file: Path,
    parquet_file: Path,
    *,
    delimiter: str,
    column_names: List[str],
    quiet: bool = False,
) -> None:
    block_size = 1 << 24  # 16 MB
    read_options = csv.ReadOptions(column_names=column_names, block_size=block_size)
    parse_options = csv.ParseOptions(delimiter=delimiter)
    writer = None
    with csv.open_csv(
        csv_file, read_options=read_options, parse_options=parse_options
    ) as csv_reader:
        for batch in tqdm(csv_reader, disable=quiet):
            if writer is None:
                writer = pq.ParquetWriter(parquet_file, csv_reader.schema, compression="zstd")
            table = pa.Table.from_batches([batch])
            writer.write_table(table)
    if writer is not None:
        writer.close()
def test_csv_options(in_type, pd_old_type, pd_new_type):
    schema = pa.schema([("string_col", pa.string())])

    read_options = csv.ReadOptions(skip_rows=1)

    parse_options = csv.ParseOptions(quote_char="'",
                                     escape_char="\\",
                                     delimiter=";",
                                     newlines_in_values=True)

    convert_options = csv.ConvertOptions(
        include_columns=["i", "my_string", "nonexistent_column"],
        include_missing_columns=True,
        null_values=["NULL_STRING"],
        strings_can_be_null=True,
    )

    df = pa_read_csv_to_pandas(
        "tests/data/csv_options_test.csv",
        schema,
        False,
        pd_string=False,
        parse_options=parse_options,
        convert_options=convert_options,
        read_options=read_options,
    )

    expected = [
        "dsfasd;dsffadsf",
        "dsfasd;dsffadsf",
        None,
        "this text\nhas a line break",
        "this text, like so, has commas",
    ]
    assert df.columns.tolist() == ["i", "my_string", "nonexistent_column"]
    assert df["nonexistent_column"].isnull().all()
    assert_series_equal(df["my_string"], Series(expected, name="my_string"))
Exemplo n.º 24
0
 def read_one_sales(fn, stores_list=None, incl_promo=True):
     my_cols = [
         'store_code_uc', 'upc', 'week_end', 'units', 'prmult', 'price'
     ]
     if incl_promo:
         my_cols = my_cols + ['feature', 'display']
     convert_dict = {
         'feature': pa.int8(),
         'display': pa.int8(),
         'prmult': pa.int8(),
         'units': pa.uint16(),
         'store_code_uc': pa.uint32()
     }
     dataset = ds.dataset(
         csv.read_csv(fn,
                      parse_options=csv.ParseOptions(delimiter='\t'),
                      convert_options=csv.ConvertOptions(
                          column_types=convert_dict,
                          include_columns=my_cols)))
     if stores_list is None:
         return dataset.to_table()
     else:
         return dataset.to_table(
             filter=ds.field('store_code_uc').isin(stores_list))
Exemplo n.º 25
0
 def pa_parse_options(self):
     parse_options = self.parse_options or pac.ParseOptions()
     parse_options.delimiter = self.delimiter
     parse_options.quote_char = self.quote_char
     return parse_options
Exemplo n.º 26
0
    def from_file(path: str,
                  file_format: str = None,
                  name: str = None,
                  perform_gzip: bool = True,
                  dtype: dict = None):
        """
		- File is read in with pyarrow, converted to bytes, compressed by default, and stored as a SQLite blob field.
		- Note: If you do not remove your file's index columns before importing them, then they will be included in your Dataset. The ordered nature of this column represents potential bias during analysis. You can drop these and other columns in memory when creating a Featureset from your Dataset.
		- Note: If no column names are provided, then they will be inserted automatically.
		- `path`: Local or absolute path
		- `file_format`: Accepts uncompressed formats including parquet, csv, and tsv (a csv with `delimiter='\t'`). This tag is used to tell pyarrow how to handle the file. We do not infer the path because (a) we don't want to force file extensions, (b) we want to make sure users know what file formats we support.
		- `name`: if none specified, then `path` string will be used.
		- `perform_gzip`: Whether or not to perform gzip compression on the file. We have observed up to 90% compression rates during testing.
		"""

        # create some files with no column names
        # do some testing with sparse null column names...
        # do some testing with all null column names...
        accepted_formats = ['csv', 'tsv', 'parquet']
        if file_format not in accepted_formats:
            print(
                "Error - Accepted file formats include uncompressed csv, tsv, and parquet."
            )
        else:
            # Defaults.
            if name is None:
                name = path
            if perform_gzip is None:
                perform_gzip = True

            #ToDo prevent ff combos like '.csv' with 'parquet' vice versa.

            # File formats.
            if (file_format == 'tsv') or (file_format is None):
                parse_opt = pc.ParseOptions(delimiter='\t')
                tbl = pc.read_csv(path, parse_options=parse_opt)
                file_format = 'tsv'
            elif (file_format == 'csv'):
                parse_opt = pc.ParseOptions(delimiter=',')
                tbl = pc.read_csv(path)
            elif (file_format == 'parquet'):
                tbl = pq.read_table(path)

            #ToDo - handle columns with no name.
            columns = tbl.column_names

            with open(path, "rb") as f:
                bytesio = io.BytesIO(f.read())
                data = bytesio.getvalue()
                if perform_gzip:
                    data = gzip.compress(data)
                    is_compressed = True
                else:
                    is_compressed = False

            d = Dataset.create(name=name,
                               data=data,
                               dtype=dtype,
                               file_format=file_format,
                               is_compressed=is_compressed,
                               columns=columns)
            return d
Exemplo n.º 27
0
    def read_year(self,
                  year,
                  hh_states_keep=None,
                  hh_states_drop=None,
                  hh_dma_keep=None,
                  hh_dma_drop=None):

        (purch_fn, trip_fn, panelist_fn) = get_fns(self.annual_dict[year])

        hh_ds = ds.dataset(
            csv.read_csv(panelist_fn,
                         parse_options=csv.ParseOptions(delimiter='\t'),
                         convert_options=csv.ConvertOptions(
                             auto_dict_encode=True,
                             auto_dict_max_cardinality=1024)))

        # build an arrow dataset filter object one by one
        my_filter = ds.field('Projection_Factor') > 0
        if hh_states_keep:
            my_filter = my_filter & (
                ds.field('Fips_State_Desc').isin(hh_states_keep))
        if hh_states_drop:
            my_filter = my_filter & (
                ~ds.field('Fips_State_Desc').isin(hh_states_drop))
        if hh_dma_keep:
            my_filter = my_filter & (ds.field('DMA_Cd').isin(hh_dma_keep))
        if hh_dma_drop:
            my_filter = my_filter & (~ds.field('DMA_Cd').isin(hh_dma_drop))

        # convert to pandas and get unique HH list
        hh_df = hh_ds.to_table(filter=my_filter).to_pandas().rename(
            columns=hh_dict_rename)
        hh_list = hh_df.household_code.unique()

        # use pyarrrow filter to filter trips for just our households
        trip_df = ds.dataset(csv.read_csv(trip_fn, parse_options=csv.ParseOptions(delimiter='\t')))\
                  .to_table(filter=ds.field('household_code').isin(hh_list)).to_pandas()

        trip_list = trip_df.trip_code_uc.unique()
        upc_list = self.prod_df.upc.unique()

        # use pyarrow to filter purchases using trips and UPCs only
        purch_ds = ds.dataset(
            csv.read_csv(purch_fn,
                         parse_options=csv.ParseOptions(delimiter='\t'),
                         convert_options=csv.ConvertOptions(
                             auto_dict_encode=True,
                             auto_dict_max_cardinality=1024)))
        purch_filter = ds.field('trip_code_uc').isin(trip_list) & ds.field(
            'upc').isin(upc_list)
        purch_df = purch_ds.to_table(filter=purch_filter).to_pandas()

        # Add the fields to the trips and purchases for convenience later
        trip_df2 = pd.merge(trip_df,
                            hh_df[self.hh_cols],
                            on=['household_code', 'panel_year'])
        purch_df2 = pd.merge(pd.merge(
            purch_df,
            self.prod_df[self.prod_cols], on=['upc', 'upc_ver_uc']),
            trip_df2[self.hh_cols+['trip_code_uc', 'purchase_date', 'store_code_uc']], on=['trip_code_uc'])\
            .rename(columns={'fips_state_desc': 'hh_state_desc'})

        self.purch_df = self.purch_df.append(purch_df2, ignore_index=True)
        self.trip_df = self.trip_df.append(trip_df2, ignore_index=True)
        self.hh_df = self.hh_df.append(hh_df, ignore_index=True)
        return
Exemplo n.º 28
0
from pyarrow import csv as pacsv

filename = "test_input.txt"

if __name__ == "__main__":
    read_options = pacsv.ReadOptions(
        column_names=["group_id", "seq_number", "data"])

    parse_options = pacsv.ParseOptions(delimiter="\t")

    table = pacsv.read_csv(filename,
                           read_options=read_options,
                           parse_options=parse_options)
Exemplo n.º 29
0
import pyarrow.parquet as pq
import pyarrow.csv as pv

csvfile = 'pressureVacuum.csv'

tb = pv.read_csv(csvfile,parse_options=pv.ParseOptions(delimiter=','))

print(tb)

parquetfile = 'pressureVacuum.parquet'

pq.write_table(tb,parquetfile,compression='BROTLI')
#  {'NONE', 'SNAPPY', 'GZIP', 'LZO', 'BROTLI', 'LZ4', 'ZSTD'}

df = pq.read_table(parquetfile,columns=None)

print(df)
Exemplo n.º 30
0
target_path = Path("data.arrows")


with open(source_path) as source:
    with open(temp_path, "w") as target:
        for source_line in source:
            if source_line.count("\t") != 8:
                # filter out records with anomalous columns
                # matches the hack in streaming-tsv-parser.js:27
                continue
            target.write(source_line)

table = csv.read_csv(
    temp_path,
    parse_options=csv.ParseOptions(
        delimiter="\t",
    ),
    convert_options=pa.csv.ConvertOptions(
        column_types={
            "date": pa.uint32(),
            "x": pa.float32(),
            "y": pa.float32(),
            "ix": pa.uint32(),
            "language": pa.dictionary(pa.int32(), pa.utf8())
        },
        null_values=["None", ""]
    ),
)

# remove unused columns
table = table.select(["ix", "x", "y", "title", "first_author_name", "date", "language"])