Exemplos de ReadOptions em Python, exemplos de pyarrow.csv.ReadOptions em Python

Exemplo n.º 1

0

Exibir arquivo

    def _read_csv(self) -> "Table":
        import pyarrow as pa
        from pyarrow import csv

        if not self.output_location:
            raise ProgrammingError("OutputLocation is none or empty.")
        if not self.output_location.endswith((".csv", ".txt")):
            return pa.Table.from_pydict(dict())
        length = self._get_content_length()
        if length and self.output_location.endswith(".txt"):
            description = self.description if self.description else []
            column_names = [d[0] for d in description]
            read_opts = csv.ReadOptions(
                skip_rows=0,
                column_names=column_names,
                block_size=self._block_size,
                use_threads=True,
            )
            parse_opts = csv.ParseOptions(
                delimiter="\t",
                quote_char=False,
                double_quote=False,
                escape_char=False,
            )
        elif length and self.output_location.endswith(".csv"):
            read_opts = csv.ReadOptions(skip_rows=0,
                                        block_size=self._block_size,
                                        use_threads=True)
            parse_opts = csv.ParseOptions(
                delimiter=",",
                quote_char='"',
                double_quote=True,
                escape_char=False,
            )
        else:
            return pa.Table.from_pydict(dict())

        bucket, key = parse_output_location(self.output_location)
        try:
            return csv.read_csv(
                self._fs.open_input_stream(f"{bucket}/{key}"),
                read_options=read_opts,
                parse_options=parse_opts,
                convert_options=csv.ConvertOptions(
                    quoted_strings_can_be_null=False,
                    timestamp_parsers=self.timestamp_parsers,
                    column_types=self.column_types,
                ),
            )
        except Exception as e:
            _logger.exception(f"Failed to read {bucket}/{key}.")
            raise OperationalError(*e.args) from e

Exemplo n.º 2

0

Exibir arquivo

    def _read_table_arrow(self, source: tp.BinaryIO,
                          schema: pa.Schema) -> pa.Table:

        try:

            read_options = pa_csv.ReadOptions()
            read_options.encoding = 'utf-8'
            read_options.use_threads = False

            parse_options = pa_csv.ParseOptions()
            parse_options.newlines_in_values = True

            convert_options = pa_csv.ConvertOptions()
            convert_options.include_columns = schema.names
            convert_options.column_types = {
                n: t
                for (n, t) in zip(schema.names, schema.types)
            }
            convert_options.strings_can_be_null = True
            convert_options.quoted_strings_can_be_null = False

            return pa_csv.read_csv(source, read_options, parse_options,
                                   convert_options)

        except pa.ArrowInvalid as e:
            err = f"CSV file decoding failed, content is garbled"
            self._log.exception(err)
            raise _ex.EDataCorruption(err) from e

        except pa.ArrowKeyError as e:
            err = f"CSV file decoding failed, one or more columns is missing"
            self._log.error(err)
            self._log.exception(str(e))
            raise _ex.EDataCorruption(err) from e

Exemplo n.º 3

0

Exibir arquivo

Arquivo: parquet.py Projeto: halasystems/boxball

def write_files(metadata: AlchemyMetadata) -> None:
    """
    Creates a Parquet file for each table in the schema.
    """
    tables: Iterator[AlchemyTable] = metadata.tables.values()
    for table in tables:
        name = table.name
        print(name)

        def get_path(prefix: Path, suffix: str):
            parent_dir = prefix.joinpath(metadata.schema)
            parent_dir.mkdir(exist_ok=True, parents=True)
            return parent_dir.joinpath(name).with_suffix(suffix)

        extract_file = get_path(EXTRACT_PATH_PREFIX, ".csv.zst")
        parquet_file = get_path(PARQUET_PREFIX, ".parquet")

        arrow_schema = pa.schema(get_fields(table))
        column_names = [name for name, dtype in get_fields(table)]

        read_options = pcsv.ReadOptions(column_names=column_names, block_size=1000000000)
        parse_options = pcsv.ParseOptions(newlines_in_values=True)
        convert_options = pcsv.ConvertOptions(column_types=arrow_schema, timestamp_parsers=["%Y%m%d", "%Y-%m-%d"],
                                              true_values=["1", "T"], false_values=["0", "F"], strings_can_be_null=True)

        parquet_writer = pq.ParquetWriter(parquet_file, schema=arrow_schema, compression='zstd',
                                          version="2.0", use_dictionary=True)
        stream_reader = pcsv.open_csv(extract_file, read_options=read_options, parse_options=parse_options,
                                      convert_options=convert_options)
        for batch in stream_reader:
            table = pa.Table.from_batches([batch])
            parquet_writer.write_table(table)
        parquet_writer.close()

Exemplo n.º 4

0

Exibir arquivo

 def pa_read_options(self):
     read_options = self.read_options or pac.ReadOptions()
     if self.skip_rows is not None:
         read_options.skip_rows = self.skip_rows
     if self.column_names is not None:
         read_options.column_names = self.column_names
     if self.autogenerate_column_names is not None:
         read_options.autogenerate_column_names = self.autogenerate_column_names
     return read_options

Exemplo n.º 5

0

Exibir arquivo

Arquivo: dbapi.py Projeto: zy-github-user/pysqream

    def csv_to_table(self, csv_path, table_name, read = None, parse = None, convert = None, con = None, auto_infer = False):
        ' Pyarrow CSV reader documentation: https://arrow.apache.org/docs/python/generated/pyarrow.csv.read_csv.html '

        if not ARROW:
            return "Optional pyarrow dependency not found. To install: pip3 install pyarrow"
        
        sqream_to_pa = {
            'ftBool':     pa.bool_(),
            'ftUByte':    pa.uint8(),
            'ftShort':    pa.int16(),
            'ftInt':      pa.int32(),
            'ftLong':     pa.int64(),
            'ftFloat':    pa.float32(),
            'ftDouble':   pa.float64(),
            'ftDate':     pa.timestamp('ns'),
            'ftDateTime': pa.timestamp('ns'),
            'ftVarchar':  pa.string(),
            'ftBlob':     pa.utf8()
        }

        start = time.time()
        # Get table metadata
        con = con or self
        con.execute(f'select * from {table_name} where 1=0')
        
        # Map column names to pyarrow types and set Arrow's CSV parameters
        sqream_col_types = [col_type[0] for col_type in con.col_type_tups]
        column_types = zip(con.col_names, [sqream_to_pa[col_type[0]] for col_type in con.col_type_tups])
        read = read or csv.ReadOptions(column_names=con.col_names)
        parse = parse or csv.ParseOptions(delimiter='|')
        convert = convert or csv.ConvertOptions(column_types = None if auto_infer else column_types)
        
        # Read CSV to in-memory arrow format
        csv_arrow = csv.read_csv(csv_path, read_options=read, parse_options=parse, convert_options=convert).combine_chunks()
        num_chunks = len(csv_arrow[0].chunks)
        numpy_cols = []

        # For each column, get the numpy representation for quick packing 
        for col_type, col in zip(sqream_col_types, csv_arrow):
            # Only one chunk after combine_chunks()
            col = col.chunks[0]
            if col_type in  ('ftVarchar', 'ftBlob', 'ftDate', 'ftDateTime'):
                col = col.to_pandas()
            else:
                col = col.to_numpy()
            
            numpy_cols.append(col)
        
        print (f'total loading csv: {time.time()-start}')
        start = time.time()
        
        # Insert columns into SQream
        col_num = csv_arrow.shape[1]
        con.executemany(f'insert into {table_name} values ({"?,"*(col_num-1)}?)', numpy_cols)
        print (f'total inserting csv: {time.time()-start}')

Exemplo n.º 6

0

Exibir arquivo

Arquivo: text.py Projeto: kandorm/CLINE

 def pa_read_options(self):
     if self.read_options is not None:
         read_options = self.read_options
     else:
         read_options = pac.ReadOptions(column_names=["text"])
     if self.encoding is not None:
         read_options.encoding = self.encoding
     if self.block_size is not None:
         read_options.block_size = self.block_size
     if self.use_threads is not None:
         read_options.use_threads = self.use_threads
     return read_options

Exemplo n.º 7

0

Exibir arquivo

Arquivo: nielsen_read2.py Projeto: chrisconlon/kiltsnielsen

    def read_product(self,
                     keep_groups=None,
                     drop_groups=None,
                     keep_modules=None,
                     drop_modules=None):
        prod_cols = [
            'upc', 'upc_ver_uc', 'upc_descr', 'product_module_code',
            'product_module_descr', 'product_group_code',
            'product_group_descr', 'brand_code_uc', 'brand_descr', 'multi',
            'size1_code_uc', 'size1_amount', 'size1_units', 'dataset_found_uc',
            'size1_change_flag_uc'
        ]

        prod_dict = {
            'upc': pa.int64(),
            'upc_ver_uc': pa.int8(),
            'product_module_code': pa.uint16(),
            'brand_code_uc': pa.uint32(),
            'multi': pa.uint16(),
            'size1_code_uc': pa.uint16()
        }

        prod_df = csv.read_csv(self.product_file,
                               read_options=csv.ReadOptions(encoding='latin'),
                               parse_options=csv.ParseOptions(delimiter='\t'),
                               convert_options=csv.ConvertOptions(
                                   column_types=prod_dict,
                                   include_columns=prod_cols)).to_pandas()
        if keep_groups:
            prod_df = prod_df[prod_df['product_group_code'].isin(keep_groups)]
        if drop_groups:
            prod_df = prod_df[~prod_df['product_group_code'].isin(drop_groups)]
        if keep_modules:
            prod_df = prod_df[prod_df['product_module_code'].isin(
                keep_modules)]
        if drop_modules:
            prod_df = prod_df[~prod_df['product_module_code'].isin(drop_modules
                                                                   )]

        # dictionary encoding to save space
        prod_df['size1_units'] = prod_df['size1_units'].astype('category')
        prod_df['product_module_descr'] = prod_df[
            'product_module_descr'].astype('category')
        prod_df['product_group_code'] = prod_df['product_group_code'].astype(
            'category')

        # clean up product info
        prod_df['upc_descr'] = prod_df['upc_descr'].str.strip().str.replace(
            'RTE', '')
        prod_df['brand_descr'] = prod_df['brand_descr'].str.strip(
        ).str.replace('CTL BR', 'Private Label')
        self.prod_df = prod_df.copy()
        return

Exemplo n.º 8

0

Exibir arquivo

Arquivo: read_api.py Projeto: yiranwang52/ray

 def csv_read(read_paths: List[str]):
     logger.debug(f"Reading {len(read_paths)} files.")
     tables = []
     for read_path in read_paths:
         with filesystem.open_input_file(read_path) as f:
             tables.append(
                 csv.read_csv(
                     f,
                     read_options=csv.ReadOptions(use_threads=False),
                     **arrow_csv_args))
     block = ArrowBlock(pa.concat_tables(tables))
     return block, block.get_metadata(input_files=read_paths)

Exemplo n.º 9

0

Exibir arquivo

def _read_options_from_dict(**kwargs):
    """Returns the read options for CSV.
    Returns:
        (object) A pyarrow ReadOptions object.
    """
    return csv.ReadOptions(
        use_threads=kwargs.pop('use_threads', True),
        block_size=kwargs.pop('block_size', 1073741824),
        skip_rows=kwargs.pop('skip_rows', 0),
        column_names=kwargs.pop('column_names', None),
        autogenerate_column_names=kwargs.pop('autogenerate_column_names',
                                             False),
        encoding=kwargs.pop('encoding', 'utf8'),
    )

Exemplo n.º 10

0

Exibir arquivo

Arquivo: hash_bucket.py Projeto: amzn/amazon-ray

def read_delta_file_envelopes(
        annotated_delta_manifests: List[Dict[str, Any]],
        column_names: List[str],
        primary_keys: List[str],
        sort_keys: List[str],
        deltacat_storage=unimplemented_deltacat_storage) \
        -> Optional[List[Dict[str, Any]]]:

    tables_and_annotations = []
    columns_to_read = list(chain(primary_keys, sort_keys))
    for annotated_delta_manifest in annotated_delta_manifests:
        tables = deltacat_storage.download_delta_manifest(
            annotated_delta_manifest,
            file_reader_kwargs={
                CONTENT_TYPE_TO_USER_KWARGS_KEY[ContentType.CSV.value]: {
                    pacsv.ReadOptions(column_names=column_names),
                    pacsv.ConvertOptions(include_columns=columns_to_read)
                },
                CONTENT_TYPE_TO_USER_KWARGS_KEY[ContentType.PARQUET.value]: {
                    "columns": columns_to_read
                },
                CONTENT_TYPE_TO_USER_KWARGS_KEY[ContentType.FEATHER.value]: {
                    "columns": columns_to_read
                },
            },
        )
        annotations = dma.get_annotations(annotated_delta_manifest)
        assert (
            len(tables) == len(annotations),
            f"Unexpected Error: Length of downloaded delta manifest tables "
            f"({len(tables)}) doesn't match the length of delta manifest "
            f"annotations ({len(annotations)}).")
        tables_and_annotations.append((tables, annotations))
    if not tables_and_annotations:
        return None

    delta_file_envelopes = []
    for tables, annotations in tables_and_annotations:
        for i in range(len(tables)):
            delta_file = delta_file_envelope.of(
                dma.get_annotation_stream_position(annotations[i]),
                dma.get_annotation_file_index(annotations[i]),
                dma.get_annotation_delta_type(annotations[i]),
                tables[i],
            )
            delta_file_envelopes.append(delta_file)
    return delta_file_envelopes

Exemplo n.º 11

0

Exibir arquivo

def func(id: int, conn: str, query: str) -> Any:
    engine = create_engine(conn)
    conn = engine.connect()
    cur = conn.connection.cursor()
    store = io.BytesIO()

    with Timer() as timer:
        cur.copy_expert(f"COPY ({query}) TO STDOUT WITH CSV HEADER;", store)
    print(f"[Copy {id}] {timer.elapsed:.2f}s")

    store.seek(0)
    with Timer() as timer:
        df = csv.read_csv(store,
                          read_options=csv.ReadOptions(use_threads=False))
    print(f"[Read CSV {id}] {timer.elapsed:.2f}s")

    return df

Exemplo n.º 12

0

Exibir arquivo

Arquivo: csv_datasource.py Projeto: vishalbelsare/ray

    def _read_stream(self, f: "pyarrow.NativeFile", path: str,
                     **reader_args) -> Iterator[Block]:
        import pyarrow
        from pyarrow import csv

        read_options = reader_args.pop("read_options",
                                       csv.ReadOptions(use_threads=False))
        reader = csv.open_csv(f, read_options=read_options, **reader_args)
        schema = None
        while True:
            try:
                batch = reader.read_next_batch()
                table = pyarrow.Table.from_batches([batch], schema=schema)
                if schema is None:
                    schema = table.schema
                yield table
            except StopIteration:
                return

Exemplo n.º 13

0

Exibir arquivo

Arquivo: scdata.py Projeto: das-projects/singlecell

    def read_csv(self, filenames, delimiter=','):
        global parquet_writer
        for file in filenames:
            csv_reader = csv.open_csv(
                file,
                read_options=csv.ReadOptions(use_threads=True),
                parse_options=csv.ParseOptions(delimiter=delimiter),
                convert_options=csv.ConvertOptions(column_types=self.dtype))
            parquet_writer = pq.ParquetWriter(self.parquet_file,
                                              csv_reader.schema)

            nrow = 0
            for batch in csv_reader:
                batch_df = batch.to_pandas()
                nrow += batch_df.shape[0]
                parquet_writer.write_table(pa.Table.from_pandas(df=batch_df))

        parquet_writer.close()
        return ds.dataset(self.parquet_file, format="parquet")

Exemplo n.º 14

0

Exibir arquivo

def csv_to_parquet(
    csv_file: Path,
    parquet_file: Path,
    *,
    delimiter: str,
    column_names: List[str],
    quiet: bool = False,
) -> None:
    block_size = 1 << 24  # 16 MB
    read_options = csv.ReadOptions(column_names=column_names, block_size=block_size)
    parse_options = csv.ParseOptions(delimiter=delimiter)
    writer = None
    with csv.open_csv(
        csv_file, read_options=read_options, parse_options=parse_options
    ) as csv_reader:
        for batch in tqdm(csv_reader, disable=quiet):
            if writer is None:
                writer = pq.ParquetWriter(parquet_file, csv_reader.schema, compression="zstd")
            table = pa.Table.from_batches([batch])
            writer.write_table(table)
    if writer is not None:
        writer.close()

Exemplo n.º 15

0

Exibir arquivo

def s3_file_to_table(s3_url: str,
                     content_type: str,
                     content_encoding: str,
                     pa_read_func_kwargs: Optional[Dict[str, Any]] = None,
                     **s3_client_kwargs) -> pa.Table:

    logger.debug(f"Reading {s3_url} to PyArrow. Content type: {content_type}. "
                 f"Encoding: {content_encoding}")
    s3_obj = s3_utils.get_object_at_url(s3_url, **s3_client_kwargs)
    logger.debug(f"Read S3 object from {s3_url}: {s3_obj}")
    pa_read_func = CONTENT_TYPE_TO_PA_READ_FUNC[content_type]
    input_file_init = ENCODING_TO_FILE_INIT[content_encoding]
    input_file = input_file_init(fileobj=io.BytesIO(s3_obj['Body'].read()))

    args = [input_file]
    kwargs = CONTENT_TYPE_TO_READER_KWARGS[content_type]

    if pa_read_func_kwargs is None:
        pa_read_func_kwargs = {}
    if content_type in DELIMITED_TEXT_CONTENT_TYPES:
        # ReadOptions can't be included in CONTENT_TYPE_TO_KWARGS because it doesn't pickle:
        #   File "/home/ubuntu/anaconda3/lib/python3.7/site-packages/ray/cloudpickle/cloudpickle_fast.py", line 563, in dump
        #       return Pickler.dump(self, obj)
        #   File "stringsource", line 2, in pyarrow._csv.ReadOptions.__reduce_cython__
        #   TypeError: self.options cannot be converted to a Python object for pickling
        logger.debug(f"{content_type} is a delimited text content type")
        kwargs["read_options"] = pacsv.ReadOptions(
            autogenerate_column_names=True)
    if pa_read_func_kwargs:
        kwargs.update(
            pa_read_func_kwargs.get(
                CONTENT_TYPE_TO_USER_KWARGS_KEY[content_type]))
    table, latency = timed_invocation(pa_read_func, *args, **kwargs)
    # Pyarrow.orc is disabled in Pyarrow 0.15, 0.16:
    # https://issues.apache.org/jira/browse/ARROW-7811
    # if content_type == DatasetConstants.ContentType.ORC:
    #    result = result.read()
    logger.debug(f"Time to read {s3_url} into PyArrow table: {latency}s")
    return table

Exemplo n.º 16

0

Exibir arquivo

Arquivo: test_str_conformance.py Projeto: uk-gov-mirror/moj-analytical-services.mojap-arrow-pd-parser

def test_csv_options(in_type, pd_old_type, pd_new_type):
    schema = pa.schema([("string_col", pa.string())])

    read_options = csv.ReadOptions(skip_rows=1)

    parse_options = csv.ParseOptions(quote_char="'",
                                     escape_char="\\",
                                     delimiter=";",
                                     newlines_in_values=True)

    convert_options = csv.ConvertOptions(
        include_columns=["i", "my_string", "nonexistent_column"],
        include_missing_columns=True,
        null_values=["NULL_STRING"],
        strings_can_be_null=True,
    )

    df = pa_read_csv_to_pandas(
        "tests/data/csv_options_test.csv",
        schema,
        False,
        pd_string=False,
        parse_options=parse_options,
        convert_options=convert_options,
        read_options=read_options,
    )

    expected = [
        "dsfasd;dsffadsf",
        "dsfasd;dsffadsf",
        None,
        "this text\nhas a line break",
        "this text, like so, has commas",
    ]
    assert df.columns.tolist() == ["i", "my_string", "nonexistent_column"]
    assert df["nonexistent_column"].isnull().all()
    assert_series_equal(df["my_string"], Series(expected, name="my_string"))

Exemplo n.º 17

0

Exibir arquivo

Arquivo: csv_datasource.py Projeto: hngenc/ray

    def _read_file(self, f: "pyarrow.NativeFile", path: str, **reader_args):
        from pyarrow import csv

        read_options = reader_args.pop("read_options",
                                       csv.ReadOptions(use_threads=False))
        return csv.read_csv(f, read_options=read_options, **reader_args)

Exemplo n.º 18

0

Exibir arquivo

Arquivo: tpch-pyarrow.py Projeto: Yizhou150/connector-x

if __name__ == "__main__":
    args = docopt(__doc__, version="1.0")
    conn = os.environ["POSTGRES_URL"]
    table = os.environ["POSTGRES_TABLE"]

    engine = create_engine(conn)
    conn = engine.connect()

    cur = conn.connection.cursor()
    store = io.BytesIO()
    with Timer() as timer:
        cur.copy_expert(
            f"COPY (SELECT * FROM {table}) TO STDOUT WITH CSV HEADER;", store)
    print(f"[Copy] {timer.elapsed:.2f}s")

    store.seek(0)

    with Timer() as timer:
        df = csv.read_csv(store,
                          read_options=csv.ReadOptions(use_threads=False))
    print(f"[Read CSV] {timer.elapsed:.2f}s")

    with Timer() as timer:
        df = df.to_pandas()
        print(f"[To Pandas] {timer.elapsed:.2f}s")

    conn.close()
    print(df.head())
    # _, peak = tracemalloc.get_traced_memory()
    # print(f"memory peak: {peak/10**9:.2f}G")

Exemplo n.º 19

0

Exibir arquivo

Arquivo: convert_csv_to_parquet.py Projeto: statisticsnorway/microdata-testing

import pyarrow as pa
import pyarrow.csv as pv
import pyarrow.parquet as pq
from datetime import datetime

csv_filename = "accumulated_data_300_million_rows_converted.csv"
parquet_filename = '../data/' + csv_filename.replace('csv', 'parquet')
parquet_partition_name = '../data/' + csv_filename.replace('.csv', '')

print("Start ", datetime.now())

# ReadOptions: https://arrow.apache.org/docs/python/generated/pyarrow.csv.ReadOptions.html#pyarrow.csv.ReadOptions
csv_read_options = pv.ReadOptions(skip_rows=0,
                                  encoding="utf8",
                                  column_names=[
                                      "unit_id", "value", "start", "stop",
                                      "start_year", "start_unix_days",
                                      "stop_unix_days"
                                  ])

# ParseOptions: https://arrow.apache.org/docs/python/generated/pyarrow.csv.ParseOptions.html#pyarrow.csv.ParseOptions
csv_parse_options = pv.ParseOptions(delimiter=';')

# Types: https://arrow.apache.org/docs/python/api/datatypes.html
# TODO nullable parameter does not work as expected!
data_schema = pa.schema([
    pa.field(name='start_year', type=pa.string(), nullable=True),
    pa.field(name='unit_id', type=pa.uint64(), nullable=False),
    pa.field(name='value', type=pa.string(), nullable=False),
    pa.field(name='start_epoch_days', type=pa.int16(), nullable=True),
    pa.field(name='stop_epoch_days', type=pa.int16(), nullable=True),

Exemplo n.º 20

0

Exibir arquivo

from pyarrow import csv as pacsv

filename = "test_input.txt"

if __name__ == "__main__":
    read_options = pacsv.ReadOptions(
        column_names=["group_id", "seq_number", "data"])

    parse_options = pacsv.ParseOptions(delimiter="\t")

    table = pacsv.read_csv(filename,
                           read_options=read_options,
                           parse_options=parse_options)

Exemplo n.º 21

0

Exibir arquivo

from pyarrow import csv
from timeit import default_timer as timer
import sys

warmup_filename = sys.argv[0]
filename = sys.argv[1]

start = timer()
table = csv.read_csv(
    warmup_filename,
    read_options=csv.ReadOptions(use_threads=False)).to_pandas()
end = timer()
t1 = end - start

start = timer()
table = csv.read_csv(
    filename, read_options=csv.ReadOptions(use_threads=False)).to_pandas()
end = timer()
t2 = end - start

start = timer()
table = csv.read_csv(
    filename, read_options=csv.ReadOptions(use_threads=False)).to_pandas()
end = timer()
t3 = end - start

print(t1)
print(t2)
print(t3)
print('NaN')
print('NaN')

Exemplo n.º 22

0

Exibir arquivo

Arquivo: convert_csv_id_filter_to_parquet.py Projeto: statisticsnorway/microdata-testing

import pyarrow as pa
import pyarrow.csv as pv
import pyarrow.parquet as pq
from pyarrow.lib import Table

csv = 'accumulated_data_300_million_rows_id_filter.csv'
target_file = '../data/accumulated_data_300_million_rows_id_filter_1mill.parquet'

csv_read_options = pv.ReadOptions(skip_rows=0,
                                  encoding="utf8",
                                  column_names=["unit_id"])

# Types: https://arrow.apache.org/docs/python/api/datatypes.html
data_schema = pa.schema([('unit_id', pa.uint64())])

# ConvertOptions: https://arrow.apache.org/docs/python/generated/pyarrow.csv.ConvertOptions.html#pyarrow.csv.ConvertOptions
csv_convert_options = pv.ConvertOptions(column_types=data_schema)

table: Table = pv.read_csv(input_file=csv,
                           read_options=csv_read_options,
                           convert_options=csv_convert_options)
pq.write_table(table, target_file)

print('Generated file with the following:')
print('Parquet metadata: ' + str(pq.read_metadata(target_file)))
print('Parquet schema: ' + pq.read_schema(target_file).to_string())

Exemplo n.º 23

0

Exibir arquivo

 def pa_read_options(self):
     read_options = self.read_options or pac.ReadOptions()
     read_options.skip_rows = self.skip_rows
     read_options.autogenerate_column_names = not self.header_as_column_names
     return read_options

Exemplo n.º 24

0

Exibir arquivo

def convert_csv_to_parquet(csv_file: str, parquet_dir: str, partitioned: bool):
    print("Start ", datetime.now())

    print(csv_file)
    print(parquet_dir)

    print("Abs path of csv file: " + os.path.abspath(csv_file))

    #Remove old partitions
    if partitioned:
        if Path(parquet_dir).is_dir():
            shutil.rmtree(parquet_dir)

    # ReadOptions: https://arrow.apache.org/docs/python/generated/pyarrow.csv.ReadOptions.html#pyarrow.csv.ReadOptions
    csv_read_options = pv.ReadOptions(skip_rows=0,
                                      encoding="utf8",
                                      column_names=[
                                          "unit_id", "value", "start", "stop",
                                          "start_year", "start_epoch_days",
                                          "stop_epoch_days"
                                      ])

    # ParseOptions: https://arrow.apache.org/docs/python/generated/pyarrow.csv.ParseOptions.html#pyarrow.csv.ParseOptions
    csv_parse_options = pv.ParseOptions(delimiter=';')

    # Types: https://arrow.apache.org/docs/python/api/datatypes.html
    # TODO nullable parameter does not work as expected!
    data_schema = pa.schema([
        pa.field(name='start_year', type=pa.string(), nullable=True),
        pa.field(name='unit_id', type=pa.uint64(), nullable=False),
        pa.field(name='value', type=pa.string(), nullable=False),
        pa.field(name='start_epoch_days', type=pa.int16(), nullable=True),
        pa.field(name='stop_epoch_days', type=pa.int16(), nullable=True),
    ])

    # ConvertOptions: https://arrow.apache.org/docs/python/generated/pyarrow.csv.ConvertOptions.html#pyarrow.csv.ConvertOptions
    csv_convert_options = pv.ConvertOptions(column_types=data_schema)
    #include_columns=["start_year", "unit_id", "value", "start_epoch_days", "stop_epoch_days"])

    # read_csv: https://arrow.apache.org/docs/python/generated/pyarrow.csv.read_csv.html#pyarrow.csv.read_csv
    table = pv.read_csv(input_file=csv_file,
                        read_options=csv_read_options,
                        parse_options=csv_parse_options,
                        convert_options=csv_convert_options)

    # print('Bytes: ' + str(table.nbytes))
    # print('Rows: ' + str(table.num_rows))
    # print('Schema: ' + str(table.schema))
    # print('Column names: ' + str(table.column_names))
    # pandas.set_option('max_columns', None)  # print all columns
    # print(table.to_pandas().head(10))

    # write with partitions

    if partitioned:
        pq.write_to_dataset(table,
                            root_path=parquet_dir,
                            partition_cols=['start_year'])
    else:
        pq.write_to_dataset(table, root_path=parquet_dir)

    print("End ", datetime.now())