示例#1
0
def test_encrypted_parquet_read_schema_no_decryption_config(
        tempdir, data_table):
    """Write an encrypted parquet, verify it's encrypted,
    but then try to read its schema without decryption properties."""
    test_encrypted_parquet_write_read(tempdir, data_table)
    with pytest.raises(IOError, match=r"no decryption"):
        pq.read_schema(tempdir / PARQUET_NAME)
示例#2
0
def get_filters(
    parquet: BytesIO, condition_groups: List[ConditionGroup]
) -> Optional[List[PyArrowConditionGroup]]:
    """ Given a dataframe and list of conditions, filter the dataframe using a boolean OR """
    schema = pq.read_schema(parquet)

    filters: List[PyArrowConditionGroup] = []

    for group in condition_groups:
        group_tuple = []

        for entry in group["conditions"]:
            try:
                field = schema.field(entry["column"])

                if field.type == pyarrow.null():
                    # Empty dataset; thus no inherent data type for entry["column"]
                    continue

                group_tuple.append((
                    entry["column"],
                    entry["operator"],
                    entry["values"],
                ))
            except KeyError:
                # Column does not exist within schema
                continue

        if group_tuple:
            filters.append(group_tuple)

    return filters or None
def file_to_schema(input_tuple):
    sname, fpath = input_tuple
    try:
        df = pq.read_schema(fpath)
    except:
        print("ERROR", fpath)
    return ';'.join([str(df.names), str(df.types)])
示例#4
0
def run_partition_test(input_file: str, output_dir: str, filters: Optional[list] = None):
    milliseconds_since_epoch = int(time() * 1000)

    print('Parquet metadata: ' + str(pq.read_metadata(input_file)))
    print('Parquet schema: ' + pq.read_schema(input_file).to_string())

    data = pq.read_table(source=input_file, filters=filters)

    # Write a dataset and collect metadata information of all written files
    metadata_collector = []
    root_path = output_dir + 'partitioned_' + str(milliseconds_since_epoch)
    pq.write_to_dataset(data,
                        root_path=root_path,
                        partition_cols=['start_year'],
                        metadata_collector=metadata_collector)

    # Write the ``_common_metadata`` parquet file without row groups statistics
    pq.write_metadata(data.schema, root_path + '/_common_metadata')

    # Write the ``_metadata`` parquet file with row groups statistics of all files
    # Gives following error:
    #       File "pyarrow/_parquet.pyx", line 616, in pyarrow._parquet.FileMetaData.append_row_groups
    #       RuntimeError: AppendRowGroups requires equal schemas.
    # data.schema has one more column than partitioned files when partitioning by one column
    # Related? https://github.com/dask/dask/issues/6243
    # pq.write_metadata(data.schema, root_path + '/_metadata', metadata_collector=metadata_collector)

    # Read from partitioned dataset
    # use the new generic Dataset API
    start_year = 2018
    value = 50000
    table = pq.read_table(root_path,
                          filters=[('start_year', '>=', start_year), ('value', '>', value)])
                          # filters=[('start_year', '>=', start_year)])
    print(table.to_pandas())
def test_read_schema(tempdir):
    N = 100
    df = pd.DataFrame({
        'index': np.arange(N),
        'values': np.random.randn(N)
    }, columns=['index', 'values'])

    data_path = tempdir / 'test.parquet'

    table = pa.Table.from_pandas(df)
    _write_table(table, data_path)

    read1 = pq.read_schema(data_path)
    read2 = pq.read_schema(data_path, memory_map=True)
    assert table.schema.equals(read1)
    assert table.schema.equals(read2)

    assert table.schema.metadata[b'pandas'] == read1.metadata[b'pandas']
示例#6
0
def columns_from_parquet_file(input_parquet):
    schema = pa_parquet.read_schema(input_parquet)
    column_specifications = schema.pandas_metadata['columns']
    options = []
    for col_item in column_specifications:
        name = col_item['name']
        if name is not None:
            options.append({'label': name, 'value': name})
    return options
示例#7
0
def test_write_parquet_no_index():
    df = pd.DataFrame(np.random.randn(6, 4),
                      columns=list('abcd'),
                      index=np.arange(6))

    with tempfile.TemporaryDirectory() as tmpdir:
        fs = LocalFileSystem()
        filename = os.path.join(tmpdir, 'df.parquet')
        write_dataframe_as_parquet(df, fs, filename)
        schema = parquet.read_schema(os.path.join(tmpdir, filename))
        assert '__index_level_0__' not in schema.names
        assert df.columns.values.tolist() == schema.names
示例#8
0
    def askUserInput(self):
        filepath = self.userdata['filepath']
        s = pa.read_schema(filepath)
        colnames = s.names

        def cb(columns):
            self.userdata['columns'] = columns

        dlgchoice = DlgListChoice(colnames, 'Open Parquet',
                                  'Choose curves to load')
        dlgchoice.dlgdata.connect(cb)
        dlgchoice.exec_()
示例#9
0
def _bytes2schema(data):
    reader = pa.BufferReader(data)
    schema = pq.read_schema(reader)
    fields = []
    for idx in range(len(schema)):
        f = schema[idx]

        # schema data recovered from parquet always contains timestamp data in us-granularity, but pandas will use
        # ns-granularity, so we re-align the two different worlds here
        if f.type == pa.timestamp("us"):
            f = pa.field(f.name, pa.timestamp("ns"))

        fields.append(f)
    return pa.schema(fields, schema.metadata)
示例#10
0
def run_test(input_file: str, output_dir: str, filters: list, use_pandas: bool):
    print('Using pyarrow')
    print('Parquet metadata: ' + str(pq.read_metadata(input_file)))
    print('Parquet schema: ' + pq.read_schema(input_file).to_string())

    pq_file = pq.ParquetFile(input_file)
    row_group_0_metadata = pq_file.metadata.row_group(0)
    print('Parquet min for column 0, row group 0: ' + str(row_group_0_metadata.column(0).statistics.min))
    print('Parquet max for column 0, row group 0: ' + str(row_group_0_metadata.column(0).statistics.max))

    if use_pandas:
        unfiltered_pandas_data = pq.read_table(source=input_file).to_pandas()
        size = sys.getsizeof(unfiltered_pandas_data)
        print('Size of UN-filtered pandas DataFrame in memory: ' + str(size) + ' bytes (' + str(size / 1000000) + ' MB)')

    with timeblock('pyarrow read and filter'):
        data = pq.read_table(source=input_file, filters=filters)
    size = sys.getsizeof(data)
    print('Size of filtered pyarrow table in memory: ' + str(size) + ' bytes (' + str(size / 1000000) + ' MB)')

    if use_pandas:
        unfiltered_pandas_data = data.to_pandas()
        size = sys.getsizeof(unfiltered_pandas_data)
        print('Size of filtered pandas DataFrame in memory: ' + str(size) + ' bytes (' + str(size / 1000000) + ' MB)')
        # print(pandas_data.head(10))

    milliseconds_since_epoch = int(time() * 1000)
    output_file = output_dir + str(milliseconds_since_epoch) + '.parquet'
    print('Output file name: ' + output_file)

    with timeblock('pyarrow write_table()'):
        pq.write_table(data, output_file)

    print('Parquet metadata of output: ' + str(pq.read_metadata(output_file)))
    print('Parquet schema of output: ' + pq.read_schema(output_file).to_string())
    print('Size of output file on disk: ' + str(os.path.getsize(output_file)) + ' bytes ('
          + str(os.path.getsize(output_file) / 1000000) + ' MB)')
示例#11
0
def read_encrypted_parquet(path, decryption_config, kms_connection_config,
                           crypto_factory):
    file_decryption_properties = crypto_factory.file_decryption_properties(
        kms_connection_config, decryption_config)
    assert (file_decryption_properties is not None)
    meta = pq.read_metadata(path,
                            decryption_properties=file_decryption_properties)
    assert (meta.num_columns == 3)
    schema = pq.read_schema(path,
                            decryption_properties=file_decryption_properties)
    assert (len(schema.names) == 3)

    result = pq.ParquetFile(path,
                            decryption_properties=file_decryption_properties)
    return result.read(use_threads=True)
示例#12
0
def test_read_schema(tmpdir):
    import pyarrow.parquet as pq

    N = 100
    df = pd.DataFrame({
        'index': np.arange(N),
        'values': np.random.randn(N)
    }, columns=['index', 'values'])

    data_path = pjoin(str(tmpdir), 'test.parquet')

    table = pa.Table.from_pandas(df)
    _write_table(table, data_path)

    assert table.schema.equals(pq.read_schema(data_path))
def file_to_schema_bad(input_tuple):
    sname, fpath = input_tuple
    matched = False
    #corrupted = False
    try:
        df = pq.read_schema(fpath)
        string_match = ';'.join([str(df.names), str(df.types)])

        if sname in proper_schema:
            if proper_schema[sname] == string_match:
                matched = True
        return (matched, sname, fpath, proper_schema[sname], string_match,
                "CORRECT-FILE")

    except:
        return (matched, sname, fpath, proper_schema[sname], "NULL", "NULL",
                "CORRUPTED-FILE")
示例#14
0
    def transform_ts(start, end, file):
        train_columns = pq.read_schema(
            file).names  # List with all column names to test
        # print(train_columns)
        X = pd.DataFrame(data=None)

        for i in train_columns[start:end]:
            df_signal = pq.read_pandas(file, columns=[i]).to_pandas()
            # turn parquet to dataframe of one single signal
            # print("Shape of signal data {}".format(df_signal.shape))

            sig = np.ravel(df_signal.iloc[:, 0].to_numpy())  # turn to numpy
            t = df_signal.index.to_numpy()  # turn time to numpy

            x_dn = de_noising(high_pass_filter(sig))
            x_deleted = delete_repeat(x_dn)
            x_deleted_cond = (x_deleted < 99998)
            x_deleted = x_deleted[x_deleted_cond]
            print(x_deleted.shape)
            t_deleted = t[x_deleted_cond]

            # Generating New Time Series Features from signal
            master_train = pd.DataFrame({
                0: x_deleted,
                1: np.repeat(i, x_deleted.shape[0]),
                2: t_deleted
            })
            # print("Shape of master train data {}".format(master_train.shape))
            # master_train.to_csv('output/master_train.csv')

            extraction_settings = EfficientFCParameters()
            X_signal = extract_features(
                master_train,
                column_id=1,
                column_sort=2,
                impute_function=impute,
                default_fc_parameters=extraction_settings)

            print("Number of extracted features in {}: {}.".format(
                i, X_signal.shape[1]))
            X = X.append(X_signal)

        return X
示例#15
0
def run_id_filter_test(input_file: str, input_id_file: str):

    # converting ids to pandas will be a "zero copy conversion" as unit_id column is int64 when:
    # - ids are not nulls
    # - a single ChunkedArray
    # TODO check it that is the case
    # https://arrow.apache.org/docs/python/pandas.html#zero-copy-series-conversions
    filter_ids = pq.read_table(source=input_id_file)
    filter_ids_as_pandas: DataFrame = filter_ids.to_pandas()
    # filter_ids_as_list = filter_ids_as_pandas['unit_id'].tolist()
    filter_ids_as_set = set(filter_ids_as_pandas['unit_id'])

    print('Parquet metadata: ' + str(pq.read_metadata(input_id_file)))
    print('Parquet schema: ' + pq.read_schema(input_id_file).to_string())
    print('Using filter ids: ' + str(filter_ids.to_pandas()))

    table = pq.read_table(source=input_file, filters=[
        # ('unit_id', 'in', filter_ids_as_list)
        ('unit_id', 'in', filter_ids_as_set)
    ])
    print(table.to_pandas())
示例#16
0
            writer = pq.ParquetWriter(outputFileName,schema) # ,coerce_timestamps='ms'

    if args.debug is True:
        totalWriteTime = totalWriteTime + (time.time() - startWrite)

if writer:
    writer.close()
    
# print final success message.

timeNow = time.time()
elapsed = timeNow - timeStart
rps = int(rowcount / elapsed)
print (f"{rowcount} rows exported to {outputFileName} at {rps} rows per second.")

if args.debug is True:
    print ()
    print ('Performance breakdown:')
    print (f'Read={totalReadTime:.2f} Transform={totalTransformTime:.2f} Write={totalWriteTime:.2f}')
    print ()
    print ('Test output display:')
    print ()
    print ('Schema:')
    print (pq.read_schema(outputFileName))
    print ()
    testTable = pq.read_table (outputFileName)
    print ('Data (10 rows):')
    print (testTable.to_pandas().head(10))

exit(0)
示例#17
0
 def _InferArrowSchema(self):
     match_result = FileSystems.match([self._file_pattern])[0]
     files_metadata = match_result.metadata_list[0]
     with FileSystems.open(files_metadata.path) as f:
         return pq.read_schema(f)
示例#18
0
文件: lib.py 项目: ploomber/posts
def peek_metadata(path_to_table):
    """Read metadata without loading the data file
    """
    schema = pq.read_schema(path_to_table)
    return json.loads(schema.metadata[b'my_metadata'].decode('utf-8'))
示例#19
0
try:
    for stream in hdfs.ls(study):
        try:
            for version in hdfs.ls(stream):
                try:
                    for user in hdfs.ls(version):
                        try:
                            files = hdfs.ls(user)
                            #if len(files)>2: put this check back again if schema mismatch is required
                            old_schema = []
                            for fle in hdfs.ls(user):
                                try:
                                    if "_SUCCESS" not in fle:
                                        with hdfs.open(hdfs_url +
                                                       fle) as f:
                                            current_schema = pq.read_schema(
                                                f).names
                                            mismatched_metadata.append({
                                                "total_files":
                                                len(files),
                                                "file_name":
                                                fle,
                                                "schema":
                                                current_schema,
                                                "user_folder":
                                                user
                                            })
                                except Exception as e:
                                    print(str(e))
                        except Exception as e:
                            print(str(e))
                except Exception as e:
示例#20
0
def forced_extraction(sources_df: pd.DataFrame, cfg_err_ra: float,
                      cfg_err_dec: float, p_run: Run, extr_df: pd.DataFrame,
                      min_sigma: float, edge_buffer: float,
                      cluster_threshold: float, allow_nan: bool,
                      add_mode: bool, done_images_df: pd.DataFrame,
                      done_source_ids: List[int]) -> Tuple[pd.DataFrame, int]:
    """
    Check and extract expected measurements, and associated them with the
    related source(s).

    Args:
        sources_df:
            Dataframe containing all the extracted measurements and
            associations (product from association step).
        cfg_err_ra:
            The minimum RA error from the config file (in degrees).
        cfg_err_dec:
            The minimum declination error from the config file (in degrees).
        p_run:
            The pipeline run object.
        extr_df:
            The dataframe containing the information on what sources are
            missing from which images (output from
            get_src_skyregion_merged_df in main.py).
        min_sigma:
            Minimum sigma value to drop forced extracted measurements.
        edge_buffer:
            Flag to pass to ForcedPhot.measure method.
        cluster_threshold:
            Flag to pass to ForcedPhot.measure method.
        allow_nan:
            Flag to pass to ForcedPhot.measure method.
        add_mode:
            True when the pipeline is running in add image mode.
        done_images_df:
            Dataframe containing the images that thave already been processed
            in a previous run (used in add image mode).
        done_source_ids:
            List of the source ids that were already present in the previous
            run (used in add image mode).

    Returns:
        The sources_df with the extracted sources added and n_forced is the
        total number of forced measurements present in the run.
    """
    logger.info('Starting force extraction step.')

    timer = StopWatch()

    # get all the skyregions and related images
    cols = [
        'id', 'name', 'measurements_path', 'path', 'noise_path', 'beam_bmaj',
        'beam_bmin', 'beam_bpa', 'background_path', 'rms_min', 'datetime',
        'skyreg__centre_ra', 'skyreg__centre_dec', 'skyreg__xtr_radius'
    ]

    images_df = pd.DataFrame(
        list(
            Image.objects.filter(run=p_run).select_related('skyreg').order_by(
                'datetime').values(*tuple(cols)))).set_index('name')
    # | name                          |   id | measurements_path   | path         | noise_path   |
    # |:------------------------------|-----:|:--------------------|:-------------|:-------------|
    # | VAST_2118-06A.EPOCH01.I.fits  |    1 | path/to/file        | path/to/file | path/to/file |
    # | VAST_2118-06A.EPOCH03x.I.fits |    3 | path/to/file        | path/to/file | path/to/file |
    # | VAST_2118-06A.EPOCH02.I.fits  |    2 | path/to/file        | path/to/file | path/to/file |

    # | name                          |   beam_bmaj |   beam_bmin |   beam_bpa | background_path   |
    # |:------------------------------|------------:|------------:|-----------:|:------------------|
    # | VAST_2118-06A.EPOCH01.I.fits  |  0.00589921 |  0.00326088 |   -70.4032 | path/to/file      |
    # | VAST_2118-06A.EPOCH03x.I.fits |  0.00470991 |  0.00300502 |   -83.1128 | path/to/file      |
    # | VAST_2118-06A.EPOCH02.I.fits  |  0.00351331 |  0.00308565 |    77.2395 | path/to/file      |

    # | name                          |   rms_min | datetime                         |   skyreg__centre_ra |   skyreg__centre_dec |   skyreg__xtr_radius |
    # |:------------------------------|----------:|:---------------------------------|--------------------:|---------------------:|---------------------:|
    # | VAST_2118-06A.EPOCH01.I.fits  |  0.173946 | 2019-08-27 18:12:16.700000+00:00 |             319.652 |              -6.2989 |               6.7401 |
    # | VAST_2118-06A.EPOCH03x.I.fits |  0.165395 | 2019-10-29 10:01:20.500000+00:00 |             319.652 |              -6.2989 |               6.7401 |
    # | VAST_2118-06A.EPOCH02.I.fits  |  0.16323  | 2019-10-30 08:31:20.200000+00:00 |             319.652 |              -6.2989 |               6.7401 |

    # Explode out the img_diff column.
    extr_df = extr_df.explode('img_diff').reset_index()
    total_to_extract = extr_df.shape[0]

    if add_mode:
        # If we are adding images to the run we assume that monitoring was
        # also performed before (enforced by the pre-run checks) so now we
        # only want to force extract in three situations:
        # 1. Any force extraction in a new image.
        # 2. The forced extraction is attached to a new source from the new
        # images.
        # 3. A new relation has been created and they need the forced
        # measuremnts filled in (actually covered by 2.)

        extr_df = (
            extr_df[~extr_df['img_diff'].isin(done_images_df['name'])].append(
                extr_df[(~extr_df['source'].isin(done_source_ids))
                        & (extr_df['img_diff'].isin(done_images_df.name))]).
            sort_index())

        logger.info(f"{extr_df.shape[0]} new measurements to force extract"
                    f" (from {total_to_extract} total)")

    timer.reset()
    extr_df = parallel_extraction(extr_df, images_df,
                                  sources_df[['source', 'image', 'flux_peak']],
                                  min_sigma, edge_buffer, cluster_threshold,
                                  allow_nan, add_mode, p_run.path)
    logger.info('Force extraction step time: %.2f seconds', timer.reset())

    # make measurement names unique for db constraint
    extr_df['name'] = extr_df['name'] + f'_f_run{p_run.id:06d}'

    # select sensible flux values and set the columns with fix values
    values = {'flux_int': 0, 'flux_int_err': 0}
    extr_df = extr_df.fillna(value=values)

    extr_df = extr_df[(extr_df['flux_int'] != 0)
                      & (extr_df['flux_int_err'] != 0)
                      & (extr_df['chi_squared_fit'] != np.inf)
                      & (extr_df['chi_squared_fit'] != np.nan)]

    default_pos_err = settings.POS_DEFAULT_MIN_ERROR / 3600.
    extr_df['ra_err'] = default_pos_err
    extr_df['dec_err'] = default_pos_err
    extr_df['err_bmaj'] = 0.
    extr_df['err_bmin'] = 0.
    extr_df['err_pa'] = 0.
    extr_df['ew_sys_err'] = cfg_err_ra
    extr_df['ns_sys_err'] = cfg_err_dec
    extr_df['error_radius'] = 0.

    extr_df['uncertainty_ew'] = np.hypot(cfg_err_ra, default_pos_err)
    extr_df['weight_ew'] = 1. / extr_df['uncertainty_ew'].values**2
    extr_df['uncertainty_ns'] = np.hypot(cfg_err_dec, default_pos_err)
    extr_df['weight_ns'] = 1. / extr_df['uncertainty_ns'].values**2

    extr_df['flux_peak'] = extr_df['flux_int']
    extr_df['flux_peak_err'] = extr_df['flux_int_err']
    extr_df['local_rms'] = extr_df['flux_int_err']
    extr_df['snr'] = (extr_df['flux_peak'].values /
                      extr_df['local_rms'].values)
    extr_df['spectral_index'] = 0.
    extr_df['dr'] = 0.
    extr_df['d2d'] = 0.
    extr_df['forced'] = True
    extr_df['compactness'] = 1.
    extr_df['psf_bmaj'] = extr_df['bmaj']
    extr_df['psf_bmin'] = extr_df['bmin']
    extr_df['psf_pa'] = extr_df['pa']
    extr_df['flag_c4'] = False
    extr_df['spectral_index_from_TT'] = False
    extr_df['has_siblings'] = False
    extr_df['flux_int_isl_ratio'] = 1.0
    extr_df['flux_peak_isl_ratio'] = 1.0

    col_order = read_schema(images_df.iloc[0]['measurements_path']).names
    col_order.remove('id')

    remaining = list(set(extr_df.columns) - set(col_order))

    extr_df = extr_df[col_order + remaining]

    # upload the measurements, a column 'id' is returned with the DB id
    extr_df = make_upload_measurements(extr_df)

    extr_df = extr_df.rename(columns={'source_tmp_id': 'source'})

    # write forced measurements to specific parquet
    logger.info('Saving forced measurements to specific parquet file...')
    parallel_write_parquet(extr_df, p_run.path, add_mode)

    # Required to rename this column for the image add mode.
    extr_df = extr_df.rename(columns={'time': 'datetime'})

    # append new meas into main df and proceed with source groupby etc
    sources_df = sources_df.append(
        extr_df.loc[:, extr_df.columns.isin(sources_df.columns)],
        ignore_index=True)

    # get the number of forced extractions for the run
    forced_parquets = glob(
        os.path.join(p_run.path, "forced_measurements*.parquet"))
    if forced_parquets:
        n_forced = (dd.read_parquet(forced_parquets,
                                    columns=['id'
                                             ]).count().compute().values[0])
    else:
        n_forced = 0

    logger.info('Total forced extraction time: %.2f seconds',
                timer.reset_init())
    return sources_df, n_forced
示例#21
0
 def load_metadata(self):
     schema = pq.read_schema(self.filename)
     metadata_json = schema.metadata[self.custom_meta_key.encode()]
     metadata = json.loads(metadata_json)
     return metadata
import pyarrow as pa
import pyarrow.csv as pv
import pyarrow.parquet as pq
from pyarrow.lib import Table

csv = 'accumulated_data_300_million_rows_id_filter.csv'
target_file = '../data/accumulated_data_300_million_rows_id_filter_1mill.parquet'

csv_read_options = pv.ReadOptions(skip_rows=0,
                                  encoding="utf8",
                                  column_names=["unit_id"])

# Types: https://arrow.apache.org/docs/python/api/datatypes.html
data_schema = pa.schema([('unit_id', pa.uint64())])

# ConvertOptions: https://arrow.apache.org/docs/python/generated/pyarrow.csv.ConvertOptions.html#pyarrow.csv.ConvertOptions
csv_convert_options = pv.ConvertOptions(column_types=data_schema)

table: Table = pv.read_csv(input_file=csv,
                           read_options=csv_read_options,
                           convert_options=csv_convert_options)
pq.write_table(table, target_file)

print('Generated file with the following:')
print('Parquet metadata: ' + str(pq.read_metadata(target_file)))
print('Parquet schema: ' + pq.read_schema(target_file).to_string())
import pandas as pd
from pyarrow.parquet import read_schema

ITEM_FEATURES = True
SHOP_FEATURES = True
ITEM_CATEGORY_FEATURES = True
CALENDAR_FEATURES = True
LAGGED_FEATURES = True
ROLLING_FEATURES = True
MISSINGNESS_FEATURES = True
EMBEDDING_FEATURES = True

MEDIAN_FEATURES = True
MEAN_FEATURES = True
BY_SHOP_ID = True
BY_ITEM_ID = True
BY_ITEM_CATEGORY_ID = True
BY_SHOP_ID_ITEM_ID = True
BY_SHOP_ID_ITEM_CATEGORY_ID = True

DATA_FILE = "../input/all_features.parquet"
FEATURES = pd.Index(read_schema(DATA_FILE).names).drop(["__index_level_0__", "item_cnt_month"], errors="ignore")
SELECTED_FEATURES = None
EARLY_STOPPING_ROUNDS = 20
示例#24
0
def get_cols(fs, filename):
    with fs.open(filename, 'rb') as f:
        schema = parquet.read_schema(f)
    return set(schema.names)
示例#25
0
def print_statistics(file):
    print('Parquet metadata: ' + str(pq.read_metadata(file)))
    print('Parquet schema: ' + pq.read_schema(file).to_string())
    print('Size of output file on disk: ' + str(get_file_size_in_mb(file)) +
          ' MB')