def test_encrypted_parquet_read_schema_no_decryption_config( tempdir, data_table): """Write an encrypted parquet, verify it's encrypted, but then try to read its schema without decryption properties.""" test_encrypted_parquet_write_read(tempdir, data_table) with pytest.raises(IOError, match=r"no decryption"): pq.read_schema(tempdir / PARQUET_NAME)
def get_filters( parquet: BytesIO, condition_groups: List[ConditionGroup] ) -> Optional[List[PyArrowConditionGroup]]: """ Given a dataframe and list of conditions, filter the dataframe using a boolean OR """ schema = pq.read_schema(parquet) filters: List[PyArrowConditionGroup] = [] for group in condition_groups: group_tuple = [] for entry in group["conditions"]: try: field = schema.field(entry["column"]) if field.type == pyarrow.null(): # Empty dataset; thus no inherent data type for entry["column"] continue group_tuple.append(( entry["column"], entry["operator"], entry["values"], )) except KeyError: # Column does not exist within schema continue if group_tuple: filters.append(group_tuple) return filters or None
def file_to_schema(input_tuple): sname, fpath = input_tuple try: df = pq.read_schema(fpath) except: print("ERROR", fpath) return ';'.join([str(df.names), str(df.types)])
def run_partition_test(input_file: str, output_dir: str, filters: Optional[list] = None): milliseconds_since_epoch = int(time() * 1000) print('Parquet metadata: ' + str(pq.read_metadata(input_file))) print('Parquet schema: ' + pq.read_schema(input_file).to_string()) data = pq.read_table(source=input_file, filters=filters) # Write a dataset and collect metadata information of all written files metadata_collector = [] root_path = output_dir + 'partitioned_' + str(milliseconds_since_epoch) pq.write_to_dataset(data, root_path=root_path, partition_cols=['start_year'], metadata_collector=metadata_collector) # Write the ``_common_metadata`` parquet file without row groups statistics pq.write_metadata(data.schema, root_path + '/_common_metadata') # Write the ``_metadata`` parquet file with row groups statistics of all files # Gives following error: # File "pyarrow/_parquet.pyx", line 616, in pyarrow._parquet.FileMetaData.append_row_groups # RuntimeError: AppendRowGroups requires equal schemas. # data.schema has one more column than partitioned files when partitioning by one column # Related? https://github.com/dask/dask/issues/6243 # pq.write_metadata(data.schema, root_path + '/_metadata', metadata_collector=metadata_collector) # Read from partitioned dataset # use the new generic Dataset API start_year = 2018 value = 50000 table = pq.read_table(root_path, filters=[('start_year', '>=', start_year), ('value', '>', value)]) # filters=[('start_year', '>=', start_year)]) print(table.to_pandas())
def test_read_schema(tempdir): N = 100 df = pd.DataFrame({ 'index': np.arange(N), 'values': np.random.randn(N) }, columns=['index', 'values']) data_path = tempdir / 'test.parquet' table = pa.Table.from_pandas(df) _write_table(table, data_path) read1 = pq.read_schema(data_path) read2 = pq.read_schema(data_path, memory_map=True) assert table.schema.equals(read1) assert table.schema.equals(read2) assert table.schema.metadata[b'pandas'] == read1.metadata[b'pandas']
def columns_from_parquet_file(input_parquet): schema = pa_parquet.read_schema(input_parquet) column_specifications = schema.pandas_metadata['columns'] options = [] for col_item in column_specifications: name = col_item['name'] if name is not None: options.append({'label': name, 'value': name}) return options
def test_write_parquet_no_index(): df = pd.DataFrame(np.random.randn(6, 4), columns=list('abcd'), index=np.arange(6)) with tempfile.TemporaryDirectory() as tmpdir: fs = LocalFileSystem() filename = os.path.join(tmpdir, 'df.parquet') write_dataframe_as_parquet(df, fs, filename) schema = parquet.read_schema(os.path.join(tmpdir, filename)) assert '__index_level_0__' not in schema.names assert df.columns.values.tolist() == schema.names
def askUserInput(self): filepath = self.userdata['filepath'] s = pa.read_schema(filepath) colnames = s.names def cb(columns): self.userdata['columns'] = columns dlgchoice = DlgListChoice(colnames, 'Open Parquet', 'Choose curves to load') dlgchoice.dlgdata.connect(cb) dlgchoice.exec_()
def _bytes2schema(data): reader = pa.BufferReader(data) schema = pq.read_schema(reader) fields = [] for idx in range(len(schema)): f = schema[idx] # schema data recovered from parquet always contains timestamp data in us-granularity, but pandas will use # ns-granularity, so we re-align the two different worlds here if f.type == pa.timestamp("us"): f = pa.field(f.name, pa.timestamp("ns")) fields.append(f) return pa.schema(fields, schema.metadata)
def run_test(input_file: str, output_dir: str, filters: list, use_pandas: bool): print('Using pyarrow') print('Parquet metadata: ' + str(pq.read_metadata(input_file))) print('Parquet schema: ' + pq.read_schema(input_file).to_string()) pq_file = pq.ParquetFile(input_file) row_group_0_metadata = pq_file.metadata.row_group(0) print('Parquet min for column 0, row group 0: ' + str(row_group_0_metadata.column(0).statistics.min)) print('Parquet max for column 0, row group 0: ' + str(row_group_0_metadata.column(0).statistics.max)) if use_pandas: unfiltered_pandas_data = pq.read_table(source=input_file).to_pandas() size = sys.getsizeof(unfiltered_pandas_data) print('Size of UN-filtered pandas DataFrame in memory: ' + str(size) + ' bytes (' + str(size / 1000000) + ' MB)') with timeblock('pyarrow read and filter'): data = pq.read_table(source=input_file, filters=filters) size = sys.getsizeof(data) print('Size of filtered pyarrow table in memory: ' + str(size) + ' bytes (' + str(size / 1000000) + ' MB)') if use_pandas: unfiltered_pandas_data = data.to_pandas() size = sys.getsizeof(unfiltered_pandas_data) print('Size of filtered pandas DataFrame in memory: ' + str(size) + ' bytes (' + str(size / 1000000) + ' MB)') # print(pandas_data.head(10)) milliseconds_since_epoch = int(time() * 1000) output_file = output_dir + str(milliseconds_since_epoch) + '.parquet' print('Output file name: ' + output_file) with timeblock('pyarrow write_table()'): pq.write_table(data, output_file) print('Parquet metadata of output: ' + str(pq.read_metadata(output_file))) print('Parquet schema of output: ' + pq.read_schema(output_file).to_string()) print('Size of output file on disk: ' + str(os.path.getsize(output_file)) + ' bytes (' + str(os.path.getsize(output_file) / 1000000) + ' MB)')
def read_encrypted_parquet(path, decryption_config, kms_connection_config, crypto_factory): file_decryption_properties = crypto_factory.file_decryption_properties( kms_connection_config, decryption_config) assert (file_decryption_properties is not None) meta = pq.read_metadata(path, decryption_properties=file_decryption_properties) assert (meta.num_columns == 3) schema = pq.read_schema(path, decryption_properties=file_decryption_properties) assert (len(schema.names) == 3) result = pq.ParquetFile(path, decryption_properties=file_decryption_properties) return result.read(use_threads=True)
def test_read_schema(tmpdir): import pyarrow.parquet as pq N = 100 df = pd.DataFrame({ 'index': np.arange(N), 'values': np.random.randn(N) }, columns=['index', 'values']) data_path = pjoin(str(tmpdir), 'test.parquet') table = pa.Table.from_pandas(df) _write_table(table, data_path) assert table.schema.equals(pq.read_schema(data_path))
def file_to_schema_bad(input_tuple): sname, fpath = input_tuple matched = False #corrupted = False try: df = pq.read_schema(fpath) string_match = ';'.join([str(df.names), str(df.types)]) if sname in proper_schema: if proper_schema[sname] == string_match: matched = True return (matched, sname, fpath, proper_schema[sname], string_match, "CORRECT-FILE") except: return (matched, sname, fpath, proper_schema[sname], "NULL", "NULL", "CORRUPTED-FILE")
def transform_ts(start, end, file): train_columns = pq.read_schema( file).names # List with all column names to test # print(train_columns) X = pd.DataFrame(data=None) for i in train_columns[start:end]: df_signal = pq.read_pandas(file, columns=[i]).to_pandas() # turn parquet to dataframe of one single signal # print("Shape of signal data {}".format(df_signal.shape)) sig = np.ravel(df_signal.iloc[:, 0].to_numpy()) # turn to numpy t = df_signal.index.to_numpy() # turn time to numpy x_dn = de_noising(high_pass_filter(sig)) x_deleted = delete_repeat(x_dn) x_deleted_cond = (x_deleted < 99998) x_deleted = x_deleted[x_deleted_cond] print(x_deleted.shape) t_deleted = t[x_deleted_cond] # Generating New Time Series Features from signal master_train = pd.DataFrame({ 0: x_deleted, 1: np.repeat(i, x_deleted.shape[0]), 2: t_deleted }) # print("Shape of master train data {}".format(master_train.shape)) # master_train.to_csv('output/master_train.csv') extraction_settings = EfficientFCParameters() X_signal = extract_features( master_train, column_id=1, column_sort=2, impute_function=impute, default_fc_parameters=extraction_settings) print("Number of extracted features in {}: {}.".format( i, X_signal.shape[1])) X = X.append(X_signal) return X
def run_id_filter_test(input_file: str, input_id_file: str): # converting ids to pandas will be a "zero copy conversion" as unit_id column is int64 when: # - ids are not nulls # - a single ChunkedArray # TODO check it that is the case # https://arrow.apache.org/docs/python/pandas.html#zero-copy-series-conversions filter_ids = pq.read_table(source=input_id_file) filter_ids_as_pandas: DataFrame = filter_ids.to_pandas() # filter_ids_as_list = filter_ids_as_pandas['unit_id'].tolist() filter_ids_as_set = set(filter_ids_as_pandas['unit_id']) print('Parquet metadata: ' + str(pq.read_metadata(input_id_file))) print('Parquet schema: ' + pq.read_schema(input_id_file).to_string()) print('Using filter ids: ' + str(filter_ids.to_pandas())) table = pq.read_table(source=input_file, filters=[ # ('unit_id', 'in', filter_ids_as_list) ('unit_id', 'in', filter_ids_as_set) ]) print(table.to_pandas())
writer = pq.ParquetWriter(outputFileName,schema) # ,coerce_timestamps='ms' if args.debug is True: totalWriteTime = totalWriteTime + (time.time() - startWrite) if writer: writer.close() # print final success message. timeNow = time.time() elapsed = timeNow - timeStart rps = int(rowcount / elapsed) print (f"{rowcount} rows exported to {outputFileName} at {rps} rows per second.") if args.debug is True: print () print ('Performance breakdown:') print (f'Read={totalReadTime:.2f} Transform={totalTransformTime:.2f} Write={totalWriteTime:.2f}') print () print ('Test output display:') print () print ('Schema:') print (pq.read_schema(outputFileName)) print () testTable = pq.read_table (outputFileName) print ('Data (10 rows):') print (testTable.to_pandas().head(10)) exit(0)
def _InferArrowSchema(self): match_result = FileSystems.match([self._file_pattern])[0] files_metadata = match_result.metadata_list[0] with FileSystems.open(files_metadata.path) as f: return pq.read_schema(f)
def peek_metadata(path_to_table): """Read metadata without loading the data file """ schema = pq.read_schema(path_to_table) return json.loads(schema.metadata[b'my_metadata'].decode('utf-8'))
try: for stream in hdfs.ls(study): try: for version in hdfs.ls(stream): try: for user in hdfs.ls(version): try: files = hdfs.ls(user) #if len(files)>2: put this check back again if schema mismatch is required old_schema = [] for fle in hdfs.ls(user): try: if "_SUCCESS" not in fle: with hdfs.open(hdfs_url + fle) as f: current_schema = pq.read_schema( f).names mismatched_metadata.append({ "total_files": len(files), "file_name": fle, "schema": current_schema, "user_folder": user }) except Exception as e: print(str(e)) except Exception as e: print(str(e)) except Exception as e:
def forced_extraction(sources_df: pd.DataFrame, cfg_err_ra: float, cfg_err_dec: float, p_run: Run, extr_df: pd.DataFrame, min_sigma: float, edge_buffer: float, cluster_threshold: float, allow_nan: bool, add_mode: bool, done_images_df: pd.DataFrame, done_source_ids: List[int]) -> Tuple[pd.DataFrame, int]: """ Check and extract expected measurements, and associated them with the related source(s). Args: sources_df: Dataframe containing all the extracted measurements and associations (product from association step). cfg_err_ra: The minimum RA error from the config file (in degrees). cfg_err_dec: The minimum declination error from the config file (in degrees). p_run: The pipeline run object. extr_df: The dataframe containing the information on what sources are missing from which images (output from get_src_skyregion_merged_df in main.py). min_sigma: Minimum sigma value to drop forced extracted measurements. edge_buffer: Flag to pass to ForcedPhot.measure method. cluster_threshold: Flag to pass to ForcedPhot.measure method. allow_nan: Flag to pass to ForcedPhot.measure method. add_mode: True when the pipeline is running in add image mode. done_images_df: Dataframe containing the images that thave already been processed in a previous run (used in add image mode). done_source_ids: List of the source ids that were already present in the previous run (used in add image mode). Returns: The sources_df with the extracted sources added and n_forced is the total number of forced measurements present in the run. """ logger.info('Starting force extraction step.') timer = StopWatch() # get all the skyregions and related images cols = [ 'id', 'name', 'measurements_path', 'path', 'noise_path', 'beam_bmaj', 'beam_bmin', 'beam_bpa', 'background_path', 'rms_min', 'datetime', 'skyreg__centre_ra', 'skyreg__centre_dec', 'skyreg__xtr_radius' ] images_df = pd.DataFrame( list( Image.objects.filter(run=p_run).select_related('skyreg').order_by( 'datetime').values(*tuple(cols)))).set_index('name') # | name | id | measurements_path | path | noise_path | # |:------------------------------|-----:|:--------------------|:-------------|:-------------| # | VAST_2118-06A.EPOCH01.I.fits | 1 | path/to/file | path/to/file | path/to/file | # | VAST_2118-06A.EPOCH03x.I.fits | 3 | path/to/file | path/to/file | path/to/file | # | VAST_2118-06A.EPOCH02.I.fits | 2 | path/to/file | path/to/file | path/to/file | # | name | beam_bmaj | beam_bmin | beam_bpa | background_path | # |:------------------------------|------------:|------------:|-----------:|:------------------| # | VAST_2118-06A.EPOCH01.I.fits | 0.00589921 | 0.00326088 | -70.4032 | path/to/file | # | VAST_2118-06A.EPOCH03x.I.fits | 0.00470991 | 0.00300502 | -83.1128 | path/to/file | # | VAST_2118-06A.EPOCH02.I.fits | 0.00351331 | 0.00308565 | 77.2395 | path/to/file | # | name | rms_min | datetime | skyreg__centre_ra | skyreg__centre_dec | skyreg__xtr_radius | # |:------------------------------|----------:|:---------------------------------|--------------------:|---------------------:|---------------------:| # | VAST_2118-06A.EPOCH01.I.fits | 0.173946 | 2019-08-27 18:12:16.700000+00:00 | 319.652 | -6.2989 | 6.7401 | # | VAST_2118-06A.EPOCH03x.I.fits | 0.165395 | 2019-10-29 10:01:20.500000+00:00 | 319.652 | -6.2989 | 6.7401 | # | VAST_2118-06A.EPOCH02.I.fits | 0.16323 | 2019-10-30 08:31:20.200000+00:00 | 319.652 | -6.2989 | 6.7401 | # Explode out the img_diff column. extr_df = extr_df.explode('img_diff').reset_index() total_to_extract = extr_df.shape[0] if add_mode: # If we are adding images to the run we assume that monitoring was # also performed before (enforced by the pre-run checks) so now we # only want to force extract in three situations: # 1. Any force extraction in a new image. # 2. The forced extraction is attached to a new source from the new # images. # 3. A new relation has been created and they need the forced # measuremnts filled in (actually covered by 2.) extr_df = ( extr_df[~extr_df['img_diff'].isin(done_images_df['name'])].append( extr_df[(~extr_df['source'].isin(done_source_ids)) & (extr_df['img_diff'].isin(done_images_df.name))]). sort_index()) logger.info(f"{extr_df.shape[0]} new measurements to force extract" f" (from {total_to_extract} total)") timer.reset() extr_df = parallel_extraction(extr_df, images_df, sources_df[['source', 'image', 'flux_peak']], min_sigma, edge_buffer, cluster_threshold, allow_nan, add_mode, p_run.path) logger.info('Force extraction step time: %.2f seconds', timer.reset()) # make measurement names unique for db constraint extr_df['name'] = extr_df['name'] + f'_f_run{p_run.id:06d}' # select sensible flux values and set the columns with fix values values = {'flux_int': 0, 'flux_int_err': 0} extr_df = extr_df.fillna(value=values) extr_df = extr_df[(extr_df['flux_int'] != 0) & (extr_df['flux_int_err'] != 0) & (extr_df['chi_squared_fit'] != np.inf) & (extr_df['chi_squared_fit'] != np.nan)] default_pos_err = settings.POS_DEFAULT_MIN_ERROR / 3600. extr_df['ra_err'] = default_pos_err extr_df['dec_err'] = default_pos_err extr_df['err_bmaj'] = 0. extr_df['err_bmin'] = 0. extr_df['err_pa'] = 0. extr_df['ew_sys_err'] = cfg_err_ra extr_df['ns_sys_err'] = cfg_err_dec extr_df['error_radius'] = 0. extr_df['uncertainty_ew'] = np.hypot(cfg_err_ra, default_pos_err) extr_df['weight_ew'] = 1. / extr_df['uncertainty_ew'].values**2 extr_df['uncertainty_ns'] = np.hypot(cfg_err_dec, default_pos_err) extr_df['weight_ns'] = 1. / extr_df['uncertainty_ns'].values**2 extr_df['flux_peak'] = extr_df['flux_int'] extr_df['flux_peak_err'] = extr_df['flux_int_err'] extr_df['local_rms'] = extr_df['flux_int_err'] extr_df['snr'] = (extr_df['flux_peak'].values / extr_df['local_rms'].values) extr_df['spectral_index'] = 0. extr_df['dr'] = 0. extr_df['d2d'] = 0. extr_df['forced'] = True extr_df['compactness'] = 1. extr_df['psf_bmaj'] = extr_df['bmaj'] extr_df['psf_bmin'] = extr_df['bmin'] extr_df['psf_pa'] = extr_df['pa'] extr_df['flag_c4'] = False extr_df['spectral_index_from_TT'] = False extr_df['has_siblings'] = False extr_df['flux_int_isl_ratio'] = 1.0 extr_df['flux_peak_isl_ratio'] = 1.0 col_order = read_schema(images_df.iloc[0]['measurements_path']).names col_order.remove('id') remaining = list(set(extr_df.columns) - set(col_order)) extr_df = extr_df[col_order + remaining] # upload the measurements, a column 'id' is returned with the DB id extr_df = make_upload_measurements(extr_df) extr_df = extr_df.rename(columns={'source_tmp_id': 'source'}) # write forced measurements to specific parquet logger.info('Saving forced measurements to specific parquet file...') parallel_write_parquet(extr_df, p_run.path, add_mode) # Required to rename this column for the image add mode. extr_df = extr_df.rename(columns={'time': 'datetime'}) # append new meas into main df and proceed with source groupby etc sources_df = sources_df.append( extr_df.loc[:, extr_df.columns.isin(sources_df.columns)], ignore_index=True) # get the number of forced extractions for the run forced_parquets = glob( os.path.join(p_run.path, "forced_measurements*.parquet")) if forced_parquets: n_forced = (dd.read_parquet(forced_parquets, columns=['id' ]).count().compute().values[0]) else: n_forced = 0 logger.info('Total forced extraction time: %.2f seconds', timer.reset_init()) return sources_df, n_forced
def load_metadata(self): schema = pq.read_schema(self.filename) metadata_json = schema.metadata[self.custom_meta_key.encode()] metadata = json.loads(metadata_json) return metadata
import pyarrow as pa import pyarrow.csv as pv import pyarrow.parquet as pq from pyarrow.lib import Table csv = 'accumulated_data_300_million_rows_id_filter.csv' target_file = '../data/accumulated_data_300_million_rows_id_filter_1mill.parquet' csv_read_options = pv.ReadOptions(skip_rows=0, encoding="utf8", column_names=["unit_id"]) # Types: https://arrow.apache.org/docs/python/api/datatypes.html data_schema = pa.schema([('unit_id', pa.uint64())]) # ConvertOptions: https://arrow.apache.org/docs/python/generated/pyarrow.csv.ConvertOptions.html#pyarrow.csv.ConvertOptions csv_convert_options = pv.ConvertOptions(column_types=data_schema) table: Table = pv.read_csv(input_file=csv, read_options=csv_read_options, convert_options=csv_convert_options) pq.write_table(table, target_file) print('Generated file with the following:') print('Parquet metadata: ' + str(pq.read_metadata(target_file))) print('Parquet schema: ' + pq.read_schema(target_file).to_string())
import pandas as pd from pyarrow.parquet import read_schema ITEM_FEATURES = True SHOP_FEATURES = True ITEM_CATEGORY_FEATURES = True CALENDAR_FEATURES = True LAGGED_FEATURES = True ROLLING_FEATURES = True MISSINGNESS_FEATURES = True EMBEDDING_FEATURES = True MEDIAN_FEATURES = True MEAN_FEATURES = True BY_SHOP_ID = True BY_ITEM_ID = True BY_ITEM_CATEGORY_ID = True BY_SHOP_ID_ITEM_ID = True BY_SHOP_ID_ITEM_CATEGORY_ID = True DATA_FILE = "../input/all_features.parquet" FEATURES = pd.Index(read_schema(DATA_FILE).names).drop(["__index_level_0__", "item_cnt_month"], errors="ignore") SELECTED_FEATURES = None EARLY_STOPPING_ROUNDS = 20
def get_cols(fs, filename): with fs.open(filename, 'rb') as f: schema = parquet.read_schema(f) return set(schema.names)
def print_statistics(file): print('Parquet metadata: ' + str(pq.read_metadata(file))) print('Parquet schema: ' + pq.read_schema(file).to_string()) print('Size of output file on disk: ' + str(get_file_size_in_mb(file)) + ' MB')