def test_open_union_dataset_with_additional_kwargs(multisourcefs): child = ds.dataset('/plain', filesystem=multisourcefs, format='parquet') with pytest.raises(ValueError, match="cannot pass any additional"): ds.dataset([child], format="parquet")
def load_flowsheet_dataset(path): dat = ds.dataset(path, format='parquet').to_table() return dat
def read_year(self, year, hh_states_keep=None, hh_states_drop=None, hh_dma_keep=None, hh_dma_drop=None): (purch_fn, trip_fn, panelist_fn) = get_fns(self.annual_dict[year]) hh_ds = ds.dataset( csv.read_csv(panelist_fn, parse_options=csv.ParseOptions(delimiter='\t'), convert_options=csv.ConvertOptions( auto_dict_encode=True, auto_dict_max_cardinality=1024))) # build an arrow dataset filter object one by one my_filter = ds.field('Projection_Factor') > 0 if hh_states_keep: my_filter = my_filter & ( ds.field('Fips_State_Desc').isin(hh_states_keep)) if hh_states_drop: my_filter = my_filter & ( ~ds.field('Fips_State_Desc').isin(hh_states_drop)) if hh_dma_keep: my_filter = my_filter & (ds.field('DMA_Cd').isin(hh_dma_keep)) if hh_dma_drop: my_filter = my_filter & (~ds.field('DMA_Cd').isin(hh_dma_drop)) # convert to pandas and get unique HH list hh_df = hh_ds.to_table(filter=my_filter).to_pandas().rename( columns=hh_dict_rename) hh_list = hh_df.household_code.unique() # use pyarrrow filter to filter trips for just our households trip_df = ds.dataset(csv.read_csv(trip_fn, parse_options=csv.ParseOptions(delimiter='\t')))\ .to_table(filter=ds.field('household_code').isin(hh_list)).to_pandas() trip_list = trip_df.trip_code_uc.unique() upc_list = self.prod_df.upc.unique() # use pyarrow to filter purchases using trips and UPCs only purch_ds = ds.dataset( csv.read_csv(purch_fn, parse_options=csv.ParseOptions(delimiter='\t'), convert_options=csv.ConvertOptions( auto_dict_encode=True, auto_dict_max_cardinality=1024))) purch_filter = ds.field('trip_code_uc').isin(trip_list) & ds.field( 'upc').isin(upc_list) purch_df = purch_ds.to_table(filter=purch_filter).to_pandas() # Add the fields to the trips and purchases for convenience later trip_df2 = pd.merge(trip_df, hh_df[self.hh_cols], on=['household_code', 'panel_year']) purch_df2 = pd.merge(pd.merge( purch_df, self.prod_df[self.prod_cols], on=['upc', 'upc_ver_uc']), trip_df2[self.hh_cols+['trip_code_uc', 'purchase_date', 'store_code_uc']], on=['trip_code_uc'])\ .rename(columns={'fips_state_desc': 'hh_state_desc'}) self.purch_df = self.purch_df.append(purch_df2, ignore_index=True) self.trip_df = self.trip_df.append(trip_df2, ignore_index=True) self.hh_df = self.hh_df.append(hh_df, ignore_index=True) return
def test_open_dataset_unsupported_format(tempdir): _, path = _create_single_file(tempdir) with pytest.raises(ValueError, match="format 'blabla' is not supported"): ds.dataset([path], format="blabla")
def test_open_dataset_validate_sources(tempdir): _, path = _create_single_file(tempdir) dataset = ds.dataset(path) with pytest.raises(ValueError, match="Expected a path-like or Source, got"): ds.dataset([dataset])
def coalesce_resource_table(infolder: str, outfolder: str, archive_folder: str, table: str, state: SqCoalesceState) -> None: """This routine coalesces all the parquet data in the folder provided This function MUST be called with sqPoller as the table the first time to build the polling period sample. Without this, its not possible to compute the records to be written for a period accurately. The polling periods are computed when this function is called the first time with None as the state field. This function stuffs the sqPoller timeblocks as the polling period in the state block and returns it. The state object returned also has some statistics written such as number of files written, number of records written and so on. :param infolder: str, folder to read data in from :param outfolder: str, folder to write data to :param archive_folder: str, folder to store the archived files in :param table: str, name of table we're coalesceing :param state: SqCoalesceState, state about this coalesceion run :returns: Nothing """ def compute_block_start(start): if state.period.total_seconds() < 24*3600: block_start = datetime(year=start.year, month=start.month, day=start.day, hour=start.hour, tzinfo=timezone.utc) elif 24*3600 <= state.period.total_seconds() < 24*3600*30: block_start = datetime(year=start.year, month=start.month, day=start.day, tzinfo=timezone.utc) elif 24*3600*30 <= state.period.total_seconds() < 24*3600*365: block_start = datetime(year=start.year, month=start.month, tzinfo=timezone.utc) else: block_start = datetime(year=start.year, tzinfo=timezone.utc) return block_start partition_cols = ['sqvers', 'namespace'] dodel = True if table == "sqPoller": wr_polling_period = True state.poller_periods = set() else: wr_polling_period = False state.wrfile_count = 0 state.wrrec_count = 0 state.table_name = table schema = state.schema if state.schema.type == "record": state.keys = schema.key_fields() if state.current_df.empty: state.current_df = get_last_update_df(outfolder, state) # Ignore reading the compressed files dataset = ds.dataset(infolder, partitioning='hive', format='parquet', ignore_prefixes=state.ign_pfx) state.logger.info(f'Examining {len(dataset.files)} {table} files ' f'for coalescing') fdf = get_file_timestamps(dataset.files) if fdf.empty: if (table == 'sqPoller') or (not state.poller_periods): return assert(len(dataset.files) == fdf.shape[0]) polled_periods = sorted(state.poller_periods) if fdf.empty: state.logger.info(f'No updates for {table} to coalesce') start = polled_periods[0] else: start = fdf.timestamp.iloc[0] utcnow = datetime.now(timezone.utc) # We now need to determine if we're coalesceing a lot of data, at the start # or if we're only coalesceing for the last interval. if (utcnow < start): logging.error( 'ERROR: Something is off, now is earlier than dates on files') return # We write data in fixed size 1 hour time blocks. Data from 10-11 is # written out as one block, data from 11-12 as another and so on. # Specifically, we write out 11:00:00 to 11:59:59 in the block block_start = compute_block_start(start) block_end = block_start + state.period # NOTE: You need the parentheses around the date comparison for some reason if (block_end > utcnow): return readblock = [] wrfile_count = 0 # We may start coalescing when nothing has changed for some initial period. # We have to write out records for that period. if schema.type == "record": for interval in polled_periods: if not fdf.empty and (block_end < interval): break pre_block_start = compute_block_start(interval) pre_block_end = pre_block_start + state.period write_files(readblock, infolder, outfolder, partition_cols, state, pre_block_start, pre_block_end) for row in fdf.itertuples(): if block_start <= row.timestamp < block_end: readblock.append(row.file) continue # Write data if either there's data to be written (readblock isn't # empty) OR this table is a record type and poller was alive during # this period (state's poller period for this window isn't blank if readblock or ((schema.type == "record") and block_start in state.poller_periods): write_files(readblock, infolder, outfolder, partition_cols, state, block_start, block_end) wrfile_count += len(readblock) if wr_polling_period and readblock: state.poller_periods.add(block_start) # Archive the saved files if readblock: archive_coalesced_files(readblock, archive_folder, state, dodel) # We have to find the timeslot where this record fits block_start = block_end block_end = block_start + state.period readblock = [] if schema.type != "record": # We can jump directly to the timestamp corresonding to this # row's timestamp block_start = compute_block_start(row.timestamp) block_end = block_start + state.period if (row.timestamp > block_end) or (block_end > utcnow): break readblock = [row.file] continue while row.timestamp > block_end: if block_start in state.poller_periods: write_files(readblock, infolder, outfolder, partition_cols, state, block_start, block_end) # Nothing to archive here, and we're not counting coalesced # records since these are duplicates block_start = block_end block_end = block_start + state.period if block_end > utcnow: break readblock = [row.file] # The last batch that ended before the block end if readblock or (fdf.empty and (schema.type == "record") and block_start in state.poller_periods): write_files(readblock, infolder, outfolder, partition_cols, state, block_start, block_end) wrfile_count += len(readblock) if wr_polling_period: state.poller_periods.add(block_start) archive_coalesced_files(readblock, archive_folder, state, dodel) state.wrfile_count = wrfile_count return
def test_open_dataset_validate_sources(tempdir): _, path = _create_single_file(tempdir) dataset = ds.dataset(path) with pytest.raises(TypeError, match="Dataset objects are currently not supported"): ds.dataset([dataset])
feather.write_feather(df, f'{name}.feather') sparse.save_npz(name+'.npz' , fingerprint_matrix) print(f'Job number {job_count} complete.') print(f'Job contained {len(smiles_list)} smiles strings') print(f'Job generated spares matrix with {len(row_idx)} row_idx') print(f'Job generated spares matrix with {len(col_idx)} col_idx') columns = ['standard_smiles', \ 'canonical_id', \ 'docking_score' ] # # source = '/data/dockop_data/AmpC_screen_table_clean.feather' # reader = pa.ipc.open_file(filename) # enumerate_list = [(index, element.to_table()) for index, element in enumerate(fragments)] ## Use the following for reading a larger partition of parquet files dataset_path = pathlib.Path('/path/to/outfile.molchunk') dataset = ds.dataset(dataset_path, format="feather") fragments = [file for file in dataset.get_fragments()] GB = 1024 ** 3 ray.init(num_cpus=20, _memory=32*GB, object_store_memory=32*GB) futures = [fp_to_batch.remote(index, element.to_table()) for index, element in enumerate(fragments)] results = [ray.get(f) for f in futures]
row["enumerated_smiles"] = smiles_list # row["onbits_fp"] =list(fp.GetOnBits()) return row except ValueError: row["standard_smiles"] = 'dropped' row["selfies"] = 'dropped' row["inchi"] = 'dropped' row["inchikey"] = 'dropped' row["enumerated_smiles"] = list('dropped') return row # Load the dataset from parquet one by one dataset = ds.dataset(dataset_dir, format="parquet") # Create a list of fragments that are not memory loaded fragments = [file for file in dataset.get_fragments()] for count, element in enumerate(fragments): #cast the fragment as a pandas df df_docked = element.to_table().to_pandas() #reset the index df_docked = df_docked.reset_index(drop=True) #now write the nearest neighbor name and smiles to the df smiles_column = 'Smile' df_add_nn = dm.parallelized(_preprocess, list(df_docked.iterrows()), arg_type='args',
smi_path = '/cbica/home/grahamth/molchunktools/molchunk_tools/test/d3_chembl.smi' smiles_column = 'f0' canonical_id_column = 'f1' activity_column = 'f2' d3_df = ingest_chembl_smi(smi_path, smiles_column, canonical_id_column, activity_column) fingerprint_matrix_chembld3 = fingerprint_matrix_from_df(d3_df) #define smiles and names for compounds in the matrix to be compared with #this will be the key system for returning the nearest neighbor smiles = list(d3_df['standard_smiles']) name = list(d3_df['canonical_id']) dataset_dir = '/cbica/home/grahamth/er_molchunk_dir' dataset = ds.dataset(dataset_dir, format="feather") output_dir = '/cbica/home/grahamth/d3fpsim' # Create a list of fragments that are not memory loaded fragments = [file for file in dataset.get_fragments()] for count, element in enumerate(fragments): #cast the fragment as a pandas df df = element.to_table().to_pandas() #reset the index df = df.reset_index(drop=True) columns_to_keep = ['enumerated_smiles', 'CatalogID', 'ID_Index'] df2 = df[columns_to_keep] df3 = df2.explode('enumerated_smiles')
import pandas as pd import pyarrow as pa from pyarrow import fs import pyarrow.dataset as ds s3 = fs.S3FileSystem( access_key=environ['B2_ACCESS_KEY_ID'], secret_key=environ['B2_SECRET_ACCESS_KEY'], endpoint_override=environ['B2_ENDPOINT_URL'] ) dataset = ds.dataset( source='polygon-equities/data/trades', format='feather', filesystem=s3, partitioning='hive', exclude_invalid_files=True ) df = dataset.to_table( # columns=['symbol', 'sip_epoch', 'price', 'size'], filter=ds.field('date') == '2020-07-01' ).to_pandas() # local dataset = ds.dataset( source='/Users/bobcolner/QuantClarity/data/trades/feather/', format='feather', partitioning='hive',
def test_dataset_from_a_list_of_local_directories_raises(multisourcefs): msg = 'points to a directory, but only file paths are supported' with pytest.raises(IsADirectoryError, match=msg): ds.dataset(['/plain', '/schema', '/hive'], filesystem=multisourcefs)
def test_union_dataset_from_other_datasets(tempdir, multisourcefs): child1 = ds.dataset('/plain', filesystem=multisourcefs, format='parquet') child2 = ds.dataset('/schema', filesystem=multisourcefs, format='parquet', partitioning=['week', 'color']) child3 = ds.dataset('/hive', filesystem=multisourcefs, format='parquet', partitioning='hive') assert child1.schema != child2.schema != child3.schema assembled = ds.dataset([child1, child2, child3]) assert isinstance(assembled, ds.UnionDataset) msg = 'cannot pass any additional arguments' with pytest.raises(ValueError, match=msg): ds.dataset([child1, child2], filesystem=multisourcefs) expected_schema = pa.schema([ ('date', pa.date32()), ('index', pa.int64()), ('value', pa.float64()), ('color', pa.string()), ('week', pa.int32()), ('year', pa.int32()), ('month', pa.int32()), ]) assert assembled.schema.equals(expected_schema) assert assembled.to_table().schema.equals(expected_schema) assembled = ds.dataset([child1, child3]) expected_schema = pa.schema([ ('date', pa.date32()), ('index', pa.int64()), ('value', pa.float64()), ('color', pa.string()), ('year', pa.int32()), ('month', pa.int32()), ]) assert assembled.schema.equals(expected_schema) assert assembled.to_table().schema.equals(expected_schema) expected_schema = pa.schema([ ('month', pa.int32()), ('color', pa.string()), ('date', pa.date32()), ]) assembled = ds.dataset([child1, child3], schema=expected_schema) assert assembled.to_table().schema.equals(expected_schema) expected_schema = pa.schema([ ('month', pa.int32()), ('color', pa.string()), ('unkown', pa.string()) # fill with nulls ]) assembled = ds.dataset([child1, child3], schema=expected_schema) assert assembled.to_table().schema.equals(expected_schema) # incompatible schemas, date and index columns have conflicting types table = pa.table([range(9), [0.] * 4 + [1.] * 5, 'abcdefghj'], names=['date', 'value', 'index']) _, path = _create_single_file(tempdir, table=table) child4 = ds.dataset(path) with pytest.raises(pa.ArrowInvalid, match='Unable to merge'): ds.dataset([child1, child4])
def test_open_dataset_from_s3_with_filesystem_uri(s3_connection, s3_server): from pyarrow.fs import FileSystem import pyarrow.parquet as pq host, port, access_key, secret_key = s3_connection bucket = 'theirbucket' path = 'nested/folder/data.parquet' uri = "s3://{}:{}@{}/{}?scheme=http&endpoint_override={}:{}".format( access_key, secret_key, bucket, path, host, port ) fs, path = FileSystem.from_uri(uri) assert path == 'theirbucket/nested/folder/data.parquet' fs.create_dir(bucket) table = pa.table({'a': [1, 2, 3]}) with fs.open_output_stream(path) as out: pq.write_table(table, out) # full string URI dataset = ds.dataset(uri, format="parquet") assert dataset.to_table().equals(table) # passing filesystem as an uri template = ( "s3://{}:{}@{{}}?scheme=http&endpoint_override={}:{}".format( access_key, secret_key, host, port ) ) cases = [ ('theirbucket/nested/folder/', '/data.parquet'), ('theirbucket/nested/folder', 'data.parquet'), ('theirbucket/nested/', 'folder/data.parquet'), ('theirbucket/nested', 'folder/data.parquet'), ('theirbucket', '/nested/folder/data.parquet'), ('theirbucket', 'nested/folder/data.parquet'), ] for prefix, path in cases: uri = template.format(prefix) dataset = ds.dataset(path, filesystem=uri, format="parquet") assert dataset.to_table().equals(table) with pytest.raises(pa.ArrowInvalid, match='Missing bucket name'): uri = template.format('/') ds.dataset('/theirbucket/nested/folder/data.parquet', filesystem=uri) error = ( "The path component of the filesystem URI must point to a directory " "but it has a type: `{}`. The path component is `{}` and the given " "filesystem URI is `{}`" ) path = 'theirbucket/doesnt/exist' uri = template.format(path) with pytest.raises(ValueError) as exc: ds.dataset('data.parquet', filesystem=uri) assert str(exc.value) == error.format('NotFound', path, uri) path = 'theirbucket/nested/folder/data.parquet' uri = template.format(path) with pytest.raises(ValueError) as exc: ds.dataset('data.parquet', filesystem=uri) assert str(exc.value) == error.format('File', path, uri)
def get_table_df(self, cfg, **kwargs) -> pd.DataFrame: """Use Pandas instead of Spark to retrieve the data""" self.cfg = cfg table = kwargs.pop("table") start = kwargs.pop("start_time") end = kwargs.pop("end_time") view = kwargs.pop("view") fields = kwargs.pop("columns") addnl_filter = kwargs.pop("add_filter", None) key_fields = kwargs.pop("key_fields") merge_fields = kwargs.pop('merge_fields', {}) folder = self._get_table_directory(table) if addnl_filter: # This is for special cases that are specific to an object query_str = addnl_filter else: query_str = None if query_str is None: # Make up a dummy query string to avoid if/then/else query_str = "timestamp != 0" # If sqvers is in the requested data, we've to handle it separately if 'sqvers' in fields: fields.remove('sqvers') need_sqvers = True max_vers = 0 else: need_sqvers = False # If requesting a specific version of the data, handle that diff too sqvers = kwargs.pop('sqvers', None) try: dirs = Path(folder) datasets = [] for elem in dirs.iterdir(): # Additional processing around sqvers filtering and data if 'sqvers=' not in str(elem): continue if sqvers and f'sqvers={sqvers}' != elem: continue elif need_sqvers: vers = float(str(elem).split('=')[-1]) if vers > max_vers: max_vers = vers datasets.append( ds.dataset(elem, format='parquet', partitioning='hive')) if not datasets: datasets = [ ds.dataset(folder, format='parquet', partitioning='hive') ] # Build the filters for predicate pushdown master_schema = self._build_master_schema(datasets) avail_fields = list( filter(lambda x: x in master_schema.names, fields)) filters = self.build_ds_filters(start, end, master_schema, merge_fields=merge_fields, **kwargs) final_df = ds.dataset(datasets) \ .to_table(filter=filters, columns=avail_fields) \ .to_pandas(self_destruct=True) \ .query(query_str) if merge_fields: # These are key fields that need to be set right before we do # the drop duplicates to avoid missing out all the data for field in merge_fields: newfld = merge_fields[field] if (field in final_df.columns and newfld in final_df.columns): final_df[newfld] = np.where(final_df[newfld], final_df[newfld], final_df[field]) elif (field in final_df.columns and newfld not in final_df.columns): final_df.rename(columns={field: newfld}, inplace=True) if (not final_df.empty and (view == 'latest') and all(x in final_df.columns for x in key_fields)): final_df = final_df.set_index(key_fields) \ .sort_values(by='timestamp') \ .query('~index.duplicated(keep="last")') \ .reset_index() except (pa.lib.ArrowInvalid, OSError): return pd.DataFrame(columns=fields) fields = [x for x in final_df.columns if x in fields] if need_sqvers: final_df['sqvers'] = max_vers fields.insert(0, 'sqvers') return final_df[fields]
def write_files(table: str, filelist: List[str], in_basedir: str, outfolder: str, partition_cols: List[str], state: SqCoalesceState, block_start, block_end) -> None: """Write the data from the list of files out as a single coalesced block We're fixing the compression in this function :param table: str, Name of the table for which we're writing the files :param filelist: List[str], list of files to write the data to :param in_basedir: str, base directory of the read files, to get partition date :param outfolder: str, the outgoing folder to write the data to :param partition_cols: List[str], partition columns :param state: SqCoalesceState, coalescer state, for constructing filename :param block_start: dateime, starting time window of this coalescing block :param block_end: dateime, ending time window of this coalescing block :returns: Nothing """ if not filelist and not state.schema.type == "record": return state.block_start = int(block_start.timestamp()) state.block_end = int(block_end.timestamp()) if filelist: this_df = ds.dataset(source=filelist, partitioning='hive', partition_base_dir=in_basedir) \ .to_table() \ .to_pandas() state.wrrec_count += this_df.shape[0] if not this_df.empty: this_df = migrate_df(table, this_df, state.schema) if state.schema.type == "record": if not state.current_df.empty: this_df = this_df.set_index(state.keys) sett = set(this_df.index) setc = set(state.current_df.index) missing_set = setc.difference(sett) if missing_set: missing_df = state.current_df.loc[missing_set] this_df = pd.concat([this_df.reset_index(), missing_df.reset_index()]) else: this_df = this_df.reset_index() elif not state.current_df.empty: assert(state.schema.type == "record") this_df = state.current_df.reset_index() else: return this_df.sqvers = state.schema.version # Updating the schema version state.dbeng.write(state.table_name, "pandas", this_df, True, state.schema.get_arrow_schema(), state.pq_file_name) if state.schema.type == "record" and filelist: # Now replace the old dataframe with this new set for "record" types # Non-record types should never have current_df non-empty state.current_df = this_df.set_index(state.keys) \ .sort_values(by='timestamp') \ .query('~index.duplicated(keep="last")')
v.levels v.expand(2) x = np.random.choice(list("abdcde"), size=10000, replace=True) v.to_index(pd.Series(x)) v.to_categorical(pd.Series(x)) v.to_sparse(pd.Series(x)) ## cycler import io data = """ v1,v2,v3 1,2,3 4,5,6 7,8,9 """ with open('test.csv', 'w') as fout: fout.write(data) from pyarrow import csv opts = csv.ConvertOptions csv.read_csv('test.csv', ) import pyarrow.dataset as ds for chunk in csv.open_csv('test.csv'): ds.dataset(chunk)
filter_ = (ds.field("event") == 3749778) elif selectivity == "10": filter_ = (ds.field("total_amount") > 27) elif selectivity == "100": filter_ = None elif selectivity == "sm": filter_ = (ds.field("total_amount") > 300) elif selectivity == "smm": filter_ = (ds.field("total_amount") > 500) results = list() for i in range(iterations): e = os.system('./clean_cache.sh') if e != 0: print('failed to clean cache') dataset_ = ds.dataset(directory, format=format_) cols_ = dataset_.schema.names start = time.time() j = 0 futures_list = list() for file in dataset_.files: future = client.submit(do_scan, file, cols_) futures_list.append(future) wait(futures_list) end = time.time() results.append(end - start) print(f"{fmt}_{selectivity} = ", results)
def test_open_dataset_from_source_additional_kwargs(multisourcefs): child = ds.FileSystemDatasetFactory(multisourcefs, fs.FileSelector('/plain'), format=ds.ParquetFileFormat()) with pytest.raises(ValueError, match="cannot pass any additional"): ds.dataset(child, format="parquet")
def read(self, table_name: str, data_format: str, **kwargs) -> pd.DataFrame: """Read the data specified from parquet files and return This function also implements predicate pushdown to filter the data as specified by the provided filters. :param table_name: str, the name of the table to be read :param data_format: str, Format the data's to be returned in, (only pandas supported at this point) :param columns: List[str], list of columns requested to be read, only those specified are returned, keyword arg :param key_fields: List[str], key fields for table, required to deduplicate, keyword arg only :param view: str, one of ["latest", "all"], keyword arg only :param start: float, starting time window for data, timestamp, can be 0 to indicate latest, keyword arg only :param end: float, ending time window for data, timestamp, can be 0 to indicate latest, keyword arg only, :param kwargs: dict, the optional keyword arguments, addnl_filter, and merge_fields, not needed typically :returns: pandas dataframe of the data specified, or None if unsupported format :rtype: pd.DataFrame """ if data_format not in self.supported_data_formats(): return None start = kwargs.pop("start_time") end = kwargs.pop("end_time") view = kwargs.pop("view") fields = kwargs.pop("columns") key_fields = kwargs.pop("key_fields") addnl_filter = kwargs.pop("add_filter", None) merge_fields = kwargs.pop('merge_fields', {}) folder = self._get_table_directory(table_name, False) if addnl_filter: # This is for special cases that are specific to an object query_str = addnl_filter else: query_str = None if query_str is None: # Make up a dummy query string to avoid if/then/else query_str = "timestamp != 0" # If sqvers is in the requested data, we've to handle it separately if 'sqvers' in fields: fields.remove('sqvers') need_sqvers = True max_vers = 0 else: need_sqvers = False # If requesting a specific version of the data, handle that diff too sqvers = kwargs.pop('sqvers', None) datasets = [] try: dirs = Path(folder) try: for elem in dirs.iterdir(): # Additional processing around sqvers filtering and data if 'sqvers=' not in str(elem): continue if sqvers and f'sqvers={sqvers}' != elem: continue elif need_sqvers: vers = float(str(elem).split('=')[-1]) if vers > max_vers: max_vers = vers datasets.append( ds.dataset(elem, format='parquet', partitioning='hive')) except FileNotFoundError: pass except Exception as e: raise e # Now find the exact set of files we need to go over cp_dataset = self._get_cp_dataset(table_name, need_sqvers, sqvers, view, start, end) if cp_dataset: datasets.append(cp_dataset) if not datasets: datasets = [ ds.dataset(folder, format='parquet', partitioning='hive') ] # Build the filters for predicate pushdown master_schema = self._build_master_schema(datasets) avail_fields = list( filter(lambda x: x in master_schema.names, fields)) filters = self.build_ds_filters(start, end, master_schema, merge_fields=merge_fields, **kwargs) final_df = ds.dataset(datasets) \ .to_table(filter=filters, columns=avail_fields) \ .to_pandas(self_destruct=True) \ .query(query_str) \ .sort_values(by='timestamp') if merge_fields: # These are key fields that need to be set right before we do # the drop duplicates to avoid missing out all the data for field in merge_fields: newfld = merge_fields[field] if (field in final_df.columns and newfld in final_df.columns): final_df[newfld] = np.where(final_df[newfld], final_df[newfld], final_df[field]) elif (field in final_df.columns and newfld not in final_df.columns): final_df = final_df.rename(columns={field: newfld}) # Because of how coalescing works, we can have multiple duplicated # entries with same timestamp. Remove them dupts_keys = key_fields + ['timestamp'] final_df = final_df.set_index(dupts_keys) \ .query('~index.duplicated(keep="last")') \ .reset_index() if (not final_df.empty and (view == 'latest') and all(x in final_df.columns for x in key_fields)): final_df = final_df.set_index(key_fields) \ .query('~index.duplicated(keep="last")') except (pa.lib.ArrowInvalid, OSError): return pd.DataFrame(columns=fields) if need_sqvers: final_df['sqvers'] = max_vers fields.insert(0, 'sqvers') cols = set(final_df.columns.tolist() + final_df.index.names) fields = [x for x in fields if x in cols] return final_df.reset_index()[fields]
def read_parquet( filepath_or_buffer, engine="cudf", columns=None, filters=None, row_groups=None, skip_rows=None, num_rows=None, strings_to_categorical=False, use_pandas_metadata=True, *args, **kwargs, ): """{docstring}""" # Multiple sources are passed as a list. If a single source is passed, # wrap it in a list for unified processing downstream. if not is_list_like(filepath_or_buffer): filepath_or_buffer = [filepath_or_buffer] # a list of row groups per source should be passed. make the list of # lists that is expected for multiple sources if row_groups is not None: if not is_list_like(row_groups): row_groups = [[row_groups]] elif not is_list_like(row_groups[0]): row_groups = [row_groups] filepaths_or_buffers = [] for source in filepath_or_buffer: tmp_source, compression = ioutils.get_filepath_or_buffer( path_or_data=source, compression=None, **kwargs) if compression is not None: raise ValueError( "URL content-encoding decompression is not supported") filepaths_or_buffers.append(tmp_source) if filters is not None: # Convert filters to ds.Expression filters = pq._filters_to_expression(filters) # Initialize ds.FilesystemDataset dataset = ds.dataset(filepaths_or_buffers, format="parquet", partitioning="hive") # Load IDs of filtered row groups for each file in dataset filtered_rg_ids = defaultdict(list) for fragment in dataset.get_fragments(filter=filters): for rg_fragment in fragment.get_row_group_fragments(filters): for rg_id in rg_fragment.row_groups: filtered_rg_ids[rg_fragment.path].append(rg_id) # TODO: Use this with pyarrow 1.0.0 # # Load IDs of filtered row groups for each file in dataset # filtered_row_group_ids = {} # for fragment in dataset.get_fragments(filters): # for row_group_fragment in fragment.split_by_row_group(filters): # for row_group_info in row_group_fragment.row_groups: # path = row_group_fragment.path # if path not in filtered_row_group_ids: # filtered_row_group_ids[path] = [row_group_info.id] # else: # filtered_row_group_ids[path].append(row_group_info.id) # Initialize row_groups to be selected if row_groups is None: row_groups = [None for _ in dataset.files] # Store IDs of selected row groups for each file for i, file in enumerate(dataset.files): if row_groups[i] is None: row_groups[i] = filtered_rg_ids[file] else: row_groups[i] = filter(lambda id: id in row_groups[i], filtered_rg_ids[file]) if engine == "cudf": return libparquet.read_parquet( filepaths_or_buffers, columns=columns, row_groups=row_groups, skip_rows=skip_rows, num_rows=num_rows, strings_to_categorical=strings_to_categorical, use_pandas_metadata=use_pandas_metadata, ) else: warnings.warn("Using CPU via PyArrow to read Parquet dataset.") return cudf.DataFrame.from_arrow( pq.ParquetDataset(filepaths_or_buffers).read_pandas( columns=columns, *args, **kwargs))
def _get_cp_dataset(self, table_name: str, need_sqvers: bool, sqvers: str, view: str, start_time: float, end_time: float) -> ds.dataset: """Get the list of files to read in coalesced dir This iterates over the coalesced files that need to be read and comes up with a list of files that corresponds to the timeslot the user has specified :param table_name: str, Table for which coalesced info is requested :param need_sqvers: bool, True if the user has requested that we return the sqvers :param sqvers: str, if we're looking only for files of a specific vers :param view: str, whether to return the latest only OR all :param start_time: float, the starting time window of data needed : param end_time: float, the ending time window of data needed :returns: pyarrow dataset for the files to be read :rtype: pyarrow.dataset.dataset """ filelist = [] max_vers = 0 folder = self._get_table_directory(table_name, True) if start_time and end_time or (view == "all"): # Enforcing the logic we have: if both start_time & end_time # are given, return all files since the model is that the user is # expecting to see all changes in the time window. Otherwise, the user # is expecting to see only the latest before an end_time OR after a # start_time. all_files = True else: all_files = False # We need to iterate otherwise the differing schema from different dirs # causes the read to abort. dirs = Path(folder) if not dirs.exists() or not dirs.is_dir(): return for elem in dirs.iterdir(): # Additional processing around sqvers filtering and data if 'sqvers=' not in str(elem): continue if sqvers and f'sqvers={sqvers}' != elem.name: continue elif need_sqvers: vers = float(str(elem).split('=')[-1]) if vers > max_vers: max_vers = vers dataset = ds.dataset(elem, format='parquet', partitioning='hive') if not start_time and not end_time: files = dataset.files else: files = [] latest_filedict = {} prev_time = 0 prev_namespace = '' file_in_this_ns = False prev_file = None thistime = [] for file in sorted(dataset.files): namespace = os.path.dirname(file).split('namespace=')[1] \ .split('/')[0] if (prev_namespace and (namespace != prev_namespace) and thistime and not file_in_this_ns): if ((start_time and thistime[1] >= start_time) or (end_time and thistime[1] >= end_time)): files.append(prev_file) prev_namespace = '' thistime = os.path.basename(file).split('.')[0] \ .split('-')[-2:] thistime = [int(x) * 1000 for x in thistime] # time in ms if not start_time or (thistime[0] >= start_time): if not end_time: files.append(file) file_in_this_ns = True elif thistime[0] < end_time: files.append(file) file_in_this_ns = True elif prev_time < end_time < thistime[0]: key = file.split('namespace=')[1].split('/')[0] if key not in latest_filedict: latest_filedict[key] = file file_in_this_ns = True prev_time = thistime[0] prev_file = file prev_namespace = namespace if not file_in_this_ns: if (thistime and ((start_time and thistime[1] >= start_time) or (end_time and thistime[1] >= end_time))): files.append(file) if latest_filedict: filelist.extend(list(latest_filedict.values())) if not all_files and files: latest_filedict = { x.split('namespace=')[1].split('/')[0]: x for x in sorted(files) } filelist.extend(list(latest_filedict.values())) elif files: filelist.extend(sorted(files)) if filelist: return ds.dataset(filelist, format='parquet', partitioning='hive') else: return []
def test_open_dataset_from_source_additional_kwargs(tempdir): _, path = _create_single_file(tempdir) with pytest.raises(ValueError, match="cannot pass any additional"): ds.dataset(ds.source(path), format="parquet")
def __init__(self, path, process_func): super().__init__() self.dataset = ds.dataset(path) self.process_func = process_func
def _read_map_parquet(healsparse_class, filepath, pixels=None, header=False, degrade_nside=None, weightfile=None, reduction='mean', use_threads=False): """ Internal function to read in a HealSparseMap from a parquet dataset. Parameters ---------- healsparse_class : `type` Type value of the HealSparseMap class. filepath : `str` Name of the file path to read. Must be a parquet dataset. pixels : `list`, optional List of coverage map pixels to read. header : `bool`, optional Return the parquet metadata as well as map? Default is False. degrade_nside : `int`, optional Degrade map to this nside on read. None means leave as-is. Not yet implemented for parquet. weightfile : `str`, optional Floating-point map to supply weights for degrade wmean. Must be a HealSparseMap (weighted degrade not supported for healpix degrade-on-read). Not yet implemented for parquet. reduction : `str`, optional Reduction method with degrade-on-read. (mean, median, std, max, min, and, or, sum, prod, wmean). Not yet implemented for parquet. use_threads : `bool`, optional Use multithreaded reading. Returns ------- healSparseMap : `HealSparseMap` HealSparseMap from file, covered by pixels header : `astropy.io.fits.Header` (if header=True) Header metadata for the map file. """ ds = dataset.dataset(filepath, format='parquet', partitioning='hive') schema = ds.schema # Convert from byte strings md = { key.decode(): schema.metadata[key].decode() for key in schema.metadata } if 'healsparse::filetype' not in md: raise RuntimeError("Filepath %s is not a healsparse parquet map." % (filepath)) if md['healsparse::filetype'] != 'healsparse': raise RuntimeError("Filepath %s is not a healsparse parquet map." % (filepath)) cov_fname = os.path.join(filepath, '_coverage.parquet') if not os.path.isfile(cov_fname): # Note that this could be reconstructed from the information in the file # inefficiently. This feature could be added in the future. raise RuntimeError("Filepath %s is missing coverage map %s" % (filepath, cov_fname)) nside_sparse = int(md['healsparse::nside_sparse']) nside_coverage = int(md['healsparse::nside_coverage']) nside_io = int(md['healsparse::nside_io']) bitshift_io = _compute_bitshift(nside_io, nside_coverage) cov_tab = parquet.read_table(cov_fname, use_threads=use_threads) cov_pixels = cov_tab['cov_pix'].to_numpy() row_groups = cov_tab['row_group'].to_numpy() if pixels is not None: _pixels = np.atleast_1d(pixels) if len(np.unique(_pixels)) < len(_pixels): raise RuntimeError("Input list of pixels must be unique.") sub = np.clip(np.searchsorted(cov_pixels, _pixels), 0, cov_pixels.size - 1) ok, = np.where(cov_pixels[sub] == _pixels) if ok.size == 0: raise RuntimeError( "None of the specified pixels are in the coverage map.") _pixels = np.sort(_pixels[ok]) _pixels_io = np.right_shift(_pixels, bitshift_io) # Figure out row groups... matches = np.searchsorted(cov_pixels, _pixels) _row_groups_io = row_groups[matches] else: _pixels = cov_pixels _pixels_io = None _row_groups_io = None cov_map = HealSparseCoverage.make_from_pixels(nside_coverage, nside_sparse, _pixels) if md['healsparse::widemask'] == 'True': is_wide_mask = True wmult = int(md['healsparse::wwidth']) else: is_wide_mask = False wmult = 1 if md['healsparse::primary'] != '': # This is a multi-column table. is_rec_array = True primary = md['healsparse::primary'] columns = [ name for name in schema.names if name not in ['iopix', 'cov_pix'] ] dtype = [(name, schema.field(name).type.to_pandas_dtype()) for name in columns] primary_dtype = schema.field(primary).type.to_pandas_dtype() else: is_rec_array = False primary = None dtype = schema.field('sparse').type.to_pandas_dtype() primary_dtype = dtype columns = ['sparse'] if md['healsparse::sentinel'] == 'UNSEEN': sentinel = primary_dtype(hpg.UNSEEN) elif md['healsparse::sentinel'] == 'False': sentinel = False elif md['healsparse::sentinel'] == 'True': sentinel = True else: sentinel = primary_dtype(md['healsparse::sentinel']) if is_integer_value(sentinel): sentinel = int(sentinel) elif not isinstance(sentinel, np.bool_): sentinel = float(sentinel) if is_rec_array: sparse_map = np.zeros((_pixels.size + 1) * cov_map.nfine_per_cov, dtype=dtype) # Fill in the overflow (primary) sparse_map[primary][:cov_map.nfine_per_cov] = sentinel # Fill in the overflow (not primary) for d in dtype: if d[0] == primary: continue sparse_map[d[0]][:cov_map.nfine_per_cov] = check_sentinel( d[1], None) else: sparse_map = np.zeros( (_pixels.size + 1) * cov_map.nfine_per_cov * wmult, dtype=dtype) sparse_map[:cov_map.nfine_per_cov * wmult] = sentinel if _pixels_io is None: # Read the full table tab = ds.to_table(columns=columns, use_threads=use_threads) else: _pixels_io_unique = list(np.unique(_pixels_io)) fragments = list( ds.get_fragments( filter=dataset.field('iopix').isin(_pixels_io_unique))) group_fragments = [] for pixel_io, fragment in zip(_pixels_io_unique, fragments): groups = fragment.split_by_row_group() # Only append groups that are relevant use, = np.where(_pixels_io == pixel_io) for ind in use: group_fragments.append(groups[_row_groups_io[ind]]) ds2 = dataset.FileSystemDataset(group_fragments, schema, ds.format) tab = ds2.to_table(columns=columns, use_threads=use_threads) if is_rec_array: for name in columns: sparse_map[name][cov_map.nfine_per_cov:] = tab[name].to_numpy() else: sparse_map[cov_map.nfine_per_cov * wmult:] = tab['sparse'].to_numpy() if is_wide_mask: sparse_map = sparse_map.reshape( (sparse_map.size // wmult, wmult)).astype(WIDE_MASK) healsparse_map = healsparse_class(cov_map=cov_map, sparse_map=sparse_map, nside_sparse=nside_sparse, primary=primary, sentinel=sentinel) if header: if 'healsparse::header' in md: hdr_string = md['healsparse::header'] hdr = fits.Header.fromstring(hdr_string) else: hdr = fits.Header() return (healsparse_map, hdr) else: return healsparse_map
def test_open_dataset_non_existing_file(): # ARROW-8213: Opening a dataset with a local incorrect path gives confusing # error message with pytest.raises(FileNotFoundError): ds.dataset('i-am-not-existing.parquet', format='parquet')
def update_scdata(self, scdata): # TODO: Only accepts one file for now ds.write_dataset(scdata, self.data_interim, format="arrow") dataset = ds.dataset(self.data_interim, format="arrow") self.filetype = "arrow" self.memory_mapped_dataset = pa.memory_map(dataset.files[0], 'r')
import pyarrow.feather as fe import datamol as dm import operator dm.disable_rdkit_log() dataset = [ '/data/dockop_glide_d3/second50k_glide_molchunkout/second50k_glide_out.molchunk', '/data/dockop_glide_d3/first50k_glide_molchunkout', '/data/dockop_glide_d3/thirdd50k_glide_molchunkout/third50k_glide_out.molchunk', '/data/dockop_glide_d3/fourth50k_glide_molchunkout/fourth50k_glide_out.molchunk', '/data/dockop_glide_d3/fithround_glide_molchunkout/fifth50k_glide_out.molchunk' ] dflist = [] for data in dataset: dataset = ds.dataset(data, format="feather") df = dataset.to_table().to_pandas() dflist.append(df) def combine_unique_molchunks_with_identical_columns(molchunk_1, molchunk_2): outer_merged = pd.merge(molchunk_1, molchunk_2, how='outer') return outer_merged docked_df = combine_unique_molchunks_with_identical_columns( dflist[0], dflist[1]) docked_df = combine_unique_molchunks_with_identical_columns( docked_df, dflist[2]) docked_df = combine_unique_molchunks_with_identical_columns( docked_df, dflist[3])
def from_parquet( cls, file, treepath="/Events", entry_start=None, entry_stop=None, runtime_cache=None, persistent_cache=None, schemaclass=NanoAODSchema, metadata=None, parquet_options={}, rados_parquet_options={}, access_log=None, ): """Quickly build NanoEvents from a parquet file Parameters ---------- file : str, pathlib.Path, pyarrow.NativeFile, or python file-like The filename or already opened file using e.g. ``uproot.open()`` treepath : str, optional Name of the tree to read in the file entry_start : int, optional Start at this entry offset in the tree (default 0) entry_stop : int, optional Stop at this entry offset in the tree (default end of tree) runtime_cache : dict, optional A dict-like interface to a cache object. This cache is expected to last the duration of the program only, and will be used to hold references to materialized awkward arrays, etc. persistent_cache : dict, optional A dict-like interface to a cache object. Only bare numpy arrays will be placed in this cache, using globally-unique keys. schemaclass : BaseSchema A schema class deriving from `BaseSchema` and implementing the desired view of the file metadata : dict, optional Arbitrary metadata to add to the `base.NanoEvents` object parquet_options : dict, optional Any options to pass to ``pyarrow.parquet.ParquetFile`` access_log : list, optional Pass a list instance to record which branches were lazily accessed by this instance """ import pyarrow import pyarrow.dataset as ds import pyarrow.parquet ftypes = ( pathlib.Path, pyarrow.NativeFile, io.TextIOBase, io.BufferedIOBase, io.RawIOBase, io.IOBase, ) if isinstance(file, ftypes): table_file = pyarrow.parquet.ParquetFile(file, **parquet_options) elif isinstance(file, str): fs_file = fsspec.open(file, "rb") table_file = pyarrow.parquet.ParquetFile(fs_file, **parquet_options) elif isinstance(file, pyarrow.parquet.ParquetFile): table_file = file else: raise TypeError("Invalid file type (%s)" % (str(type(file)))) if entry_start is None or entry_start < 0: entry_start = 0 if entry_stop is None or entry_stop > table_file.metadata.num_rows: entry_stop = table_file.metadata.num_rows pqmeta = table_file.schema_arrow.metadata pquuid = None if pqmeta is None else pqmeta.get(b"uuid", None) pqobj_path = None if pqmeta is None else pqmeta.get( b"object_path", None) partition_key = ( str(None) if pquuid is None else pquuid.decode("ascii"), str(None) if pqobj_path is None else pqobj_path.decode("ascii"), "{0}-{1}".format(entry_start, entry_stop), ) uuidpfn = {partition_key[0]: pqobj_path} mapping = ParquetSourceMapping(TrivialParquetOpener( uuidpfn, parquet_options), access_log=access_log) format_ = "parquet" if "ceph_config_path" in rados_parquet_options: format_ = ds.RadosParquetFileFormat( rados_parquet_options["ceph_config_path"].encode()) dataset = ds.dataset(file, schema=table_file.schema_arrow, format=format_) shim = TrivialParquetOpener.UprootLikeShim(file, dataset) mapping.preload_column_source(partition_key[0], partition_key[1], shim) base_form = mapping._extract_base_form(table_file.schema_arrow) return cls._from_mapping( mapping, partition_key, base_form, runtime_cache, persistent_cache, schemaclass, metadata, )
def test_open_union_dataset(tempdir): _, path = _create_single_file(tempdir) dataset = ds.dataset(path) union = ds.dataset([dataset, dataset]) assert isinstance(union, ds.UnionDataset)