def test_null_count(store, column, expected_null_count): serialiser = ParquetSerializer(chunk_size=2) df = pd.DataFrame({ "no_nulls_int": [1, 2, 3, 4, 5, 6], "partial_nulls_int": [1, 2, 3, None, None, None], "no_nulls_float": [1.1, 2.2, 3.3, 4.4, 5.5, 6.6], "partial_nulls_float": [1.0, 2.2, 3.3, np.nan, np.nan, np.nan], "partial_nulls_obj": [1.0, 2.2, 3.3, np.nan, np.nan, np.nan], "no_nulls_obj": ["1.1", "2", "3", "vier", "fuenfeinhalb", "6.6"], "partial_nulls_obj_mixed": [1.0, 2.2, None, np.nan, np.nan, 6.6], "nulls_reverse_rg": [3.3, np.nan, 1.0, 2.0, np.nan, -1.1], }) key = serialiser.store(store, "prefix", df) reader = pa.BufferReader(store.get(key)) parquet_file = ParquetFile(reader) col_idx = parquet_file.reader.column_name_idx(column) assert parquet_file.num_row_groups == 3 for idx in range(0, 3): rg = parquet_file.metadata.row_group(idx) assert rg.column( col_idx).statistics.null_count == expected_null_count[idx]
def _read_parquet_pandas_on_ray(path, engine, columns, **kwargs): from pyarrow.parquet import ParquetFile if not columns: pf = ParquetFile(path) columns = [ name for name in pf.metadata.schema.names if not PQ_INDEX_REGEX.match(name) ] num_splits = min(len(columns), RayBlockPartitions._compute_num_partitions()) # Each item in this list will be a column of original df # partitioned to smaller pieces along rows. # We need to transpose the oids array to fit our schema. blk_partitions = np.array([ _read_parquet_column._submit(args=(path, col, num_splits, kwargs), num_return_vals=num_splits + 1) for col in columns ]).T remote_partitions = np.array([[RayRemotePartition(obj) for obj in row] for row in blk_partitions[:-1]]) index_len = ray.get(blk_partitions[-1][0]) index = pandas.RangeIndex(index_len) new_manager = PandasDataManager(RayBlockPartitions(remote_partitions), index, columns) df = DataFrame(data_manager=new_manager) return df
def load(self, rowgroup_spec): """Loads data form a single rowgroup from the dataset. Reads a single rowgroup from a dataset. Returns a list of dictionary with still encoded data. If worker_predicate was passed to the constructor, the predicate is first applied to the columns specified by the predicate. The rest of the columns are loaded only if at least one row matches the predicate. A rowgroup will be loaded from local cache, if cache contains an instance of the rowgroup. If ngram not None was passed to the constructor, the function returns a dictionary structured according to NGram definition. :param rowgroup_spec: A dictionary containing the following fields: 'row_group': ParquetDatasetPiece object describing a rowgroup to be loaded; 'shuffle_row_drop_partition' a tuple with (this_partition, num_of_partitions) :return: A dictionary indexed by field names, or a dictionary defined by NGram spec. """ piece = rowgroup_spec['row_group'] shuffle_row_drop_partition = rowgroup_spec[ 'shuffle_row_drop_partition'] # Create pyarrow file system with self._dataset.fs.open(piece.path) as piece_file_handle: parquet_file = ParquetFile(piece_file_handle) if not isinstance(self._local_cache, NullCache): if self._worker_predicate: raise RuntimeError( 'Local cache is not supported together with predicates, ' 'unless the dataset is partitioned by the column the predicate operates on.' ) if shuffle_row_drop_partition[1] != 1: raise RuntimeError( 'Local cache is not supported together with shuffle_row_drop_partitions > 1' ) if self._worker_predicate: all_cols = self._load_rows_with_predicate( parquet_file, piece, self._worker_predicate, shuffle_row_drop_partition) else: # Using hash of the dataset url with the relative path in order to: # 1. Make sure if a common cache serves multiple processes (e.g. redis), we don't have conflicts # 2. Dataset url is hashed, to make sure we don't create too long keys, which maybe incompatible with # some cache implementations # 3. Still leave relative path and the piece_index in plain text to make it easier to debug cache_key = '{}:{}:{}'.format( hashlib.md5( urlunparse(self._dataset_url_parsed).encode( 'utf-8')).hexdigest(), piece.path, piece.row_group) all_cols = self._local_cache.get( cache_key, lambda: self._load_rows( parquet_file, piece, shuffle_row_drop_partition)) if self._ngram: all_cols_as_ngrams = self._ngram.form_ngram(data=all_cols, schema=self._schema) return all_cols_as_ngrams else: return all_cols
def read_parquet(path, engine='auto', columns=None, **kwargs): """Load a parquet object from the file path, returning a DataFrame. Ray DataFrame only supports pyarrow engine for now. Args: path: The filepath of the parquet file. We only support local files for now. engine: Ray only support pyarrow reader. This argument doesn't do anything for now. kwargs: Pass into parquet's read_row_group function. """ pf = ParquetFile(path) n_rows = pf.metadata.num_rows chunksize = n_rows // get_npartitions() n_row_groups = pf.metadata.num_row_groups idx_regex = re.compile('__index_level_\d+__') columns = [ name for name in pf.metadata.schema.names if not idx_regex.match(name) ] df_from_row_groups = [ _read_parquet_row_group.remote(path, columns, i, kwargs) for i in range(n_row_groups) ] splited_dfs = ray.get( [_split_df.remote(df, chunksize) for df in df_from_row_groups]) df_remotes = list(chain.from_iterable(splited_dfs)) return DataFrame(row_partitions=df_remotes, columns=columns)
def parse_parquet(infile): """ parse a parquet file and get the columns and index from it. """ import pandas as pd from pyarrow.parquet import ParquetFile parquet = ParquetFile(infile) metadata = parquet.metadata schema = metadata.schema.to_arrow_schema() columns = [ metadata.schema.column(col_i).name for col_i in range(metadata.num_columns) ] index_cols = [col for col in columns if "__index_level_" in col] assert len(index_cols) <= 1 if len(index_cols) == 1: index_col = index_cols[0] index = pd.read_parquet(infile, columns=[index_col]).index.values index_used = True else: index_col = "__non-existing-col__" index = list(range(parquet.metadata.num_rows)) index_used = False columns = [col for col in columns if col != index_col] return columns, index, index_used, schema
def process_single_parquet_partition(parquet_location, callback): parquet_file = ParquetFile(source=parquet_location) num_row_groups = parquet_file.num_row_groups print( "----------------------------------------------------------------------------------" ) print("%d row groups for partition: %s" % (num_row_groups, parquet_location)) for index in range(0, num_row_groups): row_df = parquet_file.read_row_group(index, columns=["id", "img_binary" ]).to_pandas() print(row_df.info(verbose=True)) callback(row_df)
def process(self, piece_index, worker_predicate, shuffle_row_drop_partition): """Main worker function. Loads and returns all rows matching the predicate from a rowgroup Looks up the requested piece (a single row-group in a parquet file). If a predicate is specified, columns needed by the predicate are loaded first. If no rows in the rowgroup matches the predicate criteria the rest of the columns are not loaded. :param piece_index: :param shuffle_row_drop_partition: A tuple 2 of the current row drop partition and the total number of partitions. :return: """ if not self._dataset: self._dataset = pq.ParquetDataset(self._dataset_url_parsed.path, filesystem=self._filesystem, validate_schema=False) piece = self._split_pieces[piece_index] # Create pyarrow file system parquet_file = ParquetFile(self._dataset.fs.open(piece.path)) if not isinstance(self._local_cache, NullCache): if worker_predicate: raise RuntimeError( 'Local cache is not supported together with predicates, ' 'unless the dataset is partitioned by the column the predicate operates on.' ) if shuffle_row_drop_partition[1] != 1: raise RuntimeError( 'Local cache is not supported together with shuffle_row_drop_partitions > 1' ) if worker_predicate: all_cols = self._load_rows_with_predicate( parquet_file, piece, worker_predicate, shuffle_row_drop_partition) else: # Using hash of the dataset url with the relative path in order to: # 1. Make sure if a common cache serves multiple processes (e.g. redis), we don't have conflicts # 2. Dataset url is hashed, to make sure we don't create too long keys, which maybe incompatible with # some cache implementations # 3. Still leave relative path and the piece_index in plain text to make it easier to debug cache_key = '{}:{}:{}'.format( hashlib.md5( urlunparse( self._dataset_url_parsed).encode('utf-8')).hexdigest(), piece.path, piece_index) all_cols = self._local_cache.get( cache_key, lambda: self._load_rows(parquet_file, piece, shuffle_row_drop_partition)) if self._ngram: all_cols = self._ngram.form_ngram(data=all_cols, schema=self._schema) if all_cols: self.publish_func(all_cols)
def cache_generator( cls, glob_path, reads_per_file=3, resamples=1, shuffle=False, infinite=False, ): filenames = sorted(glob2.glob(glob_path)) if len(filenames) == 0: raise Exception( f"{cls.__name__}.batch_generator() - invalid glob_path: {glob_path}" ) gc.collect() # sleep(1) # sleep(1) is required to allow measurement of the garbage collector while True: for filename in filenames: num_rows = ParquetFile(filename).metadata.num_rows cache_size = math.ceil(num_rows / reads_per_file) for n_read in range(reads_per_file): gc.collect() # sleep(1) # sleep(1) is required to allow measurement of the garbage collector cache = ( pd.read_parquet(filename) # .set_index('image_id', drop=True) # WARN: Don't do this, it breaks other things .iloc[cache_size * n_read:cache_size * (n_read + 1)].copy()) for resample in range(resamples): if shuffle: cache = cache.sample(frac=1) yield cache if not infinite: break
def get_table_column_names_and_types( self, config: RepoConfig) -> Iterable[Tuple[str, str]]: filesystem, path = FileSource.create_filesystem_and_path( self.path, self._file_options.s3_endpoint_override) schema = ParquetFile(path if filesystem is None else filesystem. open_input_file(path)).schema_arrow return zip(schema.names, map(str, schema.types))
def read(cls, path, engine, columns, **kwargs): """Load a parquet object from the file path, returning a DataFrame. Ray DataFrame only supports pyarrow engine for now. Args: path: The filepath of the parquet file. We only support local files for now. engine: Ray only support pyarrow reader. This argument doesn't do anything for now. kwargs: Pass into parquet's read_pandas function. Notes: ParquetFile API is used. Please refer to the documentation here https://arrow.apache.org/docs/python/parquet.html """ from pyarrow.parquet import ParquetFile, ParquetDataset from modin.pandas.io import PQ_INDEX_REGEX if os.path.isdir(path): partitioned_columns = set() directory = True original_path = path # We do a tree walk of the path directory because partitioned # parquet directories have a unique column at each directory level. # Thus, we can use os.walk(), which does a dfs search, to walk # through the different columns that the data is partitioned on for (root, dir_names, files) in os.walk(path): if dir_names: partitioned_columns.add(dir_names[0].split("=")[0]) if files: # Metadata files, git files, .DSStore if files[0][0] == ".": continue path = os.path.join(root, files[0]) break partitioned_columns = list(partitioned_columns) if len(partitioned_columns): ErrorMessage.default_to_pandas( "Partitioned Columns in Parquet") return cls.single_worker_read(original_path, engine=engine, columns=columns, **kwargs) else: directory = False if not columns: if directory: # Path of the sample file that we will read to get the remaining columns pd = ParquetDataset(path) column_names = pd.schema.names else: pf = ParquetFile(path) column_names = pf.metadata.schema.names columns = [ name for name in column_names if not PQ_INDEX_REGEX.match(name) ] return cls.build_query_compiler(path, columns, **kwargs)
def read_parquet(cls, path, engine, columns, **kwargs): """Load a parquet object from the file path, returning a DataFrame. Ray DataFrame only supports pyarrow engine for now. Args: path: The filepath of the parquet file. We only support local files for now. engine: Ray only support pyarrow reader. This argument doesn't do anything for now. kwargs: Pass into parquet's read_pandas function. Notes: ParquetFile API is used. Please refer to the documentation here https://arrow.apache.org/docs/python/parquet.html """ from pyarrow.parquet import ParquetFile if cls.read_parquet_remote_task is None: return super(RayIO, cls).read_parquet(path, engine, columns, **kwargs) if not columns: pf = ParquetFile(path) columns = [ name for name in pf.metadata.schema.names if not PQ_INDEX_REGEX.match(name) ] num_partitions = cls.frame_mgr_cls._compute_num_partitions() num_splits = min(len(columns), num_partitions) # Each item in this list will be a list of column names of the original df column_splits = (len(columns) // num_partitions if len(columns) % num_partitions == 0 else len(columns) // num_partitions + 1) col_partitions = [ columns[i:i + column_splits] for i in range(0, len(columns), column_splits) ] # Each item in this list will be a list of columns of original df # partitioned to smaller pieces along rows. # We need to transpose the oids array to fit our schema. blk_partitions = np.array([ cls.read_parquet_remote_task._remote( args=(path, cols, num_splits, kwargs), num_return_vals=num_splits + 1, ) for cols in col_partitions ]).T remote_partitions = np.array( [[cls.frame_partition_cls(obj) for obj in row] for row in blk_partitions[:-1]]) index_len = ray.get(blk_partitions[-1][0]) index = pandas.RangeIndex(index_len) new_query_compiler = cls.query_compiler_cls( cls.frame_mgr_cls(remote_partitions), index, columns) return new_query_compiler
def parquet_reader(filename): """ Reader interface for a single Parquet file Parameters: filename (str): The teacher parquet file name Returns: parque (obj): ParquetFile object """ return ParquetFile(source=filename)
def __init__(self, parquets, img_root='', past=0, future=0, stride=1, cameras=['front-forward'], transform=None, load_from_azure=False): columns = [ 'speed_state', 'curvature_invm_state', 'run_id_noseginfix', ] + [cam + '_image_timestamp_rgb' for cam in cameras] # for loading images from azure blob storage azure_loader = AzureImageLoader() if load_from_azure else None # open a dataframe for each run_id and construct datasets datasets = [] count = 0 for i, parquet in enumerate(parquets): pqfile = ParquetFile(parquet, memory_map=False) num_row_groups = pqfile.metadata.num_row_groups for j in range(num_row_groups): if count % 100 == 0: print('initializing parquet %d/%d run %d/%d' % (i + 1, len(parquets), j + 1, num_row_groups)) dataframe = pqfile.read_row_group(j, columns=columns) if len(dataframe) > (past + 1 + future) * stride: datasets.append( SingleWayveDataset(dataframe, img_root, past, future, stride, cameras, transform, azure_loader)) count += 1 super().__init__(datasets)
def pyarrow_read(source): from pyarrow.parquet import ParquetFile import pprint # Source is either the filename or an Arrow file handle (which could be on HDFS) # TODO: figure out how to read from s3 directly args = {} if "s3://" in source: args["filesystem"] = "s3" m = ParquetFile(source, pre_buffer=True).metadata pp = pprint.PrettyPrinter(indent=4) pp.pprint(m)
def test_rowgroup_writing(store, use_categorical, chunk_size): df = pd.DataFrame({"string": ["abc", "affe", "banane", "buchstabe"]}) serialiser = ParquetSerializer(chunk_size=2) # Arrow 0.9.0 has a bug in writing categorical columns to more than a single # RowGroup: "ArrowIOError: Column 2 had 2 while previous column had 4". # We have special handling for that in pandas-serialiser that should be # removed once we switch to 0.10.0 if use_categorical: df_write = df.astype({"string": "category"}) else: df_write = df key = serialiser.store(store, "prefix", df_write) parquet_file = ParquetFile(store.open(key)) assert parquet_file.num_row_groups == 2
def test_predicate_accept_in(store, predicate_value, expected): df = pd.DataFrame({"A": [0, 4, 13, 29]}) # min = 0, max = 29 predicate = ("A", "in", predicate_value) serialiser = ParquetSerializer(chunk_size=None) key = serialiser.store(store, "prefix", df) parquet_file = ParquetFile(store.open(key)) row_meta = parquet_file.metadata.row_group(0) arrow_schema = parquet_file.schema.to_arrow_schema() parquet_reader = parquet_file.reader assert (_predicate_accepts( predicate, row_meta=row_meta, arrow_schema=arrow_schema, parquet_reader=parquet_reader, ) == expected)
def assert_num_row_groups(store, dataset, part_num_rows, part_chunk_size): """ Assert that the row groups of each partition match the expectation based on the number of rows and the chunk size """ # Iterate over the partitions of each index value for index, partitions in dataset.indices["p"].index_dct.items(): for part_key in partitions: key = dataset.partitions[part_key].files["table"] parquet_file = ParquetFile(store.open(key)) if part_chunk_size[index] is None: assert parquet_file.num_row_groups == 1 else: assert parquet_file.num_row_groups == math.ceil( part_num_rows[index] / part_chunk_size[index] )
def _read_parquet_pandas_on_ray(path, engine, columns, **kwargs): from pyarrow.parquet import ParquetFile if not columns: pf = ParquetFile(path) columns = [ name for name in pf.metadata.schema.names if not PQ_INDEX_REGEX.match(name) ] num_partitions = RayBlockPartitions._compute_num_partitions() num_splits = min(len(columns), num_partitions) # Each item in this list will be a list of column names of the original df column_splits = ( len(columns) // num_partitions if len(columns) % num_partitions == 0 else len(columns) // num_partitions + 1 ) col_partitions = [ columns[i : i + column_splits] for i in range(0, len(columns), column_splits) ] # Each item in this list will be a list of columns of original df # partitioned to smaller pieces along rows. # We need to transpose the oids array to fit our schema. blk_partitions = np.array( [ _read_parquet_columns._remote( args=(path, cols, num_splits, kwargs), num_return_vals=num_splits + 1 ) for cols in col_partitions ] ).T remote_partitions = np.array( [ [PandasOnRayRemotePartition(obj) for obj in row] for row in blk_partitions[:-1] ] ) index_len = ray.get(blk_partitions[-1][0]) index = pandas.RangeIndex(index_len) new_manager = PandasQueryCompiler( RayBlockPartitions(remote_partitions), index, columns ) df = DataFrame(query_compiler=new_manager) return df
def test_persist_messages(input_messages_1, expected_df_1): # content of test_persist.expected.pkl based on : [{"CAD":1.3171828596,"HKD":7.7500212134,"ISK":138.6508273229,"PHP":48.5625795503,"DKK":6.3139584217,"HUF":309.7581671616,"CZK":23.2040729741,"GBP":0.7686720407,"RON":4.1381417056,"SEK":8.7889690284,"IDR":14720.101824353,"INR":73.3088672041,"BRL":5.6121340687,"RUB":77.5902418328,"HRK":6.4340263046,"JPY":105.311837081,"THB":31.1803139584,"CHF":0.9099703012,"EUR":0.8485362749,"MYR":4.1424692406,"BGN":1.6595672465,"TRY":7.8962240136,"CNY":6.6836656767,"NOK":9.2889266016,"NZD":1.5062367416,"ZAR":16.4451421298,"USD":1.0,"MXN":21.0537123462,"SGD":1.3568095036,"AUD":1.4064488757,"ILS":3.3802291048,"KRW":1138.1671616462,"PLN":3.8797624098,"date":"2020-10-19T00:00:00Z"},{"CAD":1.3171828596,"HKD":7.7500212134,"ISK":138.6508273229,"PHP":48.5625795503,"DKK":6.3139584217,"HUF":309.7581671616,"CZK":23.2040729741,"GBP":0.7686720407,"RON":4.1381417056,"SEK":8.7889690284,"IDR":14720.101824353,"INR":73.3088672041,"BRL":5.6121340687,"RUB":77.5902418328,"HRK":6.4340263046,"JPY":105.311837081,"THB":31.1803139584,"CHF":0.9099703012,"EUR":0.8485362749,"MYR":4.1424692406,"BGN":1.6595672465,"TRY":7.8962240136,"CNY":6.6836656767,"NOK":9.2889266016,"NZD":1.5062367416,"ZAR":16.4451421298,"USD":1.0,"MXN":21.0537123462,"SGD":1.3568095036,"AUD":1.4064488757,"ILS":3.3802291048,"KRW":1138.1671616462,"PLN":3.8797624098,"date":"2020-10-19T00:00:00Z"}] timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S") input_messages = io.TextIOWrapper(io.BytesIO(input_messages_1.encode()), encoding="utf-8") persist_messages(input_messages, f"test_{timestamp}") filename = [f for f in glob.glob(f"test_{timestamp}/*.parquet")] df = ParquetFile(filename[0]).read().to_pandas() for f in filename: os.remove(f) os.rmdir(f"test_{timestamp}") assert_frame_equal(df, expected_df_1)
def _get_partition_bounds_parquet(part, fs): """ Based on the part information gathered by dask, get the partition bounds if available. """ from pyarrow.parquet import ParquetFile # read the metadata from the actual file (this is again file IO, but # we can't rely on the schema metadata, because this is only the # metadata of the first piece) pq_metadata = None if "piece" in part: path = part["piece"][0] if isinstance(path, str): with fs.open(path, "rb") as f: pq_metadata = ParquetFile(f).metadata if pq_metadata is None: return None return _get_partition_bounds(pq_metadata.metadata)
def image_data_generator_application(train_hparams, model_hparams, pipeline_name): print("pipeline_name", pipeline_name) print("train_hparams", train_hparams) print("model_hparams", model_hparams) model_hparams_key = hparam_key(model_hparams) train_hparams_key = hparam_key(train_hparams) # csv_data = pd.read_csv(f"{settings['dir']['data']}/train.csv") model_file = f"{settings['dir']['models']}/{pipeline_name}/{pipeline_name}-{model_hparams_key}.hdf5" log_dir = f"{settings['dir']['logs']}/{pipeline_name}/{model_hparams_key}/{train_hparams_key}" os.makedirs(os.path.dirname(model_file), exist_ok=True) os.makedirs(log_dir, exist_ok=True) dataset_rows = ParquetFile(f"{settings['dir']['data']}/train_image_data_0.parquet").metadata.num_rows dataset = DatasetDF(size=1) input_shape = dataset.input_shape() output_shape = dataset.output_shape() model = MultiOutputApplication( input_shape=input_shape, output_shape=output_shape, **model_hparams, ) model_compile(model_hparams, model, output_shape) # Load Pre-existing weights if os.path.exists( model_file ): try: model.load_weights( model_file ) print('Loaded Weights: ', model_file) except Exception as exception: print('exception', exception) if os.environ.get('KAGGLE_KERNEL_RUN_TYPE'): load_models = (glob2.glob(f'../input/**/{os.path.basename(model_file)}') + glob2.glob(f'../input/**/{os.path.basename(model_file)}'.replace('=',''))) # Kaggle Dataset Upload removes '=' for load_model in load_models: try: model.load_weights( load_model ) print('Loaded Weights: ', load_model) # break except Exception as exception: print('exception', exception) model.summary() # Source: https://www.kaggle.com/jamesmcguigan/bengali-ai-image-processing datagen_args = { # "rescale": 1./255, # "normalize": True is default in Transforms "zoom_range": 0.2, "width_shift_range": 0.1, # we already have centering "height_shift_range": 0.1, # we already have centering "rotation_range": 45/2, "shear_range": 45/2, # "brightness_range": 0.5, # Prebrightness normalized "fill_mode": 'constant', "cval": 0, # "featurewise_center": True, # No visible effect in plt.imgshow() # "samplewise_center": True, # No visible effect in plt.imgshow() # "featurewise_std_normalization": True, # No visible effect in plt.imgshow() | requires .fit() # "samplewise_std_normalization": True, # No visible effect in plt.imgshow() | requires .fit() # "zca_whitening": True, # Kaggle, insufficent memory } flow_args = {} flow_args['train'] = { "transform_X": Transforms.transform_X, "transform_X_args": {}, # "normalize": True is default in Transforms "transform_Y": Transforms.transform_Y, "batch_size": train_hparams['batch_size'], "reads_per_file": 3, "resamples": 1, "shuffle": True, "infinite": True, } flow_args['valid'] = { **flow_args['train'], "resamples": 1, } flow_args['test'] = { **flow_args['train'], "resamples": 1, "shuffle": False, "infinite": False, "test": True, } datagens = { "train": ParquetImageDataGenerator(**datagen_args), "valid": ParquetImageDataGenerator(), "test": ParquetImageDataGenerator(), } # [ datagens[key].fit(train_batch) for key in datagens.keys() ] # Not required fileglobs = { "train": f"{settings['dir']['data']}/train_image_data_[123].parquet", "valid": f"{settings['dir']['data']}/train_image_data_0.parquet", "test": f"{settings['dir']['data']}/test_image_data_*.parquet", } if os.environ.get('KAGGLE_KERNEL_RUN_TYPE'): # For the Kaggle Submission, train on all available data and rely on Kaggle Timeout fileglobs["train"] = f"{settings['dir']['data']}/train_image_data_*.parquet" generators = { key: datagens[key].flow_from_parquet(value, **flow_args[key]) for key,value in fileglobs.items() } dataset_rows_per_file = { key: np.mean([ ParquetFile(filename).metadata.num_rows for filename in glob2.glob(fileglobs[key]) ]) for key in fileglobs.keys() } dataset_rows_total = { key: sum([ ParquetFile(filename).metadata.num_rows for filename in glob2.glob(fileglobs[key]) ]) for key in fileglobs.keys() } ### Epoch: train == one whole parquet files | valid = 1 filesystem read steps_per_epoch = int(dataset_rows_per_file['train'] / flow_args['train']['batch_size'] * flow_args['train']['resamples'] ) validation_steps = int(dataset_rows_per_file['valid'] / flow_args['valid']['batch_size'] / flow_args['train']['reads_per_file'] ) callback = callbacks(train_hparams, dataset, model_file, log_dir, best_only=True, verbose=1) timer_start = time.time() history = model.fit( generators['train'], validation_data = generators['valid'], epochs = train_hparams['epochs'], steps_per_epoch = steps_per_epoch, validation_steps = validation_steps, verbose = 2, callbacks = callback ) timer_seconds = int(time.time() - timer_start) model_stats = model_stats_from_history(history, timer_seconds, best_only=True) return model, model_stats, output_shape
def _read_parquet_row_group(path, columns, row_group_id, kwargs={}): """Read a parquet row_group given file_path. """ pf = ParquetFile(path) df = pf.read_row_group(row_group_id, columns=columns, **kwargs).to_pandas() return df
def get_table_column_names_and_types( self, config: RepoConfig) -> Iterable[Tuple[str, str]]: schema = ParquetFile(self.path).schema_arrow return zip(schema.names, map(str, schema.types))
def restore_dataframe( store, key, filter_query=None, columns=None, predicate_pushdown_to_io=True, categories=None, predicates=None, date_as_object=False, ): check_predicates(predicates) # If we want to do columnar access we can benefit from partial reads # otherwise full read en block is the better option. if (not predicate_pushdown_to_io) or (columns is None and predicates is None): with pa.BufferReader(store.get(key)) as reader: table = pq.read_pandas(reader, columns=columns) else: if HAVE_BOTO and isinstance(store, BotoStore): # Parquet and seeks on S3 currently leak connections thus # we omit column projection to the store. reader = pa.BufferReader(store.get(key)) else: reader = store.open(key) # Buffer at least 4 MB in requests. This is chosen because the default block size of the Azure # storage client is 4MB. reader = BlockBuffer(reader, 4 * 1024 * 1024) try: parquet_file = ParquetFile(reader) if predicates and parquet_file.metadata.num_rows > 0: # We need to calculate different predicates for predicate # pushdown and the later DataFrame filtering. This is required # e.g. in the case where we have an `in` predicate as this has # different normalized values. columns_to_io = _columns_for_pushdown(columns, predicates) predicates_for_pushdown = _normalize_predicates( parquet_file, predicates, True) predicates = _normalize_predicates(parquet_file, predicates, False) tables = _read_row_groups_into_tables( parquet_file, columns_to_io, predicates_for_pushdown) if len(tables) == 0: if ARROW_LARGER_EQ_0130: table = parquet_file.schema.to_arrow_schema( ).empty_table() else: table = _empty_table_from_schema(parquet_file) else: table = pa.concat_tables(tables) else: # ARROW-5139 Column projection with empty columns returns a table w/out index if ARROW_LARGER_EQ_0130 and columns == []: # Create an arrow table with expected index length. df = (parquet_file.schema.to_arrow_schema(). empty_table().to_pandas( date_as_object=date_as_object)) index = pd.Int64Index( pd.RangeIndex(start=0, stop=parquet_file.metadata.num_rows)) df = pd.DataFrame(df, index=index) # convert back to table to keep downstream code untouched by this patch table = pa.Table.from_pandas(df) else: table = pq.read_pandas(reader, columns=columns) finally: reader.close() table = _fix_pyarrow_07992_table(table) table = _fix_pyarrow_0130_table(table) if columns is not None: missing_columns = set(columns) - set(table.schema.names) if missing_columns: raise ValueError( "Columns cannot be found in stored dataframe: {missing}". format(missing=", ".join(sorted(missing_columns)))) df = table.to_pandas(categories=categories, date_as_object=date_as_object) df.columns = df.columns.map(ensure_unicode_string_type) if predicates: df = filter_df_from_predicates(df, predicates, strict_date_types=date_as_object) else: df = filter_df(df, filter_query) if columns is not None: return df.loc[:, columns] else: return df
def read_schema(self) -> ParquetSchema: return ParquetFile(self.path).schema
def __init__(self, path, key=None, secret=None, endpoint=None, proxy=None, proxy_port=None, filesystem=None): self.path = path self.url_path = urlparse(path) if str(path).endswith(".manifest"): self.manifest_path = path if str(path).startswith(LOCAL_FILE_PREFIX): self.manifest_path = str(path)[len(LOCAL_FILE_PREFIX):] if filesystem is None: a_path = self.path if isinstance(a_path, list): a_path = a_path[0] self.fs = _get_fs_from_path(a_path) else: self.fs = _ensure_filesystem(filesystem) self.pieces = list() if self.url_path.scheme == 's3a': if key is None or secret is None or endpoint is None: raise ValueError('key, secret, endpoint should not be None') if proxy is None and proxy_port is None: carbon_splits = ArrowCarbonReader().builder(self.path) \ .withHadoopConf("fs.s3a.access.key", key) \ .withHadoopConf("fs.s3a.secret.key", secret) \ .withHadoopConf("fs.s3a.endpoint", endpoint) \ .getSplits(True) configuration = Configuration() configuration.set("fs.s3a.access.key", key) configuration.set("fs.s3a.secret.key", secret) configuration.set("fs.s3a.endpoint", endpoint) self.configuration = configuration elif proxy is not None and proxy_port is not None: carbon_splits = ArrowCarbonReader().builder(self.path) \ .withHadoopConf("fs.s3a.access.key", key) \ .withHadoopConf("fs.s3a.secret.key", secret) \ .withHadoopConf("fs.s3a.endpoint", endpoint) \ .withHadoopConf("fs.s3a.proxy.host", proxy) \ .withHadoopConf("fs.s3a.proxy.port", proxy_port) \ .getSplits(True) configuration = Configuration() configuration.set("fs.s3a.access.key", key) configuration.set("fs.s3a.secret.key", secret) configuration.set("fs.s3a.endpoint", endpoint) configuration.set("fs.s3a.proxy.host", proxy) configuration.set("fs.s3a.proxy.port", proxy_port) self.configuration = configuration else: raise ValueError('wrong proxy & proxy_port configuration') if str(path).endswith(".manifest"): from obs import ObsClient obsClient = ObsClient(access_key_id=key, secret_access_key=secret, server=str(endpoint).replace( 'http://', ''), long_conn_mode=True) sources = manifest.getSources(self.manifest_path, CARBON, obsClient) if sources: self.file_path = sources[0] else: raise Exception("Manifest source can't be None!") carbon_schema = CarbonSchemaReader().readSchema( self.file_path, self.configuration.conf) else: carbon_schema = CarbonSchemaReader().readSchema( self.path, self.configuration.conf) for split in carbon_splits: # split = self.url_path.scheme + "://" + self.url_path.netloc + split folder_path = path if str(path).endswith(".manifest"): folder_path = str( self.file_path)[0:(str(self.file_path).rindex('/'))] self.pieces.append( CarbonDatasetPiece(folder_path, carbon_schema, split, key=key, secret=secret, endpoint=endpoint, proxy=proxy, proxy_port=proxy_port)) else: if str(path).endswith(".manifest"): sources = manifest.getSources(self.manifest_path, CARBON) if sources: self.file_path = sources[0] else: raise Exception("Manifest source can't be None!") try: carbon_schema = CarbonSchemaReader().readSchema( self.file_path) except: raise Exception("readSchema has some errors: " + self.file_path) else: try: carbon_schema = CarbonSchemaReader().readSchema(self.path) except: raise Exception("readSchema has some errors") carbon_splits = ArrowCarbonReader().builder(self.path) \ .getSplits(True) for split in carbon_splits: # split = self.url_path.scheme + "://" + self.url_path.netloc + split if str(path).endswith(".manifest"): self.pieces.append( CarbonDatasetPiece( str(self.file_path)[0:( str(self.file_path).rindex('/'))], carbon_schema, split)) else: self.pieces.append( CarbonDatasetPiece(path, carbon_schema, split)) self.number_of_splits = len(self.pieces) self.schema = self.getArrowSchema() # TODO add mechanism to get the file path based on file filter self.common_metadata_path = self.url_path.path + '/_common_metadata' self.common_metadata = None try: if self.fs.exists(self.common_metadata_path): with self.fs.open(self.common_metadata_path) as f: self.common_metadata = ParquetFile(f).metadata except: self.common_metadata = None
def image_data_generator_cnn(train_hparams: Dict, model_hparams: Dict, transform_X_args: Dict, transform_Y_args: Dict, datagen_args: Dict, pipeline_name='image_data_generator_cnn', model_file=None, log_dir=None, verbose=2, load_weights=True, fileglobs={}): combined_hparams = { **model_hparams, **train_hparams, **transform_X_args, **transform_Y_args, **datagen_args } train_hparams = {**settings['hparam_defaults'], **train_hparams} if verbose: print('-----') print("pipeline_name", pipeline_name) print("train_hparams", train_hparams) print("transform_X_args", transform_X_args) print("transform_Y_args", transform_Y_args) print("datagen_args", datagen_args) print("model_file", model_file) print("log_dir", log_dir) print("load_weights", load_weights) print('-----') model_hparams_key = hparam_key(model_hparams) train_hparams_key = hparam_key(train_hparams) transform_key = hparam_key( ChainMap(*[transform_X_args, transform_Y_args, datagen_args])) # csv_data = pd.read_csv(f"{settings['dir']['data']}/train.csv") model_file = model_file or f"{settings['dir']['models']}/{pipeline_name}/{pipeline_name}-{model_hparams_key}.hdf5" log_dir = log_dir or f"{settings['dir']['logs']}/{pipeline_name}/{transform_key}/" os.makedirs(os.path.dirname(model_file), exist_ok=True) os.makedirs(log_dir, exist_ok=True) dataset_rows = ParquetFile( f"{settings['dir']['data']}/train_image_data_0.parquet" ).metadata.num_rows dataset = DatasetDF( size=1, transform_X_args=transform_X_args, transform_Y_args=transform_Y_args, ) input_shape = dataset.input_shape() output_shape = dataset.output_shape() model = MultiOutputCNN( input_shape=input_shape, output_shape=output_shape, **model_hparams, ) model_compile(model_hparams, model, output_shape) # Load Pre-existing weights if load_weights: if os.path.exists(model_file): try: model.load_weights(model_file) print('Loaded Weights: ', model_file) except Exception as exception: print('exception', exception) if os.environ.get('KAGGLE_KERNEL_RUN_TYPE'): load_models = ( glob2.glob(f'../input/**/{os.path.basename(model_file)}') + glob2.glob( f'../input/**/{os.path.basename(model_file)}'.replace( '=', ''))) # Kaggle Dataset Upload removes '=' for load_model in load_models: try: model.load_weights(load_model) print('Loaded Weights: ', load_model) # break except Exception as exception: print('exception', exception) if verbose: model.summary() flow_args = {} flow_args['train'] = { "transform_X": Transforms.transform_X, "transform_Y": Transforms.transform_Y, "transform_X_args": transform_X_args, "transform_Y_args": transform_Y_args, "batch_size": train_hparams['batch_size'], "reads_per_file": 2, "resamples": 1, "shuffle": True, "infinite": True, } flow_args['valid'] = { **flow_args['train'], "resamples": 1, } flow_args['test'] = { **flow_args['train'], "resamples": 1, "shuffle": False, "infinite": False, "test": True, } datagens = { "train": ParquetImageDataGenerator(**datagen_args), "valid": ParquetImageDataGenerator(), "test": ParquetImageDataGenerator(), } # [ datagens[key].fit(train_batch) for key in datagens.keys() ] # Not required fileglobs = { "train": f"{settings['dir']['data']}/train_image_data_[123].parquet", "valid": f"{settings['dir']['data']}/train_image_data_0.parquet", "test": f"{settings['dir']['data']}/test_image_data_*.parquet", **fileglobs } ### Preserve test/train split for Kaggle # if os.environ.get('KAGGLE_KERNEL_RUN_TYPE'): # # For the Kaggle Submission, train on all available data and rely on Kaggle Timeout # fileglobs["train"] = f"{settings['dir']['data']}/train_image_data_*.parquet" generators = { key: datagens[key].flow_from_parquet(value, **flow_args[key]) for key, value in fileglobs.items() } dataset_rows_per_file = { key: np.mean([ ParquetFile(filename).metadata.num_rows for filename in glob2.glob(fileglobs[key]) ]) for key in fileglobs.keys() } dataset_rows_total = { key: sum([ ParquetFile(filename).metadata.num_rows for filename in glob2.glob(fileglobs[key]) ]) for key in fileglobs.keys() } ### Epoch: train == one whole parquet files | valid = 1 filesystem read steps_per_epoch = int(dataset_rows_per_file['train'] / flow_args['train']['batch_size'] * flow_args['train']['resamples']) validation_steps = int(dataset_rows_per_file['valid'] / flow_args['valid']['batch_size'] / flow_args['train']['reads_per_file']) callback = callbacks(combined_hparams, dataset, model_file, log_dir, best_only=True, verbose=1) timer_start = time.time() history = model.fit(generators['train'], validation_data=generators['valid'], epochs=train_hparams['epochs'], steps_per_epoch=steps_per_epoch, validation_steps=validation_steps, verbose=verbose, callbacks=callback) timer_seconds = int(time.time() - timer_start) model_stats = model_stats_from_history(history, timer_seconds, best_only=True) return model, model_stats, output_shape
def read_parquet(cls, path, engine, columns, **kwargs): """Load a parquet object from the file path, returning a DataFrame. Ray DataFrame only supports pyarrow engine for now. Args: path: The filepath of the parquet file. We only support local files for now. engine: Ray only support pyarrow reader. This argument doesn't do anything for now. kwargs: Pass into parquet's read_pandas function. Notes: ParquetFile API is used. Please refer to the documentation here https://arrow.apache.org/docs/python/parquet.html """ from pyarrow.parquet import ParquetFile, ParquetDataset if cls.read_parquet_remote_task is None: return super(RayIO, cls).read_parquet(path, engine, columns, **kwargs) file_path = path if os.path.isdir(path): directory = True partitioned_columns = set() # We do a tree walk of the path directory because partitioned # parquet directories have a unique column at each directory level. # Thus, we can use os.walk(), which does a dfs search, to walk # through the different columns that the data is partitioned on for (root, dir_names, files) in os.walk(path): if dir_names: partitioned_columns.add(dir_names[0].split("=")[0]) if files: # Metadata files, git files, .DSStore if files[0][0] == ".": continue file_path = os.path.join(root, files[0]) break partitioned_columns = list(partitioned_columns) else: directory = False if not columns: if directory: # Path of the sample file that we will read to get the remaining # columns. from pyarrow import ArrowIOError try: pd = ParquetDataset(file_path) except ArrowIOError: pd = ParquetDataset(path) column_names = pd.schema.names else: pf = ParquetFile(path) column_names = pf.metadata.schema.names columns = [ name for name in column_names if not PQ_INDEX_REGEX.match(name) ] # Cannot read in parquet file by only reading in the partitioned column. # Thus, we have to remove the partition columns from the columns to # ensure that when we do the math for the blocks, the partition column # will be read in along with a non partition column. if columns and directory and any(col in partitioned_columns for col in columns): columns = [ col for col in columns if col not in partitioned_columns ] # If all of the columns wanted are partition columns, return an # empty dataframe with the desired columns. if len(columns) == 0: return cls.query_compiler_cls.from_pandas( pandas.DataFrame(columns=partitioned_columns), block_partitions_cls=cls.frame_mgr_cls, ) num_partitions = cls.frame_mgr_cls._compute_num_partitions() num_splits = min(len(columns), num_partitions) # Each item in this list will be a list of column names of the original df column_splits = (len(columns) // num_partitions if len(columns) % num_partitions == 0 else len(columns) // num_partitions + 1) col_partitions = [ columns[i:i + column_splits] for i in range(0, len(columns), column_splits) ] column_widths = [len(c) for c in col_partitions] # Each item in this list will be a list of columns of original df # partitioned to smaller pieces along rows. # We need to transpose the oids array to fit our schema. # TODO (williamma12): This part can be parallelized even more if we # separate the partitioned parquet file code path from the default one. # The workers return multiple objects for each part of the file read: # - The first n - 2 objects are partitions of data # - The n - 1 object is the length of the partition. # - The nth object is the dtypes of the partition. We combine these to # form the final dtypes below. blk_partitions = np.array([ cls.read_parquet_remote_task._remote( args=(path, cols + partitioned_columns, num_splits, kwargs), num_return_vals=num_splits + 2, ) if directory and cols == col_partitions[len(col_partitions) - 1] else cls.read_parquet_remote_task._remote( args=(path, cols, num_splits, kwargs), num_return_vals=num_splits + 2, ) for cols in col_partitions ]).T # Metadata index_len = ray.get(blk_partitions[-2][0]) index = pandas.RangeIndex(index_len) index_chunksize = compute_chunksize(pandas.DataFrame(index=index), num_splits, axis=0) if index_chunksize > index_len: row_lengths = [index_len] + [0 for _ in range(num_splits - 1)] else: row_lengths = [ index_chunksize if i != num_splits - 1 else index_len - (index_chunksize * (num_splits - 1)) for i in range(num_splits) ] # Compute dtypes concatenating the results from each of the columns splits # determined above. This creates a pandas Series that contains a dtype for every # column. dtypes_ids = list(blk_partitions[-1]) dtypes = pandas.concat(ray.get(dtypes_ids), axis=0) blk_partitions = blk_partitions[:-2] remote_partitions = np.array([[ cls.frame_partition_cls( blk_partitions[i][j], length=row_lengths[i], width=column_widths[j], ) for j in range(len(blk_partitions[i])) ] for i in range(len(blk_partitions))]) if directory: columns += partitioned_columns dtypes.index = columns new_query_compiler = cls.query_compiler_cls( cls.frame_mgr_cls(remote_partitions), index, columns, dtypes=dtypes) return new_query_compiler
def _patched_init(self, source, **kwargs): self.source = source return ParquetFile.__old_init__(self, source, **kwargs)
def _read(cls, path, engine, columns, **kwargs): """ Load a parquet object from the file path, returning a query compiler. Parameters ---------- path : str, path object or file-like object The filepath of the parquet file in local filesystem or hdfs. engine : str Parquet library to use (only 'PyArrow' is supported for now). columns : list If not None, only these columns will be read from the file. **kwargs : dict Keyword arguments. Returns ------- BaseQueryCompiler A new Query Compiler. Notes ----- ParquetFile API is used. Please refer to the documentation here https://arrow.apache.org/docs/python/parquet.html """ from pyarrow.parquet import ParquetFile, ParquetDataset from modin.pandas.io import PQ_INDEX_REGEX if isinstance(path, str) and os.path.isdir(path): partitioned_columns = set() directory = True # We do a tree walk of the path directory because partitioned # parquet directories have a unique column at each directory level. # Thus, we can use os.walk(), which does a dfs search, to walk # through the different columns that the data is partitioned on for (root, dir_names, files) in os.walk(path): if dir_names: partitioned_columns.add(dir_names[0].split("=")[0]) if files: # Metadata files, git files, .DSStore if files[0][0] == ".": continue break partitioned_columns = list(partitioned_columns) if len(partitioned_columns): ErrorMessage.default_to_pandas( "Mixed Partitioning Columns in Parquet") return cls.single_worker_read(path, engine=engine, columns=columns, **kwargs) else: directory = False if not columns: import s3fs if directory: # Path of the sample file that we will read to get the remaining columns pd = ParquetDataset(path) meta = pd.metadata column_names = pd.schema.to_arrow_schema().names elif isinstance(path, str) and path.startswith("hdfs://"): import fsspec.core fs, path = fsspec.core.url_to_fs(path) pd = ParquetDataset(path, filesystem=fs) meta = pd.metadata column_names = pd.schema.to_arrow_schema().names elif isinstance(path, s3fs.S3File) or (isinstance(path, str) and path.startswith("s3://")): from botocore.exceptions import NoCredentialsError if isinstance(path, s3fs.S3File): bucket_path = path.url().split(".s3.amazonaws.com") path = "s3://" + bucket_path[0].split( "://")[1] + bucket_path[1] try: fs = s3fs.S3FileSystem() pd = ParquetDataset(path, filesystem=fs) except NoCredentialsError: fs = s3fs.S3FileSystem(anon=True) pd = ParquetDataset(path, filesystem=fs) meta = pd.metadata column_names = pd.schema.to_arrow_schema().names else: meta = ParquetFile(path).metadata column_names = meta.schema.to_arrow_schema().names if meta is not None and meta.metadata is not None: pandas_metadata = meta.metadata.get(b"pandas", None) if pandas_metadata is not None: import json # This is how we convert the metadata from pyarrow to a python # dictionary, from which we then get the index columns. # We use these to filter out from the columns in the metadata since # the pyarrow storage has no concept of row labels/index. # This ensures that our metadata lines up with the partitions without # extra communication steps once we have done all the remote # computation. index_columns = json.loads( pandas_metadata.decode("utf8")).get( "index_columns", []) column_names = [ c for c in column_names if c not in index_columns ] columns = [ name for name in column_names if not PQ_INDEX_REGEX.match(name) ] return cls.build_query_compiler(path, columns, **kwargs)