def append_to_parquet(df: pd.DataFrame, writer: pq.ParquetWriter, filepath: str) -> pq.ParquetWriter: table = pa.Table.from_pandas(str) if writer is None: writer = pq.ParquetWriter(filepath, table.schema) writer.write_table(table=table) return writer
def chunked_write(df_iterator: TextFileReader, parquet_writer: pq.ParquetWriter, date_cols: List[str]): """ Writes Parquet version of the chunked dataframe input. Arrow table creation and Parquet-writes take up around 25% of the time on this function. The CSV read takes around 75%. """ rows_processed = 0 for df in df_iterator: rows_processed += min(BUFFER_SIZE_ROWS, len(df)) for col_name in date_cols: df[col_name] = pd.to_datetime(df[col_name], unit="ms") pa_table = pa.Table.from_pandas(df=df, schema=parquet_writer.schema) parquet_writer.write_table(pa_table) print("Rows processed: {}".format(rows_processed), end="\r", flush=True) print()
def to_parquet(self, output: str) -> None: """Export Butler datasets as ObsCore Data Model in parquet format. Parameters ---------- output : `str` Location of the output file. """ compression = self.config.parquet_compression with ParquetWriter(output, self.schema, compression=compression) as writer: for record_batch in self._make_record_batches( self.config.batch_size): writer.write_batch(record_batch)
class OutputStreamWriter: def __init__(self, schema): self.valid_writer = ParquetWriter( self.parquet_file_path("valid_products"), schema) self.invalid_writer = ParquetWriter( self.parquet_file_path("invalid_products"), schema) def write(self, valid_batch, invalid_batch): self.valid_writer.write_table(Table.from_batches([valid_batch])) self.invalid_writer.write_table(Table.from_batches([invalid_batch])) def parquet_file_path(self, file_basename): return os.path.join(DATA_DIR, "{}.parquet".format(file_basename)) def close(self): self.valid_writer.close() self.invalid_writer.close()
def write_file(current_stream_name, record): timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S-%f") LOGGER.debug(f"Writing files from {current_stream_name} stream") dataframe = create_dataframe(record) if streams_in_separate_folder and not os.path.exists( os.path.join(destination_path, current_stream_name)): os.makedirs(os.path.join(destination_path, current_stream_name)) filename = (current_stream_name + filename_separator + timestamp + compression_extension + ".parquet") filepath = os.path.expanduser(os.path.join(destination_path, filename)) with open(filepath, 'wb') as f: ParquetWriter( f, dataframe.schema, compression=compression_method).write_table(dataframe) ## explicit memory management. This can be usefull when working on very large data groups del dataframe return filepath
def csv_stream_to_parquet_batch_writer(include_columns, \ input_file_to_stream, \ output_file_stream_directory, \ output_file_prefix): print('Initiating stream.') input_stream_reader = InputStreamReader(input_file_to_stream) outfiles_list = [] for i, batch in input_stream_reader.batches(): print(f'Ingesting batch number {i}') df = batch.to_pandas() table = pa.Table.from_pandas(df) schema = table.schema # smiles = list(df[smiles_column_title]) # print(f'Writing a total of {len(smiles)} smiles per output file to disk.') outfile = f'{output_file_stream_directory}{output_file_prefix}_{i}.parquet' ParquetWriter(outfile, schema).write_table(table) print(f'Wrote parquet to {outfile}') outfiles_list.append(outfile) return outfile
def __init__(self, schema): self.valid_writer = ParquetWriter( self.parquet_file_path("valid_products"), schema) self.invalid_writer = ParquetWriter( self.parquet_file_path("invalid_products"), schema)
if __name__ == '__main__': # TODO: Request backup # TODO: Handle HTTP 429 if a backup was already created within the last 24 hours # TODO: Poller every 5 min # tmp_dir = '/home/condesa1931/personal/github/azure-methods/DevOpsAPI/data' # backup_path = get_account_backup(backup_id="6911382328887923514", # out_directory=tmp_dir) backup_path = f"./data/aha-account-6240998105453674102-backup-2020-12-28-18-53.json" project_path = f"./data/aha-project-backup-2020-12-28-18-53.parquet" idea_path = f"./data/aha-idea-backup-2020-12-28-18-53.parquet" requirement_path = f"./data/aha-requirement-backup-2020-12-28-18-53.parquet" with open(backup_path, 'rt', encoding='utf-8') as f, \ ParquetWriter(project_path, PROJECT_PQ, compression='SNAPPY') as p, \ ParquetWriter(idea_path, IDEA_PQ, compression='SNAPPY') as i, \ ParquetWriter(requirement_path, REQUIREMENT_PQ, compression='SNAPPY') as r: class_types = [ 'Account', 'AccountUser', 'AccountWorkflow', 'Annotation::Point', 'Approval', 'BusinessModel', 'BusinessModelComponent', 'Comment', 'Competitor', 'CustomFieldDefinitions::DateField', 'CustomFieldDefinitions::LinkMasterDetail',
yield i, batch except StopIteration: break def __next_batch(self): return self.stream.read_next_batch() @property def stream(self): if not self._stream: read_options = pa.csv.ReadOptions(block_size=chunksize) parse_options = pa.csv.ParseOptions(delimiter=delimiter) convert_options = pa.csv.ConvertOptions(include_columns=include_columns) self._stream = pa.csv.open_csv( self.file_stream, read_options=read_options, parse_options=parse_options, convert_options=convert_options ) return self._stream include_columns = ['zincid', 'smiles', 'dockscore'] delimiter = str(',') chunksize = 1048576*1000 file_stream = '/data/dockop_data/AmpC_screen_table.csv' input_stream_reader = InputStreamReader(file_stream) for i, batch in input_stream_reader.batches(): df = batch.to_pandas() table = pa.Table.from_pandas(df) schema = table.schema print(f'Writing a total of {len(list(df['smiles']))} to disk.') ParquetWriter(f'/data/newdockop/dockop/code/mod_code_base/parquet/AmpC_screen_table_part_{i}.parquet', schema).write_table(table)