예제 #1
0
def append_to_parquet(df: pd.DataFrame, writer: pq.ParquetWriter,
                      filepath: str) -> pq.ParquetWriter:
    table = pa.Table.from_pandas(str)
    if writer is None:
        writer = pq.ParquetWriter(filepath, table.schema)
    writer.write_table(table=table)
    return writer
예제 #2
0
def chunked_write(df_iterator: TextFileReader,
                  parquet_writer: pq.ParquetWriter, date_cols: List[str]):
    """
    Writes  Parquet version of the chunked dataframe input.

    Arrow table creation and Parquet-writes take up around 25% of the time on this function.
    The CSV read takes around 75%.
    """
    rows_processed = 0
    for df in df_iterator:
        rows_processed += min(BUFFER_SIZE_ROWS, len(df))
        for col_name in date_cols:
            df[col_name] = pd.to_datetime(df[col_name], unit="ms")
        pa_table = pa.Table.from_pandas(df=df, schema=parquet_writer.schema)
        parquet_writer.write_table(pa_table)

        print("Rows processed: {}".format(rows_processed),
              end="\r",
              flush=True)
    print()
예제 #3
0
    def to_parquet(self, output: str) -> None:
        """Export Butler datasets as ObsCore Data Model in parquet format.

        Parameters
        ----------
        output : `str`
            Location of the output file.
        """

        compression = self.config.parquet_compression
        with ParquetWriter(output, self.schema,
                           compression=compression) as writer:
            for record_batch in self._make_record_batches(
                    self.config.batch_size):
                writer.write_batch(record_batch)
class OutputStreamWriter:
    def __init__(self, schema):
        self.valid_writer = ParquetWriter(
            self.parquet_file_path("valid_products"), schema)
        self.invalid_writer = ParquetWriter(
            self.parquet_file_path("invalid_products"), schema)

    def write(self, valid_batch, invalid_batch):
        self.valid_writer.write_table(Table.from_batches([valid_batch]))
        self.invalid_writer.write_table(Table.from_batches([invalid_batch]))

    def parquet_file_path(self, file_basename):
        return os.path.join(DATA_DIR, "{}.parquet".format(file_basename))

    def close(self):
        self.valid_writer.close()
        self.invalid_writer.close()
예제 #5
0
 def write_file(current_stream_name, record):
     timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S-%f")
     LOGGER.debug(f"Writing files from {current_stream_name} stream")
     dataframe = create_dataframe(record)
     if streams_in_separate_folder and not os.path.exists(
             os.path.join(destination_path, current_stream_name)):
         os.makedirs(os.path.join(destination_path, current_stream_name))
     filename = (current_stream_name + filename_separator + timestamp +
                 compression_extension + ".parquet")
     filepath = os.path.expanduser(os.path.join(destination_path, filename))
     with open(filepath, 'wb') as f:
         ParquetWriter(
             f, dataframe.schema,
             compression=compression_method).write_table(dataframe)
     ## explicit memory management. This can be usefull when working on very large data groups
     del dataframe
     return filepath
예제 #6
0
def csv_stream_to_parquet_batch_writer(include_columns, \
                 input_file_to_stream, \
                 output_file_stream_directory, \
                 output_file_prefix):

    print('Initiating stream.')

    input_stream_reader = InputStreamReader(input_file_to_stream)

    outfiles_list = []

    for i, batch in input_stream_reader.batches():
        print(f'Ingesting batch number {i}')
        df = batch.to_pandas()
        table = pa.Table.from_pandas(df)
        schema = table.schema
        #             smiles = list(df[smiles_column_title])
        #             print(f'Writing a total of {len(smiles)} smiles per output file to disk.')
        outfile = f'{output_file_stream_directory}{output_file_prefix}_{i}.parquet'
        ParquetWriter(outfile, schema).write_table(table)
        print(f'Wrote parquet to {outfile}')
        outfiles_list.append(outfile)

    return outfile
 def __init__(self, schema):
     self.valid_writer = ParquetWriter(
         self.parquet_file_path("valid_products"), schema)
     self.invalid_writer = ParquetWriter(
         self.parquet_file_path("invalid_products"), schema)
예제 #8
0
if __name__ == '__main__':

    # TODO: Request backup
    # TODO: Handle HTTP 429 if a backup was already created within the last 24 hours
    # TODO: Poller every 5 min
    # tmp_dir = '/home/condesa1931/personal/github/azure-methods/DevOpsAPI/data'
    # backup_path = get_account_backup(backup_id="6911382328887923514",
    #                                  out_directory=tmp_dir)
    backup_path = f"./data/aha-account-6240998105453674102-backup-2020-12-28-18-53.json"
    project_path = f"./data/aha-project-backup-2020-12-28-18-53.parquet"
    idea_path = f"./data/aha-idea-backup-2020-12-28-18-53.parquet"
    requirement_path = f"./data/aha-requirement-backup-2020-12-28-18-53.parquet"

    with open(backup_path, 'rt', encoding='utf-8') as f, \
            ParquetWriter(project_path, PROJECT_PQ, compression='SNAPPY') as p, \
            ParquetWriter(idea_path, IDEA_PQ, compression='SNAPPY') as i, \
            ParquetWriter(requirement_path, REQUIREMENT_PQ, compression='SNAPPY') as r:

        class_types = [
            'Account',
            'AccountUser',
            'AccountWorkflow',
            'Annotation::Point',
            'Approval',
            'BusinessModel',
            'BusinessModelComponent',
            'Comment',
            'Competitor',
            'CustomFieldDefinitions::DateField',
            'CustomFieldDefinitions::LinkMasterDetail',
예제 #9
0
                yield i, batch
            except StopIteration:
                break
    def __next_batch(self):
        return self.stream.read_next_batch()
    @property
    def stream(self):
        if not self._stream:
            read_options = pa.csv.ReadOptions(block_size=chunksize)
            parse_options = pa.csv.ParseOptions(delimiter=delimiter)
            convert_options = pa.csv.ConvertOptions(include_columns=include_columns)
            self._stream = pa.csv.open_csv(
                self.file_stream, read_options=read_options,
                parse_options=parse_options,
                convert_options=convert_options
            )
        return self._stream

include_columns = ['zincid', 'smiles', 'dockscore']
delimiter = str(',')
chunksize = 1048576*1000
file_stream = '/data/dockop_data/AmpC_screen_table.csv'
input_stream_reader = InputStreamReader(file_stream)

for i, batch in input_stream_reader.batches():
    df = batch.to_pandas()
    table = pa.Table.from_pandas(df)
    schema = table.schema
    print(f'Writing a total of {len(list(df['smiles']))} to disk.')
    ParquetWriter(f'/data/newdockop/dockop/code/mod_code_base/parquet/AmpC_screen_table_part_{i}.parquet', schema).write_table(table)