def convert_dir(input_dir: str, output_dir: str, svcschema: SchemaForTable): """Convert the data into a single file and write it out""" defaults = { pa.string(): "", pa.int32(): 0, pa.int64(): 0, pa.float32(): 0.0, pa.float64(): 0.0, pa.date64(): 0.0, pa.bool_(): False, pa.list_(pa.string()): ['-'], pa.list_(pa.int64()): [], } df = pd.read_parquet(input_dir, use_legacy_dataset=True) sqschema = svcschema.get_raw_schema() arrow_schema = svc_schema.get_arrow_schema() for column in filter(lambda x: x['name'] not in df.columns, sqschema): df[column['name']] = column.get('default', defaults[column['type']]) # convert all dtypes to whatever is desired for column in df.columns: if column in arrow_schema: df[column] = df[column].astype( arrow_schema.field(column).type.to_pandas_dtype()) # If there's the original ifname saved up, then eliminate this unnecessary # field as this model is no longer necessary if 'origIfname' in df.columns: if 'ifname' in df.columns: df = df.drop(columns=['ifname']) \ .rename(columns={'origIfname': 'ifname'}) elif 'oif' in df.columns: df = df.drop(columns=['oif']) \ .rename(columns={'origIfname': 'oif'}) table = pa.Table.from_pandas(df, schema=arrow_schema, preserve_index=False) partition_cols = svcschema.get_partition_columns() if 'norifcnReason' in df.columns: df.rename({'notifcnReason': 'notificnReason'}, inplace=True) pq.write_to_dataset( table, root_path=output_dir, partition_cols=partition_cols, version="2.0", compression='ZSTD', row_group_size=100000, ) logger.info(f'Wrote converted {input_dir}')
def describe(self, **kwargs): """Describes the fields for a given table""" table = kwargs.get('table', '') try: sch = SchemaForTable(table, self.schemas) except ValueError: sch = None if not sch: df = pd.DataFrame( {'error': [f'ERROR: incorrect table name {table}']}) return df entries = [{ 'name': x['name'], 'type': x['type'], 'key': x.get('key', ''), 'display': x.get('display', '') } for x in sch.get_raw_schema()] df = pd.DataFrame.from_dict(entries).sort_values('name') return df