Пример #1
0
def _numpy_and_codec_from_arrow_type(field_type):
    from pyarrow import types

    if types.is_int8(field_type):
        np_type = np.int8
    elif types.is_int16(field_type):
        np_type = np.int16
    elif types.is_int32(field_type):
        np_type = np.int32
    elif types.is_int64(field_type):
        np_type = np.int64
    elif types.is_string(field_type):
        np_type = np.unicode_
    elif types.is_boolean(field_type):
        np_type = np.bool_
    elif types.is_float32(field_type):
        np_type = np.float32
    elif types.is_float64(field_type):
        np_type = np.float64
    elif types.is_decimal(field_type):
        np_type = Decimal
    elif types.is_binary(field_type):
        np_type = np.string_
    elif types.is_fixed_size_binary(field_type):
        np_type = np.string_
    elif types.is_date(field_type):
        np_type = np.datetime64
    elif types.is_timestamp(field_type):
        np_type = np.datetime64
    elif types.is_list(field_type):
        np_type = _numpy_and_codec_from_arrow_type(field_type.value_type)
    else:
        raise ValueError('Cannot auto-create unischema due to unsupported column type {}'.format(field_type))
    return np_type
def convertPyArrowTypeToGlueType(pyarrowType: pa.DataType) -> str:
    if (types.is_string(pyarrowType) or types.is_unicode(pyarrowType)
            or types.is_large_string(pyarrowType)
            or types.is_large_unicode(pyarrowType)):
        return 'string'
    if (types.is_int64(pyarrowType) or types.is_uint64(pyarrowType)):
        return 'bigint'
    if (types.is_binary(pyarrowType)):
        return 'binary'
    if (types.is_boolean(pyarrowType)):
        return 'boolean'
    if (types.is_date(pyarrowType) or types.is_date32(pyarrowType)
            or types.is_date64(pyarrowType)):
        return 'date'
    if (types.is_decimal(pyarrowType)):
        return 'decimal(16,2)'
    if (types.is_float64(pyarrowType)):
        'return double'
    if (types.is_float16(pyarrowType) or types.is_float32(pyarrowType)):
        return 'float'
    if (types.is_int16(pyarrowType) or types.is_int32(pyarrowType)
            or types.is_uint16(pyarrowType) or types.is_uint32(pyarrowType)):
        return 'int'
    if (types.is_map(pyarrowType)):
        return 'map'
    if (types.is_struct(pyarrowType)):
        return 'struct'
    if (types.is_timestamp(pyarrowType)):
        return 'timestamp'
    if (types.is_union(pyarrowType)):
        return 'union'
    return str(pyarrowType)
Пример #3
0
 def _cast_float(val: Any,
                 dtype: pa.DataType) -> Union[np.float32, np.float64]:
     if is_float32(dtype):
         casted = np.float32(val)
     elif is_float64(dtype):
         casted = np.float64(val)
     else:
         raise NotImplementedError
     return casted
Пример #4
0
def from_arrow_type(at):
    """ Convert pyarrow type to Spark data type.
    """
    from distutils.version import LooseVersion
    import pyarrow as pa
    import pyarrow.types as types
    if types.is_boolean(at):
        spark_type = BooleanType()
    elif types.is_int8(at):
        spark_type = ByteType()
    elif types.is_int16(at):
        spark_type = ShortType()
    elif types.is_int32(at):
        spark_type = IntegerType()
    elif types.is_int64(at):
        spark_type = LongType()
    elif types.is_float32(at):
        spark_type = FloatType()
    elif types.is_float64(at):
        spark_type = DoubleType()
    elif types.is_decimal(at):
        spark_type = DecimalType(precision=at.precision, scale=at.scale)
    elif types.is_string(at):
        spark_type = StringType()
    elif types.is_binary(at):
        spark_type = BinaryType()
    elif types.is_date32(at):
        spark_type = DateType()
    elif types.is_timestamp(at):
        spark_type = TimestampType()
    elif types.is_list(at):
        if types.is_timestamp(at.value_type):
            raise TypeError("Unsupported type in conversion from Arrow: " + str(at))
        spark_type = ArrayType(from_arrow_type(at.value_type))
    elif types.is_map(at):
        if LooseVersion(pa.__version__) < LooseVersion("2.0.0"):
            raise TypeError("MapType is only supported with pyarrow 2.0.0 and above")
        if types.is_timestamp(at.key_type) or types.is_timestamp(at.item_type):
            raise TypeError("Unsupported type in conversion from Arrow: " + str(at))
        spark_type = MapType(from_arrow_type(at.key_type), from_arrow_type(at.item_type))
    elif types.is_struct(at):
        if any(types.is_struct(field.type) for field in at):
            raise TypeError("Nested StructType not supported in conversion from Arrow: " + str(at))
        return StructType(
            [StructField(field.name, from_arrow_type(field.type), nullable=field.nullable)
             for field in at])
    elif types.is_dictionary(at):
        spark_type = from_arrow_type(at.value_type)
    elif types.is_null(at):
        spark_type = NullType()
    else:
        raise TypeError("Unsupported type in conversion from Arrow: " + str(at))
    return spark_type
Пример #5
0
def _numpy_and_codec_from_arrow_type(field_type):
    from pyarrow import types

    if types.is_int8(field_type):
        np_type = np.int8
        codec = ScalarCodec(ByteType())
    elif types.is_int16(field_type):
        np_type = np.int16
        codec = ScalarCodec(ShortType())
    elif types.is_int32(field_type):
        np_type = np.int32
        codec = ScalarCodec(IntegerType())
    elif types.is_int64(field_type):
        np_type = np.int64
        codec = ScalarCodec(LongType())
    elif types.is_string(field_type):
        np_type = np.unicode_
        codec = ScalarCodec(StringType())
    elif types.is_boolean(field_type):
        np_type = np.bool_
        codec = ScalarCodec(BooleanType())
    elif types.is_float32(field_type):
        np_type = np.float32
        codec = ScalarCodec(FloatType())
    elif types.is_float64(field_type):
        np_type = np.float64
        codec = ScalarCodec(DoubleType())
    elif types.is_decimal(field_type):
        np_type = Decimal
        codec = ScalarCodec(DecimalType(field_type.precision,
                                        field_type.scale))
    elif types.is_binary(field_type):
        codec = ScalarCodec(StringType())
        np_type = np.string_
    elif types.is_fixed_size_binary(field_type):
        codec = ScalarCodec(StringType())
        np_type = np.string_
    elif types.is_date(field_type):
        np_type = np.datetime64
        codec = ScalarCodec(DateType())
    elif types.is_timestamp(field_type):
        np_type = np.datetime64
        codec = ScalarCodec(TimestampType())
    elif types.is_list(field_type):
        _, np_type = _numpy_and_codec_from_arrow_type(field_type.value_type)
        codec = None
    else:
        raise ValueError(
            'Cannot auto-create unischema due to unsupported column type {}'.
            format(field_type))
    return codec, np_type
Пример #6
0
def from_arrow_type(at):
    """ Convert pyarrow type to Spark data type.
    """
    import pyarrow.types as types
    if types.is_boolean(at):
        spark_type = BooleanType()
    elif types.is_int8(at):
        spark_type = ByteType()
    elif types.is_int16(at):
        spark_type = ShortType()
    elif types.is_int32(at):
        spark_type = IntegerType()
    elif types.is_int64(at):
        spark_type = LongType()
    elif types.is_float32(at):
        spark_type = FloatType()
    elif types.is_float64(at):
        spark_type = DoubleType()
    elif types.is_decimal(at):
        spark_type = DecimalType(precision=at.precision, scale=at.scale)
    elif types.is_string(at):
        spark_type = StringType()
    elif types.is_binary(at):
        spark_type = BinaryType()
    elif types.is_date32(at):
        spark_type = DateType()
    elif types.is_timestamp(at):
        spark_type = TimestampType()
    elif types.is_list(at):
        if types.is_timestamp(at.value_type):
            raise TypeError("Unsupported type in conversion from Arrow: " +
                            str(at))
        spark_type = ArrayType(from_arrow_type(at.value_type))
    elif types.is_struct(at):
        if any(types.is_struct(field.type) for field in at):
            raise TypeError(
                "Nested StructType not supported in conversion from Arrow: " +
                str(at))
        return StructType([
            StructField(field.name,
                        from_arrow_type(field.type),
                        nullable=field.nullable) for field in at
        ])
    elif types.is_dictionary(at):
        spark_type = from_arrow_type(at.value_type)
    else:
        raise TypeError("Unsupported type in conversion from Arrow: " +
                        str(at))
    return spark_type
Пример #7
0
def main(args=None):
    """Main method for postprocessing the raw outputs from an MC run."""
    if args is None:
        args = sys.argv[1:]
    args = parser.parse_args()

    # Start parsing args
    quantiles = args.quantiles
    verbose = args.verbose
    prefix = args.prefix
    use_gpu = args.gpu

    if verbose:
        logging.info(args)

    # File Management
    top_output_dir = args.output

    # Check if it exists, make if not
    if not os.path.exists(top_output_dir):
        os.makedirs(top_output_dir)

    # Use lookup, add prefix
    # TODO need to handle lookup weights
    if args.lookup is not None:
        lookup_df = read_lookup(args.lookup)
        if prefix is None:
            prefix = Path(args.lookup).stem
    # TODO if args.lookup we need to check it for weights

    # Create subfolder for this run using UUID of run
    uuid = args.file.split("/")[-2]

    if prefix is not None:
        uuid = prefix + "_" + uuid

    # Create directory if it doesn't exist
    output_dir = os.path.join(top_output_dir, uuid)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    data_dir = os.path.join(args.file, "data/")
    metadata_dir = os.path.join(args.file, "metadata/")

    adm_mapping = pd.read_csv(os.path.join(metadata_dir, "adm_mapping.csv"))
    dates = pd.read_csv(os.path.join(metadata_dir, "dates.csv"))
    dates = dates["date"].to_numpy()

    n_adm2 = len(adm_mapping)
    adm2_sorted_ind = xp.argsort(xp.array(adm_mapping["adm2"].to_numpy()))

    if use_gpu:
        enable_cupy(optimize=True)
        reimport_numerical_libs("postprocess")

    per_capita_cols = [
        "cumulative_reported_cases",
        "cumulative_deaths",
        "current_hospitalizations",
        "daily_reported_cases",
        "daily_deaths",
        "vacc_dose1",
        "vacc_dose2",
        "immune",
    ]
    pop_weighted_cols = [
        "case_reporting_rate",
        "R_eff",
        "frac_vacc_dose1",
        "frac_vacc_dose2",
        "frac_vacc_dose1_65",
        "frac_vacc_dose2_65",
        "frac_immune",
        "frac_immune_65",
        "state_phase",
    ]

    adm_mapping["adm0"] = 1
    adm_map = adm_mapping.to_dict(orient="list")
    adm_map = {k: xp.array(v)[adm2_sorted_ind] for k, v in adm_map.items()}
    adm_array_map = {
        k: xp.unique(v, return_inverse=True)[1]
        for k, v in adm_map.items()
    }
    adm_sizes = {
        k: xp.to_cpu(xp.max(v) + 1).item()
        for k, v in adm_array_map.items()
    }
    adm_level_values = {k: xp.to_cpu(xp.unique(v)) for k, v in adm_map.items()}
    adm_level_values["adm0"] = np.array(["US"])

    if args.lookup is not None and "weight" in lookup_df.columns:
        weight_series = lookup_df.set_index("adm2")["weight"].reindex(
            adm_mapping["adm2"], fill_value=0.0)
        weights = np.array(weight_series.to_numpy(), dtype=np.float32)
        # TODO we should ignore all the adm2 not in weights rather than just 0ing them (it'll go alot faster)
    else:
        weights = np.ones_like(adm2_sorted_ind, dtype=np.float32)

    write_queue = queue.Queue()

    def _writer():
        """Write thread that will pull from a queue."""
        # Call to_write.get() until it returns None
        file_tables = {}
        for fname, q_dict in iter(write_queue.get, None):
            df = pd.DataFrame(q_dict)
            id_col = df.columns[df.columns.str.contains("adm.")].values[0]
            df = df.set_index([id_col, "date", "quantile"])
            df = df.reindex(sorted(df.columns), axis=1)
            if fname in file_tables:
                tmp = pa.table(q_dict)
                file_tables[fname] = pa.concat_tables(
                    [file_tables[fname], tmp])
            else:
                file_tables[fname] = pa.table(q_dict)
            write_queue.task_done()

        # dump tables to disk
        for fname in tqdm.tqdm(file_tables):
            df = file_tables[fname].to_pandas()
            id_col = df.columns[df.columns.str.contains("adm.")].values[0]
            df = df.set_index([id_col, "date", "quantile"])
            df = df.reindex(sorted(df.columns), axis=1)
            df.to_csv(fname, header=True, mode="w")
        write_queue.task_done()

    write_thread = threading.Thread(target=_writer)
    write_thread.start()

    # TODO this depends on out of scope vars, need to clean that up
    def pa_array_quantiles(array, level):
        """Calculate the quantiles of a pyarrow array after shipping it to the GPU."""
        data = array.to_numpy().reshape(-1, n_adm2)
        data = data[:, adm2_sorted_ind]

        data_gpu = xp.array(data.T)

        if adm_sizes[level] == 1:
            # TODO need switching here b/c cupy handles xp.percentile weird with a size 1 dim :(
            if use_gpu:
                level_data_gpu = xp.sum(data_gpu, axis=0)  # need this if cupy
            else:
                level_data_gpu = xp.sum(data_gpu, axis=0,
                                        keepdims=True).T  # for numpy
            q_data_gpu = xp.empty((len(percentiles), adm_sizes[level]),
                                  dtype=level_data_gpu.dtype)
            # It appears theres a cupy bug when the 1st axis of the array passed to percentiles has size 1
            xp.percentile(level_data_gpu,
                          q=percentiles,
                          axis=0,
                          out=q_data_gpu)
        else:
            level_data_gpu = xp.zeros((adm_sizes[level], data_gpu.shape[1]),
                                      dtype=data_gpu.dtype)
            xp.scatter_add(level_data_gpu, adm_array_map[level], data_gpu)
            q_data_gpu = xp.empty((len(percentiles), adm_sizes[level]),
                                  dtype=level_data_gpu.dtype)
            xp.percentile(level_data_gpu,
                          q=percentiles,
                          axis=1,
                          out=q_data_gpu)
        return q_data_gpu

    try:
        percentiles = xp.array(quantiles, dtype=np.float64) * 100.0
        quantiles = np.array(quantiles)
        for date_i, date in enumerate(tqdm.tqdm(dates)):
            dataset = ds.dataset(data_dir,
                                 format="parquet",
                                 partitioning=["date"])
            table = dataset.to_table(filter=ds.field("date") == "date=" +
                                     str(date_i))
            table = table.drop(
                ("date", "rid", "adm2_id"))  # we don't need these b/c metadata
            pop_weight_table = table.select(pop_weighted_cols)
            table = table.drop(pop_weighted_cols)

            w = np.ravel(
                np.broadcast_to(
                    weights,
                    (table.shape[0] // weights.shape[0], weights.shape[0])))
            for i, col in enumerate(table.column_names):
                if pat.is_float64(table.column(i).type):
                    typed_w = w.astype(np.float64)
                else:
                    typed_w = w.astype(np.float32)

                tmp = pac.multiply_checked(table.column(i), typed_w)
                table = table.set_column(i, col, tmp)

            for col in pop_weighted_cols:
                if pat.is_float64(pop_weight_table[col].type):
                    typed_w = table["total_population"].to_numpy().astype(
                        np.float64)
                else:
                    typed_w = table["total_population"].to_numpy().astype(
                        np.float32)
                tmp = pac.multiply_checked(pop_weight_table[col], typed_w)
                table = table.append_column(col, tmp)

            for level in args.levels:
                all_q_data = {}
                for col in table.column_names:  # TODO can we do all at once since we dropped date?
                    all_q_data[col] = pa_array_quantiles(table[col], level)

                # all_q_data = {col: pa_array_quantiles(table[col]) for col in table.column_names}

                # we could do this outside the date loop and cache for each adm level...
                out_shape = (
                    len(percentiles), ) + adm_level_values[level].shape
                all_q_data[level] = np.broadcast_to(adm_level_values[level],
                                                    out_shape)
                all_q_data["date"] = np.broadcast_to(date, out_shape)
                all_q_data["quantile"] = np.broadcast_to(
                    quantiles[..., None], out_shape)

                for col in per_capita_cols:
                    all_q_data[col + "_per_100k"] = 100000.0 * all_q_data[
                        col] / all_q_data["total_population"]

                for col in pop_weighted_cols:
                    all_q_data[
                        col] = all_q_data[col] / all_q_data["total_population"]

                for col in all_q_data:
                    all_q_data[col] = xp.to_cpu(all_q_data[col].T.ravel())

                write_queue.put(
                    (os.path.join(output_dir,
                                  level + "_quantiles.csv"), all_q_data))

            del dataset
            gc.collect()

    except (KeyboardInterrupt, SystemExit):
        logging.warning("Caught SIGINT, cleaning up")
        write_queue.put(None)  # send signal to term loop
        write_thread.join()  # join the write_thread
    finally:
        write_queue.put(None)  # send signal to term loop
        write_thread.join()  # join the write_thread