Exemplo n.º 1
0
def main(argv: List[str],
         cwd: Path_T,
         builder: SparkSession_T.Builder,
         driver_memory: str = '16g') -> None:
    [stats_fn, ddict_dir, out_fn] = argv[1:4]

    spark = (builder.appName(__file__).config('driver-memory',
                                              driver_memory).getOrCreate())

    ndd = DataDictionary.make_in(spark, cwd / ddict_dir)
    job = SyntheticData(spark)
    stats = spark.read.csv(stats_fn, header=True, inferSchema=True)
    records = job.synthesize_data(stats, ndd.record_layout)
    records.to_pickle(',syn_records_TMP.pkl')
    with (cwd / out_fn).open('w') as out:
        for line in flat_file.naaccr_make_fwf(records,
                                              ndd.record_layout.toPandas()):
            out.write(line)
Exemplo n.º 2
0
def main(argv: List[str],
         cwd: Path_T,
         builder: SparkSession_T.Builder,
         driver_memory: str = '16g') -> None:
    [naaccr_file, sample_, ddict_dir, stats_out] = argv[1:5]
    sample = int(sample_)
    spark = (builder.appName(__file__).config('driver-memory',
                                              driver_memory).getOrCreate())

    ndd = DataDictionary.make_in(spark, cwd / ddict_dir)
    data_raw = naaccr_read_fwf(
        spark.read.text(str(cwd / naaccr_file)).sample(False, sample / 100),
        ndd.record_layout,
    )
    data_raw = data_raw
    stats = DataSummary.nominal_stats(data_raw, spark, ndd)

    stats.to_csv(cwd / stats_out)
    print(stats.head(10))