예제 #1
0
def test_unpersist():
    """Make sure that cached RDDs are unpersisted
    """
    sm.create("test")

    sql = SQLContext.getOrCreate(sm.sc)

    rdd1 = sm.parallelize(range(10000)).cache()
    rdd1.count()
    df1 = sql.createDataFrame([('Foo', 1)]).cache()
    df1.count()

    before = set(r.id() for r in sm.sc._jsc.getPersistentRDDs().values())

    with sm.clean_cache():
        rdd2 = sm.parallelize(range(0, 10000, 2))
        rdd2.cache()
        df2 = sql.createDataFrame([('Bar', 2)])
        df2.cache()

    assert before == set(r.id()
                         for r in sm.sc._jsc.getPersistentRDDs().values())

    assert rdd1.getStorageLevel().useMemory is True
    assert rdd2.getStorageLevel().useMemory is False

    # FIXME Does not currently work!
    # assert df1.rdd.getStorageLevel().useMemory is True
    assert df2.rdd.getStorageLevel().useMemory is False
예제 #2
0
def test_bench():
    """Test the benchmarking
    """
    sm.create("test")

    with sm.benchmark():
        rdd = sm.parallelize(range(10000))
        rdd.count()
예제 #3
0
def test_deco():
    """Test the decorator
    """
    sm.create("test")

    @sm.assign_to_jobgroup
    def some_function():
        rdd = sm.parallelize(range(10000))
        rdd.count()

    some_function()
예제 #4
0
def test_report(tmpdir):
    """Test the decorator
    """
    filename = tmpdir.join("report")
    sm.create("test", report=str(filename), reset=True)

    @sm.assign_to_jobgroup
    def some_function():
        rdd = sm.parallelize(range(10000))
        rdd.count()
    some_function()

    with open(str(filename), 'r') as fd:
        data = json.load(fd)
    assert len(data['timing']) == 1
예제 #5
0
def test_reset():
    """Make sure that all caches are reset
    """
    sm.create("test")
    sm.reset_cache()

    sql = SQLContext.getOrCreate(sm.sc)

    assert len(sm.sc._jsc.getPersistentRDDs()) == 0

    rdd1 = sm.parallelize(range(10000))
    rdd1.count()
    rdd1.persist()
    df1 = sql.createDataFrame([('Foo', 1)])
    df1.count()
    df1.persist()

    assert len(sm.sc._jsc.getPersistentRDDs()) > 0

    sm.reset_cache()

    assert len(sm.sc._jsc.getPersistentRDDs()) == 0
예제 #6
0
def run(file1, file2, output=True, spark_options=None, **opts):
    # type: (str, str, object, object, **object) -> dict

    # ====== Init Spark and dataframes ======
    sm.create("fsumcheck", spark_config, spark_options)

    options = _DEFAULTS.copy()
    options.update(opts)
    df1 = sm.spark.read.schema(SCHEMA).csv(file1, sep=options["delimiter"])
    df2 = sm.spark.read.schema(SCHEMA).csv(file2, sep=options["delimiter"])

    # ======  Optimization ======
    n_partitions = df1.rdd.getNumPartitions()
    shuffle_partitions = ((n_partitions - 1) // 50 + 1) * 50
    if options["verbosity"]:
        print("Processing {} partitions (shuffle counts: {})".format(
            n_partitions, shuffle_partitions))
    sm.conf.set("spark.sql.shuffle.partitions", shuffle_partitions)

    df1 = df1.repartition("filename").persist(StorageLevel.MEMORY_AND_DISK)
    df2 = df2.repartition("filename").persist(StorageLevel.MEMORY_AND_DISK)

    # ======  Checks ======
    # 1 Only left and right
    only_left = (df1.join(df2, "filename", how="left_anti").select(
        df1.filename).where(df1.filename.isNotNull()))
    only_right = (df2.join(df1, "filename", how="left_anti").select(
        df2.filename).where(df2.filename.isNotNull()))

    # 2 Different checksum
    different_checksum = (df1.join(
        df2,
        "filename").where(df1.checksum != df2.checksum).select(df1.filename))

    # 3 Missing field
    problematic_left = (
        df1.where("filename is NULL OR checksum is NULL").select(
            F.when(df1.filename.isNull(),
                   df1.checksum).otherwise(df1.filename).alias("entry")))
    problematic_right = (
        df2.where("filename is NULL OR checksum is NULL").select(
            F.when(df2.filename.isNull(),
                   df2.checksum).otherwise(df2.filename).alias("entry")))

    # ====== Results gathering ======

    all_dfs = OrderedDict([("only_left", only_left),
                           ("only_right", only_right),
                           ("different_checksum", different_checksum),
                           ("problematic_left", problematic_left),
                           ("problematic_right", problematic_right)])

    if output:
        if output is True:
            output = "fscheck_output"
        os.path.exists(output) or os.makedirs(output)

        for name, df in all_dfs.items():
            df = df.cache()
            out_filepath = os.path.join(output, name + ".csv.dir")
            if options["verbosity"]:
                print(" - Creating " + out_filepath)
            df.write.csv(out_filepath, mode="overwrite")

            # Quick merge
            os.system("cat {}/*.csv > {} ".format(
                out_filepath, os.path.join(output, name + ".csv")))
            os.system("rm -rf {}".format(out_filepath))

            print("   Total entries: {}".format(df.count()))
            df.unpersist()

    return all_dfs