示例#1
0
def word_count_inline(text: DataFrame) -> DataFrame:
    from operator import add
    from dbnd_spark.spark import get_spark_session

    lines = text.rdd.map(lambda r: r[0])
    counts = (lines.flatMap(lambda x: x.split(" ")).map(
        lambda x: (x, 1)).reduceByKey(add))
    output = counts.collect()
    for (word, count) in output:
        print("%s: %i" % (word, count))

    return get_spark_session().createDataFrame(counts)
示例#2
0
    def run(self):
        from operator import add
        from dbnd_spark.spark import get_spark_session

        lines = self.text.rdd.map(lambda r: r[0])
        counts = (lines.flatMap(lambda x: x.split(" ")).map(
            lambda x: (x, 1)).reduceByKey(add))
        counts.saveAsTextFile(str(self.counters))
        output = counts.collect()
        for (word, count) in output:
            print("%s: %i" % (word, count))

        self.counters_auto_save = get_spark_session().createDataFrame(counts)
示例#3
0
def word_count_inline_folder(text=parameter.folder.csv[spark.DataFrame]):
    # type:  (spark.DataFrame) -> spark.DataFrame
    from operator import add
    from dbnd_spark.spark import get_spark_session

    lines = text.rdd.map(lambda r: r[0])
    counts = (lines.flatMap(lambda x: x.split(" ")).map(
        lambda x: (x, 1)).reduceByKey(add))
    # counts.saveAsTextFile(str(counters))
    output = counts.collect()
    for (word, count) in output:
        print("%s: %i" % (word, count))

    return get_spark_session().createDataFrame(counts)
示例#4
0
def word_count_inline(text=parameter.csv[spark.DataFrame],
                      counters=output.txt.data):
    # type:  (spark.DataFrame, Target) -> spark.DataFrame
    from operator import add
    from dbnd_spark.spark import get_spark_session

    lines = text.rdd.map(lambda r: r[0])
    counts = (lines.flatMap(lambda x: x.split(" ")).map(
        lambda x: (x, 1)).reduceByKey(add))
    counts.saveAsTextFile(str(counters))
    output = counts.collect()
    for (word, count) in output:
        print("%s: %i" % (word, count))

    counts_df = get_spark_session().createDataFrame(counts)
    log_dataframe("counts_df", counts_df)
    log_metric("test", 1)

    return counts_df