def word_count_inline(text: DataFrame) -> DataFrame: from operator import add from dbnd_spark.spark import get_spark_session lines = text.rdd.map(lambda r: r[0]) counts = (lines.flatMap(lambda x: x.split(" ")).map( lambda x: (x, 1)).reduceByKey(add)) output = counts.collect() for (word, count) in output: print("%s: %i" % (word, count)) return get_spark_session().createDataFrame(counts)
def run(self): from operator import add from dbnd_spark.spark import get_spark_session lines = self.text.rdd.map(lambda r: r[0]) counts = (lines.flatMap(lambda x: x.split(" ")).map( lambda x: (x, 1)).reduceByKey(add)) counts.saveAsTextFile(str(self.counters)) output = counts.collect() for (word, count) in output: print("%s: %i" % (word, count)) self.counters_auto_save = get_spark_session().createDataFrame(counts)
def word_count_inline_folder(text=parameter.folder.csv[spark.DataFrame]): # type: (spark.DataFrame) -> spark.DataFrame from operator import add from dbnd_spark.spark import get_spark_session lines = text.rdd.map(lambda r: r[0]) counts = (lines.flatMap(lambda x: x.split(" ")).map( lambda x: (x, 1)).reduceByKey(add)) # counts.saveAsTextFile(str(counters)) output = counts.collect() for (word, count) in output: print("%s: %i" % (word, count)) return get_spark_session().createDataFrame(counts)
def word_count_inline(text=parameter.csv[spark.DataFrame], counters=output.txt.data): # type: (spark.DataFrame, Target) -> spark.DataFrame from operator import add from dbnd_spark.spark import get_spark_session lines = text.rdd.map(lambda r: r[0]) counts = (lines.flatMap(lambda x: x.split(" ")).map( lambda x: (x, 1)).reduceByKey(add)) counts.saveAsTextFile(str(counters)) output = counts.collect() for (word, count) in output: print("%s: %i" % (word, count)) counts_df = get_spark_session().createDataFrame(counts) log_dataframe("counts_df", counts_df) log_metric("test", 1) return counts_df