def get_gender_count(gender_counts: RDD, sc: SparkContext) -> RDD: """ Get gender distribution in current batch Arguments: gender_counts (RDD): rdd with gender counts sc (SparkContext): SparkContext Returns: RDD """ gender_distribution = gender_counts.transform(lambda rdd: rdd.sortBy( lambda x: -x[1])).map(lambda x: f"{x[0]}'s this batch:\t{x[1]}") return gender_distribution
def get_hostname_counts(hostname_counts: RDD, sc: SparkContext) -> RDD: """ Get hostname distribution in current batch Arguments: hostname_counts (RDD): rdd with hostname counts sc (SparkContext): SparkContext Returns: RDD """ hostname_top3 = hostname_counts.transform( lambda rdd: sc.parallelize(rdd.take(3))) hostname_distribution = hostname_top3.transform( lambda rdd: rdd.sortBy(lambda x: -x[1])).map( lambda x: f"Top3 hostnames this batch:\t{x[0]}\t(Count: {x[1]})") return hostname_distribution
def get_most_represented_country(country_counts: RDD, sc: SparkContext) -> RDD: """ Get most represented country in batch Arguments: country_counts (RDD): rdd with country counts sc (SparkContext): SparkContext Returns: RDD """ country_counts_sorted_desc = country_counts.transform( (lambda rdd: rdd.sortBy(lambda x: (-x[1])))) most_represented_country = country_counts_sorted_desc.transform( lambda rdd: sc.parallelize(rdd.take(1))) most_represented_country = most_represented_country.map( lambda x: f"Most represented country in batch: {x[0]} ({x[1]})") return most_represented_country