Exemplo n.º 1
0
def get_gender_count(gender_counts: RDD, sc: SparkContext) -> RDD:
    """ Get gender distribution in current batch

        Arguments:
            gender_counts (RDD): rdd with gender counts
            sc (SparkContext): SparkContext

        Returns:
            RDD
    """
    gender_distribution = gender_counts.transform(lambda rdd: rdd.sortBy(
        lambda x: -x[1])).map(lambda x: f"{x[0]}'s this batch:\t{x[1]}")

    return gender_distribution
Exemplo n.º 2
0
def get_hostname_counts(hostname_counts: RDD, sc: SparkContext) -> RDD:
    """ Get hostname distribution in current batch

        Arguments:
            hostname_counts (RDD): rdd with hostname counts
            sc (SparkContext): SparkContext

        Returns:
            RDD
    """

    hostname_top3 = hostname_counts.transform(
        lambda rdd: sc.parallelize(rdd.take(3)))
    hostname_distribution = hostname_top3.transform(
        lambda rdd: rdd.sortBy(lambda x: -x[1])).map(
            lambda x: f"Top3 hostnames this batch:\t{x[0]}\t(Count: {x[1]})")

    return hostname_distribution
Exemplo n.º 3
0
def get_most_represented_country(country_counts: RDD, sc: SparkContext) -> RDD:
    """ Get most represented country in batch

        Arguments:
            country_counts (RDD): rdd with country counts
            sc (SparkContext): SparkContext

        Returns:
            RDD
    """

    country_counts_sorted_desc = country_counts.transform(
        (lambda rdd: rdd.sortBy(lambda x: (-x[1]))))
    most_represented_country = country_counts_sorted_desc.transform(
        lambda rdd: sc.parallelize(rdd.take(1)))
    most_represented_country = most_represented_country.map(
        lambda x: f"Most represented country in batch: {x[0]} ({x[1]})")

    return most_represented_country