def build_vocabulary(df: pyspark.sql.DataFrame) -> Dict[str, List[Any]]:
    vocab = {}
    for col in CATEGORICAL_COLS:
        values = [r[0] for r in df.select(col).distinct().collect()]
        col_type = type([x for x in values if x is not None][0])
        default_value = col_type()
        vocab[col] = sorted(values, key=lambda x: x or default_value)
    return vocab
Пример #2
0
def normalise_fields_names(df: pyspark.sql.DataFrame,
                           fieldname_normaliser=__normalise_fieldname__):
    return df.select([
        f.col("`{}`".format(field.name)).cast(
            __rename_nested_field__(field.dataType,
                                    fieldname_normaliser)).alias(
                                        fieldname_normaliser(field.name))
        for field in df.schema.fields
    ])
Пример #3
0
def perc_weather_cancellations_per_week(spark: sk.sql.SparkSession,
                                        data: sk.sql.DataFrame) -> sk.RDD:
    onlycancelled = data.select(data['Cancelled'] == 1)
    codeperweek = data.rdd.map(lambda row: (week_from_row(row), (1, 1 if (str(
        row['CancellationCode']).strip() == 'B') else 0)))
    fractioncancelled = codeperweek.reduceByKey(lambda l, r:
                                                (l[0] + r[0], l[1] + r[1]))
    return fractioncancelled.mapValues(
        lambda v: v[1] / v[0] * 100.0).sortByKey()
Пример #4
0
def show_df(df: pyspark.sql.DataFrame,
            columns: list,
            rows: int = 10,
            sample=False,
            truncate=True):
    """
    Prints out number of rows in pyspark df

    :param df:  pyspark dataframe
    :param columns: list of columns to print
    :param rows: how many rows to print - default 10
    :param sample: should we sample - default False
    :param truncate: truncate output - default True
    :return:
    """
    if sample:
        sample_percent = min(rows / df.count(), 1.0)
        log.info(f'sampling percentage: {sample_percent}')
        df.select(columns).sample(False, sample_percent,
                                  seed=1).show(rows, truncate=truncate)
    else:
        df.select(columns).show(rows, truncate=truncate)
Пример #5
0
    def process_inquiries(self, review: pyspark.sql.DataFrame,
                          metadata: pyspark.sql.DataFrame) -> None:
        logging.info("Start pipeline")

        logging.info("Processing")
        review_transform_date = review.select(
            'asin', 'overall',
            'unixReviewTime').withColumn("unixReviewTime",
                                         from_unixtime("unixReviewTime"))
        review_date_decompose = review_transform_date.withColumn(
            "month",
            month("unixReviewTime")).withColumn("year", year("unixReviewTime"))
        metadata_flatten_categories = metadata.select(
            'asin', explode('categories')).select('asin', explode('col'))
        join_review_metadata = review_date_decompose.join(
            metadata_flatten_categories, on=['asin'], how='inner')
        groupby_review_metadata = join_review_metadata.groupBy(
            "year", "month", "col").count().orderBy('year',
                                                    'month',
                                                    'count',
                                                    ascending=False).cache()
        patrions = groupby_review_metadata.withColumn(
            "rank",
            row_number().over(self.get_partitions())).cache()
        filter_patrions = patrions.filter(self.patrions.rank <= 5).cache()
        groupby_review_metadata.unpersist()
        result_inner = join_review_metadata.join(filter_patrions,
                                                 on=['year', 'month', 'col'],
                                                 how='inner')
        patrions.unpersist()
        filter_patrions.unpersist()
        result_groupby = result_inner.groupBy(
            'year', 'month',
            'col').avg('overall').alias('rating').orderBy('year',
                                                          'month',
                                                          ascending=True)
        result_groupby.show()
        logging.info("Finished")
        self.upsert_database(result_groupby, 'mydb', 'myset')
Пример #6
0
def flatten(df: pyspark.sql.DataFrame,
            fieldname_normaliser=__normalise_fieldname__):
    cols = []
    for child in __get_fields_info__(df.schema):
        if len(child) > 2:
            ex = "x.{}".format(child[-1])
            for seg in child[-2:0:-1]:
                if seg != '``':
                    ex = "transform(x.{outer}, x -> {inner})".format(outer=seg,
                                                                     inner=ex)
            ex = "transform({outer}, x -> {inner})".format(outer=child[0],
                                                           inner=ex)
        else:
            ex = ".".join(child)
        cols.append(
            f.expr(ex).alias(
                fieldname_normaliser("_".join(child).replace('`', ''))))
    return df.select(cols)