Exemplo n.º 1
0
    def test_percentile_approx(self):
        actual = list(
            chain.from_iterable(
                [
                    re.findall("(percentile_approx\\(.*\\))", str(x))
                    for x in [
                        percentile_approx(col("foo"), lit(0.5)),
                        percentile_approx(col("bar"), 0.25, 42),
                        percentile_approx(col("bar"), [0.25, 0.5, 0.75]),
                        percentile_approx(col("foo"), (0.05, 0.95), 100),
                        percentile_approx("foo", 0.5),
                        percentile_approx("bar", [0.1, 0.9], lit(10)),
                    ]
                ]
            )
        )

        expected = [
            "percentile_approx(foo, 0.5, 10000)",
            "percentile_approx(bar, 0.25, 42)",
            "percentile_approx(bar, array(0.25, 0.5, 0.75), 10000)",
            "percentile_approx(foo, array(0.05, 0.95), 100)",
            "percentile_approx(foo, 0.5, 10000)",
            "percentile_approx(bar, array(0.1, 0.9), 10)",
        ]

        self.assertListEqual(actual, expected)
Exemplo n.º 2
0
    def median(self, sparkDataFrame, columnNames):
        '''
        https://spark.apache.org/docs/3.1.1/api/python/reference/api/pyspark.sql.functions.percentile_approx.html
        if you have an odd number of rows it does not calculate the value between the 2 middle values
        '''
        self.logger.warn("median BEGIN")
        retDF = sparkDataFrame.select(
            *(percentile_approx(c, 0.5, accuracy=1000000)
              for c in columnNames))

        self.logger.warn( "median retDF numRows:{} numCols:{}"\
                         .format( retDF.count(), len( retDF.columns ) ) )

        self.logger.warn("median END\n")
        return retDF
Exemplo n.º 3
0
def calcular_metrica_percentil(viajes_didier_df, percentil):

    #excluye los registros con kilómetros o precio_kilometro en 0, negativos o en null
    viajes_didier_df = viajes_didier_df.filter(viajes_didier_df.kilometros > 0)
    viajes_didier_df = viajes_didier_df.filter(
        viajes_didier_df.precio_kilometro > 0)

    viajes_didier_df = viajes_didier_df.withColumn(
        "Ingreso_por_Viaje",
        col("kilometros") * col("precio_kilometro"))

    #obtiene el total de ingresos por persona y se ordenan de menor a mayor cantidad de ingresos
    personas_ingresos_df = viajes_didier_df.groupBy("identificador").sum(
        "Ingreso_por_Viaje")
    personas_ingresos_df = personas_ingresos_df.orderBy(
        col("sum(Ingreso_por_Viaje)").asc(),
        col("identificador").asc())

    #si se envía un valor de percentil menor a 0, establece el valor en 0 (como valor mínimo)
    if (percentil < 0):
        percentil = 0

    #si se envía un valor de percentil mayor a 100, establece el valor en 100 (como valor máximo)
    if (percentil > 100):
        percentil = 100

    metrica = "percentil_" + str(percentil)
    #obtiene el percentil respectivo
    valor_percentil_df = personas_ingresos_df.select(
        percentile_approx("sum(Ingreso_por_Viaje)",
                          [percentil / 100])[0].alias("Valor"))
    valor_percentil_df = valor_percentil_df.withColumn("Tipo_de_Metrica",
                                                       lit(metrica))
    valor_percentil_df = valor_percentil_df.select(col("Tipo_de_Metrica"),
                                                   col("Valor"))

    return valor_percentil_df
Exemplo n.º 4
0
    def compute_multicol_stats(data, colnames, whis, precision):
        # Computes mean, median, Q1 and Q3 with approx_percentile and precision
        scol = []
        for colname in colnames:
            scol.append(
                F.percentile_approx(
                    "`%s`" % colname, [0.25, 0.50, 0.75], int(1.0 / precision)
                ).alias("{}_percentiles%".format(colname))
            )
            scol.append(F.mean("`%s`" % colname).alias("{}_mean".format(colname)))

        #      a_percentiles  a_mean    b_percentiles  b_mean
        # 0  [3.0, 3.2, 3.2]    3.18  [5.1, 5.9, 6.4]    5.86
        pdf = data._internal.resolved_copy.spark_frame.select(*scol).toPandas()

        i = 0
        multicol_stats = {}
        for colname in colnames:
            q1, med, q3 = pdf.iloc[0, i]
            iqr = q3 - q1
            lfence = q1 - whis * iqr
            ufence = q3 + whis * iqr
            i += 1

            mean = pdf.iloc[0, i]
            i += 1

            multicol_stats[colname] = {
                "mean": mean,
                "med": med,
                "q1": q1,
                "q3": q3,
                "lfence": lfence,
                "ufence": ufence,
            }

        return multicol_stats
Exemplo n.º 5
0
def calculate_time_horizon(df: DataFrame, ts_col: str, freq: str,
                           partition_cols: List[str]):
    # Convert Frequency using resample dictionary
    parsed_freq = checkAllowableFreq(freq)
    freq = f"{parsed_freq[0]} {freq_dict[parsed_freq[1]]}"

    # Get max and min timestamp per partition
    partitioned_df: DataFrame = df.groupBy(*partition_cols).agg(
        max(ts_col).alias("max_ts"),
        min(ts_col).alias("min_ts"),
    )

    # Generate upscale metrics
    normalized_time_df: DataFrame = (partitioned_df.withColumn(
        "min_epoch_ms", expr("unix_millis(min_ts)")
    ).withColumn("max_epoch_ms", expr("unix_millis(max_ts)")).withColumn(
        "interval_ms",
        expr(
            f"unix_millis(cast('1970-01-01 00:00:00.000+0000' as TIMESTAMP) + INTERVAL {freq})"
        ),
    ).withColumn(
        "rounded_min_epoch",
        expr("min_epoch_ms - (min_epoch_ms % interval_ms)")).withColumn(
            "rounded_max_epoch",
            expr("max_epoch_ms - (max_epoch_ms % interval_ms)")).withColumn(
                "diff_ms",
                expr("rounded_max_epoch - rounded_min_epoch")).withColumn(
                    "num_values", expr("(diff_ms/interval_ms) +1")))

    (
        min_ts,
        max_ts,
        min_value_partition,
        max_value_partition,
        p25_value_partition,
        p50_value_partition,
        p75_value_partition,
        total_values,
    ) = normalized_time_df.select(
        min("min_ts"),
        max("max_ts"),
        min("num_values"),
        max("num_values"),
        percentile_approx("num_values", 0.25),
        percentile_approx("num_values", 0.5),
        percentile_approx("num_values", 0.75),
        sum("num_values"),
    ).first()

    warnings.simplefilter("always", ResampleWarning)
    warnings.warn(
        f"""
            Resample Metrics Warning: 
                Earliest Timestamp: {min_ts}
                Latest Timestamp: {max_ts}
                No. of Unique Partitions: {normalized_time_df.count()}
                Resampled Min No. Values in Single a Partition: {min_value_partition}
                Resampled Max No. Values in Single a Partition: {max_value_partition}
                Resampled P25 No. Values in Single a Partition: {p25_value_partition}
                Resampled P50 No. Values in Single a Partition: {p50_value_partition}
                Resampled P75 No. Values in Single a Partition: {p75_value_partition}
                Resampled Total No. Values Across All Partitions: {total_values}
        """,
        ResampleWarning,
    )
Exemplo n.º 6
0
udfExpand = F.udf(exlodeVisits, T.MapType(DateType(), T.IntegerType()))
df = spark.read.csv('nyc_restaurant_pattern.csv', header=True, escape ='"') \
       .select("placekey","safegraph_place_id",
          F.explode(udfExpand('date_range_start', 'visits_by_day')) \
             .alias('date', "visits"))
       # .where(f"date=='{date}'")
#Credit to the professor, I leverage this piece of code from class


categories = ["big_box_grocers",
"convenience_stores",
"drinking_places",
"full_service_restaurants",
"limited_service_restaurants",
"pharmacies_and_drug_stores",
"snack_and_bakeries",
"specialty_food_stores",
"supermarkets_except_convenience_stores"]

for c in categories:
  df.join(filteredCorePlaces, ["placekey"], "inner").groupBy("date","file_name")\
  .agg(F.percentile_approx("visits", 0.5).alias('median'), F.round(F.stddev("visits")).cast("integer").alias('std'))\
  .withColumn("low", when(F.col("std") > F.col("median"), 0).otherwise(F.col("median") - (F.col("std"))))\
  .withColumn("high", F.col("median") + F.col("std"))\
  .withColumn("year", F.year("date"))\
  .withColumn("project_date", F.add_months(F.col("date"), 12))\
  .sort(F.col("year"), F.col("project_date"))\
  .where((F.col("year").isin(2019, 2020)) & (F.col("file_name") == c))\
  .select(F.col("year"), F.col("project_date").alias('date'), F.col("median"),F.col("low"),F.col("high"))\
  .coalesce(1).write.mode("overwrite").option("header",True).format("csv").save("/{}/" + c +"/" + "{}.csv".format(c, OUTPUT_PREFIX))