def test_mixed_sql_and_udf(self):
        df = self.data
        w = self.unbounded_window
        ow = self.ordered_window
        max_udf = self.pandas_agg_max_udf
        min_udf = self.pandas_agg_min_udf

        result1 = df.withColumn('v_diff', max_udf(df['v']).over(w) - min_udf(df['v']).over(w))
        expected1 = df.withColumn('v_diff', max(df['v']).over(w) - min(df['v']).over(w))

        # Test mixing sql window function and window udf in the same expression
        result2 = df.withColumn('v_diff', max_udf(df['v']).over(w) - min(df['v']).over(w))
        expected2 = expected1

        # Test chaining sql aggregate function and udf
        result3 = df.withColumn('max_v', max_udf(df['v']).over(w)) \
                    .withColumn('min_v', min(df['v']).over(w)) \
                    .withColumn('v_diff', col('max_v') - col('min_v')) \
                    .drop('max_v', 'min_v')
        expected3 = expected1

        # Test mixing sql window function and udf
        result4 = df.withColumn('max_v', max_udf(df['v']).over(w)) \
                    .withColumn('rank', rank().over(ow))
        expected4 = df.withColumn('max_v', max(df['v']).over(w)) \
                      .withColumn('rank', rank().over(ow))

        self.assertPandasEqual(expected1.toPandas(), result1.toPandas())
        self.assertPandasEqual(expected2.toPandas(), result2.toPandas())
        self.assertPandasEqual(expected3.toPandas(), result3.toPandas())
        self.assertPandasEqual(expected4.toPandas(), result4.toPandas())
示例#2
0
    def test_window_functions(self):
        df = self.sqlCtx.createDataFrame([(1, "1"), (2, "2"), (1, "2"), (1, "2")], ["key", "value"])
        w = Window.partitionBy("value").orderBy("key")
        from pyspark.sql import functions as F

        sel = df.select(
            df.value,
            df.key,
            F.max("key").over(w.rowsBetween(0, 1)),
            F.min("key").over(w.rowsBetween(0, 1)),
            F.count("key").over(w.rowsBetween(float("-inf"), float("inf"))),
            F.rowNumber().over(w),
            F.rank().over(w),
            F.denseRank().over(w),
            F.ntile(2).over(w),
        )
        rs = sorted(sel.collect())
        expected = [
            ("1", 1, 1, 1, 1, 1, 1, 1, 1),
            ("2", 1, 1, 1, 3, 1, 1, 1, 1),
            ("2", 1, 2, 1, 3, 2, 1, 1, 1),
            ("2", 2, 2, 2, 3, 3, 3, 2, 2),
        ]
        for r, ex in zip(rs, expected):
            self.assertEqual(tuple(r), ex[: len(r)])
def process_file(date_update):
    """Process downloaded MEDLINE folder to parquet file"""
    print("Process MEDLINE file to parquet")
    # remove if folder still exist
    if glob(os.path.join(save_dir, 'medline_*.parquet')):
        subprocess.call(['rm', '-rf', 'medline_*.parquet'])

    date_update_str = date_update.strftime("%Y_%m_%d")
    path_rdd = sc.parallelize(glob(os.path.join(download_dir, 'medline*.xml.gz')), numSlices=1000)
    parse_results_rdd = path_rdd.\
        flatMap(lambda x: [Row(file_name=os.path.basename(x), **publication_dict)
                           for publication_dict in pp.parse_medline_xml(x)])
    medline_df = parse_results_rdd.toDF()
    medline_df.write.parquet(os.path.join(save_dir, 'medline_raw_%s.parquet' % date_update_str),
                             mode='overwrite')

    window = Window.partitionBy(['pmid']).orderBy(desc('file_name'))
    windowed_df = medline_df.select(
        max('delete').over(window).alias('is_deleted'),
        rank().over(window).alias('pos'),
        '*')
    windowed_df.\
        where('is_deleted = False and pos = 1').\
        write.parquet(os.path.join(save_dir, 'medline_lastview_%s.parquet' % date_update_str),
                      mode='overwrite')

    # parse grant database
    parse_grant_rdd = path_rdd.flatMap(lambda x: pp.parse_medline_grant_id(x))\
        .filter(lambda x: x is not None)\
        .map(lambda x: Row(**x))
    grant_df = parse_grant_rdd.toDF()
    grant_df.write.parquet(os.path.join(save_dir, 'medline_grant_%s.parquet' % date_update_str),
                           mode='overwrite')
def extractor (df, min_count, output_path):

    n_gram_df = make_ngrams (df)
    
    n_gram_score = chi_square_procedur(n_gram_df, min_count)
    
    window = Window.partitionBy(n_gram_score['category'])\
            .orderBy(n_gram_score['aprx_chi_scr'].desc())
            
    n_gram_score = n_gram_score.dropDuplicates(['n_gram', 'category'])    
    top_word_df = n_gram_score.select('*', F.rank().over(window).alias('rank'))\
                .filter(F.col('rank')<=1000)
                
    top_word_df = top_word_df.join(categories, on = ['category'], how='left')
    
    top_words = top_word_df.orderBy(F.col('category'), F.col('count').desc()).select('n_gram','category',\
                'count','distinct_user_count','aprx_chi_scr').toPandas()
    
    top_words.to_csv(output_path)
    
    return top_words
示例#5
0
文件: tests.py 项目: kai-zeng/iolap
 def test_window_functions_without_partitionBy(self):
     df = self.sqlCtx.createDataFrame([(1, "1"), (2, "2"), (1, "2"),
                                       (1, "2")], ["key", "value"])
     w = Window.orderBy("key", df.value)
     from pyspark.sql import functions as F
     sel = df.select(
         df.value, df.key,
         F.max("key").over(w.rowsBetween(0, 1)),
         F.min("key").over(w.rowsBetween(0, 1)),
         F.count("key").over(w.rowsBetween(float('-inf'), float('inf'))),
         F.rowNumber().over(w),
         F.rank().over(w),
         F.denseRank().over(w),
         F.ntile(2).over(w))
     rs = sorted(sel.collect())
     expected = [
         ("1", 1, 1, 1, 4, 1, 1, 1, 1), ("2", 1, 1, 1, 4, 2, 2, 2, 1),
         ("2", 1, 2, 1, 4, 3, 2, 2, 2), ("2", 2, 2, 2, 4, 4, 4, 3, 2)
     ]
     for r, ex in zip(rs, expected):
         self.assertEqual(tuple(r), ex[:len(r)])
示例#6
0
    def retrieve_next_layer(self, f, e, topx, direction='out'):
        if direction == 'out':
            orig_node = 'src'
            dest_node = 'dst'
        else:
            orig_node = 'dst'
            dest_node = 'src'

        df = f.select("id").join(e.drop('in_scope'), f.id == e[orig_node],
                                 'inner').drop(orig_node)
        window = Window.partitionBy(df['id']).orderBy(df['amount'].desc())
        df = df.select(
            '*',
            F.rank().over(window).alias('rank')).filter(F.col('rank') <= topx)

        dummy_tmp = self.create_dummy_edges(df, f, topx, direction)
        df = dummy_tmp.union(df.select(dummy_tmp.columns))
        df = df.withColumn("direction", F.lit(direction))
        df = df.withColumnRenamed(dest_node, "adj")

        return df
示例#7
0
def spark_mysql(title_basics, title_ratings):

    titles = title_basics.select('tconst', 'startYear', 'originalTitle',
                                 'titleType')
    titles = titles.withColumn('startYear', titles['startYear'].cast(IntegerType()))\
                    .where(titles['startYear'].isNotNull())

    ratings = title_ratings.select('tconst', 'averageRating')

    result = ratings.join(titles, on=['tconst'])

    window = Window.partitionBy(['titleType',
                                 'startYear']).orderBy(desc('averageRating'))

    result = result.select(
        '*',
        rank().over(window).alias('rank')).filter(col('rank') <= 10)

    result = result.orderBy('titleType', 'startYear', desc('averageRating'))

    return result
示例#8
0
    def find_most_trees_address(self, df):
        """Find the adrress with most number of trees planted

        :param df: Input DataFrame containing all details of trees
        :return: DataFrame of the address with most number of trees
        """

        # dataframe of address and corresponding tree_id(s) and find the total count of trees in each address
        max_trees = df.select('address', 'tree_id').filter(
            df.address.isNotNull()).groupBy(col('address')).count().sort(
                desc("count"))

        # rank the addresses based on decreasing order of number of trees planted and find top address, which is rank 1
        max_trees_place = max_trees.withColumn(
            "rank",
            rank().over(Window.orderBy(col("count").desc()))).filter(
                col("rank") == 1).select('address')

        self.log.warn('Found the address with most number of trees planted')

        return max_trees_place
示例#9
0
def main(spark, model_file, data_file, K):
    '''Main routine for Collaborative Filtering Model testing

        Parameters
        ----------
        spark: SparkSession object

        model_file: string, path to store the model

        data_file: string, path to the parquet file to load

        K: int, evaluations are based on predictions of the top K items for each user
        '''
    testIdx = spark.read.parquet(data_file)
    model = ALSModel.load(model_file)

    users_val = testIdx.select("user_idx").distinct()

    perUserPredictedItemsDF = model.recommendForUserSubset(users_val, K)
    perUserPredictedItemsDF = perUserPredictedItemsDF.select(
        "user_idx", "recommendations.track_idx").withColumnRenamed(
            'user_idx', 'user').withColumnRenamed('recommendations.track_idx',
                                                  'items')

    w2 = Window.partitionBy('user_idx').orderBy(col('count').desc())
    perUserActualItemsDF = testIdx.select(
        'user_idx', 'track_idx', 'count',
        F.rank().over(w2).alias('rank')).where(
            'rank <= {0}'.format(K)).groupBy('user_idx').agg(
                expr('collect_list(track_idx) as items')).withColumnRenamed(
                    'user_idx', 'user')

    perUserItemsRDD = perUserPredictedItemsDF.join(
        perUserActualItemsDF, 'user').rdd.map(lambda row: (row[1], row[2]))
    rankingMetrics = RankingMetrics(perUserItemsRDD)

    print("============================================")
    print("meanAveragePrecision = %.8f" % rankingMetrics.meanAveragePrecision)
    print("precisionAt(K) = %.8f" % rankingMetrics.precisionAt(K))
    print("ndcgAt(K) = %.8f" % rankingMetrics.ndcgAt(K))
示例#10
0
def runOtherFunctions(spark, personDf):
    df = spark.createDataFrame([("v1", "v2", "v3")], ["c1", "c2", "c3"])

    # array
    df.select(df.c1, df.c2, df.c3,
              array("c1", "c2", "c3").alias("newCol")).show(truncate=False)

    # desc, asc
    personDf.show()
    personDf.sort(functions.desc("age"), functions.asc("name")).show()

    # pyspark 2.1.0 버전은 desc_nulls_first, desc_nulls_last, asc_nulls_first, asc_nulls_last 지원하지 않음

    # split, length (pyspark에서 컬럼은 df["col"] 또는 df.col 형태로 사용 가능)
    df2 = spark.createDataFrame([("Splits str around pattern", )], ['value'])
    df2.select(df2.value, split(df2.value, " "),
               length(df2.value)).show(truncate=False)

    # rownum, rank
    f1 = StructField("date", StringType(), True)
    f2 = StructField("product", StringType(), True)
    f3 = StructField("amount", IntegerType(), True)
    schema = StructType([f1, f2, f3])

    p1 = ("2017-12-25 12:01:00", "note", 1000)
    p2 = ("2017-12-25 12:01:10", "pencil", 3500)
    p3 = ("2017-12-25 12:03:20", "pencil", 23000)
    p4 = ("2017-12-25 12:05:00", "note", 1500)
    p5 = ("2017-12-25 12:05:07", "note", 2000)
    p6 = ("2017-12-25 12:06:25", "note", 1000)
    p7 = ("2017-12-25 12:08:00", "pencil", 500)
    p8 = ("2017-12-25 12:09:45", "note", 30000)

    dd = spark.createDataFrame([p1, p2, p3, p4, p5, p6, p7, p8], schema)
    w1 = Window.partitionBy("product").orderBy("amount")
    w2 = Window.orderBy("amount")
    dd.select(dd.product, dd.amount,
              functions.row_number().over(w1).alias("rownum"),
              functions.rank().over(w2).alias("rank")).show()
示例#11
0
def get_users_dataframe(complete_listens_df, metadata):
    """ Prepare users dataframe

        Args:
            complete_listens_df (dataframe): Dataframe with all the columns/fields that a typical listen has.

        Returns:
            users_df (dataframe): Columns can be depicted as:
                [
                    'user_name', 'user_id'
                ]
    """
    # We use window function to give rank to distinct user_names
    # Note that if user_names are not distinct rank would repeat and give unexpected results.
    user_window = Window.orderBy('user_name')
    users_df = complete_listens_df.select('user_name').distinct().withColumn(
        'user_id',
        rank().over(user_window))

    metadata['users_count'] = users_df.count()
    save_dataframe(users_df, path.USERS_DATAFRAME_PATH)
    return users_df
示例#12
0
def main(sc, out_file_name):
    """
    Read GDELT data from S3, clean themes from taxonomy words, and 
    perform frequency count of cleaned themes. Pick top 1000 most
    popular themes and write to out_file_name
    """

    #Obtain list of taxonomy words for theme cleaning
    tax_file = os.environ['TAX_LIST_FILE']
    tax_list = f.read_tax_file(tax_file)
    rdd_tax_list = sc.broadcast(tax_list)


    #Read 'GKG" table from GDELT S3 bucket. Transform into RDD and clean taxonomy words
    gkgRDD = sc.textFile('s3a://gdelt-open-data/v2/gkg/201[5-9]*000000.gkg.csv')
    gkgRDD = gkgRDD.map(lambda x: x.encode("utf", "ignore"))
    gkgRDD.cache()
    gkgRDD = gkgRDD.map(lambda x: x.split('\t'))
    gkgRDD = gkgRDD.filter(lambda x: len(x)==27)   
    gkgRDD = gkgRDD.filter(lambda x: f.is_not_empty(x[7]]))
    gkgRowRDD = gkgRDD.map(lambda x : Row(themes = f.clean_taxonomy(x[7].split(';')[:-1], rdd_tax_list)))


    sqlContext = SQLContext(sc)

    #Transform RDDs to dataframes
    gkgDF     = sqlContext.createDataFrame(gkgRowRDD)

    #Each document could contain multiple themes. Explode on the themes and make a new column
    explodedDF = gkgDF.select(explode(gkgDF.themes).alias("theme"))

    #Count the frequency of each theme
    testDF = explodedDF.groupBy('theme').agg(count('*').alias('num_mentions'))

    #Find top 1000 most popular themes, use Pandas to write to output file
    window = Window.orderBy(testDF['num_mentions'].desc())
    rankDF = testDF.select('*', rank().over(window).alias('rank')) .filter(col('rank') <= 1000).where(col('theme') != '')
    pandasDF = rankDF.toPandas()
    pandasDF.to_csv(out_file_name, columns = ["theme", "num_mentions", "rank"])
    def transform_as4_invoice(self, sources: dict) -> DataFrame:
        """
        Dim Location records and attributes from dataB Invoice Plant data
        """

        spark = self.get_spark()

        inv_loc_df = spark.read.orc(sources['dataB_invoice_extract']['path'])

        df = (inv_loc_df.select(
            col('plant').alias('location_id'), 'invoice_date', 'invoice',
            col('plant_name').alias('description')))

        window = Window.partitionBy('location_id').orderBy(
            df['invoice_date'].desc(), df['invoice'].desc())
        df = df.withColumn("rank", F.rank().over(window))
        df_rank = df.filter("rank = 1").distinct()

        df_final = (df_rank.select('location_id', 'invoice_date', 'invoice',
                                   'description'))

        return df_final
示例#14
0
def top_payments_monthly(data_df):
    payment_otc = spark.read.jdbc(mysql_url,
                                  "source.payment_otc",
                                  properties={
                                      "user": mysql_un,
                                      "password": mysql_password
                                  })

    payment_df = data_df.groupBy("payment_type", "month", "year").count()

    window = Window.partitionBy("month", "year").orderBy(fn.desc("count"))

    payment_df2 = payment_df.withColumn("rank", fn.rank().over(window))

    payment_df3 = payment_df2.where("rank <= 3").select(
        "month", "year", "payment_type", "count")

    payment_df4 = payment_df3.join(payment_otc, ["payment_type"]).select(
        "month", "year", "payment_name", "count")

    payment_df4.orderBy("year", "month").write.orc(hdfs_output +
                                                   "top_payments_monthly",
                                                   mode="overwrite")
示例#15
0
def obtener_topN_ciclistas_por_provincia_en_total_de_kilometros(
        ciclistas_kilometros_df, N):

    #obtiene el total de kilómetros por ciclista, agrupado por provincia, cedula y nombre_Completo
    provincia_ciclistas_kilometros_total_df = ciclistas_kilometros_df.groupBy(
        "provincia", "cedula", "nombre_Completo").sum("TotalKilometros")
    #provincia_ciclistas_kilometros_total_df.show()

    provincia_ciclistas_kilometros_total_df = \
    provincia_ciclistas_kilometros_total_df.select(
        col('provincia'),
        col('cedula'),
        col('nombre_Completo'),
        col('sum(TotalKilometros)').alias('TotalKilometros'))
    #provincia_ciclistas_kilometros_total_df.show()

    #particiona los datos por provincia, ordenados por TotalKilometros descendente y cedula ascendente, y posteriormente crea columna para asignarles una "posición"
    window = Window.partitionBy('provincia').orderBy(
        col('TotalKilometros').desc(),
        col('cedula').asc())
    provincia_ciclistas_kilometros_total_df = provincia_ciclistas_kilometros_total_df.withColumn(
        "Posicion_Por_Provincia",
        rank().over(window))

    provincia_ciclistas_kilometros_total_df = provincia_ciclistas_kilometros_total_df.withColumn(
        "Tipo_Top_N_Ciclistas_Por_Provincia", lit("Total de Km"))

    #obtiene el top N
    provincia_ciclistas_kilometros_total_df = provincia_ciclistas_kilometros_total_df.filter(
        provincia_ciclistas_kilometros_total_df.Posicion_Por_Provincia <= N)

    provincia_ciclistas_kilometros_total_df = provincia_ciclistas_kilometros_total_df.select(
        col('Tipo_Top_N_Ciclistas_Por_Provincia'), col('provincia'),
        col('cedula'), col('nombre_Completo'),
        col('TotalKilometros').alias('Valor'), col('Posicion_Por_Provincia'))

    return provincia_ciclistas_kilometros_total_df
示例#16
0
def transform(retail_df):
    """

    :param retail_df:
    :return:
    """
    from pyspark.sql.window import Window
    from pyspark.sql.functions import col, date_format, desc, dense_rank, rank, max

    # convert date format on retail_df
    transform_step1 = (retail_df.withColumn('InvoiceDate',
                                            date_format(col("InvoiceDate"), "MM/dd/yyyy H:mm")))

    # window function
    window_function = (Window.partitionBy("CustomerId")
                       .orderBy(desc("Quantity"))
                       .rowsBetween(Window.unboundedPreceding, Window.currentRow))

    # aggregate functions
    max_purchase_quantity = max(col("Quantity")).over(window_function)

    # rank functions
    purchase_dense_rank = dense_rank().over(window_function)
    purchase_rank = rank().over(window_function)

    transformed_df = (retail_df.withColumn('InvoiceDate', date_format(col("InvoiceDate"), "MM/dd/yyyy H:mm"))
                      .where("CustomerId IS NOT NULL")
                      .orderBy("CustomerId")
                      .select(col("CustomerId"),
                              col("InvoiceDate"),
                              col("Quantity"),
                              purchase_rank.alias("quantityRank"),
                              purchase_dense_rank.alias("quantityDenseRank"),
                              max_purchase_quantity.alias("maxPurchaseQuantity")))

    return transformed_df
    def run_job(self, sc, sqlc):
        input_data = sc.textFile(self.text_file, minPartitions=4)

        output = input_data.mapPartitionsWithIndex(
            self.process_warcs).reduce(add)

        output_json = sc.parallelize(output)

        self.create_db_connection()

        self.reference_to_instagram_df = output_json.toDF() \
                                            .orderBy("reference_link", "warc_date") \

        window = Window.partitionBy("instagram_link",
                                    "reference_link").orderBy(
                                        "warc_date", 'tiebreak')
        self.reference_to_instagram_df = (
            self.reference_to_instagram_df.withColumn(
                'tiebreak', monotonically_increasing_id()).withColumn(
                    'rank',
                    rank().over(window)).filter(col('rank') == 1).drop(
                        'rank', 'tiebreak'))

        self.log_aggregators(sc)

        self.prepare_csv(sc, sqlc)

        try:
            self.drop_outdated_references()
            self.perform_aggregations()

            self.conn.commit()
            self.conn.close()

        finally:
            pass
示例#18
0
def runOtherFunctions(spark, personDf):
    df = spark.createDataFrame([("v1", "v2", "v3")], ["c1", "c2", "c3"]);

    # array
    df.select(df.c1, df.c2, df.c3, array("c1", "c2", "c3").alias("newCol")).show(truncate=False)

    # desc, asc
    personDf.show()
    personDf.sort(functions.desc("age"), functions.asc("name")).show()

    # pyspark 2.1.0 버전은 desc_nulls_first, desc_nulls_last, asc_nulls_first, asc_nulls_last 지원하지 않음

    # split, length (pyspark에서 컬럼은 df["col"] 또는 df.col 형태로 사용 가능)
    df2 = spark.createDataFrame([("Splits str around pattern",)], ['value'])
    df2.select(df2.value, split(df2.value, " "), length(df2.value)).show(truncate=False)

    # rownum, rank
    f1 = StructField("date", StringType(), True)
    f2 = StructField("product", StringType(), True)
    f3 = StructField("amount", IntegerType(), True)
    schema = StructType([f1, f2, f3])

    p1 = ("2017-12-25 12:01:00", "note", 1000)
    p2 = ("2017-12-25 12:01:10", "pencil", 3500)
    p3 = ("2017-12-25 12:03:20", "pencil", 23000)
    p4 = ("2017-12-25 12:05:00", "note", 1500)
    p5 = ("2017-12-25 12:05:07", "note", 2000)
    p6 = ("2017-12-25 12:06:25", "note", 1000)
    p7 = ("2017-12-25 12:08:00", "pencil", 500)
    p8 = ("2017-12-25 12:09:45", "note", 30000)

    dd = spark.createDataFrame([p1, p2, p3, p4, p5, p6, p7, p8], schema)
    w1 = Window.partitionBy("product").orderBy("amount")
    w2 = Window.orderBy("amount")
    dd.select(dd.product, dd.amount, functions.row_number().over(w1).alias("rownum"),
              functions.rank().over(w2).alias("rank")).show()
示例#19
0
def process_file(date_update):
    """Process downloaded MEDLINE folder to parquet file"""
    print("Process MEDLINE file to parquet")
    # remove if folder still exist
    if glob(os.path.join(save_dir, 'medline_*.parquet')):
        subprocess.call(['rm', '-rf', 'medline_*.parquet'])

    date_update_str = date_update.strftime("%Y_%m_%d")
    path_rdd = sc.parallelize(glob(
        os.path.join(download_dir, 'medline*.xml.gz')),
                              numSlices=1000)
    parse_results_rdd = path_rdd.\
        flatMap(lambda x: [Row(file_name=os.path.basename(x), **publication_dict)
                           for publication_dict in pp.parse_medline_xml(x)])
    medline_df = parse_results_rdd.toDF()
    medline_df.write.parquet(os.path.join(
        save_dir, 'medline_raw_%s.parquet' % date_update_str),
                             compression='gzip')

    window = Window.partitionBy(['pmid']).orderBy(desc('file_name'))
    windowed_df = medline_df.select(
        max('delete').over(window).alias('is_deleted'),
        rank().over(window).alias('pos'), '*')
    windowed_df.\
        where('is_deleted = False and pos = 1').\
        write.parquet(os.path.join(save_dir, 'medline_lastview_%s.parquet' % date_update_str),
                      compression='gzip')

    # parse grant database
    parse_grant_rdd = path_rdd.flatMap(lambda x: pp.parse_medline_grant_id(x))\
        .filter(lambda x: x is not None)\
        .map(lambda x: Row(**x))
    grant_df = parse_grant_rdd.toDF()
    grant_df.write.parquet(os.path.join(
        save_dir, 'medline_grant_%s.parquet' % date_update_str),
                           compression='gzip')
def __get_recent_items():
    window = Window.partitionBy(date_format(
        df.dateAdded, 'yyy-MM-dd')).orderBy(df['dateAdded'].desc())

    recent_items = (
        df.select('*',
                  rank().over(window).alias('rank')).filter(col('rank') <= 1).
        dropDuplicates(
            ['dateAdded']
        )  # Sometimes, for a day, multiple products have been added at the recent time. Since the output has to be a single product, dropping the duplicates
        .orderBy(df.dateAdded, ascending=False))

    # removing unnecessary fields
    recent_items = recent_items.select(
        date_format(df.dateAdded, 'yyy-MM-dd').alias('dateAdded'), 'id',
        'brand', 'colors')

    # writing the results to redis (uses HASH data structure)
    recent_items.write \
            .format("org.apache.spark.sql.redis") \
            .option("table", "recent") \
            .option("key.column", "dateAdded") \
            .mode("overwrite") \
            .save()
def get_recordings_df(mapped_listens_df, metadata):
    """ Prepare recordings dataframe.

        Args:
            mapped_listens_df (dataframe): listens mapped with msid_mbid_mapping.

        Returns:
            recordings_df: Dataframe containing distinct recordings and corresponding
                mbids and names.
    """
    recording_window = Window.orderBy('mb_recording_mbid')

    recordings_df = mapped_listens_df.select('mb_artist_credit_id',
                                             'mb_artist_credit_mbids',
                                             'mb_recording_mbid',
                                             'mb_release_mbid',
                                             'msb_artist_credit_name_matchable',
                                             'msb_recording_name_matchable') \
                                     .distinct() \
                                     .withColumn('recording_id', rank().over(recording_window))

    metadata['recordings_count'] = recordings_df.count()
    save_dataframe(recordings_df, path.RECORDINGS_DATAFRAME_PATH)
    return recordings_df
示例#22
0
def get_recordings_df(mapped_listens_df, metadata, save_path):
    """ Prepare recordings dataframe.

        Args:
            mapped_listens_df (dataframe): listens mapped with msid_mbid_mapping.
            save_path (str): path where recordings_df should be saved

        Returns:
            recordings_df: Dataframe containing distinct recordings and corresponding
                mbids and names.
    """
    recording_window = Window.orderBy('recording_mbid')

    recordings_df = mapped_listens_df \
        .select(
            'artist_credit_id',
            'recording_mbid',
        ) \
        .distinct() \
        .withColumn('recording_id', rank().over(recording_window))

    metadata['recordings_count'] = recordings_df.count()
    save_dataframe(recordings_df, save_path)
    return recordings_df
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--posts', type=str, required=True)
    parser.add_argument('--num_top', type=int, default=1)
    parser.add_argument('--output_dir', type=str, required=True)
    args = parser.parse_args()

    conf = SparkConf().set("spark.driver.maxResultSize", "10G"). \
        set("spark.hadoop.validateOutputSpecs", "false"). \
        set('spark.default.parallelism', '400')

    spark = SparkSession.builder.\
        appName("SO Tag first usage date").\
        config(conf=conf).\
        getOrCreate()

    sc = spark.sparkContext

    in_rdd = sc.textFile(args.posts).filter(lambda x: get_field(x, 'Id') is not None).\
                                map(lambda x: (int(get_field(x, 'Id')), x))

    in_rdd = in_rdd.filter(lambda x: get_field(x[1], 'Tags') is not None and get_field(x[1], 'CreationDate') is not None).\
                    map(lambda x: (datetime.strptime(get_field(x[1], 'CreationDate').decode('utf-8'), DT_FORMAT),
                                   get_tags(get_field(x[1], 'Tags').decode('utf-8')))).\
                    flatMap(lambda x: [(x[0], y) for y in x[1]])

    tag_date_df = in_rdd.toDF(['CreationDate', 'Tag'])
    window = Window.partitionBy(tag_date_df['Tag']).orderBy(tag_date_df['CreationDate'].asc())
    #tag_first_appearances = tag_date_df.groupBy('Tag').agg({'CreationDate': 'min'})
    tag_first_appearances = tag_date_df.select('*', rank().over(window).alias('rank')).\
                        filter(col('rank') <= args.num_top)
    tag_first_appearances_pd = tag_first_appearances.toPandas().drop(columns=['rank'])

    make_sure_path_exists(args.output_dir)
    with open(os.path.join(args.output_dir, 'tag_'+str(args.num_top)+'_earliest_appearance.csv'), 'w') as f:
        tag_first_appearances_pd.to_csv(f)
示例#24
0
def get_topN(df, group_by_columns, order_by_column, n=10):
    window = Window.partitionBy(group_by_columns).orderBy(
        order_by_column.desc())
    return df.select('*',
                     f.rank().over(window).alias('rank')).filter(
                         f.col('rank') <= n).drop("rank")
示例#25
0
from pyspark.sql import Window


spark = SparkSession.builder.enableHiveSupport().getOrCreate()

df1 = spark.sql(''' select * from app.app_saas_sfs_model_input where dt='2018-07-31' ''').select(['sku_code', 'sale_date', 'sale'])
df1.show()

day_len = 90
day_end = '2018-07-31'
day_start = (parse(day_end) - datetime.timedelta(day_len)).strftime('%Y-%m-%d')

df1_sum = df1.where(''' sale_date >= '{day_start}' and sale_date <= '{day_end}' '''.format(day_start=day_start, day_end=day_end)).groupBy('sku_code').agg(F.sum('sale').alias('sale_sum'))
# Temp rank of the sale, just to split into 5
windowspec_r = Window.orderBy(F.col('sale_sum').desc())
df1_rank = df1_sum.withColumn('rank', F.rank().over(windowspec_r))
# 16483
df1_cnt = df1_sum.select(F.countDistinct('sku_code').alias('sku_count'))

df1_rcnt = df1_rank.crossJoin(F.broadcast(df1_cnt))
df1_rcnt = df1_rcnt.withColumn('rank_rate', F.col('rank')/ F.col('sku_count'))

band_sql = '''
Case
    When rank_rate < 0.2 Then 1
    When rank_rate < 0.4 Then 2
    When rank_rate < 0.6 Then 3
    When rank_rate < 0.8 Then 4
    else 5
end
as
                 fontsize=30)
graph2.set_xlabel("Players Age", fontsize=20)
graph2.set_ylabel("Player Overall ratings", fontsize=20)
graph2.set_xlim(15, 45)
graph2.set_ylim(50, 100)
graph2.xaxis.set_major_locator(tck.MultipleLocator(5))
graph2.yaxis.set_major_locator(tck.MultipleLocator(5))
plt.show()

# Finding top 10 best players with respect to each position considering their overall
# Adding a rank column to rank the best players in each position
window_for_highest_overall = Window.partitionBy(
    cleaned_set['Position']).orderBy(cleaned_set['Overall'].desc())
top_worthy_players = cleaned_set.select(
    '*',
    rank().over(window_for_highest_overall).alias('rank')).filter(
        col('rank') <= 10)
print(
    "\nThe top 10 best players who are worthy based on their Overall for each position are\n"
)
print("The column rank indicates the rank of players")
top_worthy_players.show(1000)

# Plotting average market value of top 10 worthy players for each position with required plot properties
# Adding avg(Value) column containing average of market values for each position
positionvsavgvalue_dataset = top_worthy_players.groupBy('Position').agg(
    {'Value': 'avg'})
g = positionvsavgvalue_dataset.toPandas()
graph3 = g.plot(x='Position',
                y='avg(Value)',
                kind='bar',
示例#27
0
文件: data.py 项目: wcbeard/dscontrib
def createDataFile(start_date, end_date, spark_instance, jackknife_buckets,
                   sample_percent, output_path):
    feature_data_phase1 = spark_instance.table(_TABLE_SOURCE).select([
        _COL_ID.alias("id"),
        _DATE_PARSED.alias("date"),
        # TODO: Use MD5 instead of CRC32
        (F.floor(F.crc32(_COL_ID) / 100) % jackknife_buckets).alias("bucket"),
        lit(1).alias("is_active"),
        F.when(_COL_URI_COUNT >= _NUM_ADAU_THRESHOLD, 1).otherwise(0).alias(
            "is_active_active"),
        F.to_date(_COL_PC_DATE).alias("profile_creation_date")
    ] + list(_MAP_NATURAL_DIMENSIONS.keys())).filter(
        (_DATE_PARSED.between(start_date, end_date))
        & (_COL_SAMPLE < sample_percent)).withColumn(
            "young_profile",
            F.when(
                col("date") < F.date_add(col("profile_creation_date"), 14),
                "TRUE").otherwise("FALSE"))

    new_profile_window = Window.partitionBy(col("id")).orderBy(col("date"))
    new_profile_data = feature_data_phase1.filter(
        (col("date") >= col("profile_creation_date"))
        & (col("date") <= F.date_add(col("profile_creation_date"), 6))).select(
            "*",
            F.rank().over(new_profile_window).alias('rank')).filter(
                col('rank') == 1).withColumn("new_profile",
                                             lit(1)).drop("date").withColumn(
                                                 "date",
                                                 col("profile_creation_date"))

    feature_data = feature_data_phase1.alias("fd").join(
        new_profile_data.alias("np"),
        (col("fd.id") == col("np.id")) & (col("fd.date") == col("np.date")),
        how='full',
    ).select(
        [F.coalesce(col("np.new_profile"), lit(0)).alias("new_profile")] +
        [F.coalesce(col("fd.is_active"), lit(0)).alias("is_active")] + [
            F.coalesce(col("fd.is_active_active"), lit(0)).alias(
                "is_active_active")
        ] + [
            F.coalesce(col("fd.{}".format(c)), col("np.{}".format(c))).alias(c)
            for c in feature_data_phase1.columns
            if c not in ["is_active", "is_active_active"]
        ])

    once_ever_profiles = feature_data.filter(
        col("is_active") == 1).groupBy("id").count().filter(
            col("count") == 1).select("id").withColumn("single_day_profile",
                                                       lit("1"))

    feature_data = feature_data.alias("fd").join(
        once_ever_profiles.alias("oep"), "id",
        "outer").fillna({"single_day_profile": "0"})

    ppi_profiles = spark_instance.table("main_summary").select(
        col("client_id").alias("id"),
        lit(1).alias("ppi")).filter('''submission_date_s3 >= '20190121'
        AND scalar_parent_startup_profile_selection_reason IN (
            'firstrun-skipped-default', 'restart-skipped-default'
        )''')

    feature_data = feature_data.alias("fd").join(ppi_profiles.alias("ppip"),
                                                 "id",
                                                 "left").fillna({"ppi": 0})

    feature_data.write.partitionBy("date").mode('overwrite').parquet(
        output_path)
示例#28
0
	# 1. human interfere
	df_cleanning = human_interfere(spark, df_cleanning, df_interfere)
	df_cleanning.persist()

	# 2. 以MOLE_NAME为主键JOIN
	df_result = similarity(spark, df_cleanning, df_standard)
	df_result.persist()
	df_result.show()
	print(df_result.count())

	# 3. 对每个需要匹配的值做猜想排序
	# windowSpec  = Window.partitionBy("id").orderBy(desc("SIMILARITY"))
	windowSpec  = Window.partitionBy("id").orderBy("SIMILARITY")

	df_match = df_result.withColumn("RANK", rank().over(windowSpec))
	df_match = df_match.where(df_match.RANK <= 5)

	df_match.persist()
	
	# df_match.printSchema()

	df_match = df_match.withColumn("check", check_similarity(df_match.PACK_ID_CHECK, df_match.PACK_ID_STANDARD, df_match.SIMILARITY))
	# df_match.show(5)
	df_match = df_match.orderBy("id").drop("ORIGIN", "STANDARD")
	df_match.persist()
	df_match.repartition(1).write.format("parquet").mode("overwrite").save("s3a://ph-max-auto/2020-08-11/BPBatchDAG/refactor/azsanofi/0.0.4/all")
	# df_match.repartition(1).write.format("parquet").mode("overwrite").save("s3a://ph-max-auto/2020-08-11/BPBatchDAG/refactor/0.0.15/all")

	df_replace = df_match.filter(df_match.check == 1)
示例#29
0
                                                          table + '/*.' +
                                                          fileFormat)
            #load today's changes
            logger.info("Load the latest changes")
            latestChanges = spark.read.format(fileFormat).load(landingDir +
                                                               '/' + table +
                                                               '/today/*.' +
                                                               fileFormat)
            #Combine the datasets
            logger.info(
                "Merge two datasets which will have duplicates for the records modified"
            )
            dataMerge = baseData.union(latestChanges)

            #Filter the old data and save the files with latest changes
            logger.info(
                "Filter the old data and have the latest changes for the records modified"
            )
            dataMerge.select("*", rank().over(Window.partitionBy(partitionByColumn).orderBy(desc(lmtColumn))).alias("latestRecord"))\
                .where("latestRecord == 1").drop("latestRecord").repartition(1).write.option('path',targetDir+'/'+table).mode(fileMode)\
                .bucketBy(noBuckets,bucketByColumn).saveAsTable(destDB+'.'+table)
            logger.info(
                f"Latest changes merged with the base data and it is available in {targetDir}/{table}"
            )
    except Exception as error:
        logger.exception(f"Failed with error- {error}")
    else:
        logger.info("Latest changes are merged successfully")
    finally:
        spark.stop()
示例#30
0
# 7. Saving the end results
# you cannot save df directly as textfile should convet it to rdd
# dff=dff.withColumn('year',dff['_c1'].substr(8,4) <<postion,length
#Note : dff = sc.textfile(trainingdata+"part-00000").map(lambda x: x.replace('[','').replace(']','').split('|')).toDF()
# best combination of python and spark
#########################################################################################

# lines=Source.fromFile("E:\scala\spark-data\movie-description") scala

from pyspark.sql import Window
from pyspark.sql.functions import rank

dff = spark.read.csv("E:\scala\spark-data\movie-description", sep="|")
movie_year_df = dff.withColumn('year', dff['_c2'].substr(8, 4)).select(
    "_c0", "_c1", "_c2", "year")

movie_number_df = spark.read.csv("E:\scala\spark-data\movie-data.data",
                                 inferSchema=True,
                                 header=False,
                                 sep="\t").select("_c0", "_c3")
cond = movie_number_df._c0 == movie_year_df._c0
mv_fi_df = movie_year_df.join(
    movie_number_df, cond,
    'right').withColumnRenamed("_c1", "movie").withColumnRenamed(
        "_c3", "gross").select("movie", "year", "gross")

windowSpec = Window.partitionBy("year").orderBy("gross")

mv_fi_df = mv_fi_df.withColumn("rank", rank().over(windowSpec))
mv_fi_df.filter(mv_fi_df.rank == 1).show()
示例#31
0
        "PROVINCE", regexp_replace(df_cleanning.PROVINCE, "市", ""))
    df_cleanning = df_cleanning.withColumn(
        "PROVINCE", regexp_replace(df_cleanning.PROVINCE, "自治区", ""))
    df_cleanning = df_cleanning.withColumn(
        "CITY", regexp_replace(df_cleanning.CITY, "市", ""))

    # 2. Join 一下
    df_cleanning = df_cleanning.join(broadcast(df_standard),
                                     on=["PROVINCE", "CITY"],
                                     how="left")
    df_cleanning.persist()
    df_not_match = df_cleanning.where(isnull(df_cleanning.STANDARD_NAME))
    df_cleanning = df_cleanning.where(~isnull(df_cleanning.STANDARD_NAME))
    df_cleanning = df_cleanning.repartition(800).withColumn(
        "SIMILARITY",
        efftiveness_with_jaro_winkler_similarity_in_hc_mapping(
            df_cleanning.NAME, df_cleanning.STANDARD_NAME))
    windowSpec = Window.partitionBy("ID").orderBy(desc("SIMILARITY"))
    df_cleanning = df_cleanning.withColumn("RANK", rank().over(windowSpec))
    df_cleanning = df_cleanning.where(df_cleanning.RANK == 1)
    df_cleanning.repartition(1).write.mode("overwrite").option(
        "header", "true"
    ).csv(
        "s3a://ph-max-auto/2020-08-11/BPBatchDAG/refactor/alfred/tmp/chc_hc_cleanning/hc_result_2"
    )
    df_not_match.repartition(1).write.mode("overwrite").option(
        "header", "true"
    ).csv(
        "s3a://ph-max-auto/2020-08-11/BPBatchDAG/refactor/alfred/tmp/chc_hc_cleanning/hc_result_not_match"
    )
示例#32
0
def compile_rank(t, expr, scope, *, window, **kwargs):
    return F.rank().over(window).astype('long') - F.lit(1)
示例#33
0
train.select(['user_id']).distinct().count()
#5298
val_new.select(['user_id']).distinct().count()
#1751
test_new.select(['user_id']).distinct().count()
#1636
## there are some users in val_add and test_add but not in val_new and test_new, therefore, 8774> 5298+1751+1636=8685

## example of writing x% data
train_add_test.write.parquet("train01.parquet")
val_new.write.parquet("val01.parquet")
test_new.write.parquet("test01.parquet")

# create the true rank list (example of 1% data)
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, col

window = Window.partitionBy(val['user_id']).orderBy(val['rating'].desc())
val_true_order = val.select('*', rank().over(window).alias('rank'))
val_true_list = val_true_order.select(
    'user_id',
    'book_id').groupBy('user_id').agg(expr('collect_list(book_id) as books'))
val_true_list.write.parquet("val01_true_list.parquet")

window = Window.partitionBy(test['user_id']).orderBy(test['rating'].desc())
test_true_order = test.select('*', rank().over(window).alias('rank'))
test_true_list = test_true_order.select(
    'user_id',
    'book_id').groupBy('user_id').agg(expr('collect_list(book_id) as books'))
test_true_list.write.parquet("test01_true_list.parquet")
示例#34
0
def match_accidents_with_roads(spark, road_df, accident_df, use_cache=True):
    cache_path = workdir + "data/matches_accident-road.parquet"
    if isdir(cache_path) and use_cache:
        print("Reading accident-road matches from cache...")
        return spark.read.parquet(cache_path)

    nb_top_road_center_preselected = 5
    max_distance_accepted = 10  # in meters

    # Compute distance between accident and road centers to identify the
    # top nb_top_road_center_preselected closest roads
    road_centers = road_df.select(
        ["street_id", "center_long", "center_lat"]
    ).drop_duplicates()

    acc_window = Window.partitionBy("accident_id").orderBy("distance_measure")
    accidents_top_k_roads = (
        accident_df.select("loc_lat", "loc_long", "accident_id")
        .crossJoin(road_centers)
        .withColumn(
            "distance_inter",
            distance_intermediate_formula(
                "loc_lat", "loc_long", "center_lat", "center_long"
            ),
        )
        .withColumn("distance_measure", distance_measure())
        .select(
            "accident_id",
            "street_id",
            "distance_measure",
            "loc_lat",
            "loc_long",
            rank().over(acc_window).alias("distance_rank"),
        )
        .filter(col("distance_rank") <= nb_top_road_center_preselected)
        .drop("distance_measure", "distance_rank")
        .persist()
    )

    # For each accident identify road point closest
    accidents_roads_first_match = (
        accidents_top_k_roads.join(road_df, "street_id")
        .withColumn(
            "distance_inter",
            distance_intermediate_formula(
                "loc_lat", "loc_long", "coord_lat", "coord_long"
            ),
        )
        .withColumn("distance_measure", distance_measure())
        .select(
            "accident_id",
            "loc_lat",
            "loc_long",
            "coord_lat",
            "coord_long",
            "street_id",
            "street_name",
            row_number().over(acc_window).alias("distance_rank"),
            "distance_measure",
        )
        .filter(col("distance_rank") == 1)
        .withColumn("distance", col("distance_measure") * (6371 * 2 * 1000))
        .drop("distance_rank", "distance_measure", "coord_lat", "coord_long")
        .persist()
    )

    # If the distance is lower than max_distance_accepted we keep the
    # accident/street matches
    accidents_road_correct_match = accidents_roads_first_match.filter(
        col("distance") < max_distance_accepted
    ).select("accident_id", "street_id")

    # If not, we try to get a better match by adding intermediate points on
    # the preselected streets
    # For unsatisfying matches, recompute the k closests roads
    # Recomputing is probably faster than reading from disk
    # cache + joining on accident_ids
    accidents_close_streets_coords = (
        accidents_roads_first_match.filter(col("distance") >= max_distance_accepted)
        .select("accident_id", "loc_lat", "loc_long")
        .crossJoin(road_centers)
        .withColumn(
            "distance_inter",
            distance_intermediate_formula(
                "loc_lat", "loc_long", "center_lat", "center_long"
            ),
        )
        .withColumn("distance_measure", distance_measure())
        .select(
            "accident_id",
            "street_id",
            "distance_measure",
            "loc_lat",
            "loc_long",
            rank().over(acc_window).alias("distance_rank"),
        )
        .filter(col("distance_rank") <= nb_top_road_center_preselected)
        .drop("distance_measure", "distance_rank")
        .join(road_df.select("street_id", "coord_lat", "coord_long"), "street_id")
    )

    # Add the intermediate points
    street_rolling_window = (
        Window.partitionBy("street_id").orderBy("coord_long").rowsBetween(0, +1)
    )
    accidents_close_streets_with_additional_coords = (
        accidents_close_streets_coords.select(
            "accident_id",
            "street_id",
            "loc_lat",
            "loc_long",
            avg("coord_long").over(street_rolling_window).alias("coord_long"),
            avg("coord_lat").over(street_rolling_window).alias("coord_lat"),
        )
        .union(accidents_close_streets_coords)
        .dropDuplicates()
    )
    accidents_close_streets_coords.unpersist()

    # Recompute distances between accident and new set of points
    # and use closest point to identify street
    accidents_roads_first_match_with_additional_coords = (
        accidents_close_streets_with_additional_coords.withColumn(
            "distance_inter",
            distance_intermediate_formula(
                "loc_lat", "loc_long", "coord_lat", "coord_long"
            ),
        )
        .withColumn("distance_measure", distance_measure())
        .select(
            "accident_id",
            "street_id",
            "loc_lat",
            "loc_long",
            "coord_lat",
            "coord_long",
            row_number().over(acc_window).alias("distance_rank"),
        )
        .filter(col("distance_rank") == 1)
        .drop("distance_rank", "loc_lat", "loc_long", "coord_lat", "coord_long")
    )

    # Union accidents matched correctly with first method with the accidents
    # for which we used more street points
    final_match = accidents_road_correct_match.union(
        accidents_roads_first_match_with_additional_coords
    )

    # Make sure there is only one road per accident
    final_match = (
        final_match.join(road_centers, "street_id")
        .join(accident_df.select("loc_lat", "loc_long", "accident_id"), "accident_id")
        .withColumn(
            "distance_inter",
            distance_intermediate_formula(
                "loc_lat", "loc_long", "center_lat", "center_long"
            ),
        )
        .withColumn("distance_measure", distance_measure())
        .withColumn("dist_rank", row_number().over(acc_window))
        .filter(col("dist_rank") == 1)
        .select("accident_id", "street_id")
    )

    return final_match
  .partitionBy("CustomerId", "date")\
  .orderBy(desc("Quantity"))\
  .rowsBetween(Window.unboundedPreceding, Window.currentRow)


# COMMAND ----------

from pyspark.sql.functions import max
maxPurchaseQuantity = max(col("Quantity")).over(windowSpec)


# COMMAND ----------

from pyspark.sql.functions import dense_rank, rank
purchaseDenseRank = dense_rank().over(windowSpec)
purchaseRank = rank().over(windowSpec)


# COMMAND ----------

from pyspark.sql.functions import col

dfWithDate.where("CustomerId IS NOT NULL").orderBy("CustomerId")\
  .select(
    col("CustomerId"),
    col("date"),
    col("Quantity"),
    purchaseRank.alias("quantityRank"),
    purchaseDenseRank.alias("quantityDenseRank"),
    maxPurchaseQuantity.alias("maxPurchaseQuantity")).show()