示例#1
0
 def test_first_last_ignorenulls(self):
     from pyspark.sql import functions
     df = self.spark.range(0, 100)
     df2 = df.select(functions.when(df.id % 3 == 0, None).otherwise(df.id).alias("id"))
     df3 = df2.select(functions.first(df2.id, False).alias('a'),
                      functions.first(df2.id, True).alias('b'),
                      functions.last(df2.id, False).alias('c'),
                      functions.last(df2.id, True).alias('d'))
     self.assertEqual([Row(a=None, b=1, c=None, d=98)], df3.collect())
示例#2
0
def reduce_to_ohlc(time, rdd):
    row_rdd = rdd.map(lambda row: row.split(',')) \
                 .filter(lambda row: len(row) == 3) \
                 .map(lambda row: Row(
                       symbol=row[0],
                       tx_time=datetime.strptime(row[2], '%Y-%m-%d %H:%M:%S.%f'),
                       price=float(row[1])
                 ))
    sql_context = get_sql_context_instance(rdd.context)
    data = sql_context.createDataFrame(row_rdd)
    data.cache()
    data.write.format('org.apache.spark.sql.cassandra') \
            .options(table='transactions2', keyspace='stock', cluster='Test Cluster') \
            .mode('append') \
            .save()

    ohlc = data.select('symbol', truncate_min(data.tx_time).alias('batch_time'), 'price', 'tx_time') \
                .orderBy('tx_time') \
                .groupBy('symbol', 'batch_time') \
                .agg(
                   F.first(data.price).alias('open'),
                   F.max(data.price).alias('high'),
                   F.min(data.price).alias('low'),
                   F.last(data.price).alias('close'),
                   F.first(data.tx_time).alias('open_time'),
                   F.last(data.tx_time).alias('close_time')
                )

    existing_ohlc = sql_context.read.format('org.apache.spark.sql.cassandra') \
            .options(table='ohlc_1_min2', keyspace='stock', cluster='Test Cluster') \
            .load() \
            .select('symbol', 'batch_time', 'open', 'open_time', 'high', 'low', 'close', 'close_time')

    merged_ohlc = ohlc.join(existing_ohlc,
                             (ohlc.symbol == existing_ohlc.symbol) &
                             (ohlc.batch_time == existing_ohlc.batch_time),
                             'left'
                           )

    merged_ohlc = merged_ohlc.select(
        ohlc.symbol.alias('symbol'),
        ohlc.batch_time.alias('batch_time'),
        F.when(existing_ohlc.open_time < ohlc.open_time, existing_ohlc.open).otherwise(ohlc.open).alias('open'),
        F.when(existing_ohlc.open_time < ohlc.open_time, existing_ohlc.open_time).otherwise(ohlc.open_time).alias('open_time'),
        F.when(existing_ohlc.close_time > ohlc.close_time, existing_ohlc.close).otherwise(ohlc.close).alias('close'),
        F.when(existing_ohlc.close_time > ohlc.close_time, existing_ohlc.close_time).otherwise(ohlc.close_time).alias('close_time'),
        F.when(existing_ohlc.low < ohlc.low, existing_ohlc.low).otherwise(ohlc.low).alias('low'),
        F.when(existing_ohlc.high > ohlc.high, existing_ohlc.high).otherwise(ohlc.high).alias('high')
    )
    merged_ohlc.write.format('org.apache.spark.sql.cassandra') \
                .options(table='ohlc_1_min2', keyspace='stock', cluster='Test Cluster') \
                .mode('append') \
                .save()
示例#3
0
    def test_aggregator(self):
        df = self.df
        g = df.groupBy()
        self.assertEqual([99, 100], sorted(g.agg({'key': 'max', 'value': 'count'}).collect()[0]))
        self.assertEqual([Row(**{"AVG(key#0)": 49.5})], g.mean().collect())

        from pyspark.sql import functions
        self.assertEqual((0, u'99'),
                         tuple(g.agg(functions.first(df.key), functions.last(df.value)).first()))
        self.assertTrue(95 < g.agg(functions.approxCountDistinct(df.key)).first()[0])
        self.assertEqual(100, g.agg(functions.countDistinct(df.value)).first()[0])
示例#4
0
    def levenshtein_cluster(df, col_name):
        # Prepare a group so we don need to apply the fingerprint to the whole data set
        df = df.select(col_name).groupby(col_name).agg(F.count(col_name).alias("count"))

        df = KeyCollision.fingerprint(df, col_name)

        df_t = df.groupby(col_name + "_FINGERPRINT").agg(F.collect_list(col_name).alias("cluster"),
                                                         F.size(F.collect_list(col_name)).alias("cluster_size"),
                                                         F.first(col_name).alias("recommended"),
                                                         F.sum("count").alias("count"))

        # Filter min distance
        df_l = DistanceCluster.levenshtein_filter(df, col_name)

        # Cluster
        df_l = df_l.join(df_t, (df_l[col_name + "_FROM"] == df_t[col_name + "_FINGERPRINT"]), how="left") \
            .cols.drop(col_name + "_FINGERPRINT") \
            .cols.drop([col_name + "_FROM", col_name + "_TO", col_name + "_LEVENSHTEIN_DISTANCE"]).table()

        return df_l
示例#5
0
def saveCompletedJobRunScheduleData(microBatchDF):
    scheduleExplodeDF = microBatchDF.select(microBatchDF.job_id,
                                            microBatchDF.run_id,
                                            explode(microBatchDF.schedule))
    scheduleDF = scheduleExplodeDF.groupBy("job_id",
                                           "run_id").pivot("key").agg(
                                               first("value"))

    if DeltaTable.isDeltaTable(spark, completed_job_run_schedule_path):
        # merge data
        deltaTable = DeltaTable.forPath(spark, completed_job_run_schedule_path)

        (deltaTable.alias("target").merge(
            scheduleDF.alias("source"),
            "source.job_id=target.job_id and source.run_id=target.run_id").
         whenMatchedUpdateAll().whenNotMatchedInsertAll().execute())

    else:
        (scheduleDF.write.format("delta").mode("overwrite").option(
            "mergeSchema", "true").save(completed_job_run_schedule_path))
示例#6
0
def statistic_school_address(df):
    """
    学校和工作地对应的分析
    :param df: 
    :return: 
    """
    df = df.filter(df.address.isNotNull())
    groups = ("school_name", "degree", "address")
    df = add_median_salary(df, groups)
    sda_df = df.groupby(*groups).agg(
        F.count("*").alias("person_num"),
        F.first("avg_salary").alias("avg_salary"))
    sda_df = sda_df.filter(sda_df.person_num > MIN_NUM)
    # 不限degree分析
    sa_df = sda_df.groupby("school_name", "address").agg(
        F.sum("person_num").alias("person_num"),
        F.avg("avg_salary").alias("avg_salary"))
    sa_df = sa_df.withColumn("degree", F.lit(NA))
    sda_df = sda_df.unionByName(sa_df)
    return sda_df
示例#7
0
    def deduplication(logger, df_dict: Dict[str, DataFrame], rules: Dict[str, List[str]]):
        """
        Deduplicate lines considering few columns and merge data from those duplicate
        Args:
            logger: Logger instance used to log events
            df_dict: Dictionary of the datasets with the structure {Name: Dataframe}
            rules: {Dataset Name: [column1, column2]

        Returns: Dic updated in place

        """
        try:
            for df_name, columns in rules.items():
                df_dict[df_name] = df_dict.get(df_name).groupBy(*columns) \
                    .agg(
                    *[first(x, ignorenulls=True).alias(x) for x in df_dict.get(df_name).columns if x not in columns])
            logger.info("Dataframes cleaning deduplication applied")
        except Exception as e:
            logger.error("Cleaning duplicate rows couldn't be performed: {}".format(e), traceback.format_exc())
            raise e
示例#8
0
def main():
    spark = SparkSession.builder.master("local").appName("Word Count").config(
        "spark.some.config.option", "some-value").getOrCreate()
    # sc = SparkContext()
    l = [(None, 1), ('Aliceaa', 3), ('Alices', None), ('Alicesssss', 1),
         ('Alices', 3)]
    x = spark.createDataFrame(l, ['name', 'age'])

    def myFunc(data_list):
        for val in data_list:
            if val is not None and val != '':
                return val
        return None

    myUdf = udf(myFunc, StringType())

    x=x.groupBy('age')\
        .agg(first('name').alias('name'))
    # dropping duplicates from the dataframe
    x.dropDuplicates().show()
示例#9
0
文件: tests.py 项目: listwebit/data
    def test_aggregator(self):
        df = self.df
        g = df.groupBy()
        self.assertEqual([99, 100],
                         sorted(
                             g.agg({
                                 'key': 'max',
                                 'value': 'count'
                             }).collect()[0]))
        self.assertEqual([Row(**{"AVG(key#0)": 49.5})], g.mean().collect())

        from pyspark.sql import functions
        self.assertEqual((0, u'99'),
                         tuple(
                             g.agg(functions.first(df.key),
                                   functions.last(df.value)).first()))
        self.assertTrue(
            95 < g.agg(functions.approxCountDistinct(df.key)).first()[0])
        self.assertEqual(100,
                         g.agg(functions.countDistinct(df.value)).first()[0])
示例#10
0
    def group_batched_logs(df_logs):
        # group logs from did + interval_time + keyword.
        # group 1: group by did + interval_starting_time + keyword
        df = df_logs.groupBy(
            'aid', 'interval_starting_time', 'keyword_index').agg(
                first('keyword').alias('keyword'),
                first('age').alias('age'),
                first('gender_index').alias('gender_index'),
                first('aid_bucket').alias('aid_bucket'),
                fn.sum(col('is_click')).alias('kw_clicks_count'),
                fn.sum(fn.when(col('is_click') == 0,
                               1).otherwise(0)).alias('kw_shows_count'),
            )
        # df = df.orderBy('keyword_index')
        df = df.withColumn(
            'kwi_clicks_count',
            concat_ws(":", col('keyword_index'), col('kw_clicks_count')))
        df = df.withColumn(
            'kwi_shows_count',
            concat_ws(":", col('keyword_index'), col('kw_shows_count')))
        df = df.withColumn(
            'kw_clicks_count',
            concat_ws(":", col('keyword'), col('kw_clicks_count')))
        df = df.withColumn(
            'kw_shows_count',
            concat_ws(":", col('keyword'), col('kw_shows_count')))
        # group 2: group by did + interval_starting_time
        df = df.groupBy('aid', 'interval_starting_time').agg(
            concat_ws(",", collect_list('keyword_index')).alias('kwi'),
            concat_ws(
                ",",
                collect_list('kwi_clicks_count')).alias('kwi_click_counts'),
            concat_ws(
                ",", collect_list('kwi_shows_count')).alias('kwi_show_counts'),
            concat_ws(",", collect_list('keyword')).alias('interval_keywords'),
            concat_ws(
                ",", collect_list('kw_clicks_count')).alias('kw_click_counts'),
            concat_ws(",",
                      collect_list('kw_shows_count')).alias('kw_show_counts'),
            first('age').alias('age'),
            first('gender_index').alias('gender_index'),
            first('aid_bucket').alias('aid_bucket'))

        return df
示例#11
0
def search_clients_daily(main_summary):
    return agg_search_data(
        main_summary,
        [
            'client_id',
            'submission_date',
            'engine',
            'source',
        ],
        map(agg_first, [
            'country',
            'app_version',
            'distribution_id',
            'locale',
            'search_cohort',
            'addon_version',
            'os',
            'channel',
            'profile_creation_date',
            'default_search_engine',
            'default_search_engine_data_load_path',
            'default_search_engine_data_submission_url',
            'sample_id',
        ]) + [
            # Count of 'first' subsessions seen for this client_day
            (count(when(col('subsession_counter') == 1,
                        1)).alias('sessions_started_on_this_day')),
            first(
                datediff(
                    'subsession_start_date',
                    from_unixtime(col('profile_creation_date') * 24 * 60 *
                                  60))).alias('profile_age_in_days'),
            sum(col('subsession_length') /
                3600.0).alias('subsession_hours_sum'),
            mean(size('active_addons')).alias('active_addons_count_mean'),
            (max('scalar_parent_browser_engagement_max_concurrent_tab_count').
             alias('max_concurrent_tab_count_max')),
            (sum('scalar_parent_browser_engagement_tab_open_event_count').
             alias('tab_open_event_count_sum')),
            (sum(col('active_ticks') * 5 / 3600.0).alias('active_hours_sum')),
        ])
示例#12
0
def statistic_school_rank(df):
    """
    专业排名
    :param df: 
    :return: 
    """
    groups = ("school_name", "degree")
    df = add_median_salary(df, groups)
    sd_df = df.groupby(*groups).agg(
        F.count("*").alias("person_num"),
        F.first("avg_salary").alias("avg_salary"))
    sd_df = sd_df.filter(sd_df.person_num > MIN_NUM)
    # 不限degree分析
    s_df = sd_df.groupby("school_name").agg(
        F.sum("person_num").alias("person_num"),
        F.avg("avg_salary").alias("avg_salary"))
    s_df = s_df.withColumn("degree", F.lit(NA))
    sd_df = sd_df.unionByName(s_df)
    sd_df = sd_df.filter(sd_df.person_num > MIN_NUM)
    sd_df = add_rank(sd_df, "degree")
    return sd_df
示例#13
0
def _event_prop(event_type: str, expr: Column) -> Column:
    """Get property from the event of a certain type within the ad session.

    Parameters
    ----------
    event_type
        Event type.
    expr
        Value expression.

    Returns
    -------
    Column
        Column expression that evaluates to the provided `expr` if the event
        type matches the specified one, or None otherwise.

    """
    return first(
        when(col('type') == event_type, expr),
        ignorenulls=True
    )
def pose():
    # SparkSession_2 = SparkSession.newSession()
    spark = SparkSession.builder.appName('csql_demo1').master(
        'local[*]').getOrCreate()
    # spark = SparkSession.builder.appName('csql_demo').master('local[*]').config('spark.jars', 'file:///home/boopathi/Downloads/spark-cassandra-connector-2.4.0-s_2.11.jar').getOrCreate()
    # spark.conf.set('spark.jars', 'file:///home/boopathi/Downloads/postgresql-42.2.7.jar')postgresql-42.2.7.jar
    # spark.newSession() ,.config('spark.jars','file:///home/boopathi/Downloads/*')
    #--------------------
    SparkSession_2 = spark.newSession()
    # query = "(SELECT * FROM attribute_kv) as r"
    query = "(SELECT * FROM attribute_kv WHERE  entity_type = 'DEVICE' ) as r"
    get_data = spark.read.format('jdbc').option(
        'driver', 'org.postgresql.Driver').option(
            'url', 'jdbc:postgresql://192.168.1.36:5432/thingsboard').option(
                "user",
                "postgres").option("password",
                                   "postgres").option('dbtable', query).load()
    dx = get_data.withColumn(
        "value",
        concat_ws("", get_data.bool_v, get_data.long_v, get_data.dbl_v,
                  get_data.json_v, get_data.str_v))
    dx = dx.filter(dx.attribute_type == 'SERVER_SCOPE')
    nl = dx.groupBy('entity_id', 'attribute_type').pivot('attribute_key').agg(
        first('value'))
    ld = nl.withColumnRenamed('entity_id', 'device_id')
    query = "(SELECT name, type, id FROM device) as r"
    sk = spark.read.format('jdbc').option(
        'driver', 'org.postgresql.Driver').option(
            'url', 'jdbc:postgresql://192.168.1.36:5432/thingsboard').option(
                "user",
                "postgres").option("password",
                                   "postgres").option('dbtable', query).load()
    joined_data = ld.join(sk, ld.device_id == sk.id)
    req_det = joined_data.rdd.map(lambda x: [
        x.name, x.device_id, x.attribute_type, x.scNo, x.simNo, x.imeiNumber, x
        .boardNumber, x.zoneName, x.wardName, x.location, x.phase, x.ccmsType,
        x.kva, x.baseWatts, x.baseLine, x.connectedWatts, x.roadType, x.
        latitude, x.longitude
    ]).collect()
    return req_det
示例#15
0
def filter_df_on_start_activities_nocc(df, nocc, sa_count0=None, timestamp_key=DEFAULT_TIMESTAMP_KEY,
                                       case_id_glue=CASE_CONCEPT_NAME, activity_key=DEFAULT_NAME_KEY, grouped_df=None):
    """Filters the Spark dataframe on start activities number of occurrences
    """

    if grouped_df is None:
        grouped_df = df.groupby(case_id_glue)
    if sa_count0 is None:
        parameters = {
            PARAMETER_CONSTANT_TIMESTAMP_KEY: timestamp_key,
            PARAMETER_CONSTANT_CASEID_KEY: case_id_glue,
            PARAMETER_CONSTANT_ACTIVITY_KEY: activity_key,
            GROUPED_DATAFRAME: grouped_df
        }
        sa_count0 = get_start_activities(df, parameters=parameters)
    sa_count = [k for k, v in sa_count0.items() if v >= nocc]

    if len(sa_count) < len(sa_count0):
        grouped_df = grouped_df.agg(F.first(activity_key).alias(activity_key+"_1"))
        df_start = grouped_df.filter(grouped_df[activity_key+"_1"].isin(sa_count))
        return df.join(F.broadcast(df_start), grouped_df.columns[0]).drop(activity_key+"_1")
    return df
示例#16
0
    def test_aggregator(self):
        df = self.df
        g = df.groupBy()
        self.assertEqual([99, 100],
                         sorted(
                             g.agg({
                                 "key": "max",
                                 "value": "count"
                             }).collect()[0]))
        self.assertEqual([Row(**{"AVG(key#0)": 49.5})], g.mean().collect())

        from pyspark.sql import functions

        self.assertEqual((0, "99"),
                         tuple(
                             g.agg(functions.first(df.key),
                                   functions.last(df.value)).first()))
        self.assertTrue(
            95 < g.agg(functions.approx_count_distinct(df.key)).first()[0])
        # test deprecated countDistinct
        self.assertEqual(100,
                         g.agg(functions.countDistinct(df.value)).first()[0])
示例#17
0
def saveCompletedJobRunTaskData(microBatchDF):

    taskExpode1 = microBatchDF.select(microBatchDF.job_id, microBatchDF.run_id,
                                      explode(microBatchDF.cluster_spec))
    taskExpode2 = taskExpode1.select(taskExpode1.job_id, taskExpode1.run_id,
                                     taskExpode1.key.alias("task_type"),
                                     explode(taskExpode1.value))
    taskDF = taskExpode2.groupBy("job_id", "run_id",
                                 "task_type").pivot("key").agg(first("value"))

    if DeltaTable.isDeltaTable(spark, completed_job_run_task_path):
        # merge data
        deltaTable = DeltaTable.forPath(spark, completed_job_run_task_path)

        (deltaTable.alias("target").merge(
            taskDF.alias("source"),
            "source.job_id=target.job_id and source.run_id=target.run_id").
         whenMatchedUpdateAll().whenNotMatchedInsertAll().execute())

    else:
        (taskDF.write.format("delta").mode("overwrite").option(
            "mergeSchema", "true").save(completed_job_run_task_path))
示例#18
0
def levenshtein_cluster(df, input_col):
    """
    Return a dataframe with a string of cluster related to a string
    :param df: Spark Dataframe
    :param input_col:
    :return:
    """
    # Prepare a group so we don't need to apply the fingerprint to the whole data set
    df = df.select(input_col).groupby(input_col).agg(
        F.count(input_col).alias("count"))
    df = keycollision.fingerprint(df, input_col)

    count_col = name_col(input_col, COUNT_COL)
    cluster_col = name_col(input_col, CLUSTER_COL)
    recommended_col = name_col(input_col, RECOMMENDED_COL)
    cluster_size_col = name_col(input_col, CLUSTER_SIZE_COL)
    fingerprint_col = name_col(input_col, FINGERPRINT_COL)

    df_t = df.groupby(fingerprint_col).agg(
        F.collect_list(input_col).alias(cluster_col),
        F.size(F.collect_list(input_col)).alias(cluster_size_col),
        F.first(input_col).alias(recommended_col),
        F.sum("count").alias(count_col)).repartition(1)
    # if Optimus.cache:
    #     df_t = df_t.cache()

    # Filter nearest string
    df_l = levenshtein_filter(df, input_col).repartition(1)

    if Optimus.cache:
        df_l = df_l.cache()

    # Create Cluster
    df_l = df_l.join(df_t, (df_l[input_col + "_FROM"] == df_t[fingerprint_col]), how="left") \
        .cols.drop(fingerprint_col) \
        .cols.drop([input_col + "_FROM", input_col + "_TO", name_col(input_col, "LEVENSHTEIN_DISTANCE")])

    return df_l
示例#19
0
def gen_freq_distr_user_data(userDF, attrs):
    userWt = weightCol
    # get weighted frequencies of each category of a user's categorical attributes
    print("[getCategoryFreqs] Grouping records by users")
    categoryFreqInfo = {}
    udf = userDF
    for attr in attrs:
        print("Processing attribute %s" % attr)
        # get wt for each individual user
        _tbl = udf.filter(udf[attr].isNotNull()) \
                  .groupby(userIdCol, attr) \
                  .agg(F.first(userWt).alias(userWt))
        # sum up weight for each values of the attribute
        _tbl = _tbl.groupby(attr).agg(F.sum(userWt).alias('wt'))
        attrInfo = _tbl.collect()
        # build a dic of {attrValue:freq}
        vals = {x[attr]: x['wt'] for x in attrInfo}
        # sum of all occurances of attribute
        tot = sum(vals.values())
        #compute relative freq w.r.t to total occurances of the attribute
        info = {val: float(wt) / tot for val, wt in vals.iteritems()}
        categoryFreqInfo[attr] = info
    return categoryFreqInfo
示例#20
0
def statistic_major_position(df):
    """
    专业对应的
    :param df: 
    :return: 
    """
    groups = ("major", "degree", "position_name")
    df = df.filter(df.position_name.isNotNull())
    # 职位别名
    df = df.withColumn("position_title", F.lower(F.trim(df.position_title)))
    pdf = df.groupby("position_name", "position_title").agg(F.count("*").alias("total"))
    pdf = pdf.groupby("position_name").apply(filter_position)
    pdf = pdf.groupby("position_name").agg(F.collect_set("position_title").alias("position_set"))
    pdf = pdf.withColumn("position_alias", F.udf(lambda x: "/".join(x))(pdf.position_set))
    pdf = pdf.select("position_name", "position_alias")
    # 职位对应行业
    idf = df.groupby("position_name", "industry").agg(F.count("*").alias("total"))
    idf = idf.groupby("position_name").apply(filter_industry)
    idf = idf.groupby("position_name").agg(F.collect_set("industry").alias("industry_set"))
    idf = idf.withColumn("industry_alias", F.udf(lambda x: "/".join(x))(idf.industry_set))
    idf = idf.select("position_name", "industry_alias")
    # 限制degree分析
    df = add_median_salary(df, groups)
    mdp_df = df.groupby(*groups).agg(F.count("*").alias("person_num"),
                                     F.first(df.avg_salary).alias("avg_salary"))
    mdp_df = mdp_df.filter(mdp_df.person_num > MIN_NUM)
    # 不限degree分析
    mp_df = mdp_df.groupby("major", "position_name").agg(F.sum("person_num").alias("person_num"),
                                                         F.avg("avg_salary").alias("avg_salary"))
    mp_df = mp_df.withColumn("degree", F.lit(NA))
    mdp_df = mdp_df.unionByName(mp_df)

    # 融合职位别名
    mdp_df = mdp_df.join(pdf, "position_name")
    # 融合职位对应的行业
    mdp_df = mdp_df.join(idf, "position_name")
    return mdp_df
示例#21
0
def user_cluster_model(spark, ratings, movies, k, genres):
    """ Returns a clustering model for users' genre preferences """

    # Get all user ids
    all_user_ids = ratings.select("userId").distinct().rdd.flatMap(
        lambda x: x).collect()

    # Calculate scores for each user
    scores = user_genre_scores(spark, ratings, movies, all_user_ids)\
        .sort(col("userId"), col("genre"))

    # Convert genres in rows to columns
    scores = scores.groupBy("userId").pivot("genre").agg(
        first("score")).na.fill(0)

    # Ignore movies without genres
    if "(no genres listed)" in scores.columns:
        scores = scores.drop("(no genres listed)")
    scores.cache()

    # Find genres in dataset used
    genres_in_scores = scores.drop("userId").columns

    # Train a k-means model
    scores = VectorAssembler(inputCols=genres_in_scores,
                             outputCol="features").transform(scores)
    kmeans_model = KMeans().setK(k).setSeed(5052).fit(scores)

    # Save genres used in model to model object
    kmeans_model.genres = genres_in_scores

    # Calculate sihlouette score & save to model
    train_predictions = kmeans_model.transform(scores)
    kmeans_model.sihlouette_score = ClusteringEvaluator().evaluate(
        train_predictions)

    return kmeans_model
示例#22
0
def create_yearly_weather(spark) -> DataFrame:
    """
    Reads in 3 years of daily weather reports throughout the world. After filtering on US stations only, and keeping
    only the most prevalent key weather metrics, the dataframe needs to be pivotted so it can be easily joined with
    the review and distances dataframes.
    """
    yearly_weather_path = f"s3://{s3_bucket}/ghcn/year_*"
    elements_to_keep = ['PRCP', 'SNOW', 'SNWD', 'TMAX', 'TMIN']

    yearly_weather = (spark.read.csv(
        yearly_weather_path, header=False,
        schema=yearly_weather_schema).filter(
            col('element').isin(elements_to_keep)).filter(
                col('station_id').startswith('US')).withColumn(
                    'year', substring(col('date'), 1, 4)).withColumn(
                        'month', substring(col('date'), 5, 2)).withColumn(
                            'day', substring(col('date'), 7, 2)).withColumn(
                                'weather_date',
                                to_date(
                                    concat_ws(
                                        '-', col('year'), col('month'),
                                        col('day')))).select(
                                            col('station_id'),
                                            col('weather_date'),
                                            col('element'),
                                            col('value').cast(
                                                IntegerType())).repartition(
                                                    200, 'station_id',
                                                    'weather_date'))

    yearly_weather_pivot = (yearly_weather.groupby(
        'station_id',
        'weather_date').pivot('element').agg(first('value')).dropna(
            subset=['PRCP', 'TMAX', 'TMIN']).repartition(
                200, 'station_id', 'weather_date'))

    return yearly_weather_pivot
def assoc_fn(df: DataFrame, group_by_cols):
    gbc = [col(x) for x in group_by_cols]
    h_fn = partial(harmonic_fn,
                   partition_cols=group_by_cols,
                   over_col="evs_score",
                   output_col=harmonic_col)
    assoc_df = (df.withColumn(
        "evs_score", array_min(array(col("evidence_score") / 10.0, lit(1.0)))
    ).transform(h_fn).groupBy(*gbc).agg(
        countDistinct(col("pmid")).alias("f"),
        mean(col("evidence_score")).alias("mean"),
        stddev(col("evidence_score")).alias("std"),
        max(col("evidence_score")).alias("max"),
        min(col("evidence_score")).alias("min"),
        expr("approx_percentile(evidence_score, array(0.25, 0.5, 0.75))").
        alias("q"),
        count(col("pmid")).alias("N"),
        first(col(harmonic_col)).alias(harmonic_col)).withColumn(
            "median", element_at(col("q"), 2)).withColumn(
                "q1", element_at(col("q"),
                                 1)).withColumn("q3", element_at(col("q"),
                                                                 3)).drop("q"))

    return assoc_df
def run_job(trades: DataFrame):
    """
    Generates daily summaries of provided trade data grouped by security.
    Returns a DataFrame with the following columns:
     - Security
     - Date
     - TradedVolume
     - NumberOfTrades
     - StartPrice
     - EndPrice
     - HighPrice
     - LowPrice
     - Volatility
    DataFrame is ordered alphabetically by trade, then in chronological reverse order.
    :param trades: DataFrame
    :return: DataFrame
    """

    with_roc = trades.withColumn(
        "ROC",
        udf(calculate_roc, FloatType())(trades.StartPrice, trades.EndPrice))

    grouped = with_roc.groupBy('Mnemonic', 'Date')\
        .agg(
            sum('TradedVolume').alias('TradedVolume'),
            sum('NumberOfTrades').alias('NumberOfTrades'),
            first('StartPrice').alias('StartPrice'),
            last('EndPrice').alias('EndPrice'),
            max('MaxPrice').alias('HighPrice'),
            min('MinPrice').alias('LowPrice'),
            sum('ROC').alias('Volatility'),
        )\
        .withColumnRenamed('Mnemonic', 'Security') \
        .orderBy(asc('Security'), desc('Date'))

    return grouped
示例#25
0
    def process_df(self, df):
        def detect_anomaly(ts):
            """
            Args ts: pandas.series
            rtype: int
            """
            outliers_indices = seasonal_esd(
                ts, hybrid=True, max_anomalies=10)
            return len(outliers_indices)

        grouped_df = df.groupBy(["id"]).agg(F.collect_list("downsample_avg").alias(
            "downsampled_ts"), first("start_ts").alias("start_ts"), last("end_ts").alias("end_ts"))
        anomaly_udf = udf(detect_anomaly, IntegerType())
        processed_df = grouped_df.withColumn("num_anomaly", anomaly_udf(
            "downsampled_avg")).sort(desc("num_anomaly"))
        final_df = processed_df.select(
            "id", "start_ts", "end_ts", "num_anomaly")
        try:
            connector = pgConnector.PostgresConnector(
                "ec2-3-94-71-208.compute-1.amazonaws.com", "datanodedb", "datanode", "password")
            connector.write(final_df, "global_anomalies_table", "append")
        except Exception as e:
            print(e)
            pass
示例#26
0
    def group_batched_logs(df_logs_batched):
        # group the logs to generate the train ready data from the basic unit of uckey + interval_time + keyword.
        # group 1: group by uckey + interval_starting_time + keyword
        df = df_logs_batched.groupBy('uckey', 'interval_starting_time', 'keyword_index').agg(
            first('keyword').alias('keyword'),
            fn.sum(col('is_click')).alias('keyword_click_count'),
            fn.count(fn.when(col('is_click') == 0, 1).otherwise(
                0)).alias('keyword_show_count')
        )
        df = df.withColumn('keyword_index_click_count', concat_ws(":", col('keyword_index'), col('keyword_click_count')))
        df = df.withColumn('keyword_index_show_count', concat_ws(":", col('keyword_index'), col('keyword_show_count')))
        df = df.withColumn('keyword_click_count', concat_ws(":", col('keyword'), col('keyword_click_count')))
        df = df.withColumn('keyword_show_count', concat_ws(":", col('keyword'), col('keyword_show_count')))

        # group 2: group by uckey + interval_starting_time
        df = df.groupBy('uckey', 'interval_starting_time').agg(
            concat_ws(",", collect_list('keyword_index')).alias('interval_keyword_indexes'),
            concat_ws(",", collect_list('keyword_index_click_count')).alias('interval_keyword_indexes_click_counts'),
            concat_ws(",", collect_list('keyword_index_show_count')).alias('interval_keyword_indexes_show_counts'),
            concat_ws(",", collect_list('keyword')).alias('interval_keywords'),
            concat_ws(",", collect_list('keyword_click_count')).alias('interval_keywords_click_counts'),
            concat_ws(",", collect_list('keyword_show_count')).alias('interval_keywords_show_counts')
        )
        return df
示例#27
0
def pivotSummary(df: DataFrame) -> DataFrame:
    '''
		Combined with the melt function above this function takes in a summary of a dataframe 
		calculated by `.describe()` function and outputs it in a long and more readable format
		especially in case of dataframes with many variables.
		'''

    schema = df.schema

    slist = []
    for i in schema:
        slist.append(i.name)

    id1 = slist[0]

    slist.remove('summary')

    longFormat = melt(df, id_vars=[id1], value_vars=slist)

    wideDF = longFormat.groupBy('variable').pivot(
        'summary',
        ['count', 'mean', 'stddev', 'min', 'max']).agg(first('value'))

    return wideDF
示例#28
0
    idf_model.write().overwrite().save(
        "s3a://ph-max-auto/2020-08-11/BPBatchDAG/refactor/alfred/idf_model")

    # df_standard = idf_model.transform(df_standard)
    # df_standard.show()

    # --------- 从这里开始应该重启一个job 我偷懒没有写 ------------

    # 5. 利用原有的crossjoin 数据构建,公司的label
    df_result = load_training_data(spark)  # 待清洗数据

    # 为生产厂商构建新label
    df_mnf_label = df_result.where(df_result.label == 1.0).select(
        "id", "MANUFACTURER_NAME", "MANUFACTURER_NAME_STANDARD",
        "MANUFACTURER_NAME_EN_STANDARD")
    df_mnf_label = df_mnf_label.groupBy("id").agg(first(df_mnf_label.MANUFACTURER_NAME).alias("MANUFACTURER_NAME_ANSWER"), \
                first(df_mnf_label.MANUFACTURER_NAME_STANDARD).alias("MANUFACTURER_NAME_STANDARD_ANSWER"), \
                first(df_mnf_label.MANUFACTURER_NAME_EN_STANDARD).alias("MANUFACTURER_NAME_EN_STANDARD_ANSWER"))
    df_result = df_result.join(df_mnf_label, how="left", on="id")
    df_result = df_result.withColumn(
        "mnf_label",
        when((df_result.MANUFACTURER_NAME_STANDARD
              == df_result.MANUFACTURER_NAME_STANDARD_ANSWER) |
             (df_result.MANUFACTURER_NAME_EN_STANDARD
              == df_result.MANUFACTURER_NAME_EN_STANDARD_ANSWER),
             1.0).otherwise(0.0))
    # df_result.select("id", "MANUFACTURER_NAME", "MANUFACTURER_NAME_STANDARD","MANUFACTURER_NAME_STANDARD_ANSWER", "mnf_label").show()
    df_result = df_result.drop("MANUFACTURER_NAME_ANSWER",
                               "MANUFACTURER_NAME_STANDARD_ANSWER",
                               "MANUFACTURER_NAME_EN_STANDARD_ANSWER")
df = spark.createDataFrame(data=simpleData, schema = schema)
df.printSchema()
df.show(truncate=False)

print("approx_count_distinct: " + \
      str(df.select(approx_count_distinct("salary")).collect()[0][0]))

print("avg: " + str(df.select(avg("salary")).collect()[0][0]))

df.select(collect_list("salary")).show(truncate=False)

df.select(collect_set("salary")).show(truncate=False)

df2 = df.select(countDistinct("department", "salary"))
df2.show(truncate=False)
print("Distinct Count of Department &amp; Salary: "+str(df2.collect()[0][0]))

print("count: "+str(df.select(count("salary")).collect()[0]))
df.select(first("salary")).show(truncate=False)
df.select(last("salary")).show(truncate=False)
df.select(kurtosis("salary")).show(truncate=False)
df.select(max("salary")).show(truncate=False)
df.select(min("salary")).show(truncate=False)
df.select(mean("salary")).show(truncate=False)
df.select(skewness("salary")).show(truncate=False)
df.select(stddev("salary"), stddev_samp("salary"), \
    stddev_pop("salary")).show(truncate=False)
df.select(sum("salary")).show(truncate=False)
df.select(sumDistinct("salary")).show(truncate=False)
df.select(variance("salary"),var_samp("salary"),var_pop("salary")) \
  .show(truncate=False)
示例#30
0
def agg_first(col):
    return first(col).alias(col)
示例#31
0
 def hash(self, df_trajectory_processed, df_type='pandas'):
     # Assert Implemented Methods
     assert df_type in {
         'pandas', 'spark'
     }, 'hash@<TrajectoryHasherJacardEstimation>: df_type = "{}" is not implemented!'.format(
         df_type)
     # Hash
     if (df_type == 'pandas'):
         # Bounds
         id_timestamp_min = df_trajectory_processed['id_timestamp'].min()
         id_timestamp_max = df_trajectory_processed['id_timestamp'].max()
         # Select ID TimeStamp for Hashes
         id_timestamps_selected = np.random.choice(
             np.arange(id_timestamp_min, id_timestamp_max + 1),
             self.n_hashes,
             replace=False) if self.n_hashes < (
                 id_timestamp_max - id_timestamp_min + 1) else list(
                     range(id_timestamp_min, id_timestamp_max + 1))
         id_timestamps_selected_set = set(id_timestamps_selected)
         # Filter
         df_result = df_trajectory_processed[
             df_trajectory_processed['id_timestamp'].map(
                 lambda x: x in id_timestamps_selected_set)].copy(
                 ).sort_values(['id_user', 'id_timestamp'])
         # Index Locations
         location_indices = []
         lat_lng_to_idx = dict()
         for lat, lng in zip(df_result['lat'], df_result['lng']):
             key = (lat, lng)
             if (not key in lat_lng_to_idx):
                 lat_lng_to_idx[key] = len(lat_lng_to_idx)
             # Add New Index
             location_indices.append(lat_lng_to_idx[key])
         df_result['location_indices'] = location_indices
         # Calculate Hashes
         df_hashes = None
         for i, id_timestamp in enumerate(id_timestamps_selected):
             if (df_hashes is None):
                 df_hashes = df_result[df_result['id_timestamp'] ==
                                       id_timestamp][[
                                           'id_user', 'location_indices'
                                       ]].copy()
             else:
                 df_hashes['location_indices'] = df_result[
                     df_result['id_timestamp'] ==
                     id_timestamp]['location_indices'].values
             # Rename New Column
             colname = 'hash_{}'.format(i)
             df_hashes.rename(columns={'location_indices': colname},
                              inplace=True)
     elif (df_type == 'spark'):
         # ID TimeStamp Bounds
         row = df_trajectory_processed.agg(
             sql_functions.min(sql_functions.col("id_timestamp")).alias(
                 "id_timestamp_min"),
             sql_functions.max(sql_functions.col("id_timestamp")).alias(
                 "id_timestamp_max")).head()
         id_timestamp_min, id_timestamp_max = row['id_timestamp_min'], row[
             'id_timestamp_max']
         # Chosen TimeStamps
         id_timestamps_selected = np.random.choice(
             np.arange(id_timestamp_min, id_timestamp_max + 1),
             self.n_hashes,
             replace=False).tolist() if self.n_hashes < (
                 id_timestamp_max - id_timestamp_min + 1) else list(
                     range(id_timestamp_min, id_timestamp_max + 1))
         # Create SparkDataFrame
         df_id_timestamps = self.params['spark'].createDataFrame(
             [[id_timestamp, 'hash_{}'.format(i)]
              for i, id_timestamp in enumerate(id_timestamps_selected)],
             schema=sql_types.StructType([
                 sql_types.StructField('id_timestamp',
                                       sql_types.IntegerType(), False),
                 sql_types.StructField('hash_name', sql_types.StringType(),
                                       False),
             ]))
         # ID Locations
         df_location_ids = df_trajectory_processed.select("lat", "lng").\
             distinct().withColumn(
                 "id_location",
                 sql_functions.row_number().over(
                     Window.orderBy("lat", "lng")
                 )
             )
         # Join Hashes
         df_result = df_trajectory_processed.join(df_id_timestamps,
                                                  on=['id_timestamp'],
                                                  how='inner').join(
                                                      df_location_ids,
                                                      on=['lat', 'lng'],
                                                      how='inner')
         # Turn Into Table
         df_hashes = df_result.groupby("id_user").\
             pivot( "hash_name" ).\
             agg( sql_functions.first( "id_location" ) )
     # Return
     return (df_hashes)
示例#32
0
文件: house.py 项目: poojk/BnB-Pay
    key = my_bucket_object.key
    tablename=key.split("_")
    table = tablename[0]
    val = tablename[2].split(".")[0]
    path = f's3a://{bucket}/{key}'
    df = spark.read.option("header",True).csv(path)
    s = [s for s in df.columns if '2019' in s]
    selected = ['State','Metro'] + s
    df2=df.select(*selected)
    newdf = df2.withColumn('house_average', sum(df2[col] for col in s)/len(s))
    df1 = newdf.withColumn("house_average", F.round(newdf["house_average"], 1))
    d = df1.drop(*s)
    d = d.withColumn('bedrooms',F.lit(val))
    d = d.withColumnRenamed('Metro','city')
    d = d.dropna(subset=["city"])
    d = d.groupBy('state').agg(F.avg('house_average').alias('house_average'), F.first('bedrooms'))
    #d = d.withColumnRenamed('first(state)','state')
    d = d.withColumnRenamed('first(bedrooms)','bedrooms')
    d = d.withColumn("house_average", F.round(d["house_average"], 1))
    d = d.sort('state')
    d.show()
        d.write \
            .format("jdbc") \
            .option("url","jdbc:postgresql://10.0.0.8:5432/my_db") \
            .option("dbtable","house_prices") \
            .option("user","test") \
            .option("password","test") \
            .option("driver","org.postgresql.Driver") \
            .mode("Append") \
            .save()
    sc = SparkContext(appName='generateDOIBoost')
    spark = SparkSession(sc)

    #Loading CrossRef Dataframe
    crossref = spark.read.load('/data/df/crossref.parquet', format="parquet")

    #Loading MAG Dataframe
    microsoft = spark.read.load("/data/df/mag.parquet", format="parquet")

    #Alias each column with _mag
    microsoft = microsoft.select(*(col(x).alias(x + '_mag')
                                   for x in microsoft.columns))

    #Group By DOI since we have repeated doi with multiple abstract, at the moment we take the first One
    mag = microsoft.groupBy('doi_mag').agg(
        first('authors_mag').alias('author_mag'),
        first('abstract_mag').alias('abstract_mag'),
        first('collectedFom_mag').alias('collectedFrom_mag'))

    #Load ORCID DataFrame
    orcid = spark.read.load("/data/df/ORCID.parquet", format="parquet")

    #Fix missing value in collectedFrom
    orcid = orcid.withColumn('collectedFrom', array(lit('ORCID')))

    #Alias each column with _orchid
    orcid = orcid.select(*(col(x).alias(x + '_orcid') for x in orcid.columns))

    #Load UnpayWall DataFrame
    uw = spark.read.load("/data/df/unpaywall.parquet", format="parquet")