示例#1
0
def query_classification_data(**params):
    """
    parameters:
        horizon_month
        prediction_month
        n
    """

    query = """
    
    SELECT
    CAST(x.cla_0 AS INT) as num_edits_0,
    CAST(x.cla_1 AS INT) as num_edits_1,
    CAST(x.cla_2 AS INT) as num_edits_2,
    CAST(x.cla_3 AS INT) as num_edits_3,
    CAST(x.cla_tot AS INT) as num_edits_total,
    CAST(x.cla_tot - x.cla_0 - x.cla_1 - x.cla_2 - x.cla_3 AS INT) as num_edits_rest,
    CAST(x.v_no_months AS INT) as months_since_registration,
    CAST(x.aeb_no_months AS INT) as months_since_5_edits,
    CAST(x.talk_counts AS INT) as talk_counts,
    CAST(x.friend_count AS INT) as friend_count,
    z.num_edits_to_date_0,
    z.num_edits_to_date_1,
    z.num_edits_to_date_2,
    z.num_edits_to_date_3,
    z.num_edits_to_date_tot,
    z.num_edits_got_archived_to_date_0,
    z.num_edits_got_reverted_to_date_0,
    CASE
        WHEN y.user_id IS NOT NULL THEN 1
        ELSE 0
    END as survive
    FROM
    (SELECT * from staging.leila_edits
    WHERE month = %(horizon_month)s ) x
    LEFT JOIN
    (SELECT distinct(user_id) 
    FROM staging.leila_edits
    WHERE month <= %(prediction_month_end)s 
    AND month >= %(prediction_month_start)s  
    ) y
    ON (x.user_id = y.user_id)
    LEFT JOIN
    (SELECT user_id,
     SUM(cla_0) as num_edits_to_date_0,
     SUM(cla_1) as num_edits_to_date_1,
     SUM(cla_2) as num_edits_to_date_2,
     SUM(cla_3) as num_edits_to_date_3,
     SUM(cla_tot) as num_edits_to_date_tot,
     SUM(cala_0) as num_edits_got_archived_to_date_0,
     SUM(crla_0) as num_edits_got_reverted_to_date_0
    FROM staging.leila_edits
    WHERE month <= %(horizon_month)s 
    GROUP BY user_id) z
    ON (x.user_id = z.user_id)
    ORDER BY RAND()
    LIMIT %(n)s
    """

    return query_s1(query, params)
示例#2
0
文件: cluster.py 项目: ataylor-cs/wmf
def query_career_cluster_data():

    """
    AVG(cla_2) as avg_monthly_edits_2,
    STD(cla_2) as var_monthly_edits_2,
    AVG(cla_3) as avg_monthly_edits_3,
    STD(cla_3) as var_monthly_edits_3,
    AVG(friend_count) as avg_friend_count,
    STD(friend_count) as var_friend_count,
    AVG(cla_tot) as avg_monthly_edits_tot,
    STD(cla_tot) as var_monthly_edits_tot,
    COUNT(*) as num_active_months,
    SUM(cala_0)/SUM(cla_0) as percent_edits_got_archived_0,
    SUM(crla_0)/SUM(cla_0) as percent_edits_got_reverted_0
    """



    query = """
    SELECT
    AVG(cla_0) as avg_monthly_edits_0,
    STD(cla_0) as var_monthly_edits_0,
    AVG(cla_1) as avg_monthly_edits_1,
    STD(cla_1) as var_monthly_edits_1
    FROM staging.leila_edits
    GROUP BY user_id
    HAVING COUNT(*) > 5
    LIMIT 50000
    """

    df =  query_s1(query, {}).fillna(0)
    return df
示例#3
0
def query_classification_data(**params):
    """
    parameters:
        horizon_month
        prediction_month
        n
    """

    query = """
    
    SELECT
    CAST(x.cla_0 AS INT) as num_edits_0,
    CAST(x.cla_1 AS INT) as num_edits_1,
    CAST(x.cla_2 AS INT) as num_edits_2,
    CAST(x.cla_3 AS INT) as num_edits_3,
    CAST(x.cla_tot AS INT) as num_edits_total,
    CAST(x.cla_tot - x.cla_0 - x.cla_1 - x.cla_2 - x.cla_3 AS INT) as num_edits_rest,
    CAST(x.v_no_months AS INT) as months_since_registration,
    CAST(x.aeb_no_months AS INT) as months_since_5_edits,
    CAST(x.talk_counts AS INT) as talk_counts,
    CAST(x.friend_count AS INT) as friend_count,
    z.num_edits_to_date_0,
    z.num_edits_to_date_1,
    z.num_edits_to_date_2,
    z.num_edits_to_date_3,
    z.num_edits_to_date_tot,
    z.num_edits_got_archived_to_date_0,
    z.num_edits_got_reverted_to_date_0,
    CASE
        WHEN y.user_id IS NOT NULL THEN 1
        ELSE 0
    END as survive
    FROM
    (SELECT * from staging.leila_edits
    WHERE month = %(horizon_month)s ) x
    LEFT JOIN
    (SELECT distinct(user_id) 
    FROM staging.leila_edits
    WHERE month <= %(prediction_month_end)s 
    AND month >= %(prediction_month_start)s  
    ) y
    ON (x.user_id = y.user_id)
    LEFT JOIN
    (SELECT user_id,
     SUM(cla_0) as num_edits_to_date_0,
     SUM(cla_1) as num_edits_to_date_1,
     SUM(cla_2) as num_edits_to_date_2,
     SUM(cla_3) as num_edits_to_date_3,
     SUM(cla_tot) as num_edits_to_date_tot,
     SUM(cala_0) as num_edits_got_archived_to_date_0,
     SUM(crla_0) as num_edits_got_reverted_to_date_0
    FROM staging.leila_edits
    WHERE month <= %(horizon_month)s 
    GROUP BY user_id) z
    ON (x.user_id = z.user_id)
    ORDER BY RAND()
    LIMIT %(n)s
    """

    return query_s1(query, params)
示例#4
0
def query_career_cluster_data():
    """
    AVG(cla_2) as avg_monthly_edits_2,
    STD(cla_2) as var_monthly_edits_2,
    AVG(cla_3) as avg_monthly_edits_3,
    STD(cla_3) as var_monthly_edits_3,
    AVG(friend_count) as avg_friend_count,
    STD(friend_count) as var_friend_count,
    AVG(cla_tot) as avg_monthly_edits_tot,
    STD(cla_tot) as var_monthly_edits_tot,
    COUNT(*) as num_active_months,
    SUM(cala_0)/SUM(cla_0) as percent_edits_got_archived_0,
    SUM(crla_0)/SUM(cla_0) as percent_edits_got_reverted_0
    """

    query = """
    SELECT
    AVG(cla_0) as avg_monthly_edits_0,
    STD(cla_0) as var_monthly_edits_0,
    AVG(cla_1) as avg_monthly_edits_1,
    STD(cla_1) as var_monthly_edits_1
    FROM staging.leila_edits
    GROUP BY user_id
    HAVING COUNT(*) > 5
    LIMIT 50000
    """

    df = query_s1(query, {}).fillna(0)
    return df
示例#5
0
文件: cluster.py 项目: ataylor-cs/wmf
def query_monthly_behaviour_data():
    query = """
    SELECT
    cla_0/cla_tot AS percent_main,
    cla_1/cla_tot AS percent_talk,
    cla_2/cla_tot AS percent_user,
    cla_3/cla_tot AS percent_usertalk
    FROM staging.leila_edits
    WHERE month = '2013-05-31'
    """
    return query_s1(query, {}).fillna(0)
示例#6
0
def query_monthly_behaviour_data():
    query = """
    SELECT
    cla_0/cla_tot AS percent_main,
    cla_1/cla_tot AS percent_talk,
    cla_2/cla_tot AS percent_user,
    cla_3/cla_tot AS percent_usertalk
    FROM staging.leila_edits
    WHERE month = '2013-05-31'
    """
    return query_s1(query, {}).fillna(0)