def main(): parser = argparse.ArgumentParser() parser.add_argument("--output_tsv", default=config.edit_el_tsv, help="TSV filename for output EventLogging data") parser.add_argument( "--hive_requests_table", default=config.hive_el_requests_table, help= "Hive table with all potential QuickSurvey webrequests and surveySessionTokens" ) args = parser.parse_args() # make sure dates WHERE clause matches config logic query = ( "SELECT event.session_token AS session_token, " "event.action AS action, " "event.init_mechanism AS init_mechanism, " "event.editor_interface AS editor_interface, " "event.page_title AS edit_page_title, " "event.user_editcount AS user_edit, " "event.user_id = 0 AS anon, " "REFLECT('org.apache.commons.codec.digest.DigestUtils', 'sha512Hex', CONCAT(s.client_ip, s.user_agent, '{0}')) AS userhash " "FROM event.editattemptstep e " "INNER JOIN {1} s " "ON (e.event.session_token = SUBSTR(s.survey_session_token, 0, 20)) " "WHERE e.year = 2019 AND e.month = 3 AND (e.day = 4 OR e.day = 5)". format(config.hash_key, args.hive_requests_table)) exec_hive_stat2(query, args.output_tsv)
def ungroup(db_name, table_name, lang, priority, nice, year=config.survey_start_date.year): query = """ CREATE TABLE {0}.{1}_{2} ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS PARQUET AS SELECT userhash, geocoded_data, MAX(logged_in) as has_account, MAX(attempted_edit) as attempted_edit, CONCAT_WS('REQUEST_DELIM', COLLECT_LIST(requests)) AS requests, SUM(r_count) as request_count, RAND() AS rand_sample FROM {0}.{1}_{2}_by_day WHERE year = {3} GROUP BY userhash, geocoded_data """.format(db_name, table_name, lang, year) exec_hive_stat2(query, priority=priority, nice=nice)
def add_day_to_hive_trace_table(req_table, db_name, table_name, day, lang, priority, nice, sampling_rate=1.0): year = day.year month = day.month day = day.day query = """ INSERT OVERWRITE TABLE {0}.{1}_{2}_by_day PARTITION(year={3}, month={4}, day ={5}, host='{2}') SELECT userhash, geocoded_data, MAX(logged_in) as logged_in, MAX(edit_attempt) as attempted_edit, CONCAT_WS('REQUEST_DELIM', COLLECT_LIST(request)) AS requests, COUNT(*) as r_count FROM (SELECT userhash, geocoded_data, logged_in, CAST(page_title = '{6}' as int) as edit_attempt, CAST(normalized_host.project = '{2}' as int) as correct_wiki, CONCAT( 'ts|', ts, '|referer|', referer, '|page_id|', page_id, '|title|', page_title, '|uri_path|', reflect('java.net.URLDecoder', 'decode', uri_path), '|uri_query|', reflect('java.net.URLDecoder', 'decode', uri_query), '|access_method|', access_method, '|referer_class|', referer_class, '|project|', normalized_host.project_class, '|lang|', normalized_host.project, '|uri_host|', uri_host ) AS request FROM {7} WHERE day = {5} AND CONV(SUBSTR(userhash, 113), 16, 10) / 18446744073709551615 < {8} ) a GROUP BY userhash, geocoded_data HAVING COUNT(*) < 500 AND SUM(correct_wiki) > 0; """.format(db_name, table_name, lang, year, month, day, config.edit_attempt_str, req_table, sampling_rate) exec_hive_stat2(query, priority=priority, nice=nice)
def get_pageview_data(lang, output_dir): query = ( "SELECT page_id, sum(view_count) AS weekly_pageviews FROM wmf.pageview_hourly " "WHERE project = '{0}.wikipedia' " "AND agent_type = 'user' " "AND {1} " "AND namespace_id = 0 " "GROUP BY page_id;".format(lang, config.hive_days_clause)) filename = os.path.join(output_dir, "{0}_pageviews.csv".format(lang)) exec_hive_stat2(query, filename)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--all_ids_csv", default=config.all_ids_csv, help="CSV with userIDs from all languages") parser.add_argument("--ids_table_name", default=config.hive_ids_table, help="Hive table with hashed userIDs.") parser.add_argument("--srvy_req_table", default=config.hive_survey_requests_table, help="Hive table w/ all survey requests") parser.add_argument("--all_req_table", default=config.hive_all_requests_table, help="Hive table w/ all webrequests.") args = parser.parse_args() exec_hive_stat2("DROP TABLE IF EXISTS {0};".format(args.ids_table_name)) exec_hive_stat2( "CREATE TABLE {0} (userhash string) " "ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' WITH SERDEPROPERTIES " "('separatorChar' = ',', 'quoteChar' = '\\\"');".format( args.ids_table_name)) exec_hive_stat2( "LOAD DATA LOCAL INPATH '{0}' OVERWRITE INTO TABLE {1};".format( args.all_ids_csv, args.ids_table_name)) query = ("CREATE TABLE {0} STORED AS PARQUET AS " "SELECT * FROM {1} " "WHERE {2}.userhash in (SELECT {3}.userhash from {4});".format( args.srvy_req_table, args.all_req_table, args.all_req_table.split(".")[1], args.ids_table_name.split(".")[1], args.ids_table_name)) exec_hive_stat2(query)
def traces_to_csv(db, table, lang, smpl_req_folder, max_num=200000): full_tablename = db + "." + table + "_" + lang query = ( "SET mapreduce.map.memory.mb=9000; " "SET mapreduce.map.java.opts=-Xmx7200m; " "SET mapreduce.reduce.memory.mb=9000; " "SET mapreduce.reduce.java.opts=-Xmx7200m; " "SELECT userhash, geocoded_data, has_account, attempted_edit, requests " "FROM (" "SELECT * " "FROM {0} " "WHERE request_count < 500 " "ORDER BY rand_sample " "LIMIT {1}) w;".format(full_tablename, max_num)) exec_hive_stat2( query, os.path.join(smpl_req_folder, "sample_{0}.csv".format(lang)))
def add_day_to_hive_trace_table(req_table, db_name, table_name, day, lang, priority, nice): year = day.year month = day.month day = day.day query = """ INSERT OVERWRITE TABLE {0}.{1}_{2}_by_day PARTITION(year={3}, month={4}, day={5}, host='{2}') SELECT userhash, geocoded_data, MAX(logged_in) as logged_in, MAX(edit_attempt) as attempted_edit, CONCAT_WS('REQUEST_DELIM', COLLECT_LIST(request)) AS requests, COUNT(*) as r_count FROM (SELECT userhash, geocoded_data, logged_in, CAST(page_title = '{6}' as int) as edit_attempt, CONCAT( 'ts|', ts, '|referer|', referer, '|page_id|', page_id, '|title|', page_title, '|uri_path|', reflect('java.net.URLDecoder', 'decode', uri_path), '|uri_query|', reflect('java.net.URLDecoder', 'decode', uri_query), '|access_method|', access_method, '|referer_class|', referer_class, '|project|', normalized_host.project_class, '|lang|', normalized_host.project, '|uri_host|', uri_host ) AS request FROM {6} w WHERE day = {5} ) a GROUP BY userhash, geocoded_data;""".format(db_name, table_name, lang, year, month, day, req_table) # HAVING # COUNT(*) < 500;""" exec_hive_stat2(query, priority=priority, nice=nice)
def create_hive_trace_table(db_name, table_name, lang, priority, nice): """ Create a Table partitioned by day and host """ query = """ CREATE TABLE IF NOT EXISTS {0}.{1}_{2}_by_day ( userhash STRING, geocoded_data MAP<STRING,STRING>, logged_in INT, attempted_edit INT, requests STRING, r_count INT ) PARTITIONED BY (year INT, month INT, day INT, host STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS PARQUET """.format(db_name, table_name, lang) exec_hive_stat2(query, priority=priority, nice=nice)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--output_csv", default=config.quicksurvey_requests_tsv, help="CSV filename for output survey-related webrequests") parser.add_argument("--quicksurvey_requests_table", default=config.hive_el_requests_table, help="Hive table with all potential QuickSurvey webrequests and surveySessionTokens") args = parser.parse_args() # All Hive webrequests including QuickSurvey beacon (survey may have run) on the days while survey was live get_qs_query = ("CREATE TABLE {0} AS " "SELECT *, reflect('java.net.URLDecoder', 'decode', substr(uri_query, 2)) AS json_event " "FROM wmf.webrequest " "WHERE uri_path LIKE '%beacon/event' AND uri_query LIKE '%QuickSurvey%' AND uri_query LIKE '%{1}%' " "AND {2}".format(args.quicksurvey_requests_table, config.survey_name_start, config.hive_days_clause)) #exec_hive_stat2(get_qs_query) # NOTE: empirically, the client_ip and user_agent checks have filtered out zero webrequests anonymized_to_csv_query = ("SELECT dt as dt_QSinitialization, " "reflect('org.apache.commons.codec.digest.DigestUtils', 'sha512Hex', concat(client_ip, user_agent, '{0}')) as userhash, " "get_json_object(json_event, '$.event.surveySessionToken') AS survey_session_token, " "get_json_object(json_event, '$.event.pageviewToken') as pageview_token, " "get_json_object(json_event, '$.event.surveyResponseValue') as response_type, " "get_json_object(json_event, '$.event.pageTitle') as page_title, " "get_json_object(json_event, '$.event.pageId') as page_id, " "get_json_object(json_event, '$.event.isLoggedIn') as logged_in, " "geocoded_data['country'] as country, " "geocoded_data['country_code'] as country_code, " "geocoded_data['timezone'] as timezone, " "geocoded_data['city'] as city, " "geocoded_data['subdivision'] as subdivision, " "geocoded_data['latitude'] as lat, " "geocoded_data['longitude'] as lon " "FROM {1} " "WHERE client_ip <> '-' AND " "user_agent <> '-'".format(hash_key, args.quicksurvey_requests_table)) exec_hive_stat2(anonymized_to_csv_query, args.output_csv)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--hash_key", default=config.hash_key, help="Hash key for salting user-agent + client-IP") parser.add_argument("--all_req_table", default=config.hive_all_requests_table, help="Hive table w/ all webrequests.") args = parser.parse_args() query = ( "CREATE TABLE {0} STORED AS PARQUET AS " "SELECT reflect('org.apache.commons.codec.digest.DigestUtils', 'sha512Hex', concat(client_ip, user_agent, '{1}')) as userhash," "map('country', geocoded_data['country'], 'timezone', geocoded_data['timezone']) as geocoded_data, " "ts, " "referer, " "uri_path, " "uri_host, " "uri_query, " "access_method, " "referer_class, " "normalized_host, " "COALESCE(pageview_info['page_title'], '{2}') as page_title, " "COALESCE(x_analytics_map['loggedIn'], 0) as logged_in, " "page_id, " "day, " "hour " "FROM wmf.webrequest " "WHERE {3} " "AND webrequest_source = 'text' AND access_method != 'mobile app' AND agent_type = 'user' " "AND normalized_host.project_class = 'wikipedia' " "AND ((namespace_id = 0 AND is_pageview = TRUE) OR ({4}));".format( args.all_req_table, args.hash_key, config.edit_attempt_str, config.hive_days_clause, config.hive_edit_clause)) exec_hive_stat2(query)
def traces_to_csv(db, table, lang, srv_dir): full_tablename = db + "." + table + "_" + lang query = "SELECT userhash, geocoded_data, has_account, attempted_edit, requests from {0};".format(full_tablename) exec_hive_stat2(query, os.path.join(srv_dir, "sample_{0}.csv".format(lang)))