def get_pageviews(start, stop, country, project): query = """ SELECT year, month, day, hour, SUM(view_count) as pageviews, access_method FROM wmf.projectview_hourly WHERE agent_type = 'user' AND %(time)s AND project = '%(project)s' AND country_code = '%(country)s' GROUP BY year, month, day, hour, access_method """ params = { 'country': country, 'project': project, 'time': get_hive_timespan(start, stop) } d = query_hive_ssh(query % params, 'pvquery' + country + project, priority=True, delete=True) dt = d["year"].map(str) + '-' + d["month"].map(str) + '-' + d["day"].map( str) + ' ' + d["hour"].map(str) + ':00' d.index = pd.to_datetime(dt) del d['year'] del d['month'] del d['day'] del d['hour'] return d
def get_pageviews(start, stop, country, project): query = """ SELECT year, month, day, hour, SUM(view_count) as pageviews, access_method FROM wmf.projectview_hourly WHERE agent_type = 'user' AND %(time)s AND project = '%(project)s' AND country_code = '%(country)s' GROUP BY year, month, day, hour, access_method """ params = {'country': country, 'project': project, 'time': get_hive_timespan(start, stop) } d = query_hive_ssh(query % params, 'pvquery' + country + project, priority = True, delete = True) dt = d["year"].map(str) + '-' + d["month"].map(str) + '-' + d["day"].map(str) + ' ' + d["hour"].map(str) + ':00' d.index = pd.to_datetime(dt) del d['year'] del d['month'] del d['day'] del d['hour'] return d
def __init__(self, start, stop, db, dry = False): basename = start.replace('-', '') + '_' + stop.replace('-', '') self.params = { 'basename': basename, 'start': start, 'stop': stop, 'tpc_table' : basename + '_tpc', 'wdc_table': basename + '_wdc', 'tp_table' : basename + '_tp', 'wd_table' : basename + '_wd', 'c_table' : basename + '_c', 'db': db, 'time_conditon': get_hive_timespan(start, stop) } if not dry: self.create_tpc_table() self.create_wdc_table() self.create_tp_table() self.create_wd_table() self.create_c_table() self.join_and_clean()
def get_clickstream(table, lang, start, stop, priority = False, min_count = 10): params = { 'time_conditions': get_hive_timespan(start, stop, hour = False), 'table': table, 'lang': lang, 'min_count': min_count, } query = """ -- ############################################ -- create helper tables -- create copy of page table and insert rows for our special prev pages -- this will let us work with ids instead of titles later, which is much less error prone DROP TABLE IF EXISTS clickstream.%(table)s_page_helper; CREATE TABLE clickstream.%(table)s_page_helper AS SELECT * FROM clickstream.%(lang)s_page ; INSERT INTO TABLE clickstream.%(table)s_page_helper SELECT -1 AS page_id, 0 AS page_namespace, false AS page_is_redirect, 'other-empty' AS page_title FROM clickstream.%(table)s_page_helper LIMIT 1; INSERT INTO TABLE clickstream.%(table)s_page_helper SELECT -2 AS page_id, 0 AS page_namespace, false AS page_is_redirect, 'other-internal' AS page_title FROM clickstream.%(table)s_page_helper LIMIT 1; INSERT INTO TABLE clickstream.%(table)s_page_helper SELECT -3 AS page_id, 0 AS page_namespace, false AS page_is_redirect, 'other-external' AS page_title FROM clickstream.%(table)s_page_helper LIMIT 1; INSERT INTO TABLE clickstream.%(table)s_page_helper SELECT -4 AS page_id, 0 AS page_namespace, false AS page_is_redirect, 'other-search' AS page_title FROM clickstream.%(table)s_page_helper LIMIT 1; INSERT INTO TABLE clickstream.%(table)s_page_helper SELECT -5 AS page_id, 0 AS page_namespace, false AS page_is_redirect, 'other-other' AS page_title FROM clickstream.%(table)s_page_helper LIMIT 1; -- create pagelinks table that resolves links that end in a redirect -- this means that if A links to B, and B redirects to C, we replace the link (A,B) with (A,C) -- this lets us properly annotate link types after resolving redirects in the clickstream, since -- a user will experience following A as if it linked to C -- the group be ensures that each link only occurs once DROP TABLE IF EXISTS clickstream.%(table)s_pagelinks_helper; CREATE TABLE clickstream.%(table)s_pagelinks_helper AS SELECT pl_from_page_id, pl_to_page_id FROM (SELECT pl_from_page_id, CASE WHEN r.rd_to_page_id IS NULL THEN pl_to_page_id ELSE rd_to_page_id END AS pl_to_page_id FROM clickstream.%(lang)s_pagelinks l LEFT JOIN clickstream.%(lang)s_redirect r ON (r.rd_from_page_id = l.pl_to_page_id) ) a GROUP BY pl_from_page_id, pl_to_page_id ; -- ############################################ -- extract raw prev, curr pairs DROP VIEW IF EXISTS clickstream.%(table)s_temp1; CREATE VIEW clickstream.%(table)s_temp1 AS SELECT CASE -- empty or malformed referer WHEN referer IS NULL THEN 'other-empty' WHEN referer == '' THEN 'other-empty' WHEN referer == '-' THEN 'other-empty' WHEN parse_url(referer,'HOST') is NULL THEN 'other-empty' -- internal referer from the same wikipedia WHEN parse_url(referer,'HOST') in ('%(lang)s.wikipedia.org', '%(lang)s.m.wikipedia.org') AND LENGTH(REGEXP_EXTRACT(parse_url(referer,'PATH'), '/wiki/(.*)', 1)) > 1 THEN REGEXP_EXTRACT(parse_url(referer,'PATH'), '/wiki/(.*)', 1) -- other referers WHEN referer_class = 'internal' THEN 'other-internal' WHEN referer_class = 'external' THEN 'other-external' WHEN referer_class = 'external (search engine)' THEN 'other-search' ELSE 'other-other' END as prev, pageview_info['page_title'] as curr FROM wmf.webrequest WHERE %(time_conditions)s AND webrequest_source = 'text' AND normalized_host.project_class = 'wikipedia' AND normalized_host.project = '%(lang)s' AND is_pageview AND agent_type = 'user' ; -- count raw prev, curr pairs, this speeds up later queries DROP TABLE IF EXISTS clickstream.%(table)s_temp2; CREATE TABLE clickstream.%(table)s_temp2 AS SELECT prev, curr, COUNT(*) as n FROM clickstream.%(table)s_temp1 GROUP BY prev, curr ; -- we enforce that curr and prev are main namespace pages -- the joins accomplish this because, in the logs, the non main namespace pages have the namespace prepended -- at this point curr and prev are ids DROP TABLE IF EXISTS clickstream.%(table)s_temp3; CREATE TABLE clickstream.%(table)s_temp3 AS SELECT pp.page_id as prev, pc.page_id as curr, n FROM clickstream.%(table)s_temp2 JOIN clickstream.%(table)s_page_helper pp ON (prev = pp.page_title) JOIN clickstream.%(table)s_page_helper pc ON (curr = pc.page_title) WHERE pp.page_namespace = 0 AND pc.page_namespace = 0 ; -- resolve curr redirects, one step -- note that prev should not be a redirect, so we do not bother resolving it -- and prev redirects will be filtered out at the end DROP TABLE IF EXISTS clickstream.%(table)s_temp4; CREATE TABLE clickstream.%(table)s_temp4 AS SELECT prev, CASE WHEN rd_to_page_id IS NULL THEN curr ELSE rd_to_page_id END AS curr, n FROM clickstream.%(table)s_temp3 LEFT JOIN clickstream.%(lang)s_redirect ON (curr = rd_from_page_id) ; -- re-aggregate after resolving redirects and filter out pairs that occur infrequently DROP TABLE IF EXISTS clickstream.%(table)s_temp5; CREATE TABLE clickstream.%(table)s_temp5 AS SELECT prev, curr, SUM(n) as n FROM clickstream.%(table)s_temp4 GROUP BY prev, curr HAVING SUM(n) > %(min_count)s ; -- annotate link types DROP TABLE IF EXISTS clickstream.%(table)s_temp6; CREATE TABLE clickstream.%(table)s_temp6 AS SELECT prev, curr, CASE WHEN prev < 0 THEN 'external' WHEN (pl_from_page_id IS NOT NULL AND pl_to_page_id IS NOT NULL) THEN 'link' ELSE 'other' END AS type, n FROM clickstream.%(table)s_temp5 LEFT JOIN clickstream.%(table)s_pagelinks_helper ON (prev = pl_from_page_id AND curr = pl_to_page_id) ; -- create final table -- remove self loops -- restrict prev and curr to main namespace, no redirects -- get page titles DROP TABLE IF EXISTS clickstream.%(table)s; CREATE TABLE clickstream.%(table)s ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE AS SELECT pp.page_title as prev, pc.page_title as curr, a.type, a.n FROM clickstream.%(table)s_temp6 a JOIN clickstream.%(table)s_page_helper pp ON (prev = pp.page_id) JOIN clickstream.%(table)s_page_helper pc ON (curr = pc.page_id) WHERE pp.page_is_redirect = false AND pp.page_namespace = 0 AND pc.page_is_redirect = false AND pc.page_namespace = 0 AND a.curr != a.prev ; DROP VIEW clickstream.%(table)s_temp1; DROP TABLE clickstream.%(table)s_temp2; DROP TABLE clickstream.%(table)s_temp3; DROP TABLE clickstream.%(table)s_temp4; DROP TABLE clickstream.%(table)s_temp5; DROP TABLE clickstream.%(table)s_temp6; DROP TABLE clickstream.%(table)s_page_helper; DROP TABLE clickstream.%(table)s_pagelinks_helper; """ exec_hive_stat2(query % params, priority = priority)
def get_requests(start, stop, table, trace_db = 'a2v', prod_db = 'prod', priority = True, min_count=50): query = """ SET mapreduce.input.fileinputformat.split.maxsize=200000000; SET hive.mapred.mode=nonstrict; -- get pageviews, resolve redirects, add wikidata ids DROP TABLE IF EXISTS %(trace_db)s.%(trace_table)s_pageviews; CREATE TABLE %(trace_db)s.%(trace_table)s_pageviews AS SELECT year,month,day, client_ip, user_agent, x_forwarded_for, ts, pv2.lang, pv2.title, id FROM (SELECT year,month,day, client_ip, user_agent, x_forwarded_for, ts, pv1.lang, CASE WHEN rd_to_page_title IS NULL THEN raw_title ELSE rd_to_page_title END AS title FROM (SELECT year,month,day, client_ip, user_agent, x_forwarded_for, ts, normalized_host.project AS lang, REGEXP_EXTRACT(reflect('java.net.URLDecoder', 'decode', uri_path), '/wiki/(.*)', 1) as raw_title FROM wmf.webrequest WHERE is_pageview AND webrequest_source = 'text' AND normalized_host.project_class = 'wikipedia' AND agent_type = 'user' AND %(time_conditions)s AND LENGTH(REGEXP_EXTRACT(reflect('java.net.URLDecoder', 'decode', uri_path), '/wiki/(.*)', 1)) > 0 ) pv1 LEFT JOIN (SELECT * FROM prod.redirect WHERE rd_from_page_namespace = 0 AND rd_to_page_namespace = 0 AND lang RLIKE '.*' ) r ON pv1.raw_title = r.rd_from_page_title AND pv1.lang = r.lang ) pv2 INNER JOIN %(prod_db)s.wikidata_will w ON pv2.title = w.title AND pv2.lang = w.lang; DROP TABLE IF EXISTS %(trace_db)s.%(trace_table)s_editors; CREATE TABLE %(trace_db)s.%(trace_table)s_editors AS SELECT client_ip, user_agent, x_forwarded_for FROM wmf.webrequest WHERE uri_query RLIKE 'action=edit' AND %(time_conditions)s GROUP BY client_ip, user_agent, x_forwarded_for; DROP TABLE IF EXISTS %(trace_db)s.%(trace_table)s_reader_pageviews; CREATE TABLE %(trace_db)s.%(trace_table)s_reader_pageviews AS SELECT p.* FROM %(trace_db)s.%(trace_table)s_pageviews p LEFT JOIN %(trace_db)s.%(trace_table)s_editors e ON ( p.client_ip = e.client_ip AND p.user_agent = e.user_agent AND p.x_forwarded_for = e.x_forwarded_for ) WHERE e.client_ip is NULL AND e.user_agent is NULL AND e.x_forwarded_for is NULL; DROP TABLE IF EXISTS %(trace_db)s.%(trace_table)s_clients_per_item; CREATE TABLE %(trace_db)s.%(trace_table)s_clients_per_item AS SELECT id, COUNT(*) as n FROM (SELECT client_ip, user_agent, x_forwarded_for, id FROM %(trace_db)s.%(trace_table)s_reader_pageviews GROUP BY client_ip, user_agent, x_forwarded_for, id ) a GROUP BY id; -- remove disambiguation pages and pages with colon in title DROP TABLE IF EXISTS %(trace_db)s.%(trace_table)s_eligible_reader_pageviews; CREATE TABLE %(trace_db)s.%(trace_table)s_eligible_reader_pageviews AS SELECT pv.* FROM (SELECT p.* FROM %(trace_db)s.%(trace_table)s_reader_pageviews p JOIN %(trace_db)s.%(trace_table)s_clients_per_item c ON (p.id = c.id) WHERE c.n >= %(min_count)s ) pv LEFT JOIN (SELECT lang, page_title FROM %(prod_db)s.page_props WHERE propname = 'disambiguation' AND lang RLIKE '.*' AND page_namespace = 0 GROUP BY lang, page_title ) d ON (pv.lang = d.lang and pv.title = d.page_title) WHERE d.page_title IS NULL AND pv.title NOT RLIKE 'disambig' AND pv.title NOT RLIKE ':'; DROP TABLE IF EXISTS %(trace_db)s.%(trace_table)s_requests; CREATE TABLE %(trace_db)s.%(trace_table)s_requests AS SELECT CONCAT_WS('||', COLLECT_LIST(request)) AS requests FROM (SELECT client_ip, user_agent, x_forwarded_for, CONCAT('ts|', ts, '|id|', id, '|title|', title, '|lang|', lang ) AS request FROM %(trace_db)s.%(trace_table)s_eligible_reader_pageviews ) a GROUP BY client_ip, user_agent, x_forwarded_for HAVING COUNT(*) <= 1000 AND COUNT(*) > 1; DROP TABLE IF EXISTS %(trace_db)s.%(trace_table)s_eligible_reader_pageviews; DROP TABLE IF EXISTS %(trace_db)s.%(trace_table)s_clients_per_item; DROP TABLE IF EXISTS %(trace_db)s.%(trace_table)s_reader_pageviews; DROP TABLE IF EXISTS %(trace_db)s.%(trace_table)s_editors; DROP TABLE IF EXISTS %(trace_db)s.%(trace_table)s_pageviews; """ params = { 'time_conditions': get_hive_timespan(start, stop, hour = False), 'trace_db': trace_db, 'prod_db': prod_db, 'trace_table': table, 'min_count': min_count } exec_hive_stat2(query % params, priority = priority)
def create_hive_ts(d, start, stop): query = """ DROP TABLE IF EXISTS censorship.daily_ts2; CREATE TABLE censorship.daily_ts2 AS SELECT CONCAT(ts.year,'-',LPAD(ts.month,2,'0'),'-',LPAD(ts.day,2,'0')) as day, ts.country, ts.project, ts.page_title, ts.n, ts.n / agg.n_agg as proportion, wd.en_page_title FROM (SELECT year, month, day, country, project, page_title, SUM(view_count) as n FROM wmf.pageview_hourly WHERE agent_type = 'user' AND page_title not RLIKE ':' AND %(cp_conditions)s AND %(time_conditions)s GROUP BY year, month, day, country, project, page_title ) ts LEFT JOIN (SELECT year, month, day, project, page_title, SUM(view_count) as n_agg FROM wmf.pageview_hourly WHERE agent_type = 'user' AND page_title not RLIKE ':' AND %(time_conditions)s GROUP BY year, month, day, project, page_title ) agg ON ( ts.year = agg.year AND ts.month = agg.month AND ts.day = agg.day AND ts.project = agg.project AND ts.page_title = agg.page_title) LEFT JOIN censorship.wikidata wd ON (ts.page_title = wd.page_title AND ts.project = wd.project); """ params = {'cp_conditions' : get_country_project_condition(cp_dict), 'time_conditions': get_hive_timespan(start, stop), } query %= params query_hive_ssh(query, 'ts', priority = True)
def add_day_to_hive_trace_table(db_name, table_name, day, priority = True): query = """ INSERT OVERWRITE TABLE %(db_name)s.%(table_name)s_by_day PARTITION(year=%(year)d, month=%(month)d, day =%(day)d, host) SELECT client_ip, user_agent, geocoded_data, user_agent_map, CONCAT_WS('REQUEST_DELIM', COLLECT_LIST(request)) AS requests, uri_host AS host FROM (SELECT client_ip, user_agent, geocoded_data, user_agent_map, CONCAT( 'ts|', ts, '|referer|', referer, '|title|', title, '|uri_path|', reflect('java.net.URLDecoder', 'decode', uri_path), '|uri_query|', reflect('java.net.URLDecoder', 'decode', uri_query), '|is_pageview|', is_pageview, '|access_method|', access_method, '|referer_class|', referer_class, '|project|', normalized_host.project_class, '|lang|', normalized_host.project ) AS request, uri_host FROM (SELECT c.*, CASE WHEN NOT is_pageview THEN NULL WHEN rd_to IS NULL THEN raw_title ELSE rd_to END AS title FROM (SELECT w.*, CASE WHEN is_pageview THEN pageview_info['page_title'] ELSE round(RAND(), 5) END AS raw_title FROM wmf.webrequest w WHERE webrequest_source = 'text' AND agent_type = 'user' AND %(time_conditions)s AND hour = 1 AND access_method != 'mobile app' AND uri_host in ('en.wikipedia.org', 'en.m.wikipedia.org') ) c LEFT JOIN traces.en_redirect r ON c.raw_title = r.rd_from ) b ) a GROUP BY client_ip, user_agent, geocoded_data, user_agent_map, uri_host HAVING COUNT(*) < 500; """ day_dt = dateutil.parser.parse(day) params = { 'time_conditions': get_hive_timespan(day, day, hour = False), 'db_name': db_name, 'table_name': table_name, 'year' : day_dt.year, 'month': day_dt.month, 'day': day_dt.day } exec_hive_stat2(query % params, priority = priority)
def get_requests(start, stop, table, trace_db='a2v', prod_db='prod', priority=True, min_count=50): query = """ SET mapreduce.input.fileinputformat.split.maxsize=200000000; SET hive.mapred.mode=nonstrict; -- get pageviews, resolve redirects, add wikidata ids DROP TABLE IF EXISTS %(trace_db)s.%(trace_table)s_pageviews; CREATE TABLE %(trace_db)s.%(trace_table)s_pageviews AS SELECT year,month,day, client_ip, user_agent, x_forwarded_for, ts, pv2.lang, pv2.title, id FROM (SELECT year,month,day, client_ip, user_agent, x_forwarded_for, ts, pv1.lang, CASE WHEN rd_to_page_title IS NULL THEN raw_title ELSE rd_to_page_title END AS title FROM (SELECT year,month,day, client_ip, user_agent, x_forwarded_for, ts, normalized_host.project AS lang, REGEXP_EXTRACT(reflect('java.net.URLDecoder', 'decode', uri_path), '/wiki/(.*)', 1) as raw_title FROM wmf.webrequest WHERE is_pageview AND webrequest_source = 'text' AND normalized_host.project_class = 'wikipedia' AND agent_type = 'user' AND %(time_conditions)s AND LENGTH(REGEXP_EXTRACT(reflect('java.net.URLDecoder', 'decode', uri_path), '/wiki/(.*)', 1)) > 0 ) pv1 LEFT JOIN (SELECT * FROM prod.redirect WHERE rd_from_page_namespace = 0 AND rd_to_page_namespace = 0 AND lang RLIKE '.*' ) r ON pv1.raw_title = r.rd_from_page_title AND pv1.lang = r.lang ) pv2 INNER JOIN %(prod_db)s.wikidata_will w ON pv2.title = w.title AND pv2.lang = w.lang; DROP TABLE IF EXISTS %(trace_db)s.%(trace_table)s_editors; CREATE TABLE %(trace_db)s.%(trace_table)s_editors AS SELECT client_ip, user_agent, x_forwarded_for FROM wmf.webrequest WHERE uri_query RLIKE 'action=edit' AND %(time_conditions)s GROUP BY client_ip, user_agent, x_forwarded_for; DROP TABLE IF EXISTS %(trace_db)s.%(trace_table)s_reader_pageviews; CREATE TABLE %(trace_db)s.%(trace_table)s_reader_pageviews AS SELECT p.* FROM %(trace_db)s.%(trace_table)s_pageviews p LEFT JOIN %(trace_db)s.%(trace_table)s_editors e ON ( p.client_ip = e.client_ip AND p.user_agent = e.user_agent AND p.x_forwarded_for = e.x_forwarded_for ) WHERE e.client_ip is NULL AND e.user_agent is NULL AND e.x_forwarded_for is NULL; DROP TABLE IF EXISTS %(trace_db)s.%(trace_table)s_clients_per_item; CREATE TABLE %(trace_db)s.%(trace_table)s_clients_per_item AS SELECT id, COUNT(*) as n FROM (SELECT client_ip, user_agent, x_forwarded_for, id FROM %(trace_db)s.%(trace_table)s_reader_pageviews GROUP BY client_ip, user_agent, x_forwarded_for, id ) a GROUP BY id; -- remove disambiguation pages and pages with colon in title DROP TABLE IF EXISTS %(trace_db)s.%(trace_table)s_eligible_reader_pageviews; CREATE TABLE %(trace_db)s.%(trace_table)s_eligible_reader_pageviews AS SELECT pv.* FROM (SELECT p.* FROM %(trace_db)s.%(trace_table)s_reader_pageviews p JOIN %(trace_db)s.%(trace_table)s_clients_per_item c ON (p.id = c.id) WHERE c.n >= %(min_count)s ) pv LEFT JOIN (SELECT lang, page_title FROM %(prod_db)s.page_props WHERE propname = 'disambiguation' AND lang RLIKE '.*' AND page_namespace = 0 GROUP BY lang, page_title ) d ON (pv.lang = d.lang and pv.title = d.page_title) WHERE d.page_title IS NULL AND pv.title NOT RLIKE 'disambig' AND pv.title NOT RLIKE ':'; DROP TABLE IF EXISTS %(trace_db)s.%(trace_table)s_requests; CREATE TABLE %(trace_db)s.%(trace_table)s_requests AS SELECT CONCAT_WS('||', COLLECT_LIST(request)) AS requests FROM (SELECT client_ip, user_agent, x_forwarded_for, CONCAT('ts|', ts, '|id|', id, '|title|', title, '|lang|', lang ) AS request FROM %(trace_db)s.%(trace_table)s_eligible_reader_pageviews ) a GROUP BY client_ip, user_agent, x_forwarded_for HAVING COUNT(*) <= 1000 AND COUNT(*) > 1; DROP TABLE IF EXISTS %(trace_db)s.%(trace_table)s_eligible_reader_pageviews; DROP TABLE IF EXISTS %(trace_db)s.%(trace_table)s_clients_per_item; DROP TABLE IF EXISTS %(trace_db)s.%(trace_table)s_reader_pageviews; DROP TABLE IF EXISTS %(trace_db)s.%(trace_table)s_editors; DROP TABLE IF EXISTS %(trace_db)s.%(trace_table)s_pageviews; """ params = { 'time_conditions': get_hive_timespan(start, stop, hour=False), 'trace_db': trace_db, 'prod_db': prod_db, 'trace_table': table, 'min_count': min_count } exec_hive_stat2(query % params, priority=priority)
def get_clickstream(table, lang, start, stop, priority = False, min_count = 10): params = { 'time_conditions': get_hive_timespan(start, stop, hour = False), 'table': table, 'lang': lang, 'min_count': min_count, } query = """ -- extract raw prev, curr pairs DROP VIEW IF EXISTS west1.clickstream_%(table)s_temp1; CREATE VIEW west1.clickstream_%(table)s_temp1 AS SELECT CASE -- empty or malformed referer WHEN referer IS NULL THEN 'other-empty' WHEN referer == '' THEN 'other-empty' WHEN referer == '-' THEN 'other-empty' WHEN parse_url(referer,'HOST') is NULL THEN 'other-empty' -- internal referer from the same wikipedia WHEN parse_url(referer,'HOST') in ('%(lang)s.wikipedia.org', '%(lang)s.m.wikipedia.org') AND LENGTH(REGEXP_EXTRACT(parse_url(referer,'PATH'), '/wiki/(.*)', 1)) > 1 THEN REGEXP_EXTRACT(parse_url(referer,'PATH'), '/wiki/(.*)', 1) -- other referers WHEN referer_class = 'internal' THEN 'other-internal' WHEN referer_class = 'external' THEN 'other-external' WHEN referer_class = 'external (search engine)' THEN 'other-search' ELSE 'other-other' END as prev, pageview_info['page_title'] as curr FROM wmf.webrequest WHERE %(time_conditions)s AND webrequest_source = 'text' AND normalized_host.project_class = 'wikipedia' AND normalized_host.project = '%(lang)s' AND is_pageview AND agent_type = 'user'; -- count raw prev, curr pairs DROP VIEW IF EXISTS west1.clickstream_%(table)s_temp2; CREATE VIEW west1.clickstream_%(table)s_temp2 AS SELECT curr, prev, COUNT(*) as n FROM west1.clickstream_%(table)s_temp1 GROUP BY curr, prev; -- resolve redirects DROP VIEW IF EXISTS west1.clickstream_%(table)s_temp3; CREATE VIEW west1.clickstream_%(table)s_temp3 AS SELECT CASE WHEN prev in ('other-empty', 'other-internal', 'other-external', 'other-search', 'other-other') THEN prev WHEN pr.rd_to IS NULL THEN prev ELSE pr.rd_to END AS prev, CASE WHEN cr.rd_to IS NULL THEN curr ELSE cr.rd_to END AS curr, curr AS curr_unresolved, n FROM west1.clickstream_%(table)s_temp2 LEFT JOIN west1.%(lang)s_redirect pr ON (prev = pr.rd_from) LEFT JOIN west1.%(lang)s_redirect cr ON (curr = cr.rd_from); -- re-aggregate after resolving redirects and filter out pairs that occur infrequently DROP VIEW IF EXISTS west1.clickstream_%(table)s_temp4; CREATE VIEW west1.clickstream_%(table)s_temp4 AS SELECT curr, curr_unresolved, prev, SUM(n) as n FROM west1.clickstream_%(table)s_temp3 GROUP BY curr, curr_unresolved, prev HAVING SUM(n) > %(min_count)s; -- only include main namespace articles DROP VIEW IF EXISTS west1.clickstream_%(table)s_temp5; CREATE VIEW west1.clickstream_%(table)s_temp5 AS SELECT curr, curr_unresolved, prev, n FROM west1.clickstream_%(table)s_temp4 LEFT JOIN west1.%(lang)s_page_raw pp ON (prev = pp.page_title) LEFT JOIN west1.%(lang)s_page_raw cp ON (curr = cp.page_title) WHERE cp.page_title is not NULL AND ( pp.page_title is NOT NULL OR prev in ('other-empty', 'other-internal', 'other-external', 'other-search', 'other-other') ); -- annotate link types DROP VIEW IF EXISTS west1.clickstream_%(table)s_temp6; CREATE VIEW west1.clickstream_%(table)s_temp6 AS SELECT prev, curr, curr_unresolved, CASE WHEN prev in ('other-empty', 'other-internal', 'other-external', 'other-search', 'other-other') THEN 'external' WHEN l.pl_from IS NOT NULL AND l.pl_to IS NOT NULL THEN 'link' ELSE 'other' END AS type, n FROM west1.clickstream_%(table)s_temp5 LEFT JOIN west1.%(lang)s_pagelinks l ON (prev = l.pl_from AND curr = l.pl_to); -- create table DROP TABLE IF EXISTS west1.clickstream_%(table)s; CREATE TABLE west1.clickstream_%(table)s ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE AS SELECT * FROM west1.clickstream_%(table)s_temp6 WHERE curr != prev; DROP VIEW west1.clickstream_%(table)s_temp1; DROP VIEW west1.clickstream_%(table)s_temp2; DROP VIEW west1.clickstream_%(table)s_temp3; DROP VIEW west1.clickstream_%(table)s_temp4; DROP VIEW west1.clickstream_%(table)s_temp5; DROP VIEW west1.clickstream_%(table)s_temp6; """ print(query % params) exec_hive_stat2(query % params, priority = priority)
def get_clickstream(table, lang, start, stop, priority=False, min_count=10): params = { 'time_conditions': get_hive_timespan(start, stop, hour=False), 'table': table, 'lang': lang, 'min_count': min_count, } query = """ -- ############################################ -- create helper tables -- create copy of page table and insert rows for our special prev pages -- this will let us work with ids instead of titles later, which is much less error prone DROP TABLE IF EXISTS clickstream.%(table)s_page_helper; CREATE TABLE clickstream.%(table)s_page_helper AS SELECT * FROM clickstream.%(lang)s_page ; INSERT INTO TABLE clickstream.%(table)s_page_helper SELECT -1 AS page_id, 0 AS page_namespace, false AS page_is_redirect, 'other-empty' AS page_title FROM clickstream.%(table)s_page_helper LIMIT 1; INSERT INTO TABLE clickstream.%(table)s_page_helper SELECT -2 AS page_id, 0 AS page_namespace, false AS page_is_redirect, 'other-internal' AS page_title FROM clickstream.%(table)s_page_helper LIMIT 1; INSERT INTO TABLE clickstream.%(table)s_page_helper SELECT -3 AS page_id, 0 AS page_namespace, false AS page_is_redirect, 'other-external' AS page_title FROM clickstream.%(table)s_page_helper LIMIT 1; INSERT INTO TABLE clickstream.%(table)s_page_helper SELECT -4 AS page_id, 0 AS page_namespace, false AS page_is_redirect, 'other-search' AS page_title FROM clickstream.%(table)s_page_helper LIMIT 1; INSERT INTO TABLE clickstream.%(table)s_page_helper SELECT -5 AS page_id, 0 AS page_namespace, false AS page_is_redirect, 'other-other' AS page_title FROM clickstream.%(table)s_page_helper LIMIT 1; -- create pagelinks table that resolves links that end in a redirect -- this means that if A links to B, and B redirects to C, we replace the link (A,B) with (A,C) -- this lets us properly annotate link types after resolving redirects in the clickstream, since -- a user will experience following A as if it linked to C -- the group be ensures that each link only occurs once DROP TABLE IF EXISTS clickstream.%(table)s_pagelinks_helper; CREATE TABLE clickstream.%(table)s_pagelinks_helper AS SELECT pl_from_page_id, pl_to_page_id FROM (SELECT pl_from_page_id, CASE WHEN r.rd_to_page_id IS NULL THEN pl_to_page_id ELSE rd_to_page_id END AS pl_to_page_id FROM clickstream.%(lang)s_pagelinks l LEFT JOIN clickstream.%(lang)s_redirect r ON (r.rd_from_page_id = l.pl_to_page_id) ) a GROUP BY pl_from_page_id, pl_to_page_id ; -- ############################################ -- extract raw prev, curr pairs DROP VIEW IF EXISTS clickstream.%(table)s_temp1; CREATE VIEW clickstream.%(table)s_temp1 AS SELECT CASE -- empty or malformed referer WHEN referer IS NULL THEN 'other-empty' WHEN referer == '' THEN 'other-empty' WHEN referer == '-' THEN 'other-empty' WHEN parse_url(referer,'HOST') is NULL THEN 'other-empty' -- internal referer from the same wikipedia WHEN parse_url(referer,'HOST') in ('%(lang)s.wikipedia.org', '%(lang)s.m.wikipedia.org') AND LENGTH(REGEXP_EXTRACT(parse_url(referer,'PATH'), '/wiki/(.*)', 1)) > 1 THEN REGEXP_EXTRACT(parse_url(referer,'PATH'), '/wiki/(.*)', 1) -- other referers WHEN referer_class = 'internal' THEN 'other-internal' WHEN referer_class = 'external' THEN 'other-external' WHEN referer_class = 'external (search engine)' THEN 'other-search' ELSE 'other-other' END as prev, pageview_info['page_title'] as curr FROM wmf.webrequest WHERE %(time_conditions)s AND webrequest_source = 'text' AND normalized_host.project_class = 'wikipedia' AND normalized_host.project = '%(lang)s' AND is_pageview AND agent_type = 'user' ; -- count raw prev, curr pairs, this speeds up later queries DROP TABLE IF EXISTS clickstream.%(table)s_temp2; CREATE TABLE clickstream.%(table)s_temp2 AS SELECT prev, curr, COUNT(*) as n FROM clickstream.%(table)s_temp1 GROUP BY prev, curr ; -- we enforce that curr and prev are main namespace pages -- the joins accomplish this because, in the logs, the non main namespace pages have the namespace prepended -- at this point curr and prev are ids DROP TABLE IF EXISTS clickstream.%(table)s_temp3; CREATE TABLE clickstream.%(table)s_temp3 AS SELECT pp.page_id as prev, pc.page_id as curr, n FROM clickstream.%(table)s_temp2 JOIN clickstream.%(table)s_page_helper pp ON (prev = pp.page_title) JOIN clickstream.%(table)s_page_helper pc ON (curr = pc.page_title) WHERE pp.page_namespace = 0 AND pc.page_namespace = 0 ; -- resolve curr redirects, one step -- note that prev should not be a redirect, so we do not bother resolving it -- and prev redirects will be filtered out at the end DROP TABLE IF EXISTS clickstream.%(table)s_temp4; CREATE TABLE clickstream.%(table)s_temp4 AS SELECT prev, CASE WHEN rd_to_page_id IS NULL THEN curr ELSE rd_to_page_id END AS curr, n FROM clickstream.%(table)s_temp3 LEFT JOIN clickstream.%(lang)s_redirect ON (curr = rd_from_page_id) ; -- re-aggregate after resolving redirects and filter out pairs that occur infrequently DROP TABLE IF EXISTS clickstream.%(table)s_temp5; CREATE TABLE clickstream.%(table)s_temp5 AS SELECT prev, curr, SUM(n) as n FROM clickstream.%(table)s_temp4 GROUP BY prev, curr HAVING SUM(n) > %(min_count)s ; -- annotate link types DROP TABLE IF EXISTS clickstream.%(table)s_temp6; CREATE TABLE clickstream.%(table)s_temp6 AS SELECT prev, curr, CASE WHEN prev < 0 THEN 'external' WHEN (pl_from_page_id IS NOT NULL AND pl_to_page_id IS NOT NULL) THEN 'link' ELSE 'other' END AS type, n FROM clickstream.%(table)s_temp5 LEFT JOIN clickstream.%(table)s_pagelinks_helper ON (prev = pl_from_page_id AND curr = pl_to_page_id) ; -- create final table -- remove self loops -- restrict prev and curr to main namespace, no redirects -- get page titles DROP TABLE IF EXISTS clickstream.%(table)s; CREATE TABLE clickstream.%(table)s ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE AS SELECT pp.page_title as prev, pc.page_title as curr, a.type, a.n FROM clickstream.%(table)s_temp6 a JOIN clickstream.%(table)s_page_helper pp ON (prev = pp.page_id) JOIN clickstream.%(table)s_page_helper pc ON (curr = pc.page_id) WHERE pp.page_is_redirect = false AND pp.page_namespace = 0 AND pc.page_is_redirect = false AND pc.page_namespace = 0 AND a.curr != a.prev ; DROP VIEW clickstream.%(table)s_temp1; DROP TABLE clickstream.%(table)s_temp2; DROP TABLE clickstream.%(table)s_temp3; DROP TABLE clickstream.%(table)s_temp4; DROP TABLE clickstream.%(table)s_temp5; DROP TABLE clickstream.%(table)s_temp6; DROP TABLE clickstream.%(table)s_page_helper; DROP TABLE clickstream.%(table)s_pagelinks_helper; """ exec_hive_stat2(query % params, priority=priority)