def main(dry_run=True, batch_count=None, force=False): """Upload the pageviews to Keen. """ history_run_id = utils.get_history_run_id_for('transform02') complaints_run_id = utils.get_complaints_run_id_for('transform02') if history_run_id != complaints_run_id: print("You need to validate your first-phase transformed data! Bailing...") sys.exit() extract_complaints = utils.get_complaints_for('transform02', 'r') extract_complaints.readline() # toss header if extract_complaints.readline(): print("You have unaddressed complaints in your second-phase transform!") if not force: print(" ...pass --force to ignore") sys.exit() history_file = utils.get_history_for('load', 'a') history_file.write(script_settings.RUN_HEADER + '{}\n'.format(complaints_run_id)) history_file.write('Beginning upload at: {}Z\n'.format(datetime.utcnow())) keen_clients = {'public': None, 'private': None} es_client = None if dry_run: print("Doing dry-run upload to Elastic search. Pass --for-reals to upload to Keen") es_client = Elasticsearch() try: es_client.indices.delete(script_settings.ES_INDEX) except Exception as exc: print(exc) pass else: keen_clients = { 'public': KeenClient( project_id=settings.KEEN['public']['project_id'], write_key=settings.KEEN['public']['write_key'], ), 'private': KeenClient( project_id=settings.KEEN['private']['project_id'], write_key=settings.KEEN['private']['write_key'], ) } tally = {} seen = {} try: with open(utils.get_dir_for('load') + '/resume.log', 'r') as resume_log: for seen_file in resume_log.readlines(): seen[seen_file.strip('\n')] = 1 except: pass batch_count = utils.get_batch_count() if batch_count is None else batch_count print("Beginning Upload") with open(utils.get_dir_for('load') + '/resume.log', 'a', 0) as resume_log: for batch_id in range(1, batch_count+1): print(" Batch {}".format(batch_id)) for domain in ('private', 'public'): print(" Domain: {}".format(domain)) file_id = '{}-{}'.format(domain, batch_id) if file_id in seen.keys(): print(" ...seen, skipping.\n") continue history_file.write('Uploading for {} project, batch {}'.format(domain, batch_id)) load_batch_for(batch_id, domain, tally, dry_run, es_client, keen_clients[domain]) resume_log.write('{}\n'.format(file_id)) history_file.write(' ...finished\n') print("Finished Upload") history_file.write('Finished upload at: {}Z\n'.format(datetime.utcnow())) history_file.write('Tally was:\n') for k, v in sorted(tally.items()): history_file.write(' {}: {}\n'.format(k, v))
def main(): """This script extracts pageview data from the OSF Piwik db and outputs the results to a dumpfile. """ try: mysql_db = MySQLdb.connect(host=settings.PIWIK_DB_HOST, port=settings.PIWIK_DB_PORT, user=settings.PIWIK_DB_USER, passwd=settings.PIWIK_DB_PASSWORD, db=settings.PIWIK_DB_NAME) except MySQLdb.Error as err: print "MySQL Error [%d]: %s" % (err.args[0], err.args[1]) raise err my_cursor = mysql_db.cursor(MySQLdb.cursors.DictCursor) my_cursor.execute("SET NAMES 'utf8'") my_cursor.execute( "SET sql_mode = 'ERROR_FOR_DIVISION_BY_ZERO,NO_AUTO_CREATE_USER,NO_AUTO_VALUE_ON_ZERO," "NO_ZERO_DATE,NO_ZERO_IN_DATE';") history_file = utils.get_history_for('extract', 'w') output_file = open( utils.get_dir_for('extract') + '/' + settings.EXTRACT_FILE, 'w') count = 0 last_count = 0 history_file.write(settings.RUN_HEADER + '{}\n'.format(uuid.uuid4())) history_file.write('Beginning extraction at: {}Z\n'.format( datetime.utcnow())) visit_cursor = get_visits(mysql_db) visit = visit_cursor.fetchone() while visit is not None: visit['tz_offset'] = calculate_tz_offset( str(visit['visitor_localtime']), str(visit['first_contact_server_time'])) action_cursor = get_actions_for_visit(mysql_db, visit['idvisit']) action = action_cursor.fetchone() action_count = 0 while action is not None: action_count += 1 referrer_url = None if action_count == 1 and visit['referer_type'] in ( 2, 3, ): referrer_url = visit['referer_url'] elif action['previous_url']: url_scheme = '' if action[ 'previous_url_prefix'] is None else URL_PREFIX[ action['previous_url_prefix']] referrer_url = url_scheme + action['previous_url'] # piwik stores searches weird. if action['page_title_type'] and action['page_title_type'] == 8: action['page_url'] = 'staging.osf.io/search/?q=' + action[ 'page_title'] action['page_url_prefix'] = 2 action['page_title'] = 'OSF | Search' pageview = { 'visit': { 'id': visit['idvisit'], 'visitor_id': b2a_hex(visit['idvisitor']), 'visitor_returning': visit['visitor_returning'], 'ip_addr': None if visit['location_ip'] == NULL_IP else inet_ntoa( visit['location_ip']), 'user_id': visit['user_id'], 'tz_offset': visit['tz_offset'], 'ua': { 'os': visit['config_os'], 'os_version': None, 'browser': { 'version': visit['config_browser_version'], 'name': visit['config_browser_name'], 'cookies': visit['config_cookie'], 'locale': visit['location_browser_lang'], }, 'screen': visit['config_resolution'], 'device': None, }, }, 'action': { 'id': action['visit_action_id'], 'parent_node_id': action['parent_node_id'], 'node_id': action['node_id'], 'node_tags': action['node_tags'], 'page': { 'url': action['page_url'], 'url_prefix': None if action['page_url_prefix'] is None else URL_PREFIX[action['page_url_prefix']], 'url_id': action['page_url_id'], 'url_type': action['page_url_type'], 'title': action['page_title'], 'title_id': action['page_title_id'], 'title_type': action['page_title_type'], 'is_search': True if action['page_title_type'] and action['page_title_type'] == 8 else False, }, 'referrer': referrer_url, 'timestamp': str(action['server_time']), }, } output_file.write(json.dumps(pageview) + '\n') history_file.write('\tLast action written timestamp: ' + str(action['server_time']) + '\n') count += 1 action = action_cursor.fetchone() visit = visit_cursor.fetchone() history_file.write('Finished extraction at: {}Z\n'.format( datetime.utcnow())) history_file.write('Final count was: {}\n'.format(count)) print("Final count is: {}".format(count)) history_file.close() output_file.close() mysql_db.close()
def main(force=False): history_run_id = utils.get_history_run_id_for('transform01') complaints_run_id = utils.get_complaints_run_id_for('transform01') if history_run_id != complaints_run_id: print("You need to validate your first-phase transformed data! Bailing...") sys.exit() extract_complaints = utils.get_complaints_for('transform01', 'r') extract_complaints.readline() # toss header if extract_complaints.readline(): print("You have unaddressed complaints in your first-phase transform!") if not force: print(" ...pass --force to ignore") sys.exit() history_file = utils.get_history_for('transform02', 'w') history_file.write('Run ID: {}\n'.format(complaints_run_id)) history_file.write('Beginning extraction at: {}Z\n'.format(datetime.utcnow())) transform_dir = utils.get_dir_for('transform02') public_template = transform_dir + '/public-{0:04d}.data' private_template = transform_dir + '/private-{0:04d}.data' lastline = 0 try: with open(utils.get_dir_for('transform02') + '/resume.log', 'r') as fp: fp.seek(-32, 2) lastline = int(fp.readlines()[-1].strip('\n')) except IOError: pass linenum = 0 batchnum = 0 public_pageviews = [] private_pageviews = [] with open(transform_dir + '/resume.log', 'a', 0) as resume_file: # Pass 0 for unbuffered writing with open(utils.get_dir_for('transform01') + '/' + settings.TRANSFORM01_FILE, 'r') as input_file: print('Lastline is: {}\n'.format(lastline)) for i, pageview_json in enumerate(input_file): linenum = i + 1 if linenum <= lastline: if not linenum % 1000: print('Skipping line {} of ***{}***'.format(linenum, lastline)) continue if not linenum % 1000: print('Batching line {}'.format(linenum)) pageview = json.loads(pageview_json) made_public_date = pageview['node']['made_public_date'] del pageview['node']['made_public_date'] private_pageviews.append(pageview) # only pageviews logged after the most recent make public date are copied to public # collection if made_public_date is not None and made_public_date < pageview['keen']['timestamp']: public_pageview = copy.deepcopy(pageview) for private_property in ('tech', 'user', 'visitor', 'geo' ): del public_pageview[private_property] for addon in public_pageview['keen']['addons']: if addon['name'] in ('keen:ip_to_geo', 'keen:ua_parser'): public_pageview['keen']['addons'].remove(addon) public_pageviews.append(public_pageview) if linenum % settings.BATCH_SIZE == 0: batchnum += 1 write_batch(batchnum, complaints_run_id, 'public', public_pageviews, transform_dir) write_batch(batchnum, complaints_run_id, 'private', private_pageviews, transform_dir) if linenum % settings.BATCH_SIZE != 0: batchnum += 1 write_batch(batchnum, complaints_run_id, 'public', public_pageviews, transform_dir) write_batch(batchnum, complaints_run_id, 'private', private_pageviews, transform_dir) history_file.write(settings.BATCH_HEADER + '{}\n'.format(batchnum))
def main(dry_run=True, batch_count=None, force=False): """Upload the pageviews to Keen. """ history_run_id = utils.get_history_run_id_for('transform02') complaints_run_id = utils.get_complaints_run_id_for('transform02') if history_run_id != complaints_run_id: print( "You need to validate your first-phase transformed data! Bailing..." ) sys.exit() extract_complaints = utils.get_complaints_for('transform02', 'r') extract_complaints.readline() # toss header if extract_complaints.readline(): print( "You have unaddressed complaints in your second-phase transform!") if not force: print(" ...pass --force to ignore") sys.exit() history_file = utils.get_history_for('load', 'a') history_file.write(script_settings.RUN_HEADER + '{}\n'.format(complaints_run_id)) history_file.write('Beginning upload at: {}Z\n'.format(datetime.utcnow())) keen_clients = {'public': None, 'private': None} es_client = None if dry_run: print( "Doing dry-run upload to Elastic search. Pass --for-reals to upload to Keen" ) es_client = Elasticsearch() try: es_client.indices.delete(script_settings.ES_INDEX) except Exception as exc: print(exc) pass else: keen_clients = { 'public': KeenClient( project_id=settings.KEEN['public']['project_id'], write_key=settings.KEEN['public']['write_key'], ), 'private': KeenClient( project_id=settings.KEEN['private']['project_id'], write_key=settings.KEEN['private']['write_key'], ) } tally = {} seen = {} try: with open(utils.get_dir_for('load') + '/resume.log', 'r') as resume_log: for seen_file in resume_log.readlines(): seen[seen_file.strip('\n')] = 1 except: pass batch_count = utils.get_batch_count( ) if batch_count is None else batch_count print("Beginning Upload") with open(utils.get_dir_for('load') + '/resume.log', 'a', 0) as resume_log: for batch_id in range(1, batch_count + 1): print(" Batch {}".format(batch_id)) for domain in ('private', 'public'): print(" Domain: {}".format(domain)) file_id = '{}-{}'.format(domain, batch_id) if file_id in seen.keys(): print(" ...seen, skipping.\n") continue history_file.write('Uploading for {} project, batch {}'.format( domain, batch_id)) load_batch_for(batch_id, domain, tally, dry_run, es_client, keen_clients[domain]) resume_log.write('{}\n'.format(file_id)) history_file.write(' ...finished\n') print("Finished Upload") history_file.write('Finished upload at: {}Z\n'.format(datetime.utcnow())) history_file.write('Tally was:\n') for k, v in sorted(tally.items()): history_file.write(' {}: {}\n'.format(k, v))
def main(force=False): history_run_id = utils.get_history_run_id_for('transform01') complaints_run_id = utils.get_complaints_run_id_for('transform01') if history_run_id != complaints_run_id: print( "You need to validate your first-phase transformed data! Bailing..." ) sys.exit() extract_complaints = utils.get_complaints_for('transform01', 'r') extract_complaints.readline() # toss header if extract_complaints.readline(): print("You have unaddressed complaints in your first-phase transform!") if not force: print(" ...pass --force to ignore") sys.exit() history_file = utils.get_history_for('transform02', 'w') history_file.write('Run ID: {}\n'.format(complaints_run_id)) history_file.write('Beginning extraction at: {}Z\n'.format( datetime.utcnow())) transform_dir = utils.get_dir_for('transform02') public_template = transform_dir + '/public-{0:04d}.data' private_template = transform_dir + '/private-{0:04d}.data' lastline = 0 try: with open(utils.get_dir_for('transform02') + '/resume.log', 'r') as fp: fp.seek(-32, 2) lastline = int(fp.readlines()[-1].strip('\n')) except IOError: pass linenum = 0 batchnum = 0 public_pageviews = [] private_pageviews = [] with open(transform_dir + '/resume.log', 'a', 0) as resume_file: # Pass 0 for unbuffered writing with open( utils.get_dir_for('transform01') + '/' + settings.TRANSFORM01_FILE, 'r') as input_file: print('Lastline is: {}\n'.format(lastline)) for i, pageview_json in enumerate(input_file): linenum = i + 1 if linenum <= lastline: if not linenum % 1000: print('Skipping line {} of ***{}***'.format( linenum, lastline)) continue if not linenum % 1000: print('Batching line {}'.format(linenum)) pageview = json.loads(pageview_json) made_public_date = pageview['node']['made_public_date'] del pageview['node']['made_public_date'] private_pageviews.append(pageview) # only pageviews logged after the most recent make public date are copied to public # collection if made_public_date is not None and made_public_date < pageview[ 'keen']['timestamp']: public_pageview = copy.deepcopy(pageview) for private_property in ('tech', 'user', 'visitor', 'geo'): del public_pageview[private_property] for addon in public_pageview['keen']['addons']: if addon['name'] in ('keen:ip_to_geo', 'keen:ua_parser'): public_pageview['keen']['addons'].remove(addon) public_pageviews.append(public_pageview) if linenum % settings.BATCH_SIZE == 0: batchnum += 1 write_batch(batchnum, complaints_run_id, 'public', public_pageviews, transform_dir) write_batch(batchnum, complaints_run_id, 'private', private_pageviews, transform_dir) if linenum % settings.BATCH_SIZE != 0: batchnum += 1 write_batch(batchnum, complaints_run_id, 'public', public_pageviews, transform_dir) write_batch(batchnum, complaints_run_id, 'private', private_pageviews, transform_dir) history_file.write(settings.BATCH_HEADER + '{}\n'.format(batchnum))
def main(): """This script extracts pageview data from the OSF Piwik db and outputs the results to a dumpfile. """ try: mysql_db = MySQLdb.connect( host=settings.PIWIK_DB_HOST, port=settings.PIWIK_DB_PORT, user=settings.PIWIK_DB_USER, passwd=settings.PIWIK_DB_PASSWORD, db=settings.PIWIK_DB_NAME, ) except MySQLdb.Error as err: print "MySQL Error [%d]: %s" % (err.args[0], err.args[1]) raise err my_cursor = mysql_db.cursor(MySQLdb.cursors.DictCursor) my_cursor.execute("SET NAMES 'utf8'") my_cursor.execute( "SET sql_mode = 'ERROR_FOR_DIVISION_BY_ZERO,NO_AUTO_CREATE_USER,NO_AUTO_VALUE_ON_ZERO," "NO_ZERO_DATE,NO_ZERO_IN_DATE';" ) history_file = utils.get_history_for("extract", "w") output_file = open(utils.get_dir_for("extract") + "/" + settings.EXTRACT_FILE, "w") count = 0 last_count = 0 history_file.write(settings.RUN_HEADER + "{}\n".format(uuid.uuid4())) history_file.write("Beginning extraction at: {}Z\n".format(datetime.utcnow())) visit_cursor = get_visits(mysql_db) visit = visit_cursor.fetchone() while visit is not None: visit["tz_offset"] = calculate_tz_offset( str(visit["visitor_localtime"]), str(visit["first_contact_server_time"]) ) action_cursor = get_actions_for_visit(mysql_db, visit["idvisit"]) action = action_cursor.fetchone() action_count = 0 while action is not None: action_count += 1 referrer_url = None if action_count == 1 and visit["referer_type"] in (2, 3): referrer_url = visit["referer_url"] elif action["previous_url"]: url_scheme = "" if action["previous_url_prefix"] is None else URL_PREFIX[action["previous_url_prefix"]] referrer_url = url_scheme + action["previous_url"] # piwik stores searches weird. if action["page_title_type"] and action["page_title_type"] == 8: action["page_url"] = "staging.osf.io/search/?q=" + action["page_title"] action["page_url_prefix"] = 2 action["page_title"] = "OSF | Search" pageview = { "visit": { "id": visit["idvisit"], "visitor_id": b2a_hex(visit["idvisitor"]), "visitor_returning": visit["visitor_returning"], "ip_addr": None if visit["location_ip"] == NULL_IP else inet_ntoa(visit["location_ip"]), "user_id": visit["user_id"], "tz_offset": visit["tz_offset"], "ua": { "os": visit["config_os"], "os_version": None, "browser": { "version": visit["config_browser_version"], "name": visit["config_browser_name"], "cookies": visit["config_cookie"], "locale": visit["location_browser_lang"], }, "screen": visit["config_resolution"], "device": None, }, }, "action": { "id": action["visit_action_id"], "parent_node_id": action["parent_node_id"], "node_id": action["node_id"], "node_tags": action["node_tags"], "page": { "url": action["page_url"], "url_prefix": None if action["page_url_prefix"] is None else URL_PREFIX[action["page_url_prefix"]], "url_id": action["page_url_id"], "url_type": action["page_url_type"], "title": action["page_title"], "title_id": action["page_title_id"], "title_type": action["page_title_type"], "is_search": True if action["page_title_type"] and action["page_title_type"] == 8 else False, }, "referrer": referrer_url, "timestamp": str(action["server_time"]), }, } output_file.write(json.dumps(pageview) + "\n") history_file.write("\tLast action written timestamp: " + str(action["server_time"]) + "\n") count += 1 action = action_cursor.fetchone() visit = visit_cursor.fetchone() history_file.write("Finished extraction at: {}Z\n".format(datetime.utcnow())) history_file.write("Final count was: {}\n".format(count)) print ("Final count is: {}".format(count)) history_file.close() output_file.close() mysql_db.close()