def verify_files(domain, batch_count, run_id, complaints_file): complaints = 0 work_dir = utils.get_dir_for('transform02') files = glob.glob(work_dir + '/' + domain + '-*.data') if batch_count > len(files): complaints += 1 complaints_file.write('Too many {} files found! got {}, expected {}\n'.format( domain, len(files), batch_count, )) elif batch_count < len(files): complaints += 1 complaints_file.write('Too few {} files found! got {}, expected {}\n'.format( domain, len(files), batch_count, )) lastfile_re = domain + '\-\d*' + str(batch_count) + '\.data' for filename in files: data_file = open(filename, 'r') file_run_id = data_file.readline().replace(settings.RUN_HEADER, '').rstrip() if file_run_id != run_id: complaints += 1 complaints_file.write('Invalid Run ID for {}! got {}, expected {}\n'.format( filename, file_run_id, run_id, )) break events = json.loads(data_file.readline()) if len(events) != settings.BATCH_SIZE and not re.search(lastfile_re, filename): complaints += 1 complaints_file.write('Not enough events for {}! got {}, expected {}\n'.format( filename, len(events), settings.BATCH_SIZE, )) if domain == 'public': eventnum = 0 for event in events: eventnum += 1 if hasattr(event, 'tech'): complaints += 1 complaints_file.write( 'Event {} in {} has private data! "tech" shouldn\'t be included\n'.format( eventnum, filename, ) ) if hasattr(event, 'user'): complaints += 1 complaints_file.write( 'Event {} in {} has private data! "user" shouldn\'t be included\n'.format( eventnum, filename, ) ) return complaints
def main(): input_filename = '/'.join([ utils.get_dir_for('extract'), settings.EXTRACT_FILE, ]) input_file = open(input_filename, 'r') run_id = utils.get_history_run_id_for('extract') complaints_file = utils.get_complaints_for('extract', 'w') complaints_file.write('Run ID: {}\n'.format(run_id)) linenum = 0 complaints = 0 for pageview_json in input_file.readlines(): linenum += 1 pageview = json.loads(pageview_json) visit = pageview['visit'] action = pageview['action'] # ip address are all scrubbed? if not re.search('0\.0$', visit['ip_addr']): complaints += 1 complaints_file.write( 'Line {}, ID {}: unscrubbed ip address! ({})\n'.format( linenum, action['id'], visit['ip_addr'])) if not action['page']['url']: complaints += 1 complaints_file.write( 'Line {}, ID {}: page url is missing!\n'.format( linenum, action['id'])) elif re.match('https?:\/\/', action['page']['url']): complaints += 1 complaints_file.write( 'Line {}, ID {}: page url includes domain! ({})\n'.format( linenum, action['id'], action['page']['url'].encode('utf-8'))) if complaints > 0: print("You've got {} problems, but a ready-to-go migration ain't one!". format(complaints)) else: print("Looks good. How'd you manage that?")
def main(): input_filename = '/'.join([utils.get_dir_for('transform01'), settings.TRANSFORM01_FILE,]) input_file = open(input_filename, 'r') run_id = utils.get_history_run_id_for('transform01') complaints_file = utils.get_complaints_for('transform01', 'w') complaints_file.write('Run ID: {}\n'.format(run_id)) linenum = 0 complaints = 0 for pageview_json in input_file.readlines(): linenum += 1 if not linenum % 100: print('Validating line {}'.format(linenum)) pageview = json.loads(pageview_json) if pageview['page']['url'] is None: complaints += 1 complaints_file.write('Line {}: empty url!\n'.format(linenum)) # if pageview['page']['title'] is None: # complaints += 1 # complaints_file.write('Line {}: empty page title!\n'.format(linenum)) if pageview['time']['utc'] is None: complaints += 1 complaints_file.write('Line {}: missing timestamp!\n'.format(linenum)) if pageview['tech']['ip'] is not None: if pageview['anon']['continent'] is None or pageview['anon']['country'] is None: complaints += 1 complaints_file.write( 'Line {}: Have IP addr ({}), but missing continent and/or country: ({} / {})\n'.format( linenum, pageview['tech']['ip'], pageview['anon']['continent'] or 'None', pageview['anon']['country'] or 'None' ) ) if complaints > 0: print("I got {} reasons to be mad at you. ".format(complaints)) else: print("You've done your homework, have a cookie!");
def load_batch_for(batch_id, domain, tally, dry_run, es_client, keen_client): data_dir = utils.get_dir_for('transform02') batch_filename = script_settings.EVENT_DATA_FILE_TEMPLATE.format( domain=domain, batch_id=batch_id ) data_file = open(data_dir + '/' + batch_filename, 'r') run_id = data_file.readline().rstrip() events = json.loads(data_file.readline()) if dry_run: actions = [{ '_index': script_settings.ES_INDEX, '_type': domain + '-pageviews', '_source': event, } for event in events] stats = es_bulk( client=es_client, stats_only=True, actions=actions, ) tally[domain + '-' + str(batch_id)] = stats else: keen_client.add_events({'pageviews': events})
def main(): run_id = utils.get_history_run_id_for('transform01') complaints_file = utils.get_complaints_for('transform01', 'w') complaints_file.write('Run ID: {}\n'.format(run_id)) linenum = 0 complaints = 0 with open(utils.get_dir_for('transform01') + '/' + settings.TRANSFORM01_FILE, 'r') as input_file: for i, pageview_json in enumerate(input_file): linenum = i + 1 if not linenum % 100: print('Validating line {}'.format(linenum)) pageview = json.loads(pageview_json) if pageview['page']['url'] is None: complaints += 1 complaints_file.write('Line {}: empty url!\n'.format(linenum)) if pageview['time']['utc'] is None: complaints += 1 complaints_file.write('Line {}: missing timestamp!\n'.format(linenum)) if pageview['tech']['ip'] is not None: if pageview['anon']['continent'] is None or pageview['anon']['country'] is None: complaints += 1 complaints_file.write( 'Line {}: Have IP addr ({}), but missing continent and/or country: ({} / {})\n'.format( linenum, pageview['tech']['ip'], pageview['anon']['continent'] or 'None', pageview['anon']['country'] or 'None' ) ) if complaints > 0: print("I got {} reasons to be mad at you. ".format(complaints)) else: print("You've done your homework, have a cookie!");
def load_batch_for(batch_id, domain, tally, dry_run, es_client, keen_client): data_dir = utils.get_dir_for('transform02') batch_filename = script_settings.EVENT_DATA_FILE_TEMPLATE.format( domain=domain, batch_id=batch_id) events = [] with open(data_dir + '/' + batch_filename, 'r') as data_file: run_id = data_file.readline().rstrip() events = json.loads(data_file.readline()) if dry_run: actions = [{ '_index': script_settings.ES_INDEX, '_type': domain + '-pageviews', '_source': event, } for event in events] stats = es_bulk( client=es_client, stats_only=True, actions=actions, ) tally[domain + '-' + str(batch_id)] = stats else: keen_client.add_events({'pageviews': events})
def main(dry_run=True, batch_count=None, force=False): """Upload the pageviews to Keen. """ history_run_id = utils.get_history_run_id_for('transform02') complaints_run_id = utils.get_complaints_run_id_for('transform02') if history_run_id != complaints_run_id: print("You need to validate your first-phase transformed data! Bailing...") sys.exit() extract_complaints = utils.get_complaints_for('transform02', 'r') extract_complaints.readline() # toss header if extract_complaints.readline(): print("You have unaddressed complaints in your second-phase transform!") if not force: print(" ...pass --force to ignore") sys.exit() history_file = utils.get_history_for('load', 'a') history_file.write(script_settings.RUN_HEADER + '{}\n'.format(complaints_run_id)) history_file.write('Beginning upload at: {}Z\n'.format(datetime.utcnow())) keen_clients = {'public': None, 'private': None} es_client = None if dry_run: print("Doing dry-run upload to Elastic search. Pass --for-reals to upload to Keen") es_client = Elasticsearch() try: es_client.indices.delete(script_settings.ES_INDEX) except Exception as exc: print(exc) pass else: keen_clients = { 'public': KeenClient( project_id=settings.KEEN['public']['project_id'], write_key=settings.KEEN['public']['write_key'], ), 'private': KeenClient( project_id=settings.KEEN['private']['project_id'], write_key=settings.KEEN['private']['write_key'], ) } tally = {} seen = {} try: with open(utils.get_dir_for('load') + '/resume.log', 'r') as resume_log: for seen_file in resume_log.readlines(): seen[seen_file.strip('\n')] = 1 except: pass batch_count = utils.get_batch_count() if batch_count is None else batch_count print("Beginning Upload") with open(utils.get_dir_for('load') + '/resume.log', 'a', 0) as resume_log: for batch_id in range(1, batch_count+1): print(" Batch {}".format(batch_id)) for domain in ('private', 'public'): print(" Domain: {}".format(domain)) file_id = '{}-{}'.format(domain, batch_id) if file_id in seen.keys(): print(" ...seen, skipping.\n") continue history_file.write('Uploading for {} project, batch {}'.format(domain, batch_id)) load_batch_for(batch_id, domain, tally, dry_run, es_client, keen_clients[domain]) resume_log.write('{}\n'.format(file_id)) history_file.write(' ...finished\n') print("Finished Upload") history_file.write('Finished upload at: {}Z\n'.format(datetime.utcnow())) history_file.write('Tally was:\n') for k, v in sorted(tally.items()): history_file.write(' {}: {}\n'.format(k, v))
def main(): """This script extracts pageview data from the OSF Piwik db and outputs the results to a dumpfile. """ try: mysql_db = MySQLdb.connect(host=settings.PIWIK_DB_HOST, port=settings.PIWIK_DB_PORT, user=settings.PIWIK_DB_USER, passwd=settings.PIWIK_DB_PASSWORD, db=settings.PIWIK_DB_NAME) except MySQLdb.Error as err: print "MySQL Error [%d]: %s" % (err.args[0], err.args[1]) raise err my_cursor = mysql_db.cursor(MySQLdb.cursors.DictCursor) my_cursor.execute("SET NAMES 'utf8'") my_cursor.execute( "SET sql_mode = 'ERROR_FOR_DIVISION_BY_ZERO,NO_AUTO_CREATE_USER,NO_AUTO_VALUE_ON_ZERO," "NO_ZERO_DATE,NO_ZERO_IN_DATE';") history_file = utils.get_history_for('extract', 'w') output_file = open( utils.get_dir_for('extract') + '/' + settings.EXTRACT_FILE, 'w') count = 0 last_count = 0 history_file.write(settings.RUN_HEADER + '{}\n'.format(uuid.uuid4())) history_file.write('Beginning extraction at: {}Z\n'.format( datetime.utcnow())) visit_cursor = get_visits(mysql_db) visit = visit_cursor.fetchone() while visit is not None: visit['tz_offset'] = calculate_tz_offset( str(visit['visitor_localtime']), str(visit['first_contact_server_time'])) action_cursor = get_actions_for_visit(mysql_db, visit['idvisit']) action = action_cursor.fetchone() action_count = 0 while action is not None: action_count += 1 referrer_url = None if action_count == 1 and visit['referer_type'] in ( 2, 3, ): referrer_url = visit['referer_url'] elif action['previous_url']: url_scheme = '' if action[ 'previous_url_prefix'] is None else URL_PREFIX[ action['previous_url_prefix']] referrer_url = url_scheme + action['previous_url'] # piwik stores searches weird. if action['page_title_type'] and action['page_title_type'] == 8: action['page_url'] = 'staging.osf.io/search/?q=' + action[ 'page_title'] action['page_url_prefix'] = 2 action['page_title'] = 'OSF | Search' pageview = { 'visit': { 'id': visit['idvisit'], 'visitor_id': b2a_hex(visit['idvisitor']), 'visitor_returning': visit['visitor_returning'], 'ip_addr': None if visit['location_ip'] == NULL_IP else inet_ntoa( visit['location_ip']), 'user_id': visit['user_id'], 'tz_offset': visit['tz_offset'], 'ua': { 'os': visit['config_os'], 'os_version': None, 'browser': { 'version': visit['config_browser_version'], 'name': visit['config_browser_name'], 'cookies': visit['config_cookie'], 'locale': visit['location_browser_lang'], }, 'screen': visit['config_resolution'], 'device': None, }, }, 'action': { 'id': action['visit_action_id'], 'parent_node_id': action['parent_node_id'], 'node_id': action['node_id'], 'node_tags': action['node_tags'], 'page': { 'url': action['page_url'], 'url_prefix': None if action['page_url_prefix'] is None else URL_PREFIX[action['page_url_prefix']], 'url_id': action['page_url_id'], 'url_type': action['page_url_type'], 'title': action['page_title'], 'title_id': action['page_title_id'], 'title_type': action['page_title_type'], 'is_search': True if action['page_title_type'] and action['page_title_type'] == 8 else False, }, 'referrer': referrer_url, 'timestamp': str(action['server_time']), }, } output_file.write(json.dumps(pageview) + '\n') history_file.write('\tLast action written timestamp: ' + str(action['server_time']) + '\n') count += 1 action = action_cursor.fetchone() visit = visit_cursor.fetchone() history_file.write('Finished extraction at: {}Z\n'.format( datetime.utcnow())) history_file.write('Final count was: {}\n'.format(count)) print("Final count is: {}".format(count)) history_file.close() output_file.close() mysql_db.close()
def main(force=False): history_run_id = utils.get_history_run_id_for('extract') complaints_run_id = utils.get_complaints_run_id_for('extract') if history_run_id != complaints_run_id: print('You need to validate your exported data! Bailing...') sys.exit() extract_complaints = utils.get_complaints_for('extract', 'r') extract_complaints.readline() # toss header if extract_complaints.readline(): print('You have unaddressed complaints!') if not force: print(' ...pass --force to ignore') sys.exit() extract_complaints.close() sqlite_db = sqlite3.connect(settings.SQLITE_PATH) sqlite_db.row_factory = sqlite3.Row sqlite_setup(sqlite_db) transform_dir = utils.get_dir_for('transform01') logger.info('Run ID: {}\n'.format(complaints_run_id)) logger.info('Beginning extraction at: {}Z\n'.format(datetime.utcnow())) tally = {'missing_user': 0, 'missing_node': 0} lastline = 0 try: with open(utils.get_dir_for('transform01') + '/resume.log', 'r') as fp: fp.seek(-32, 2) lastline = int(fp.readlines()[-1].strip('\n')) except IOError: pass with open(utils.get_dir_for('transform01') + '/resume.log', 'a', 0) as resume_file: # Pass 0 for unbuffered writing with open(transform_dir + '/' + settings.TRANSFORM01_FILE, 'a') as output_file: with open(utils.get_dir_for('extract') + '/' + settings.EXTRACT_FILE, 'r') as input_file: print('Lastline is: {}\n'.format(lastline)) for i, pageview_json in enumerate(input_file): linenum = i + 1 if linenum <= lastline: if not linenum % 1000: print('Skipping line {} of ***{}***'.format(linenum, lastline)) continue if not linenum % 1000: print('Transforming line {}'.format(linenum)) raw_pageview = json.loads(pageview_json) visit = raw_pageview['visit'] action = raw_pageview['action'] # lookup location by ip address. piwik strips last 16 bits, so may not be completely # accurate, but should be close enough. ip_addr = visit['ip_addr'] location = get_location_for_ip_addr(ip_addr, sqlite_db) # user has many visitor ids, visitor id has many session ids. # in keen, visitor id will refresh 1/per year, session 1/per 30min. visitor_id = get_or_create_visitor_id(visit['visitor_id'], sqlite_db) session_id = get_or_create_session_id(visit['id'], sqlite_db) user_id = visit['user_id'] user = get_or_create_user(user_id, sqlite_db) node_id = action['node_id'] node = get_or_create_node(node_id, sqlite_db) browser_version = [None, None] if visit['ua']['browser']['version']: browser_version = visit['ua']['browser']['version'].split('.') os_version = [None, None] if visit['ua']['os_version']: os_version = visit['ua']['os_version'].split('.') if len(os_version) == 1: os_version.append(None) os_family = parse_os_family(visit['ua']['os']); if visit['ua']['os'] == 'WIN' and visit['ua']['os_version']: os_family = os_family.replace('<Unknown Version>', visit['ua']['os_version']) browser_info = { 'device': { 'family': visit['ua']['device'], }, 'os': { 'major': os_version[0], 'patch_minor': None, 'minor': os_version[1], 'family': os_family, 'patch': None, }, 'browser': { 'major': browser_version[0], 'minor': browser_version[1], 'family': parse_browser_family(visit['ua']['browser']['name']), 'patch': None, }, } if '-' in visit['ua']['browser']['locale']: browser_locale = visit['ua']['browser']['locale'].split('-') browser_language = '-'.join([browser_locale[0], browser_locale[1].upper()]) node_tags = None if action['node_tags'] is None else [ tag for tag in action['node_tags'].split(',') ] # piwik stores resolution as 1900x600 mostly, but sometimes as a float? # For the sake of my sanity and yours, let's ignore floats. screen_resolution = (None, None) if re.search('x', visit['ua']['screen']): screen_resolution = visit['ua']['screen'].split('x') # piwik fmt: '2016-05-11 20:30:00', keen fmt: '2016-06-30T17:12:50.070Z' # piwik is always utc utc_timestamp = datetime.strptime(action['timestamp'], '%Y-%m-%d %H:%M:%S') utc_ts_formatted = utc_timestamp.isoformat() + '.000Z' # naive, but correct local_timedelta = timedelta(minutes=visit['tz_offset']) local_timestamp = utc_timestamp + local_timedelta pageview = { 'meta': { 'epoch': 0, # migrated from piwik }, 'page': { 'title': action['page']['title'], 'url': action['page']['url_prefix'] + action['page']['url'] if action['page']['url'] is not None else None, 'info': {} # (add-on) }, 'referrer': { 'url': action['referrer'] or None, 'info': {}, # (add-on) }, 'tech': { 'browser': { # JS-side will be filled in by Keen.helpers.getBrowserProfile() 'cookies': True if visit['ua']['browser']['cookies'] else False, 'language': browser_language, 'screen': { 'height': screen_resolution[1], 'width': screen_resolution[0], }, }, 'ip': ip_addr, # private 'ua': None, 'info': browser_info, }, 'time': { 'utc': timestamp_components(utc_timestamp), 'local': timestamp_components(local_timestamp), }, 'visitor': { 'id': visitor_id, 'session': session_id, 'returning': True if visit['visitor_returning'] else False, # visit }, 'user': { 'id': user_id, 'entry_point': '' if user is None else user['entry_point'], # empty string if no user 'locale': '' if user is None else user['locale'], # empty string if no user 'timezone': '' if user is None else user['timezone'], # empty string if no user 'institutions': None if user is None else user['institutions'], # null if no user, else [] }, 'node': { 'id': node_id, 'title': None if node is None else node['title'], 'type': None if node is None else node['category'], 'tags': node_tags, 'made_public_date': None if node is None else node['made_public_date'], }, 'geo': {}, 'anon': { 'id': md5(session_id).hexdigest(), 'continent': None if location is None else location['continent'], 'country': None if location is None else location['country'], }, 'keen': { 'timestamp': utc_ts_formatted, 'addons': [ { 'name': 'keen:referrer_parser', 'input': { 'referrer_url': 'referrer.url', 'page_url': 'page.url' }, 'output': 'referrer.info' }, { 'name': 'keen:url_parser', 'input': { 'url': 'page.url' }, 'output': 'page.info' }, { 'name': 'keen:url_parser', 'input': { 'url': 'referrer.url' }, 'output': 'referrer.info' }, { # private 'name': 'keen:ip_to_geo', 'input': { 'ip': 'tech.ip' }, 'output': 'geo', } ], } } if node_id is None: tally['missing_node'] += 1 if user_id is None: tally['missing_user'] += 1 output_file.write(json.dumps(pageview) + '\n') resume_file.write(str(linenum) + '\n') logger.info('Finished extraction at: {}Z\n'.format(datetime.utcnow())) logger.info('Final count was: {}\n'.format(linenum)) logger.info('{} pageviews lacked a user id.\n'.format(tally['missing_user'])) logger.info('{} pageviews lacked a node id.\n'.format(tally['missing_node'])) sqlite_db.close()
def main(force=False): history_run_id = utils.get_history_run_id_for('transform01') complaints_run_id = utils.get_complaints_run_id_for('transform01') if history_run_id != complaints_run_id: print("You need to validate your first-phase transformed data! Bailing...") sys.exit() extract_complaints = utils.get_complaints_for('transform01', 'r') extract_complaints.readline() # toss header if extract_complaints.readline(): print("You have unaddressed complaints in your first-phase transform!") if not force: print(" ...pass --force to ignore") sys.exit() history_file = utils.get_history_for('transform02', 'w') history_file.write('Run ID: {}\n'.format(complaints_run_id)) history_file.write('Beginning extraction at: {}Z\n'.format(datetime.utcnow())) transform_dir = utils.get_dir_for('transform02') public_template = transform_dir + '/public-{0:04d}.data' private_template = transform_dir + '/private-{0:04d}.data' lastline = 0 try: with open(utils.get_dir_for('transform02') + '/resume.log', 'r') as fp: fp.seek(-32, 2) lastline = int(fp.readlines()[-1].strip('\n')) except IOError: pass linenum = 0 batchnum = 0 public_pageviews = [] private_pageviews = [] with open(transform_dir + '/resume.log', 'a', 0) as resume_file: # Pass 0 for unbuffered writing with open(utils.get_dir_for('transform01') + '/' + settings.TRANSFORM01_FILE, 'r') as input_file: print('Lastline is: {}\n'.format(lastline)) for i, pageview_json in enumerate(input_file): linenum = i + 1 if linenum <= lastline: if not linenum % 1000: print('Skipping line {} of ***{}***'.format(linenum, lastline)) continue if not linenum % 1000: print('Batching line {}'.format(linenum)) pageview = json.loads(pageview_json) made_public_date = pageview['node']['made_public_date'] del pageview['node']['made_public_date'] private_pageviews.append(pageview) # only pageviews logged after the most recent make public date are copied to public # collection if made_public_date is not None and made_public_date < pageview['keen']['timestamp']: public_pageview = copy.deepcopy(pageview) for private_property in ('tech', 'user', 'visitor', 'geo' ): del public_pageview[private_property] for addon in public_pageview['keen']['addons']: if addon['name'] in ('keen:ip_to_geo', 'keen:ua_parser'): public_pageview['keen']['addons'].remove(addon) public_pageviews.append(public_pageview) if linenum % settings.BATCH_SIZE == 0: batchnum += 1 write_batch(batchnum, complaints_run_id, 'public', public_pageviews, transform_dir) write_batch(batchnum, complaints_run_id, 'private', private_pageviews, transform_dir) if linenum % settings.BATCH_SIZE != 0: batchnum += 1 write_batch(batchnum, complaints_run_id, 'public', public_pageviews, transform_dir) write_batch(batchnum, complaints_run_id, 'private', private_pageviews, transform_dir) history_file.write(settings.BATCH_HEADER + '{}\n'.format(batchnum))
def main(dry_run=True, batch_count=None, force=False): """Upload the pageviews to Keen. """ history_run_id = utils.get_history_run_id_for('transform02') complaints_run_id = utils.get_complaints_run_id_for('transform02') if history_run_id != complaints_run_id: print( "You need to validate your first-phase transformed data! Bailing..." ) sys.exit() extract_complaints = utils.get_complaints_for('transform02', 'r') extract_complaints.readline() # toss header if extract_complaints.readline(): print( "You have unaddressed complaints in your second-phase transform!") if not force: print(" ...pass --force to ignore") sys.exit() history_file = utils.get_history_for('load', 'a') history_file.write(script_settings.RUN_HEADER + '{}\n'.format(complaints_run_id)) history_file.write('Beginning upload at: {}Z\n'.format(datetime.utcnow())) keen_clients = {'public': None, 'private': None} es_client = None if dry_run: print( "Doing dry-run upload to Elastic search. Pass --for-reals to upload to Keen" ) es_client = Elasticsearch() try: es_client.indices.delete(script_settings.ES_INDEX) except Exception as exc: print(exc) pass else: keen_clients = { 'public': KeenClient( project_id=settings.KEEN['public']['project_id'], write_key=settings.KEEN['public']['write_key'], ), 'private': KeenClient( project_id=settings.KEEN['private']['project_id'], write_key=settings.KEEN['private']['write_key'], ) } tally = {} seen = {} try: with open(utils.get_dir_for('load') + '/resume.log', 'r') as resume_log: for seen_file in resume_log.readlines(): seen[seen_file.strip('\n')] = 1 except: pass batch_count = utils.get_batch_count( ) if batch_count is None else batch_count print("Beginning Upload") with open(utils.get_dir_for('load') + '/resume.log', 'a', 0) as resume_log: for batch_id in range(1, batch_count + 1): print(" Batch {}".format(batch_id)) for domain in ('private', 'public'): print(" Domain: {}".format(domain)) file_id = '{}-{}'.format(domain, batch_id) if file_id in seen.keys(): print(" ...seen, skipping.\n") continue history_file.write('Uploading for {} project, batch {}'.format( domain, batch_id)) load_batch_for(batch_id, domain, tally, dry_run, es_client, keen_clients[domain]) resume_log.write('{}\n'.format(file_id)) history_file.write(' ...finished\n') print("Finished Upload") history_file.write('Finished upload at: {}Z\n'.format(datetime.utcnow())) history_file.write('Tally was:\n') for k, v in sorted(tally.items()): history_file.write(' {}: {}\n'.format(k, v))
def main(force=False): history_run_id = utils.get_history_run_id_for('transform01') complaints_run_id = utils.get_complaints_run_id_for('transform01') if history_run_id != complaints_run_id: print( "You need to validate your first-phase transformed data! Bailing..." ) sys.exit() extract_complaints = utils.get_complaints_for('transform01', 'r') extract_complaints.readline() # toss header if extract_complaints.readline(): print("You have unaddressed complaints in your first-phase transform!") if not force: print(" ...pass --force to ignore") sys.exit() history_file = utils.get_history_for('transform02', 'w') history_file.write('Run ID: {}\n'.format(complaints_run_id)) history_file.write('Beginning extraction at: {}Z\n'.format( datetime.utcnow())) transform_dir = utils.get_dir_for('transform02') public_template = transform_dir + '/public-{0:04d}.data' private_template = transform_dir + '/private-{0:04d}.data' lastline = 0 try: with open(utils.get_dir_for('transform02') + '/resume.log', 'r') as fp: fp.seek(-32, 2) lastline = int(fp.readlines()[-1].strip('\n')) except IOError: pass linenum = 0 batchnum = 0 public_pageviews = [] private_pageviews = [] with open(transform_dir + '/resume.log', 'a', 0) as resume_file: # Pass 0 for unbuffered writing with open( utils.get_dir_for('transform01') + '/' + settings.TRANSFORM01_FILE, 'r') as input_file: print('Lastline is: {}\n'.format(lastline)) for i, pageview_json in enumerate(input_file): linenum = i + 1 if linenum <= lastline: if not linenum % 1000: print('Skipping line {} of ***{}***'.format( linenum, lastline)) continue if not linenum % 1000: print('Batching line {}'.format(linenum)) pageview = json.loads(pageview_json) made_public_date = pageview['node']['made_public_date'] del pageview['node']['made_public_date'] private_pageviews.append(pageview) # only pageviews logged after the most recent make public date are copied to public # collection if made_public_date is not None and made_public_date < pageview[ 'keen']['timestamp']: public_pageview = copy.deepcopy(pageview) for private_property in ('tech', 'user', 'visitor', 'geo'): del public_pageview[private_property] for addon in public_pageview['keen']['addons']: if addon['name'] in ('keen:ip_to_geo', 'keen:ua_parser'): public_pageview['keen']['addons'].remove(addon) public_pageviews.append(public_pageview) if linenum % settings.BATCH_SIZE == 0: batchnum += 1 write_batch(batchnum, complaints_run_id, 'public', public_pageviews, transform_dir) write_batch(batchnum, complaints_run_id, 'private', private_pageviews, transform_dir) if linenum % settings.BATCH_SIZE != 0: batchnum += 1 write_batch(batchnum, complaints_run_id, 'public', public_pageviews, transform_dir) write_batch(batchnum, complaints_run_id, 'private', private_pageviews, transform_dir) history_file.write(settings.BATCH_HEADER + '{}\n'.format(batchnum))
def main(): """This script extracts pageview data from the OSF Piwik db and outputs the results to a dumpfile. """ try: mysql_db = MySQLdb.connect( host=settings.PIWIK_DB_HOST, port=settings.PIWIK_DB_PORT, user=settings.PIWIK_DB_USER, passwd=settings.PIWIK_DB_PASSWORD, db=settings.PIWIK_DB_NAME, ) except MySQLdb.Error as err: print "MySQL Error [%d]: %s" % (err.args[0], err.args[1]) raise err my_cursor = mysql_db.cursor(MySQLdb.cursors.DictCursor) my_cursor.execute("SET NAMES 'utf8'") my_cursor.execute( "SET sql_mode = 'ERROR_FOR_DIVISION_BY_ZERO,NO_AUTO_CREATE_USER,NO_AUTO_VALUE_ON_ZERO," "NO_ZERO_DATE,NO_ZERO_IN_DATE';" ) history_file = utils.get_history_for("extract", "w") output_file = open(utils.get_dir_for("extract") + "/" + settings.EXTRACT_FILE, "w") count = 0 last_count = 0 history_file.write(settings.RUN_HEADER + "{}\n".format(uuid.uuid4())) history_file.write("Beginning extraction at: {}Z\n".format(datetime.utcnow())) visit_cursor = get_visits(mysql_db) visit = visit_cursor.fetchone() while visit is not None: visit["tz_offset"] = calculate_tz_offset( str(visit["visitor_localtime"]), str(visit["first_contact_server_time"]) ) action_cursor = get_actions_for_visit(mysql_db, visit["idvisit"]) action = action_cursor.fetchone() action_count = 0 while action is not None: action_count += 1 referrer_url = None if action_count == 1 and visit["referer_type"] in (2, 3): referrer_url = visit["referer_url"] elif action["previous_url"]: url_scheme = "" if action["previous_url_prefix"] is None else URL_PREFIX[action["previous_url_prefix"]] referrer_url = url_scheme + action["previous_url"] # piwik stores searches weird. if action["page_title_type"] and action["page_title_type"] == 8: action["page_url"] = "staging.osf.io/search/?q=" + action["page_title"] action["page_url_prefix"] = 2 action["page_title"] = "OSF | Search" pageview = { "visit": { "id": visit["idvisit"], "visitor_id": b2a_hex(visit["idvisitor"]), "visitor_returning": visit["visitor_returning"], "ip_addr": None if visit["location_ip"] == NULL_IP else inet_ntoa(visit["location_ip"]), "user_id": visit["user_id"], "tz_offset": visit["tz_offset"], "ua": { "os": visit["config_os"], "os_version": None, "browser": { "version": visit["config_browser_version"], "name": visit["config_browser_name"], "cookies": visit["config_cookie"], "locale": visit["location_browser_lang"], }, "screen": visit["config_resolution"], "device": None, }, }, "action": { "id": action["visit_action_id"], "parent_node_id": action["parent_node_id"], "node_id": action["node_id"], "node_tags": action["node_tags"], "page": { "url": action["page_url"], "url_prefix": None if action["page_url_prefix"] is None else URL_PREFIX[action["page_url_prefix"]], "url_id": action["page_url_id"], "url_type": action["page_url_type"], "title": action["page_title"], "title_id": action["page_title_id"], "title_type": action["page_title_type"], "is_search": True if action["page_title_type"] and action["page_title_type"] == 8 else False, }, "referrer": referrer_url, "timestamp": str(action["server_time"]), }, } output_file.write(json.dumps(pageview) + "\n") history_file.write("\tLast action written timestamp: " + str(action["server_time"]) + "\n") count += 1 action = action_cursor.fetchone() visit = visit_cursor.fetchone() history_file.write("Finished extraction at: {}Z\n".format(datetime.utcnow())) history_file.write("Final count was: {}\n".format(count)) print ("Final count is: {}".format(count)) history_file.close() output_file.close() mysql_db.close()