def main(): """Verification of batching/anonymization script. Asserts: * Expected number of batch files exist for both public and private collections. * No extra batch files exist for both public and private collections. * All of the batch files are part on the current run. * Number of events is consistent between public and private, and matches up with upstream counts * No sensitive fields exists in public collections. """ run_id = utils.get_history_run_id_for('transform02') complaints_file = utils.get_complaints_for('transform02', 'w') complaints_file.write(settings.RUN_HEADER + '{}\n'.format(run_id)) batch_count = utils.get_batch_count() complaints = 0 print('Validating private data') complaints += verify_files('private', batch_count, run_id, complaints_file) print('Validating public data') complaints += verify_files('public', batch_count, run_id, complaints_file) if complaints > 0: print("This is {}.\n\nThat's {} {}!".format( ', '.join(['gross'] * complaints), complaints, 'whole "gross"' if complaints == 1 else '"grosses"' )) else: print("You've passed the final challenge! Huzzah, brave warrior!")
def main(dry_run=True, batch_count=None, force=False): """Upload the pageviews to Keen. """ history_run_id = utils.get_history_run_id_for('transform02') complaints_run_id = utils.get_complaints_run_id_for('transform02') if history_run_id != complaints_run_id: print("You need to validate your first-phase transformed data! Bailing...") sys.exit() extract_complaints = utils.get_complaints_for('transform02', 'r') extract_complaints.readline() # toss header if extract_complaints.readline(): print("You have unaddressed complaints in your second-phase transform!") if not force: print(" ...pass --force to ignore") sys.exit() history_file = utils.get_history_for('load', 'a') history_file.write(script_settings.RUN_HEADER + '{}\n'.format(complaints_run_id)) history_file.write('Beginning upload at: {}Z\n'.format(datetime.utcnow())) keen_clients = {'public': None, 'private': None} es_client = None if dry_run: print("Doing dry-run upload to Elastic search. Pass --for-reals to upload to Keen") es_client = Elasticsearch() try: es_client.indices.delete(script_settings.ES_INDEX) except Exception as exc: print(exc) pass else: keen_clients = { 'public': KeenClient( project_id=settings.KEEN['public']['project_id'], write_key=settings.KEEN['public']['write_key'], ), 'private': KeenClient( project_id=settings.KEEN['private']['project_id'], write_key=settings.KEEN['private']['write_key'], ) } tally = {} seen = {} try: with open(utils.get_dir_for('load') + '/resume.log', 'r') as resume_log: for seen_file in resume_log.readlines(): seen[seen_file.strip('\n')] = 1 except: pass batch_count = utils.get_batch_count() if batch_count is None else batch_count print("Beginning Upload") with open(utils.get_dir_for('load') + '/resume.log', 'a', 0) as resume_log: for batch_id in range(1, batch_count+1): print(" Batch {}".format(batch_id)) for domain in ('private', 'public'): print(" Domain: {}".format(domain)) file_id = '{}-{}'.format(domain, batch_id) if file_id in seen.keys(): print(" ...seen, skipping.\n") continue history_file.write('Uploading for {} project, batch {}'.format(domain, batch_id)) load_batch_for(batch_id, domain, tally, dry_run, es_client, keen_clients[domain]) resume_log.write('{}\n'.format(file_id)) history_file.write(' ...finished\n') print("Finished Upload") history_file.write('Finished upload at: {}Z\n'.format(datetime.utcnow())) history_file.write('Tally was:\n') for k, v in sorted(tally.items()): history_file.write(' {}: {}\n'.format(k, v))
def main(dry_run=True, batch_count=None, force=False): """Upload the pageviews to Keen. """ history_run_id = utils.get_history_run_id_for('transform02') complaints_run_id = utils.get_complaints_run_id_for('transform02') if history_run_id != complaints_run_id: print( "You need to validate your first-phase transformed data! Bailing..." ) sys.exit() extract_complaints = utils.get_complaints_for('transform02', 'r') extract_complaints.readline() # toss header if extract_complaints.readline(): print( "You have unaddressed complaints in your second-phase transform!") if not force: print(" ...pass --force to ignore") sys.exit() history_file = utils.get_history_for('load', 'a') history_file.write(script_settings.RUN_HEADER + '{}\n'.format(complaints_run_id)) history_file.write('Beginning upload at: {}Z\n'.format(datetime.utcnow())) keen_clients = {'public': None, 'private': None} es_client = None if dry_run: print( "Doing dry-run upload to Elastic search. Pass --for-reals to upload to Keen" ) es_client = Elasticsearch() try: es_client.indices.delete(script_settings.ES_INDEX) except Exception as exc: print(exc) pass else: keen_clients = { 'public': KeenClient( project_id=settings.KEEN['public']['project_id'], write_key=settings.KEEN['public']['write_key'], ), 'private': KeenClient( project_id=settings.KEEN['private']['project_id'], write_key=settings.KEEN['private']['write_key'], ) } tally = {} seen = {} try: with open(utils.get_dir_for('load') + '/resume.log', 'r') as resume_log: for seen_file in resume_log.readlines(): seen[seen_file.strip('\n')] = 1 except: pass batch_count = utils.get_batch_count( ) if batch_count is None else batch_count print("Beginning Upload") with open(utils.get_dir_for('load') + '/resume.log', 'a', 0) as resume_log: for batch_id in range(1, batch_count + 1): print(" Batch {}".format(batch_id)) for domain in ('private', 'public'): print(" Domain: {}".format(domain)) file_id = '{}-{}'.format(domain, batch_id) if file_id in seen.keys(): print(" ...seen, skipping.\n") continue history_file.write('Uploading for {} project, batch {}'.format( domain, batch_id)) load_batch_for(batch_id, domain, tally, dry_run, es_client, keen_clients[domain]) resume_log.write('{}\n'.format(file_id)) history_file.write(' ...finished\n') print("Finished Upload") history_file.write('Finished upload at: {}Z\n'.format(datetime.utcnow())) history_file.write('Tally was:\n') for k, v in sorted(tally.items()): history_file.write(' {}: {}\n'.format(k, v))