예제 #1
0
def main():
    """Verification of batching/anonymization script.

    Asserts:

    * Expected number of batch files exist for both public and private collections.

    * No extra batch files exist for both public and private collections.

    * All of the batch files are part on the current run.

    * Number of events is consistent between public and private, and matches up with upstream counts

    * No sensitive fields exists in public collections.

    """

    run_id = utils.get_history_run_id_for('transform02')
    complaints_file = utils.get_complaints_for('transform02', 'w')
    complaints_file.write(settings.RUN_HEADER + '{}\n'.format(run_id))

    batch_count = utils.get_batch_count()

    complaints = 0
    print('Validating private data')
    complaints += verify_files('private', batch_count, run_id, complaints_file)
    print('Validating public data')
    complaints += verify_files('public', batch_count, run_id, complaints_file)

    if complaints > 0:
        print("This is {}.\n\nThat's {} {}!".format(
            ', '.join(['gross'] * complaints), complaints, 'whole "gross"' if complaints == 1 else '"grosses"'
        ))
    else:
        print("You've passed the final challenge! Huzzah, brave warrior!")
예제 #2
0
파일: load.py 프로젝트: atelic/osf.io
def main(dry_run=True, batch_count=None, force=False):
    """Upload the pageviews to Keen.
    """

    history_run_id = utils.get_history_run_id_for('transform02')
    complaints_run_id = utils.get_complaints_run_id_for('transform02')
    if history_run_id != complaints_run_id:
        print("You need to validate your first-phase transformed data! Bailing...")
        sys.exit()

    extract_complaints = utils.get_complaints_for('transform02', 'r')
    extract_complaints.readline()  # toss header
    if extract_complaints.readline():
        print("You have unaddressed complaints in your second-phase transform!")
        if not force:
            print("  ...pass --force to ignore")
            sys.exit()


    history_file = utils.get_history_for('load', 'a')
    history_file.write(script_settings.RUN_HEADER + '{}\n'.format(complaints_run_id))
    history_file.write('Beginning upload at: {}Z\n'.format(datetime.utcnow()))

    keen_clients = {'public': None, 'private': None}
    es_client = None
    if dry_run:
        print("Doing dry-run upload to Elastic search.  Pass --for-reals to upload to Keen")
        es_client = Elasticsearch()
        try:
            es_client.indices.delete(script_settings.ES_INDEX)
        except Exception as exc:
            print(exc)
            pass
    else:
        keen_clients = {
            'public': KeenClient(
                project_id=settings.KEEN['public']['project_id'],
                write_key=settings.KEEN['public']['write_key'],
            ),
            'private':  KeenClient(
                project_id=settings.KEEN['private']['project_id'],
                write_key=settings.KEEN['private']['write_key'],
            )
        }

    tally = {}
    seen = {}

    try:
        with open(utils.get_dir_for('load') + '/resume.log', 'r') as resume_log:
            for seen_file in resume_log.readlines():
                seen[seen_file.strip('\n')] = 1
    except:
        pass

    batch_count = utils.get_batch_count() if batch_count is None else batch_count
    print("Beginning Upload")
    with open(utils.get_dir_for('load') + '/resume.log', 'a', 0) as resume_log:
        for batch_id in range(1, batch_count+1):
            print("  Batch {}".format(batch_id))
            for domain in ('private', 'public'):
                print("    Domain: {}".format(domain))
                file_id = '{}-{}'.format(domain, batch_id)
                if file_id in seen.keys():
                    print("  ...seen, skipping.\n")
                    continue

                history_file.write('Uploading for {} project, batch {}'.format(domain, batch_id))
                load_batch_for(batch_id, domain, tally, dry_run, es_client, keen_clients[domain])
                resume_log.write('{}\n'.format(file_id))
                history_file.write('  ...finished\n')

    print("Finished Upload")
    history_file.write('Finished upload at: {}Z\n'.format(datetime.utcnow()))
    history_file.write('Tally was:\n')
    for k, v in sorted(tally.items()):
        history_file.write('  {}: {}\n'.format(k, v))
예제 #3
0
파일: load.py 프로젝트: j-p-courneya/osf.io
def main(dry_run=True, batch_count=None, force=False):
    """Upload the pageviews to Keen.
    """

    history_run_id = utils.get_history_run_id_for('transform02')
    complaints_run_id = utils.get_complaints_run_id_for('transform02')
    if history_run_id != complaints_run_id:
        print(
            "You need to validate your first-phase transformed data! Bailing..."
        )
        sys.exit()

    extract_complaints = utils.get_complaints_for('transform02', 'r')
    extract_complaints.readline()  # toss header
    if extract_complaints.readline():
        print(
            "You have unaddressed complaints in your second-phase transform!")
        if not force:
            print("  ...pass --force to ignore")
            sys.exit()

    history_file = utils.get_history_for('load', 'a')
    history_file.write(script_settings.RUN_HEADER +
                       '{}\n'.format(complaints_run_id))
    history_file.write('Beginning upload at: {}Z\n'.format(datetime.utcnow()))

    keen_clients = {'public': None, 'private': None}
    es_client = None
    if dry_run:
        print(
            "Doing dry-run upload to Elastic search.  Pass --for-reals to upload to Keen"
        )
        es_client = Elasticsearch()
        try:
            es_client.indices.delete(script_settings.ES_INDEX)
        except Exception as exc:
            print(exc)
            pass
    else:
        keen_clients = {
            'public':
            KeenClient(
                project_id=settings.KEEN['public']['project_id'],
                write_key=settings.KEEN['public']['write_key'],
            ),
            'private':
            KeenClient(
                project_id=settings.KEEN['private']['project_id'],
                write_key=settings.KEEN['private']['write_key'],
            )
        }

    tally = {}
    seen = {}

    try:
        with open(utils.get_dir_for('load') + '/resume.log',
                  'r') as resume_log:
            for seen_file in resume_log.readlines():
                seen[seen_file.strip('\n')] = 1
    except:
        pass

    batch_count = utils.get_batch_count(
    ) if batch_count is None else batch_count
    print("Beginning Upload")
    with open(utils.get_dir_for('load') + '/resume.log', 'a', 0) as resume_log:
        for batch_id in range(1, batch_count + 1):
            print("  Batch {}".format(batch_id))
            for domain in ('private', 'public'):
                print("    Domain: {}".format(domain))
                file_id = '{}-{}'.format(domain, batch_id)
                if file_id in seen.keys():
                    print("  ...seen, skipping.\n")
                    continue

                history_file.write('Uploading for {} project, batch {}'.format(
                    domain, batch_id))
                load_batch_for(batch_id, domain, tally, dry_run, es_client,
                               keen_clients[domain])
                resume_log.write('{}\n'.format(file_id))
                history_file.write('  ...finished\n')

    print("Finished Upload")
    history_file.write('Finished upload at: {}Z\n'.format(datetime.utcnow()))
    history_file.write('Tally was:\n')
    for k, v in sorted(tally.items()):
        history_file.write('  {}: {}\n'.format(k, v))