Exemplo n.º 1
0
def _handle_hash_validation(upload_config, hash_function, hash_value, path):
    if upload_config.validate_hashes:
        calculated_hash_value = memory_bound_hexdigest(
            hash_func_name=hash_function, path=path)
        if calculated_hash_value != hash_value:
            raise Exception(
                f"Failed to validate upload with [{hash_function}] - expected [{hash_value}] got [{calculated_hash_value}]"
            )
Exemplo n.º 2
0
def main(argv):
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-r',
                        '--report-directory',
                        help='Directory to store reports in',
                        default=os.path.abspath(os.path.join('.', 'reports')))
    parser.add_argument('-g',
                        '--grt-config',
                        help='Path to GRT config file',
                        default=default_config)
    parser.add_argument(
        "-l",
        "--loglevel",
        choices=['debug', 'info', 'warning', 'error', 'critical'],
        help="Set the logging level",
        default='warning')
    parser.add_argument("-b",
                        "--batch-size",
                        type=int,
                        default=1000,
                        help="Batch size for sql queries")
    parser.add_argument(
        "-m",
        "--max-records",
        type=int,
        default=5000000,
        help=
        "Maximum number of records to include in a single report. This option should ONLY be used when reporting historical data. Setting this may require running GRT multiple times to capture all historical logs."
    )
    populate_config_args(parser)

    args = parser.parse_args()
    logging.getLogger().setLevel(getattr(logging, args.loglevel.upper()))

    _times = []
    _start_time = time.time()

    def annotate(label, human_label=None):
        if human_label:
            logging.info(human_label)
        _times.append((label, time.time() - _start_time))

    annotate('init_start', 'Loading GRT configuration...')
    try:
        with open(args.grt_config) as handle:
            config = yaml.safe_load(handle)
    except Exception:
        logging.info('Using default GRT configuration')
        with open(sample_config) as handle:
            config = yaml.safe_load(handle)
    annotate('init_end')

    REPORT_DIR = args.report_directory
    CHECK_POINT_FILE = os.path.join(REPORT_DIR, '.checkpoint')
    REPORT_IDENTIFIER = str(time.time())
    REPORT_BASE = os.path.join(REPORT_DIR, REPORT_IDENTIFIER)

    if os.path.exists(CHECK_POINT_FILE):
        with open(CHECK_POINT_FILE, 'r') as handle:
            last_job_sent = int(handle.read())
    else:
        last_job_sent = -1

    annotate('galaxy_init', 'Loading Galaxy...')
    model, object_store, gxconfig = _init(args)

    # Galaxy overrides our logging level.
    logging.getLogger().setLevel(getattr(logging, args.loglevel.upper()))
    sa_session = model.context.current
    annotate('galaxy_end')

    # Fetch jobs COMPLETED with status OK that have not yet been sent.

    # Set up our arrays
    active_users = defaultdict(int)
    job_state_data = defaultdict(int)

    if not os.path.exists(REPORT_DIR):
        os.makedirs(REPORT_DIR)

    # Pick an end point so our queries can return uniform data.
    annotate('endpoint_start', 'Identifying a safe endpoint for SQL queries')
    end_job_id = sa_session.query(model.Job.id) \
        .order_by(model.Job.id.desc()) \
        .first()[0]

    # Allow users to only report N records at once.
    if args.max_records > 0:
        if end_job_id - last_job_sent > args.max_records:
            end_job_id = last_job_sent + args.max_records

    annotate('endpoint_end',
             'Processing jobs (%s, %s]' % (last_job_sent, end_job_id))

    # Remember the last job sent.
    if end_job_id == last_job_sent:
        logging.info("No new jobs to report")
        # So we can just quit now.
        sys.exit(0)

    # Unfortunately we have to keep this mapping for the sanitizer to work properly.
    job_tool_map = {}
    blacklisted_tools = config['sanitization']['tools']

    annotate('export_jobs_start', 'Exporting Jobs')
    with io.open(REPORT_BASE + '.jobs.tsv', 'w',
                 encoding='utf-8') as handle_job:
        handle_job.write(u'\t'.join(('id', 'tool_id', 'tool_version', 'state',
                                     'create_time')) + '\n')
        for offset_start in range(last_job_sent, end_job_id, args.batch_size):
            logging.debug("Processing %s:%s", offset_start,
                          min(end_job_id, offset_start + args.batch_size))
            for job in sa_session.query(model.Job.id, model.Job.user_id, model.Job.tool_id, model.Job.tool_version, model.Job.state, model.Job.create_time) \
                    .filter(model.Job.id > offset_start) \
                    .filter(model.Job.id <= min(end_job_id, offset_start + args.batch_size)) \
                    .all():
                # If the tool is blacklisted, exclude everywhere
                if job[2] in blacklisted_tools:
                    continue

                try:
                    line = [
                        str(job[0]),  # id
                        job[2],  # tool_id
                        job[3],  # tool_version
                        job[4],  # state
                        str(job[5])  # create_time
                    ]
                    cline = unicodify('\t'.join(line) + '\n')
                    handle_job.write(cline)
                except Exception:
                    logging.warning(
                        "Unable to write out a 'handle_job' row. Ignoring the row.",
                        exc_info=True)
                    continue
                # meta counts
                job_state_data[job[4]] += 1
                active_users[job[1]] += 1
                job_tool_map[job[0]] = job[2]
    annotate('export_jobs_end')

    annotate('export_datasets_start', 'Exporting Datasets')
    with io.open(REPORT_BASE + '.datasets.tsv', 'w',
                 encoding='utf-8') as handle_datasets:
        handle_datasets.write(u'\t'.join(('job_id', 'dataset_id', 'extension',
                                          'file_size', 'param_name', 'type')) +
                              '\n')
        for offset_start in range(last_job_sent, end_job_id, args.batch_size):
            logging.debug("Processing %s:%s", offset_start,
                          min(end_job_id, offset_start + args.batch_size))

            # four queries: JobToInputDatasetAssociation, JobToOutputDatasetAssociation, HistoryDatasetAssociation, Dataset

            job_to_input_hda_ids = sa_session.query(model.JobToInputDatasetAssociation.job_id, model.JobToInputDatasetAssociation.dataset_id,
                model.JobToInputDatasetAssociation.name) \
                .filter(model.JobToInputDatasetAssociation.job_id > offset_start) \
                .filter(model.JobToInputDatasetAssociation.job_id <= min(end_job_id, offset_start + args.batch_size)) \
                .all()

            job_to_output_hda_ids = sa_session.query(model.JobToOutputDatasetAssociation.job_id, model.JobToOutputDatasetAssociation.dataset_id,
                model.JobToOutputDatasetAssociation.name) \
                .filter(model.JobToOutputDatasetAssociation.job_id > offset_start) \
                .filter(model.JobToOutputDatasetAssociation.job_id <= min(end_job_id, offset_start + args.batch_size)) \
                .all()

            # add type and concat
            job_to_hda_ids = [[list(i), "input"] for i in job_to_input_hda_ids
                              ] + [[list(i), "output"]
                                   for i in job_to_output_hda_ids]

            # put all of the hda_ids into a list
            hda_ids = [i[0][1] for i in job_to_hda_ids]

            hdas = sa_session.query(model.HistoryDatasetAssociation.id, model.HistoryDatasetAssociation.dataset_id,
                model.HistoryDatasetAssociation.extension) \
                .filter(model.HistoryDatasetAssociation.id.in_(hda_ids)) \
                .all()

            # put all the dataset ids into a list
            dataset_ids = [i[1] for i in hdas]

            # get the sizes of the datasets
            datasets = sa_session.query(model.Dataset.id, model.Dataset.total_size) \
                .filter(model.Dataset.id.in_(dataset_ids)) \
                .all()

            # datasets to dictionay for easy search
            hdas = {i[0]: i[1:] for i in hdas}
            datasets = {i[0]: i[1:] for i in datasets}

            for job_to_hda in job_to_hda_ids:

                job = job_to_hda[0]  # job_id, hda_id, name
                filetype = job_to_hda[1]  # input|output

                # No associated job
                if job[0] not in job_tool_map:
                    continue

                # If the tool is blacklisted, exclude everywhere
                if job_tool_map[job[0]] in blacklisted_tools:
                    continue

                hda_id = job[1]
                if hda_id is None:
                    continue

                dataset_id = hdas[hda_id][0]
                if dataset_id is None:
                    continue

                try:
                    line = [
                        str(job[0]),  # Job ID
                        str(hda_id),  # HDA ID
                        str(hdas[hda_id][1]),  # Extension
                        round_to_2sd(datasets[dataset_id][0]),  # File size
                        job[2],  # Parameter name
                        str(filetype)  # input/output
                    ]
                    cline = unicodify('\t'.join(line) + '\n')
                    handle_datasets.write(cline)
                except Exception:
                    logging.warning(
                        "Unable to write out a 'handle_datasets' row. Ignoring the row.",
                        exc_info=True)
                    continue
    annotate('export_datasets_end')

    annotate('export_metric_num_start', 'Exporting Metrics (Numeric)')
    with io.open(REPORT_BASE + '.metric_num.tsv', 'w',
                 encoding='utf-8') as handle_metric_num:
        handle_metric_num.write(u'\t'.join(('job_id', 'plugin', 'name',
                                            'value')) + '\n')
        for offset_start in range(last_job_sent, end_job_id, args.batch_size):
            logging.debug("Processing %s:%s", offset_start,
                          min(end_job_id, offset_start + args.batch_size))
            for metric in sa_session.query(model.JobMetricNumeric.job_id, model.JobMetricNumeric.plugin, model.JobMetricNumeric.metric_name, model.JobMetricNumeric.metric_value) \
                    .filter(model.JobMetricNumeric.job_id > offset_start) \
                    .filter(model.JobMetricNumeric.job_id <= min(end_job_id, offset_start + args.batch_size)) \
                    .all():
                # No associated job
                if metric[0] not in job_tool_map:
                    continue
                # If the tool is blacklisted, exclude everywhere
                if job_tool_map[metric[0]] in blacklisted_tools:
                    continue

                try:
                    line = [
                        str(metric[0]),  # job id
                        metric[1],  # plugin
                        metric[2],  # name
                        str(metric[3])  # value
                    ]

                    cline = unicodify('\t'.join(line) + '\n')
                    handle_metric_num.write(cline)
                except Exception:
                    logging.warning(
                        "Unable to write out a 'handle_metric_num' row. Ignoring the row.",
                        exc_info=True)
                    continue
    annotate('export_metric_num_end')

    # Now on to outputs.
    with tarfile.open(REPORT_BASE + '.tar.gz', 'w:gz') as handle:
        for name in ('jobs', 'metric_num', 'datasets'):
            path = REPORT_BASE + '.' + name + '.tsv'
            if os.path.exists(path):
                handle.add(path)

    for name in ('jobs', 'metric_num', 'datasets'):
        path = REPORT_BASE + '.' + name + '.tsv'
        if os.path.exists(path):
            os.unlink(REPORT_BASE + '.' + name + '.tsv')

    _times.append(('job_finish', time.time() - _start_time))
    sha = hash_util.memory_bound_hexdigest(hash_func=hash_util.sha256,
                                           path=REPORT_BASE + ".tar.gz")
    _times.append(('hash_finish', time.time() - _start_time))

    # Now serialize the individual report data.
    with open(REPORT_BASE + '.json', 'w') as handle:
        json.dump(
            {
                "version": 3,
                "galaxy_version": gxconfig.version_major,
                "generated": REPORT_IDENTIFIER,
                "report_hash": "sha256:" + sha,
                "metrics": {
                    "_times": _times,
                },
                "users": {
                    "active": len(active_users.keys()),
                    "total": sa_session.query(model.User.id).count(),
                },
                "jobs": job_state_data,
            }, handle)

    # Write our checkpoint file so we know where to start next time.
    with open(CHECK_POINT_FILE, 'w') as handle:
        handle.write(str(end_job_id))
Exemplo n.º 3
0
def main(argv):
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-r', '--report-directory', help='Directory to store reports in',
                        default=os.path.abspath(os.path.join('.', 'reports')))
    parser.add_argument('-g', '--grt-config', help='Path to GRT config file',
                        default=default_config)
    parser.add_argument("-l", "--loglevel", choices=['debug', 'info', 'warning', 'error', 'critical'],
                        help="Set the logging level", default='warning')
    parser.add_argument("-b", "--batch-size", type=int, default=1000,
                        help="Batch size for sql queries")
    parser.add_argument("-m", "--max-records", type=int, default=0,
                        help="Maximum number of records to include in a single report. This option should ONLY be used when reporting historical data. Setting this may require running GRT multiple times to capture all historical logs.")
    populate_config_args(parser)

    args = parser.parse_args()
    logging.getLogger().setLevel(getattr(logging, args.loglevel.upper()))

    _times = []
    _start_time = time.time()

    def annotate(label, human_label=None):
        if human_label:
            logging.info(human_label)
        _times.append((label, time.time() - _start_time))

    annotate('init_start', 'Loading GRT configuration...')
    try:
        with open(args.grt_config) as handle:
            config = yaml.safe_load(handle)
    except Exception:
        logging.info('Using default GRT configuration')
        with open(sample_config) as handle:
            config = yaml.safe_load(handle)
    annotate('init_end')

    REPORT_DIR = args.report_directory
    CHECK_POINT_FILE = os.path.join(REPORT_DIR, '.checkpoint')
    REPORT_IDENTIFIER = str(time.time())
    REPORT_BASE = os.path.join(REPORT_DIR, REPORT_IDENTIFIER)

    if os.path.exists(CHECK_POINT_FILE):
        with open(CHECK_POINT_FILE, 'r') as handle:
            last_job_sent = int(handle.read())
    else:
        last_job_sent = -1

    annotate('galaxy_init', 'Loading Galaxy...')
    model, object_store, gxconfig, app = _init(args, need_app=config['grt']['share_toolbox'])

    # Galaxy overrides our logging level.
    logging.getLogger().setLevel(getattr(logging, args.loglevel.upper()))
    sa_session = model.context.current
    annotate('galaxy_end')

    # Fetch jobs COMPLETED with status OK that have not yet been sent.

    # Set up our arrays
    active_users = defaultdict(int)
    job_state_data = defaultdict(int)

    annotate('san_init', 'Building Sanitizer')
    san = Sanitization(config['sanitization'], model, sa_session)
    annotate('san_end')

    if not os.path.exists(REPORT_DIR):
        os.makedirs(REPORT_DIR)

    # Pick an end point so our queries can return uniform data.
    annotate('endpoint_start', 'Identifying a safe endpoint for SQL queries')
    end_job_id = sa_session.query(model.Job.id) \
        .order_by(model.Job.id.desc()) \
        .first()[0]

    # Allow users to only report N records at once.
    if args.max_records > 0:
        if end_job_id - last_job_sent > args.max_records:
            end_job_id = last_job_sent + args.max_records

    annotate('endpoint_end', 'Processing jobs (%s, %s]' % (last_job_sent, end_job_id))

    # Remember the last job sent.
    if end_job_id == last_job_sent:
        logging.info("No new jobs to report")
        # So we can just quit now.
        sys.exit(0)

    # Unfortunately we have to keep this mapping for the sanitizer to work properly.
    job_tool_map = {}
    blacklisted_tools = config['sanitization']['tools']

    annotate('export_jobs_start', 'Exporting Jobs')
    handle_job = open(REPORT_BASE + '.jobs.tsv', 'w')
    handle_job.write('\t'.join(('id', 'tool_id', 'tool_version', 'state', 'create_time')) + '\n')
    for offset_start in range(last_job_sent, end_job_id, args.batch_size):
        logging.debug("Processing %s:%s", offset_start, min(end_job_id, offset_start + args.batch_size))
        for job in sa_session.query(model.Job.id, model.Job.user_id, model.Job.tool_id, model.Job.tool_version, model.Job.state, model.Job.create_time) \
                .filter(model.Job.id > offset_start) \
                .filter(model.Job.id <= min(end_job_id, offset_start + args.batch_size)) \
                .all():
            # If the tool is blacklisted, exclude everywhere
            if job[2] in blacklisted_tools:
                continue

            try:
                handle_job.write(str(job[0]))  # id
                handle_job.write('\t')
                handle_job.write(job[2])  # tool_id
                handle_job.write('\t')
                handle_job.write(job[3])  # tool_version
                handle_job.write('\t')
                handle_job.write(job[4])  # state
                handle_job.write('\t')
                handle_job.write(str(job[5]))  # create_time
                handle_job.write('\n')
            except Exception:
                logging.warning("Unable to write out a 'handle_job' row. Ignoring the row.", exc_info=True)
                continue
            # meta counts
            job_state_data[job[4]] += 1
            active_users[job[1]] += 1
            job_tool_map[job[0]] = job[2]

    handle_job.close()
    annotate('export_jobs_end')

    annotate('export_datasets_start', 'Exporting Datasets')
    handle_datasets = open(REPORT_BASE + '.datasets.tsv', 'w')
    handle_datasets.write('\t'.join(('job_id', 'dataset_id', 'extension', 'file_size', 'param_name', 'type')) + '\n')
    for offset_start in range(last_job_sent, end_job_id, args.batch_size):
        logging.debug("Processing %s:%s", offset_start, min(end_job_id, offset_start + args.batch_size))

        # four queries: JobToInputDatasetAssociation, JobToOutputDatasetAssociation, HistoryDatasetAssociation, Dataset

        job_to_input_hda_ids = sa_session.query(model.JobToInputDatasetAssociation.job_id, model.JobToInputDatasetAssociation.dataset_id,
            model.JobToInputDatasetAssociation.name) \
            .filter(model.JobToInputDatasetAssociation.job_id > offset_start) \
            .filter(model.JobToInputDatasetAssociation.job_id <= min(end_job_id, offset_start + args.batch_size)) \
            .all()

        job_to_output_hda_ids = sa_session.query(model.JobToOutputDatasetAssociation.job_id, model.JobToOutputDatasetAssociation.dataset_id,
            model.JobToOutputDatasetAssociation.name) \
            .filter(model.JobToOutputDatasetAssociation.job_id > offset_start) \
            .filter(model.JobToOutputDatasetAssociation.job_id <= min(end_job_id, offset_start + args.batch_size)) \
            .all()

        # add type and concat
        job_to_hda_ids = [[list(i), "input"] for i in job_to_input_hda_ids] + [[list(i), "output"] for i in job_to_output_hda_ids]

        # put all of the hda_ids into a list
        hda_ids = [i[0][1] for i in job_to_hda_ids]

        hdas = sa_session.query(model.HistoryDatasetAssociation.id, model.HistoryDatasetAssociation.dataset_id,
            model.HistoryDatasetAssociation.extension) \
            .filter(model.HistoryDatasetAssociation.id.in_(hda_ids)) \
            .all()

        # put all the dataset ids into a list
        dataset_ids = [i[1] for i in hdas]

        # get the sizes of the datasets
        datasets = sa_session.query(model.Dataset.id, model.Dataset.total_size) \
            .filter(model.Dataset.id.in_(dataset_ids)) \
            .all()

        # datasets to dictionay for easy search
        hdas = {i[0]: i[1:] for i in hdas}
        datasets = {i[0]: i[1:] for i in datasets}

        for job_to_hda in job_to_hda_ids:

            job = job_to_hda[0]  # job_id, hda_id, name
            filetype = job_to_hda[1]  # input|output

            # No associated job
            if job[0] not in job_tool_map:
                continue

            # If the tool is blacklisted, exclude everywhere
            if job_tool_map[job[0]] in blacklisted_tools:
                continue

            hda_id = job[1]
            if hda_id is None:
                continue

            dataset_id = hdas[hda_id][0]
            if dataset_id is None:
                continue

            try:
                handle_datasets.write(str(job[0]))
                handle_datasets.write('\t')
                handle_datasets.write(str(hda_id))
                handle_datasets.write('\t')
                handle_datasets.write(str(hdas[hda_id][1]))
                handle_datasets.write('\t')
                handle_datasets.write(str(datasets[dataset_id][0]))
                handle_datasets.write('\t')
                handle_datasets.write(str(job[2]))
                handle_datasets.write('\t')
                handle_datasets.write(str(filetype))
                handle_datasets.write('\n')
            except Exception:
                logging.warning("Unable to write out a 'handle_datasets' row. Ignoring the row.", exc_info=True)
                continue
    handle_datasets.close()
    annotate('export_datasets_end')

    annotate('export_metric_num_start', 'Exporting Metrics (Numeric)')
    handle_metric_num = open(REPORT_BASE + '.metric_num.tsv', 'w')
    handle_metric_num.write('\t'.join(('job_id', 'plugin', 'name', 'value')) + '\n')
    for offset_start in range(last_job_sent, end_job_id, args.batch_size):
        logging.debug("Processing %s:%s", offset_start, min(end_job_id, offset_start + args.batch_size))
        for metric in sa_session.query(model.JobMetricNumeric.job_id, model.JobMetricNumeric.plugin, model.JobMetricNumeric.metric_name, model.JobMetricNumeric.metric_value) \
                .filter(model.JobMetricNumeric.job_id > offset_start) \
                .filter(model.JobMetricNumeric.job_id <= min(end_job_id, offset_start + args.batch_size)) \
                .all():
            # No associated job
            if metric[0] not in job_tool_map:
                continue
            # If the tool is blacklisted, exclude everywhere
            if job_tool_map[metric[0]] in blacklisted_tools:
                continue

            try:
                handle_metric_num.write(str(metric[0]))
                handle_metric_num.write('\t')
                handle_metric_num.write(metric[1])
                handle_metric_num.write('\t')
                handle_metric_num.write(metric[2])
                handle_metric_num.write('\t')
                handle_metric_num.write(str(metric[3]))
                handle_metric_num.write('\n')
            except Exception:
                logging.warning("Unable to write out a 'handle_metric_num' row. Ignoring the row.", exc_info=True)
                continue
    handle_metric_num.close()
    annotate('export_metric_num_end')

    annotate('export_params_start', 'Export Job Parameters')
    handle_params = open(REPORT_BASE + '.params.tsv', 'w')
    handle_params.write('\t'.join(('job_id', 'name', 'value')) + '\n')
    for offset_start in range(last_job_sent, end_job_id, args.batch_size):
        logging.debug("Processing %s:%s", offset_start, min(end_job_id, offset_start + args.batch_size))
        for param in sa_session.query(model.JobParameter.job_id, model.JobParameter.name, model.JobParameter.value) \
                .filter(model.JobParameter.job_id > offset_start) \
                .filter(model.JobParameter.job_id <= min(end_job_id, offset_start + args.batch_size)) \
                .all():
            # No associated job
            if param[0] not in job_tool_map:
                continue
            # If the tool is blacklisted, exclude everywhere
            if job_tool_map[param[0]] in blacklisted_tools:
                continue

            try:
                sanitized = san.sanitize_data(job_tool_map[param[0]], param[1], param[2])

                handle_params.write(str(param[0]))
                handle_params.write('\t')
                handle_params.write(param[1])
                handle_params.write('\t')
                handle_params.write(json.dumps(sanitized))
                handle_params.write('\n')
            except Exception:
                logging.warning("Unable to write out a 'handle_params' row. Ignoring the row.", exc_info=True)
                continue
    handle_params.close()
    annotate('export_params_end')

    # Now on to outputs.
    with tarfile.open(REPORT_BASE + '.tar.gz', 'w:gz') as handle:
        for name in ('jobs', 'metric_num', 'params', 'datasets'):
            handle.add(REPORT_BASE + '.' + name + '.tsv')

    for name in ('jobs', 'metric_num', 'params', 'datasets'):
        os.unlink(REPORT_BASE + '.' + name + '.tsv')

    _times.append(('job_finish', time.time() - _start_time))
    sha = hash_util.memory_bound_hexdigest(hash_util.sha256, REPORT_BASE + ".tar.gz")
    _times.append(('hash_finish', time.time() - _start_time))

    # Now serialize the individual report data.
    with open(REPORT_BASE + '.json', 'w') as handle:
        if config['grt']['share_toolbox']:
            toolbox = [
                (tool.id, tool.name, tool.version, tool.tool_shed, tool.repository_id, tool.repository_name)
                for tool_id, tool in app.toolbox._tools_by_id.items()
            ]
        else:
            toolbox = None

        json.dump({
            "version": 1,
            "galaxy_version": gxconfig.version_major,
            "generated": REPORT_IDENTIFIER,
            "report_hash": "sha256:" + sha,
            "metrics": {
                "_times": _times,
            },
            "users": {
                "active": len(active_users.keys()),
                "total": sa_session.query(model.User.id).count(),
            },
            "jobs": job_state_data,
            "tools": toolbox
        }, handle)

    # Write our checkpoint file so we know where to start next time.
    with open(CHECK_POINT_FILE, 'w') as handle:
        handle.write(str(end_job_id))
Exemplo n.º 4
0
def main(argv):
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-r',
                        '--report-directory',
                        help='Directory to store reports in',
                        default=os.path.abspath(os.path.join('.', 'reports')))
    parser.add_argument('-g',
                        '--grt-config',
                        help='Path to GRT config file',
                        default=default_config)
    parser.add_argument(
        "-l",
        "--loglevel",
        choices=['debug', 'info', 'warning', 'error', 'critical'],
        help="Set the logging level",
        default='warning')
    parser.add_argument("-b",
                        "--batch-size",
                        type=int,
                        default=1000,
                        help="Batch size for sql queries")
    parser.add_argument(
        "-m",
        "--max-records",
        type=int,
        default=0,
        help=
        "Maximum number of records to include in a single report. This option should ONLY be used when reporting historical data. Setting this may require running GRT multiple times to capture all historical logs."
    )
    populate_config_args(parser)

    args = parser.parse_args()
    logging.getLogger().setLevel(getattr(logging, args.loglevel.upper()))

    _times = []
    _start_time = time.time()

    def annotate(label, human_label=None):
        if human_label:
            logging.info(human_label)
        _times.append((label, time.time() - _start_time))

    annotate('init_start', 'Loading GRT configuration...')
    try:
        with open(args.grt_config) as handle:
            config = yaml.safe_load(handle)
    except Exception:
        logging.info('Using default GRT configuration')
        with open(sample_config) as handle:
            config = yaml.safe_load(handle)
    annotate('init_end')

    REPORT_DIR = args.report_directory
    CHECK_POINT_FILE = os.path.join(REPORT_DIR, '.checkpoint')
    REPORT_IDENTIFIER = str(time.time())
    REPORT_BASE = os.path.join(REPORT_DIR, REPORT_IDENTIFIER)

    if os.path.exists(CHECK_POINT_FILE):
        with open(CHECK_POINT_FILE, 'r') as handle:
            last_job_sent = int(handle.read())
    else:
        last_job_sent = -1

    annotate('galaxy_init', 'Loading Galaxy...')
    model, object_store, gxconfig, app = _init(
        args, need_app=config['grt']['share_toolbox'])
    # Galaxy overrides our logging level.
    logging.getLogger().setLevel(getattr(logging, args.loglevel.upper()))
    sa_session = model.context.current
    annotate('galaxy_end')

    # Fetch jobs COMPLETED with status OK that have not yet been sent.

    # Set up our arrays
    active_users = defaultdict(int)
    job_state_data = defaultdict(int)

    annotate('san_init', 'Building Sanitizer')
    san = Sanitization(config['sanitization'], model, sa_session)
    annotate('san_end')

    if not os.path.exists(REPORT_DIR):
        os.makedirs(REPORT_DIR)

    # Pick an end point so our queries can return uniform data.
    annotate('endpoint_start', 'Identifying a safe endpoint for SQL queries')
    end_job_id = sa_session.query(model.Job.id) \
        .order_by(model.Job.id.desc()) \
        .first()[0]

    # Allow users to only report N records at once.
    if args.max_records > 0:
        if end_job_id - last_job_sent > args.max_records:
            end_job_id = last_job_sent + args.max_records

    annotate('endpoint_end',
             'Processing jobs (%s, %s]' % (last_job_sent, end_job_id))

    # Remember the last job sent.
    if end_job_id == last_job_sent:
        logging.info("No new jobs to report")
        # So we can just quit now.
        sys.exit(0)

    # Unfortunately we have to keep this mapping for the sanitizer to work properly.
    job_tool_map = {}
    blacklisted_tools = config['sanitization']['tools']

    annotate('export_jobs_start', 'Exporting Jobs')
    handle_job = open(REPORT_BASE + '.jobs.tsv', 'w')
    handle_job.write('\t'.join(('id', 'tool_id', 'tool_version', 'state',
                                'create_time')) + '\n')
    for offset_start in range(last_job_sent, end_job_id, args.batch_size):
        logging.debug("Processing %s:%s", offset_start,
                      min(end_job_id, offset_start + args.batch_size))
        for job in sa_session.query(model.Job.id, model.Job.user_id, model.Job.tool_id, model.Job.tool_version, model.Job.state, model.Job.create_time) \
                .filter(model.Job.id > offset_start) \
                .filter(model.Job.id <= min(end_job_id, offset_start + args.batch_size)) \
                .all():
            # If the tool is blacklisted, exclude everywhere
            if job[2] in blacklisted_tools:
                continue

            handle_job.write(str(job[0]))  # id
            handle_job.write('\t')
            handle_job.write(job[2])  # tool_id
            handle_job.write('\t')
            handle_job.write(job[3])  # tool_version
            handle_job.write('\t')
            handle_job.write(job[4])  # state
            handle_job.write('\t')
            handle_job.write(str(job[5]))  # create_time
            handle_job.write('\n')
            # meta counts
            job_state_data[job[4]] += 1
            active_users[job[1]] += 1
            job_tool_map[job[0]] = job[2]

    handle_job.close()
    annotate('export_jobs_end')

    annotate('export_metric_num_start', 'Exporting Metrics (Numeric)')
    handle_metric_num = open(REPORT_BASE + '.metric_num.tsv', 'w')
    handle_metric_num.write('\t'.join(('job_id', 'plugin', 'name', 'value')) +
                            '\n')
    for offset_start in range(last_job_sent, end_job_id, args.batch_size):
        logging.debug("Processing %s:%s", offset_start,
                      min(end_job_id, offset_start + args.batch_size))
        for metric in sa_session.query(model.JobMetricNumeric.job_id, model.JobMetricNumeric.plugin, model.JobMetricNumeric.metric_name, model.JobMetricNumeric.metric_value) \
                .filter(model.JobMetricNumeric.job_id > offset_start) \
                .filter(model.JobMetricNumeric.job_id <= min(end_job_id, offset_start + args.batch_size)) \
                .all():
            # No associated job
            if metric[0] not in job_tool_map:
                continue
            # If the tool is blacklisted, exclude everywhere
            if job_tool_map[metric[0]] in blacklisted_tools:
                continue

            handle_metric_num.write(str(metric[0]))
            handle_metric_num.write('\t')
            handle_metric_num.write(metric[1])
            handle_metric_num.write('\t')
            handle_metric_num.write(metric[2])
            handle_metric_num.write('\t')
            handle_metric_num.write(str(metric[3]))
            handle_metric_num.write('\n')
    handle_metric_num.close()
    annotate('export_metric_num_end')

    annotate('export_params_start', 'Export Job Parameters')
    handle_params = open(REPORT_BASE + '.params.tsv', 'w')
    handle_params.write('\t'.join(('job_id', 'name', 'value')) + '\n')
    for offset_start in range(last_job_sent, end_job_id, args.batch_size):
        logging.debug("Processing %s:%s", offset_start,
                      min(end_job_id, offset_start + args.batch_size))
        for param in sa_session.query(model.JobParameter.job_id, model.JobParameter.name, model.JobParameter.value) \
                .filter(model.JobParameter.job_id > offset_start) \
                .filter(model.JobParameter.job_id <= min(end_job_id, offset_start + args.batch_size)) \
                .all():
            # No associated job
            if param[0] not in job_tool_map:
                continue
            # If the tool is blacklisted, exclude everywhere
            if job_tool_map[param[0]] in blacklisted_tools:
                continue

            sanitized = san.sanitize_data(job_tool_map[param[0]], param[1],
                                          param[2])

            handle_params.write(str(param[0]))
            handle_params.write('\t')
            handle_params.write(param[1])
            handle_params.write('\t')
            handle_params.write(json.dumps(sanitized))
            handle_params.write('\n')
    handle_params.close()
    annotate('export_params_end')

    # Now on to outputs.
    with tarfile.open(REPORT_BASE + '.tar.gz', 'w:gz') as handle:
        for name in ('jobs', 'metric_num', 'params'):
            handle.add(REPORT_BASE + '.' + name + '.tsv')

    for name in ('jobs', 'metric_num', 'params'):
        os.unlink(REPORT_BASE + '.' + name + '.tsv')

    _times.append(('job_finish', time.time() - _start_time))
    sha = hash_util.memory_bound_hexdigest(hash_util.sha256,
                                           REPORT_BASE + ".tar.gz")
    _times.append(('hash_finish', time.time() - _start_time))

    # Now serialize the individual report data.
    with open(REPORT_BASE + '.json', 'w') as handle:
        if config['grt']['share_toolbox']:
            toolbox = [(tool.id, tool.name, tool.version, tool.tool_shed,
                        tool.repository_id, tool.repository_name)
                       for tool_id, tool in app.toolbox._tools_by_id.items()]
        else:
            toolbox = None

        json.dump(
            {
                "version": 1,
                "galaxy_version": gxconfig.version_major,
                "generated": REPORT_IDENTIFIER,
                "report_hash": "sha256:" + sha,
                "metrics": {
                    "_times": _times,
                },
                "users": {
                    "active": len(active_users.keys()),
                    "total": sa_session.query(model.User.id).count(),
                },
                "jobs": job_state_data,
                "tools": toolbox
            }, handle)

    # Write our checkpoint file so we know where to start next time.
    with open(CHECK_POINT_FILE, 'w') as handle:
        handle.write(str(end_job_id))