def _read_model_arguments(argv, use_argparse=False): if use_argparse: parser = argparse.ArgumentParser() parser.add_argument('database', metavar='DATABASE', type=str, default="galaxy", nargs='?', help='database to target (galaxy, tool_shed, install)') populate_config_args(parser) args = parser.parse_args(argv[1:] if argv else []) return args.config_file, args.config_section, args.database else: config_file = None for arg in ["-c", "--config", "--config-file"]: if arg in argv: pos = argv.index(arg) argv.pop(pos) config_file = argv.pop(pos) config_section = None if "--config-section" in argv: pos = argv.index("--config-section") argv.pop(pos) config_section = argv.pop(pos) if argv and (argv[-1] in DATABASE): database = argv.pop() # database name tool_shed, galaxy, or install. else: database = 'galaxy' return config_file, config_section, database
def main(argv): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-k', '--secret-key', help='Key to convert pages with', default='') parser.add_argument('-d', '--dry-run', help='No changes, just test it.', action='store_true') populate_config_args(parser) args = parser.parse_args() properties = app_properties_from_args(args) config = galaxy.config.Configuration(**properties) secret = args.secret_key or config.id_secret security_helper = SecurityHelper(id_secret=secret) object_store = build_object_store_from_config(config) if not config.database_connection: print("The database connection is empty. If you are using the default value, please uncomment that in your galaxy.yml") model = galaxy.config.init_models_from_config(config, object_store=object_store) session = model.context.current pagerevs = session.query(model.PageRevision).all() mock_trans = Bunch(app=Bunch(security=security_helper), model=model, user_is_admin=lambda: True, sa_session=session) for p in pagerevs: try: processor = _PageContentProcessor(mock_trans, _placeholderRenderForSave) processor.feed(p.content) newcontent = unicodify(processor.output(), 'utf-8') if p.content != newcontent: if not args.dry_run: p.content = unicodify(processor.output(), 'utf-8') session.add(p) session.flush() else: print("Modifying revision %s." % p.id) print(difflib.unified_diff(p.content, newcontent)) except Exception: logging.exception("Error parsing page, rolling changes back and skipping revision %s. Please report this error." % p.id) session.rollback()
def parse_arguments(): parser = argparse.ArgumentParser( description='Generate walltime statistics') parser.add_argument('tool_id', help='Tool (by ID) to collect stats about') parser.add_argument('--like', action='store_true', default=False, help='Use SQL `LIKE` operator to find ' 'a shed-installed tool using the tool\'s ' '"short" id') populate_config_args(parser) parser.add_argument('-d', '--debug', action='store_true', default=False, help='Print extra info') parser.add_argument('-m', '--min', type=int, default=-1, help='Ignore runtimes less than MIN seconds') parser.add_argument('-M', '--max', type=int, default=-1, help='Ignore runtimes greater than MAX seconds') parser.add_argument('-u', '--user', help='Return stats for only this user (id, email, ' 'or username)') parser.add_argument('-s', '--source', default='metrics', help='Runtime data source (SOURCES: %s)' % ', '.join(DATA_SOURCES)) args = parser.parse_args() if args.like and '/' in args.tool_id: print('ERROR: Do not use --like with a tool shed tool id (the tool ' 'id should not contain `/` characters)') sys.exit(2) args.source = args.source.lower() if args.source not in ('metrics', 'history'): print('ERROR: Data source `%s` unknown, valid source are: %s' % (args.source, ', '.join(DATA_SOURCES))) app_properties = app_properties_from_args(args) config = galaxy.config.Configuration(**app_properties) uri = args.config.get_database_url(config) names = {'database': 'dbname', 'username': '******'} args.connect_args = url.make_url(uri).translate_connect_args(**names) if args.debug: print('Got options:') for i in vars(args).items(): print('%s: %s' % i) return args
def main(argv): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-k', '--secret-key', help='Key to convert pages with', default='') parser.add_argument('-d', '--dry-run', help='No changes, just test it.', action='store_true') populate_config_args(parser) args = parser.parse_args() properties = app_properties_from_args(args) config = galaxy.config.Configuration(**properties) secret = args.secret_key or config.id_secret security_helper = IdEncodingHelper(id_secret=secret) object_store = build_object_store_from_config(config) if not config.database_connection: print("The database connection is empty. If you are using the default value, please uncomment that in your galaxy.yml") model = galaxy.config.init_models_from_config(config, object_store=object_store) session = model.context.current pagerevs = session.query(model.PageRevision).all() mock_trans = Bunch(app=Bunch(security=security_helper), model=model, user_is_admin=lambda: True, sa_session=session) for p in pagerevs: try: processor = _PageContentProcessor(mock_trans, _placeholderRenderForSave) processor.feed(p.content) newcontent = unicodify(processor.output(), 'utf-8') if p.content != newcontent: if not args.dry_run: p.content = unicodify(processor.output(), 'utf-8') session.add(p) session.flush() else: print("Modifying revision %s." % p.id) print(difflib.unified_diff(p.content, newcontent)) except Exception: logging.exception("Error parsing page, rolling changes back and skipping revision %s. Please report this error." % p.id) session.rollback()
def __parse_args(self): parser = argparse.ArgumentParser() populate_config_args(parser) parser.add_argument('-d', '--debug', action='store_true', default=False, help='Enable debug logging (SQL queries)') parser.add_argument('--dry-run', action='store_true', default=False, help="Dry run (rollback all transactions)") parser.add_argument('--force-retry', action='store_true', default=False, help="Retry file removals (on applicable actions)") parser.add_argument( '-o', '--older-than', dest='days', type=int, default=14, help= 'Only perform action(s) on objects that have not been updated since the specified number of days' ) parser.add_argument('-U', '--no-update-time', action='store_false', dest='update_time', default=True, help="Don't set update_time on updated objects") parser.add_argument( '-s', '--sequence', dest='sequence', default='', help='DEPRECATED: Comma-separated sequence of actions') parser.add_argument('-w', '--work-mem', dest='work_mem', default=None, help='Set PostgreSQL work_mem for this connection') parser.add_argument('-l', '--log-dir', default=DEFAULT_LOG_DIR, help='Log file directory') parser.add_argument('actions', nargs='*', metavar='ACTION', default=[], help='Action(s) to perform, chosen from: %s' % ', '.join(sorted(self.actions.keys()))) self.args = parser.parse_args() # add deprecated sequence arg to actions self.args.sequence = [x.strip() for x in self.args.sequence.split(',')] if self.args.sequence != ['']: self.args.actions.extend(self.args.sequence) if not self.args.actions: parser.error("Please specify one or more actions")
def __parse_args(self): parser = argparse.ArgumentParser() populate_config_args(parser) parser.add_argument('-d', '--debug', action='store_true', dest='debug', help='Enable debug logging', default=False) parser.add_argument('--dry-run', action='store_true', dest='dry_run', help="Dry run (rollback all transactions)", default=False) parser.add_argument('--force-retry', action='store_true', dest='force_retry', help="Retry file removals (on applicable actions)", default=False) parser.add_argument('-o', '--older-than', type=int, dest='days', help='Only perform action(s) on objects that have not been updated since the specified number of days', default=14) parser.add_argument('-U', '--no-update-time', action='store_false', dest='update_time', help="Don't set update_time on updated objects", default=True) parser.add_argument('-s', '--sequence', dest='sequence', help='Comma-separated sequence of actions, chosen from: %s' % self.action_names, default='') parser.add_argument('-w', '--work-mem', dest='work_mem', help='Set PostgreSQL work_mem for this connection', default=None) parser.add_argument('-l', '--log-dir', dest='log_dir', help='Log file directory', default=os.path.join(galaxy_root, 'scripts', 'cleanup_datasets')) self.args = parser.parse_args() self.args.sequence = [x.strip() for x in self.args.sequence.split(',')] if self.args.sequence == ['']: print("Error: At least one action must be specified in the action sequence\n") parser.print_help() sys.exit(0)
def main(): parser = argparse.ArgumentParser(DESCRIPTION) populate_config_args(parser) args = parser.parse_args() app_properties = app_properties_from_args(args) config = galaxy.config.Configuration(**app_properties) model = galaxy.config.init_models_from_config(config) for row in model.context.query(model.Dataset): if row.uuid is None: row.uuid = uuid.uuid4() print("Setting dataset:", row.id, " UUID to ", row.uuid) model.context.flush() for row in model.context.query(model.Workflow): if row.uuid is None: row.uuid = uuid.uuid4() print("Setting Workflow:", row.id, " UUID to ", row.uuid) model.context.flush() print("Complete")
def parse_arguments(): parser = argparse.ArgumentParser( description= 'Build a disk-backed Toolshed repository index and tool index for searching.' ) populate_config_args(parser) parser.add_argument('-d', '--debug', action='store_true', default=False, help='Print extra info') args = parser.parse_args() app_properties = app_properties_from_args(args) config = ts_config.ToolShedAppConfiguration(**app_properties) args.dburi = config.database_connection args.hgweb_config_dir = config.hgweb_config_dir args.whoosh_index_dir = config.whoosh_index_dir args.file_path = config.file_path if args.debug: log.setLevel(logging.DEBUG) log.debug('Full options:') for i in vars(args).items(): log.debug('%s: %s' % i) return args
def main(): """ Datasets that are older than the specified cutoff and for which the tool_id contains the specified text will be marked as deleted in user's history and the user will be notified by email using the specified template file. """ parser = argparse.ArgumentParser() parser.add_argument('legacy_config', metavar='CONFIG', type=str, default=None, nargs='?', help='config file (legacy, use --config instead)') parser.add_argument("-d", "--days", dest="days", action="store", type=int, help="number of days (60)", default=60) parser.add_argument("--tool_id", default=None, help="Text to match against tool_id" "Default: match all") parser.add_argument("--template", default=None, help="Mako Template file to use as email " "Variables are 'cutoff' for the cutoff in days, " "'email' for users email and " "'datasets' which is a list of tuples " "containing 'dataset' and 'history' names. " "Default: admin_cleanup_deletion_template.txt") parser.add_argument("-i", "--info_only", action="store_true", dest="info_only", help="info about the requested action", default=False) parser.add_argument("-e", "--email_only", action="store_true", dest="email_only", help="Send emails only, don't delete", default=False) parser.add_argument("--smtp", default=None, help="SMTP Server to use to send email. " "Default: [read from galaxy ini file]") parser.add_argument("--fromaddr", default=None, help="From address to use to send email. " "Default: [read from galaxy ini file]") populate_config_args(parser) args = parser.parse_args() config_override = None if args.legacy_config: config_override = args.legacy_config app_properties = app_properties_from_args(args, legacy_config_override=config_override) if args.smtp is not None: app_properties['smtp_server'] = args.smtp if app_properties.get('smtp_server') is None: parser.error("SMTP Server must be specified as an option (--smtp) " "or in the config file (smtp_server)") if args.fromaddr is not None: app_properties['email_from'] = args.fromaddr if app_properties.get('email_from') is None: parser.error("From address must be specified as an option " "(--fromaddr) or in the config file " "(email_from)") scriptdir = os.path.dirname(os.path.abspath(__file__)) template_file = args.template if template_file is None: default_template = os.path.join(scriptdir, 'admin_cleanup_deletion_template.txt') sample_template_file = "%s.sample" % default_template if os.path.exists(default_template): template_file = default_template elif os.path.exists(sample_template_file): print("Copying %s to %s" % (sample_template_file, default_template)) shutil.copyfile(sample_template_file, default_template) template_file = default_template else: parser.error("Default template (%s) or sample template (%s) not " "found, please specify template as an option " "(--template)." % default_template, sample_template_file) elif not os.path.exists(template_file): parser.error("Specified template file (%s) not found." % template_file) config = galaxy.config.Configuration(**app_properties) app = CleanupDatasetsApplication(config) cutoff_time = datetime.utcnow() - timedelta(days=args.days) now = strftime("%Y-%m-%d %H:%M:%S") print("##########################################") print("\n# %s - Handling stuff older than %i days" % (now, args.days)) if args.info_only: print("# Displaying info only ( --info_only )\n") elif args.email_only: print("# Sending emails only, not deleting ( --email_only )\n") administrative_delete_datasets( app, cutoff_time, args.days, tool_id=args.tool_id, template_file=template_file, config=config, email_only=args.email_only, info_only=args.info_only) app.shutdown() sys.exit(0)
def main(argv): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-r', '--report-directory', help='Directory to store reports in', default=os.path.abspath(os.path.join('.', 'reports'))) parser.add_argument('-g', '--grt-config', help='Path to GRT config file', default=default_config) parser.add_argument( "-l", "--loglevel", choices=['debug', 'info', 'warning', 'error', 'critical'], help="Set the logging level", default='warning') parser.add_argument("-b", "--batch-size", type=int, default=1000, help="Batch size for sql queries") parser.add_argument( "-m", "--max-records", type=int, default=0, help= "Maximum number of records to include in a single report. This option should ONLY be used when reporting historical data. Setting this may require running GRT multiple times to capture all historical logs." ) populate_config_args(parser) args = parser.parse_args() logging.getLogger().setLevel(getattr(logging, args.loglevel.upper())) _times = [] _start_time = time.time() def annotate(label, human_label=None): if human_label: logging.info(human_label) _times.append((label, time.time() - _start_time)) annotate('init_start', 'Loading GRT configuration...') try: with open(args.grt_config) as handle: config = yaml.safe_load(handle) except Exception: logging.info('Using default GRT configuration') with open(sample_config) as handle: config = yaml.safe_load(handle) annotate('init_end') REPORT_DIR = args.report_directory CHECK_POINT_FILE = os.path.join(REPORT_DIR, '.checkpoint') REPORT_IDENTIFIER = str(time.time()) REPORT_BASE = os.path.join(REPORT_DIR, REPORT_IDENTIFIER) if os.path.exists(CHECK_POINT_FILE): with open(CHECK_POINT_FILE, 'r') as handle: last_job_sent = int(handle.read()) else: last_job_sent = -1 annotate('galaxy_init', 'Loading Galaxy...') model, object_store, gxconfig, app = _init( args, need_app=config['grt']['share_toolbox']) # Galaxy overrides our logging level. logging.getLogger().setLevel(getattr(logging, args.loglevel.upper())) sa_session = model.context.current annotate('galaxy_end') # Fetch jobs COMPLETED with status OK that have not yet been sent. # Set up our arrays active_users = defaultdict(int) job_state_data = defaultdict(int) annotate('san_init', 'Building Sanitizer') san = Sanitization(config['sanitization'], model, sa_session) annotate('san_end') if not os.path.exists(REPORT_DIR): os.makedirs(REPORT_DIR) # Pick an end point so our queries can return uniform data. annotate('endpoint_start', 'Identifying a safe endpoint for SQL queries') end_job_id = sa_session.query(model.Job.id) \ .order_by(model.Job.id.desc()) \ .first()[0] # Allow users to only report N records at once. if args.max_records > 0: if end_job_id - last_job_sent > args.max_records: end_job_id = last_job_sent + args.max_records annotate('endpoint_end', 'Processing jobs (%s, %s]' % (last_job_sent, end_job_id)) # Remember the last job sent. if end_job_id == last_job_sent: logging.info("No new jobs to report") # So we can just quit now. sys.exit(0) # Unfortunately we have to keep this mapping for the sanitizer to work properly. job_tool_map = {} blacklisted_tools = config['sanitization']['tools'] annotate('export_jobs_start', 'Exporting Jobs') handle_job = open(REPORT_BASE + '.jobs.tsv', 'w') handle_job.write('\t'.join(('id', 'tool_id', 'tool_version', 'state', 'create_time')) + '\n') for offset_start in range(last_job_sent, end_job_id, args.batch_size): logging.debug("Processing %s:%s", offset_start, min(end_job_id, offset_start + args.batch_size)) for job in sa_session.query(model.Job.id, model.Job.user_id, model.Job.tool_id, model.Job.tool_version, model.Job.state, model.Job.create_time) \ .filter(model.Job.id > offset_start) \ .filter(model.Job.id <= min(end_job_id, offset_start + args.batch_size)) \ .all(): # If the tool is blacklisted, exclude everywhere if job[2] in blacklisted_tools: continue handle_job.write(str(job[0])) # id handle_job.write('\t') handle_job.write(job[2]) # tool_id handle_job.write('\t') handle_job.write(job[3]) # tool_version handle_job.write('\t') handle_job.write(job[4]) # state handle_job.write('\t') handle_job.write(str(job[5])) # create_time handle_job.write('\n') # meta counts job_state_data[job[4]] += 1 active_users[job[1]] += 1 job_tool_map[job[0]] = job[2] handle_job.close() annotate('export_jobs_end') annotate('export_metric_num_start', 'Exporting Metrics (Numeric)') handle_metric_num = open(REPORT_BASE + '.metric_num.tsv', 'w') handle_metric_num.write('\t'.join(('job_id', 'plugin', 'name', 'value')) + '\n') for offset_start in range(last_job_sent, end_job_id, args.batch_size): logging.debug("Processing %s:%s", offset_start, min(end_job_id, offset_start + args.batch_size)) for metric in sa_session.query(model.JobMetricNumeric.job_id, model.JobMetricNumeric.plugin, model.JobMetricNumeric.metric_name, model.JobMetricNumeric.metric_value) \ .filter(model.JobMetricNumeric.job_id > offset_start) \ .filter(model.JobMetricNumeric.job_id <= min(end_job_id, offset_start + args.batch_size)) \ .all(): # No associated job if metric[0] not in job_tool_map: continue # If the tool is blacklisted, exclude everywhere if job_tool_map[metric[0]] in blacklisted_tools: continue handle_metric_num.write(str(metric[0])) handle_metric_num.write('\t') handle_metric_num.write(metric[1]) handle_metric_num.write('\t') handle_metric_num.write(metric[2]) handle_metric_num.write('\t') handle_metric_num.write(str(metric[3])) handle_metric_num.write('\n') handle_metric_num.close() annotate('export_metric_num_end') annotate('export_params_start', 'Export Job Parameters') handle_params = open(REPORT_BASE + '.params.tsv', 'w') handle_params.write('\t'.join(('job_id', 'name', 'value')) + '\n') for offset_start in range(last_job_sent, end_job_id, args.batch_size): logging.debug("Processing %s:%s", offset_start, min(end_job_id, offset_start + args.batch_size)) for param in sa_session.query(model.JobParameter.job_id, model.JobParameter.name, model.JobParameter.value) \ .filter(model.JobParameter.job_id > offset_start) \ .filter(model.JobParameter.job_id <= min(end_job_id, offset_start + args.batch_size)) \ .all(): # No associated job if param[0] not in job_tool_map: continue # If the tool is blacklisted, exclude everywhere if job_tool_map[param[0]] in blacklisted_tools: continue sanitized = san.sanitize_data(job_tool_map[param[0]], param[1], param[2]) handle_params.write(str(param[0])) handle_params.write('\t') handle_params.write(param[1]) handle_params.write('\t') handle_params.write(json.dumps(sanitized)) handle_params.write('\n') handle_params.close() annotate('export_params_end') # Now on to outputs. with tarfile.open(REPORT_BASE + '.tar.gz', 'w:gz') as handle: for name in ('jobs', 'metric_num', 'params'): handle.add(REPORT_BASE + '.' + name + '.tsv') for name in ('jobs', 'metric_num', 'params'): os.unlink(REPORT_BASE + '.' + name + '.tsv') _times.append(('job_finish', time.time() - _start_time)) sha = hash_util.memory_bound_hexdigest(hash_util.sha256, REPORT_BASE + ".tar.gz") _times.append(('hash_finish', time.time() - _start_time)) # Now serialize the individual report data. with open(REPORT_BASE + '.json', 'w') as handle: if config['grt']['share_toolbox']: toolbox = [(tool.id, tool.name, tool.version, tool.tool_shed, tool.repository_id, tool.repository_name) for tool_id, tool in app.toolbox._tools_by_id.items()] else: toolbox = None json.dump( { "version": 1, "galaxy_version": gxconfig.version_major, "generated": REPORT_IDENTIFIER, "report_hash": "sha256:" + sha, "metrics": { "_times": _times, }, "users": { "active": len(active_users.keys()), "total": sa_session.query(model.User.id).count(), }, "jobs": job_state_data, "tools": toolbox }, handle) # Write our checkpoint file so we know where to start next time. with open(CHECK_POINT_FILE, 'w') as handle: handle.write(str(end_job_id))
Encodes and decodes IDs, returns Dataset IDs if provided an HDA or LDDA id, returns the disk path of a dataset. """ import argparse import os import sys sys.path.insert(1, os.path.join(os.path.dirname(__file__), os.pardir, 'lib')) import galaxy.config from galaxy.security import idencoding from galaxy.util.script import app_properties_from_args, populate_config_args parser = argparse.ArgumentParser() populate_config_args(parser) parser.add_argument('-e', '--encode-id', dest='encode_id', help='Encode an ID') parser.add_argument('-d', '--decode-id', dest='decode_id', help='Decode an ID') parser.add_argument('--hda', dest='hda_id', help='Display HistoryDatasetAssociation info') parser.add_argument('--ldda', dest='ldda_id', help='Display LibraryDatasetDatasetAssociation info') args = parser.parse_args() app_properties = app_properties_from_args(args) config = galaxy.config.Configuration(**app_properties) helper = idencoding.IdEncodingHelper(id_secret=app_properties.get('id_secret')) model = galaxy.config.init_models_from_config(config) if args.encode_id: print('Encoded "{}": {}'.format(args.encode_id, helper.encode_id(args.encode_id))) if args.decode_id:
def __parse_args(self): parser = argparse.ArgumentParser() populate_config_args(parser) parser.add_argument('-d', '--debug', action='store_true', dest='debug', help='Enable debug logging', default=False) parser.add_argument('--dry-run', action='store_true', dest='dry_run', help="Dry run (rollback all transactions)", default=False) parser.add_argument('--force-retry', action='store_true', dest='force_retry', help="Retry file removals (on applicable actions)", default=False) parser.add_argument( '-o', '--older-than', type=int, dest='days', help= 'Only perform action(s) on objects that have not been updated since the specified number of days', default=14) parser.add_argument('-U', '--no-update-time', action='store_false', dest='update_time', help="Don't set update_time on updated objects", default=True) parser.add_argument( '-s', '--sequence', dest='sequence', help='Comma-separated sequence of actions, chosen from: %s' % self.action_names, default='') parser.add_argument('-w', '--work-mem', dest='work_mem', help='Set PostgreSQL work_mem for this connection', default=None) parser.add_argument('-l', '--log-dir', dest='log_dir', help='Log file directory', default=os.path.join(galaxy_root, 'scripts', 'cleanup_datasets')) self.args = parser.parse_args() self.args.sequence = [x.strip() for x in self.args.sequence.split(',')] if self.args.sequence == ['']: print( "Error: At least one action must be specified in the action sequence\n" ) parser.print_help() sys.exit(0)
def main(): """ Datasets that are older than the specified cutoff and for which the tool_id contains the specified text will be marked as deleted in user's history and the user will be notified by email using the specified template file. """ parser = argparse.ArgumentParser() parser.add_argument('legacy_config', metavar='CONFIG', type=str, default=None, nargs='?', help='config file (legacy, use --config instead)') parser.add_argument("-d", "--days", dest="days", action="store", type=int, help="number of days (60)", default=60) parser.add_argument("--tool_id", default=None, help="Text to match against tool_id" "Default: match all") parser.add_argument("--template", default=None, help="Mako Template file to use as email " "Variables are 'cutoff' for the cutoff in days, " "'email' for users email and " "'datasets' which is a list of tuples " "containing 'dataset' and 'history' names. " "Default: admin_cleanup_deletion_template.txt") parser.add_argument("-i", "--info_only", action="store_true", dest="info_only", help="info about the requested action", default=False) parser.add_argument("-e", "--email_only", action="store_true", dest="email_only", help="Send emails only, don't delete", default=False) parser.add_argument("--smtp", default=None, help="SMTP Server to use to send email. " "Default: [read from galaxy ini file]") parser.add_argument("--fromaddr", default=None, help="From address to use to send email. " "Default: [read from galaxy ini file]") populate_config_args(parser) args = parser.parse_args() config_override = None if args.legacy_config: config_override = args.legacy_config app_properties = app_properties_from_args( args, legacy_config_override=config_override) if args.smtp is not None: app_properties['smtp_server'] = args.smtp if app_properties.get('smtp_server') is None: parser.error("SMTP Server must be specified as an option (--smtp) " "or in the config file (smtp_server)") if args.fromaddr is not None: app_properties['email_from'] = args.fromaddr if app_properties.get('email_from') is None: parser.error("From address must be specified as an option " "(--fromaddr) or in the config file " "(email_from)") scriptdir = os.path.dirname(os.path.abspath(__file__)) template_file = args.template if template_file is None: default_template = os.path.join(scriptdir, 'admin_cleanup_deletion_template.txt') sample_template_file = "%s.sample" % default_template if os.path.exists(default_template): template_file = default_template elif os.path.exists(sample_template_file): print("Copying %s to %s" % (sample_template_file, default_template)) shutil.copyfile(sample_template_file, default_template) template_file = default_template else: parser.error( "Default template (%s) or sample template (%s) not " "found, please specify template as an option " "(--template)." % default_template, sample_template_file) elif not os.path.exists(template_file): parser.error("Specified template file (%s) not found." % template_file) config = galaxy.config.Configuration(**app_properties) app = CleanupDatasetsApplication(config) cutoff_time = datetime.utcnow() - timedelta(days=args.days) now = strftime("%Y-%m-%d %H:%M:%S") print("##########################################") print("\n# %s - Handling stuff older than %i days" % (now, args.days)) if args.info_only: print("# Displaying info only ( --info_only )\n") elif args.email_only: print("# Sending emails only, not deleting ( --email_only )\n") administrative_delete_datasets(app, cutoff_time, args.days, tool_id=args.tool_id, template_file=template_file, config=config, email_only=args.email_only, info_only=args.info_only) app.shutdown() sys.exit(0)
sys.path.insert(1, os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir, 'lib'))) from galaxy.util.script import app_properties_from_args, populate_config_args from galaxy.web.security import SecurityHelper logging.basicConfig() log = logging.getLogger(__name__) parser = argparse.ArgumentParser() parser.add_argument('action', metavar='ACTION', type=str, default=None, help='decode|encode') parser.add_argument('value', metavar='VALUE', type=str, default=None, help='value to encode or decode') populate_config_args(parser) args = parser.parse_args() app_properties = app_properties_from_args(args) helper = SecurityHelper(id_secret=app_properties.get('id_secret')) # We need the ID secret for configuring the security helper to decrypt # galaxysession cookies. if "id_secret" not in app_properties: log.warning('No ID_SECRET specified. Please set the "id_secret" in your galaxy.yml.') id_secret = app_properties.get('id_secret', 'dangerous_default') security_helper = SecurityHelper(id_secret=id_secret) # And get access to the models # Login manager to manage current_user functionality
def main(argv): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-r', '--report-directory', help='Directory to store reports in', default=os.path.abspath(os.path.join('.', 'reports'))) parser.add_argument('-g', '--grt-config', help='Path to GRT config file', default=default_config) parser.add_argument( "-l", "--loglevel", choices=['debug', 'info', 'warning', 'error', 'critical'], help="Set the logging level", default='warning') parser.add_argument("-b", "--batch-size", type=int, default=1000, help="Batch size for sql queries") parser.add_argument( "-m", "--max-records", type=int, default=5000000, help= "Maximum number of records to include in a single report. This option should ONLY be used when reporting historical data. Setting this may require running GRT multiple times to capture all historical logs." ) populate_config_args(parser) args = parser.parse_args() logging.getLogger().setLevel(getattr(logging, args.loglevel.upper())) _times = [] _start_time = time.time() def annotate(label, human_label=None): if human_label: logging.info(human_label) _times.append((label, time.time() - _start_time)) annotate('init_start', 'Loading GRT configuration...') try: with open(args.grt_config) as handle: config = yaml.safe_load(handle) except Exception: logging.info('Using default GRT configuration') with open(sample_config) as handle: config = yaml.safe_load(handle) annotate('init_end') REPORT_DIR = args.report_directory CHECK_POINT_FILE = os.path.join(REPORT_DIR, '.checkpoint') REPORT_IDENTIFIER = str(time.time()) REPORT_BASE = os.path.join(REPORT_DIR, REPORT_IDENTIFIER) if os.path.exists(CHECK_POINT_FILE): with open(CHECK_POINT_FILE, 'r') as handle: last_job_sent = int(handle.read()) else: last_job_sent = -1 annotate('galaxy_init', 'Loading Galaxy...') model, object_store, gxconfig = _init(args) # Galaxy overrides our logging level. logging.getLogger().setLevel(getattr(logging, args.loglevel.upper())) sa_session = model.context.current annotate('galaxy_end') # Fetch jobs COMPLETED with status OK that have not yet been sent. # Set up our arrays active_users = defaultdict(int) job_state_data = defaultdict(int) if not os.path.exists(REPORT_DIR): os.makedirs(REPORT_DIR) # Pick an end point so our queries can return uniform data. annotate('endpoint_start', 'Identifying a safe endpoint for SQL queries') end_job_id = sa_session.query(model.Job.id) \ .order_by(model.Job.id.desc()) \ .first()[0] # Allow users to only report N records at once. if args.max_records > 0: if end_job_id - last_job_sent > args.max_records: end_job_id = last_job_sent + args.max_records annotate('endpoint_end', 'Processing jobs (%s, %s]' % (last_job_sent, end_job_id)) # Remember the last job sent. if end_job_id == last_job_sent: logging.info("No new jobs to report") # So we can just quit now. sys.exit(0) # Unfortunately we have to keep this mapping for the sanitizer to work properly. job_tool_map = {} blacklisted_tools = config['sanitization']['tools'] annotate('export_jobs_start', 'Exporting Jobs') with io.open(REPORT_BASE + '.jobs.tsv', 'w', encoding='utf-8') as handle_job: handle_job.write(u'\t'.join(('id', 'tool_id', 'tool_version', 'state', 'create_time')) + '\n') for offset_start in range(last_job_sent, end_job_id, args.batch_size): logging.debug("Processing %s:%s", offset_start, min(end_job_id, offset_start + args.batch_size)) for job in sa_session.query(model.Job.id, model.Job.user_id, model.Job.tool_id, model.Job.tool_version, model.Job.state, model.Job.create_time) \ .filter(model.Job.id > offset_start) \ .filter(model.Job.id <= min(end_job_id, offset_start + args.batch_size)) \ .all(): # If the tool is blacklisted, exclude everywhere if job[2] in blacklisted_tools: continue try: line = [ str(job[0]), # id job[2], # tool_id job[3], # tool_version job[4], # state str(job[5]) # create_time ] cline = unicodify('\t'.join(line) + '\n') handle_job.write(cline) except Exception: logging.warning( "Unable to write out a 'handle_job' row. Ignoring the row.", exc_info=True) continue # meta counts job_state_data[job[4]] += 1 active_users[job[1]] += 1 job_tool_map[job[0]] = job[2] annotate('export_jobs_end') annotate('export_datasets_start', 'Exporting Datasets') with io.open(REPORT_BASE + '.datasets.tsv', 'w', encoding='utf-8') as handle_datasets: handle_datasets.write(u'\t'.join(('job_id', 'dataset_id', 'extension', 'file_size', 'param_name', 'type')) + '\n') for offset_start in range(last_job_sent, end_job_id, args.batch_size): logging.debug("Processing %s:%s", offset_start, min(end_job_id, offset_start + args.batch_size)) # four queries: JobToInputDatasetAssociation, JobToOutputDatasetAssociation, HistoryDatasetAssociation, Dataset job_to_input_hda_ids = sa_session.query(model.JobToInputDatasetAssociation.job_id, model.JobToInputDatasetAssociation.dataset_id, model.JobToInputDatasetAssociation.name) \ .filter(model.JobToInputDatasetAssociation.job_id > offset_start) \ .filter(model.JobToInputDatasetAssociation.job_id <= min(end_job_id, offset_start + args.batch_size)) \ .all() job_to_output_hda_ids = sa_session.query(model.JobToOutputDatasetAssociation.job_id, model.JobToOutputDatasetAssociation.dataset_id, model.JobToOutputDatasetAssociation.name) \ .filter(model.JobToOutputDatasetAssociation.job_id > offset_start) \ .filter(model.JobToOutputDatasetAssociation.job_id <= min(end_job_id, offset_start + args.batch_size)) \ .all() # add type and concat job_to_hda_ids = [[list(i), "input"] for i in job_to_input_hda_ids ] + [[list(i), "output"] for i in job_to_output_hda_ids] # put all of the hda_ids into a list hda_ids = [i[0][1] for i in job_to_hda_ids] hdas = sa_session.query(model.HistoryDatasetAssociation.id, model.HistoryDatasetAssociation.dataset_id, model.HistoryDatasetAssociation.extension) \ .filter(model.HistoryDatasetAssociation.id.in_(hda_ids)) \ .all() # put all the dataset ids into a list dataset_ids = [i[1] for i in hdas] # get the sizes of the datasets datasets = sa_session.query(model.Dataset.id, model.Dataset.total_size) \ .filter(model.Dataset.id.in_(dataset_ids)) \ .all() # datasets to dictionay for easy search hdas = {i[0]: i[1:] for i in hdas} datasets = {i[0]: i[1:] for i in datasets} for job_to_hda in job_to_hda_ids: job = job_to_hda[0] # job_id, hda_id, name filetype = job_to_hda[1] # input|output # No associated job if job[0] not in job_tool_map: continue # If the tool is blacklisted, exclude everywhere if job_tool_map[job[0]] in blacklisted_tools: continue hda_id = job[1] if hda_id is None: continue dataset_id = hdas[hda_id][0] if dataset_id is None: continue try: line = [ str(job[0]), # Job ID str(hda_id), # HDA ID str(hdas[hda_id][1]), # Extension round_to_2sd(datasets[dataset_id][0]), # File size job[2], # Parameter name str(filetype) # input/output ] cline = unicodify('\t'.join(line) + '\n') handle_datasets.write(cline) except Exception: logging.warning( "Unable to write out a 'handle_datasets' row. Ignoring the row.", exc_info=True) continue annotate('export_datasets_end') annotate('export_metric_num_start', 'Exporting Metrics (Numeric)') with io.open(REPORT_BASE + '.metric_num.tsv', 'w', encoding='utf-8') as handle_metric_num: handle_metric_num.write(u'\t'.join(('job_id', 'plugin', 'name', 'value')) + '\n') for offset_start in range(last_job_sent, end_job_id, args.batch_size): logging.debug("Processing %s:%s", offset_start, min(end_job_id, offset_start + args.batch_size)) for metric in sa_session.query(model.JobMetricNumeric.job_id, model.JobMetricNumeric.plugin, model.JobMetricNumeric.metric_name, model.JobMetricNumeric.metric_value) \ .filter(model.JobMetricNumeric.job_id > offset_start) \ .filter(model.JobMetricNumeric.job_id <= min(end_job_id, offset_start + args.batch_size)) \ .all(): # No associated job if metric[0] not in job_tool_map: continue # If the tool is blacklisted, exclude everywhere if job_tool_map[metric[0]] in blacklisted_tools: continue try: line = [ str(metric[0]), # job id metric[1], # plugin metric[2], # name str(metric[3]) # value ] cline = unicodify('\t'.join(line) + '\n') handle_metric_num.write(cline) except Exception: logging.warning( "Unable to write out a 'handle_metric_num' row. Ignoring the row.", exc_info=True) continue annotate('export_metric_num_end') # Now on to outputs. with tarfile.open(REPORT_BASE + '.tar.gz', 'w:gz') as handle: for name in ('jobs', 'metric_num', 'datasets'): path = REPORT_BASE + '.' + name + '.tsv' if os.path.exists(path): handle.add(path) for name in ('jobs', 'metric_num', 'datasets'): path = REPORT_BASE + '.' + name + '.tsv' if os.path.exists(path): os.unlink(REPORT_BASE + '.' + name + '.tsv') _times.append(('job_finish', time.time() - _start_time)) sha = hash_util.memory_bound_hexdigest(hash_func=hash_util.sha256, path=REPORT_BASE + ".tar.gz") _times.append(('hash_finish', time.time() - _start_time)) # Now serialize the individual report data. with open(REPORT_BASE + '.json', 'w') as handle: json.dump( { "version": 3, "galaxy_version": gxconfig.version_major, "generated": REPORT_IDENTIFIER, "report_hash": "sha256:" + sha, "metrics": { "_times": _times, }, "users": { "active": len(active_users.keys()), "total": sa_session.query(model.User.id).count(), }, "jobs": job_state_data, }, handle) # Write our checkpoint file so we know where to start next time. with open(CHECK_POINT_FILE, 'w') as handle: handle.write(str(end_job_id))
def main(): """ Managing library datasets is a bit complex, so here is a scenario that hopefully provides clarification. The complexities of handling library datasets is mostly contained in the delete_datasets() method in this script. Assume we have 1 library dataset with: LibraryDatasetDatasetAssociation -> LibraryDataset and Dataset At this point, we have the following database column values: LibraryDatasetDatasetAssociation deleted: False LibraryDataset deleted: False, purged: False Dataset deleted: False purged: False 1. A user deletes the assumed dataset above from a data library via a UI menu option. This action results in the following database column values (changes from previous step marked with *): LibraryDatasetDatasetAssociation deleted: False LibraryDataset deleted: True*, purged: False Dataset deleted: False, purged: False 2. After the number of days configured for the delete_datasets() method (option -6 below) have passed, execution of the delete_datasets() method results in the following database column values (changes from previous step marked with *): LibraryDatasetDatasetAssociation deleted: True* LibraryDataset deleted: True, purged: True* Dataset deleted: True*, purged: False 3. After the number of days configured for the purge_datasets() method (option -3 below) have passed, execution of the purge_datasets() method results in the following database column values (changes from previous step marked with *): LibraryDatasetDatasetAssociation deleted: True LibraryDataset deleted: True, purged: True Dataset deleted: True, purged: True* (dataset file removed from disk if -r flag is used) This scenario is about as simple as it gets. Keep in mind that a Dataset object can have many HistoryDatasetAssociations and many LibraryDatasetDatasetAssociations, and a LibraryDataset can have many LibraryDatasetDatasetAssociations. Another way of stating it is: LibraryDatasetDatasetAssociation objects map LibraryDataset objects to Dataset objects, and Dataset objects may be mapped to History objects via HistoryDatasetAssociation objects. """ parser = argparse.ArgumentParser() parser.add_argument('legacy_config', metavar='CONFIG', type=str, default=None, nargs='?', help='config file (legacy, use --config instead)') parser.add_argument("-d", "--days", dest="days", action="store", type=int, help="number of days (60)", default=60) parser.add_argument("-r", "--remove_from_disk", action="store_true", dest="remove_from_disk", help="remove datasets from disk when purged", default=False) parser.add_argument("-i", "--info_only", action="store_true", dest="info_only", help="info about the requested action", default=False) parser.add_argument( "-f", "--force_retry", action="store_true", dest="force_retry", help= "performs the requested actions, but ignores whether it might have been done before. Useful when -r wasn't used, but should have been", default=False) parser.add_argument("-1", "--delete_userless_histories", action="store_true", dest="delete_userless_histories", default=False, help="delete userless histories and datasets") parser.add_argument("-2", "--purge_histories", action="store_true", dest="purge_histories", default=False, help="purge deleted histories") parser.add_argument("-3", "--purge_datasets", action="store_true", dest="purge_datasets", default=False, help="purge deleted datasets") parser.add_argument("-4", "--purge_libraries", action="store_true", dest="purge_libraries", default=False, help="purge deleted libraries") parser.add_argument("-5", "--purge_folders", action="store_true", dest="purge_folders", default=False, help="purge deleted library folders") parser.add_argument( "-6", "--delete_datasets", action="store_true", dest="delete_datasets", default=False, help= "mark deletable datasets as deleted and purge associated dataset instances" ) populate_config_args(parser) args = parser.parse_args() config_override = None if args.legacy_config: config_override = args.legacy_config if not (args.purge_folders ^ args.delete_userless_histories ^ args.purge_libraries ^ args.purge_histories ^ args.purge_datasets ^ args.delete_datasets): parser.print_help() sys.exit(0) if args.remove_from_disk and args.info_only: parser.error("remove_from_disk and info_only are mutually exclusive") app_properties = app_properties_from_args( args, legacy_config_override=config_override) config = galaxy.config.Configuration(**app_properties) app = CleanupDatasetsApplication(config) cutoff_time = datetime.utcnow() - timedelta(days=args.days) now = strftime("%Y-%m-%d %H:%M:%S") log.info("##########################################") log.info("\n# %s - Handling stuff older than %i days" % (now, args.days)) if args.info_only: log.info("# Displaying info only ( --info_only )\n") elif args.remove_from_disk: log.info("Datasets will be removed from disk.\n") else: log.info("Datasets will NOT be removed from disk.\n") if args.delete_userless_histories: delete_userless_histories(app, cutoff_time, info_only=args.info_only, force_retry=args.force_retry) elif args.purge_histories: purge_histories(app, cutoff_time, args.remove_from_disk, info_only=args.info_only, force_retry=args.force_retry) elif args.purge_datasets: purge_datasets(app, cutoff_time, args.remove_from_disk, info_only=args.info_only, force_retry=args.force_retry) elif args.purge_libraries: purge_libraries(app, cutoff_time, args.remove_from_disk, info_only=args.info_only, force_retry=args.force_retry) elif args.purge_folders: purge_folders(app, cutoff_time, args.remove_from_disk, info_only=args.info_only, force_retry=args.force_retry) elif args.delete_datasets: delete_datasets(app, cutoff_time, args.remove_from_disk, info_only=args.info_only, force_retry=args.force_retry) app.shutdown() sys.exit(0)
def main(argv): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-r', '--report-directory', help='Directory to store reports in', default=os.path.abspath(os.path.join('.', 'reports'))) parser.add_argument('-g', '--grt-config', help='Path to GRT config file', default=default_config) parser.add_argument("-l", "--loglevel", choices=['debug', 'info', 'warning', 'error', 'critical'], help="Set the logging level", default='warning') parser.add_argument("-b", "--batch-size", type=int, default=1000, help="Batch size for sql queries") parser.add_argument("-m", "--max-records", type=int, default=0, help="Maximum number of records to include in a single report. This option should ONLY be used when reporting historical data. Setting this may require running GRT multiple times to capture all historical logs.") populate_config_args(parser) args = parser.parse_args() logging.getLogger().setLevel(getattr(logging, args.loglevel.upper())) _times = [] _start_time = time.time() def annotate(label, human_label=None): if human_label: logging.info(human_label) _times.append((label, time.time() - _start_time)) annotate('init_start', 'Loading GRT configuration...') try: with open(args.grt_config) as handle: config = yaml.safe_load(handle) except Exception: logging.info('Using default GRT configuration') with open(sample_config) as handle: config = yaml.safe_load(handle) annotate('init_end') REPORT_DIR = args.report_directory CHECK_POINT_FILE = os.path.join(REPORT_DIR, '.checkpoint') REPORT_IDENTIFIER = str(time.time()) REPORT_BASE = os.path.join(REPORT_DIR, REPORT_IDENTIFIER) if os.path.exists(CHECK_POINT_FILE): with open(CHECK_POINT_FILE, 'r') as handle: last_job_sent = int(handle.read()) else: last_job_sent = -1 annotate('galaxy_init', 'Loading Galaxy...') model, object_store, gxconfig, app = _init(args, need_app=config['grt']['share_toolbox']) # Galaxy overrides our logging level. logging.getLogger().setLevel(getattr(logging, args.loglevel.upper())) sa_session = model.context.current annotate('galaxy_end') # Fetch jobs COMPLETED with status OK that have not yet been sent. # Set up our arrays active_users = defaultdict(int) job_state_data = defaultdict(int) annotate('san_init', 'Building Sanitizer') san = Sanitization(config['sanitization'], model, sa_session) annotate('san_end') if not os.path.exists(REPORT_DIR): os.makedirs(REPORT_DIR) # Pick an end point so our queries can return uniform data. annotate('endpoint_start', 'Identifying a safe endpoint for SQL queries') end_job_id = sa_session.query(model.Job.id) \ .order_by(model.Job.id.desc()) \ .first()[0] # Allow users to only report N records at once. if args.max_records > 0: if end_job_id - last_job_sent > args.max_records: end_job_id = last_job_sent + args.max_records annotate('endpoint_end', 'Processing jobs (%s, %s]' % (last_job_sent, end_job_id)) # Remember the last job sent. if end_job_id == last_job_sent: logging.info("No new jobs to report") # So we can just quit now. sys.exit(0) # Unfortunately we have to keep this mapping for the sanitizer to work properly. job_tool_map = {} blacklisted_tools = config['sanitization']['tools'] annotate('export_jobs_start', 'Exporting Jobs') handle_job = open(REPORT_BASE + '.jobs.tsv', 'w') handle_job.write('\t'.join(('id', 'tool_id', 'tool_version', 'state', 'create_time')) + '\n') for offset_start in range(last_job_sent, end_job_id, args.batch_size): logging.debug("Processing %s:%s", offset_start, min(end_job_id, offset_start + args.batch_size)) for job in sa_session.query(model.Job.id, model.Job.user_id, model.Job.tool_id, model.Job.tool_version, model.Job.state, model.Job.create_time) \ .filter(model.Job.id > offset_start) \ .filter(model.Job.id <= min(end_job_id, offset_start + args.batch_size)) \ .all(): # If the tool is blacklisted, exclude everywhere if job[2] in blacklisted_tools: continue try: handle_job.write(str(job[0])) # id handle_job.write('\t') handle_job.write(job[2]) # tool_id handle_job.write('\t') handle_job.write(job[3]) # tool_version handle_job.write('\t') handle_job.write(job[4]) # state handle_job.write('\t') handle_job.write(str(job[5])) # create_time handle_job.write('\n') except Exception: logging.warning("Unable to write out a 'handle_job' row. Ignoring the row.", exc_info=True) continue # meta counts job_state_data[job[4]] += 1 active_users[job[1]] += 1 job_tool_map[job[0]] = job[2] handle_job.close() annotate('export_jobs_end') annotate('export_datasets_start', 'Exporting Datasets') handle_datasets = open(REPORT_BASE + '.datasets.tsv', 'w') handle_datasets.write('\t'.join(('job_id', 'dataset_id', 'extension', 'file_size', 'param_name', 'type')) + '\n') for offset_start in range(last_job_sent, end_job_id, args.batch_size): logging.debug("Processing %s:%s", offset_start, min(end_job_id, offset_start + args.batch_size)) # four queries: JobToInputDatasetAssociation, JobToOutputDatasetAssociation, HistoryDatasetAssociation, Dataset job_to_input_hda_ids = sa_session.query(model.JobToInputDatasetAssociation.job_id, model.JobToInputDatasetAssociation.dataset_id, model.JobToInputDatasetAssociation.name) \ .filter(model.JobToInputDatasetAssociation.job_id > offset_start) \ .filter(model.JobToInputDatasetAssociation.job_id <= min(end_job_id, offset_start + args.batch_size)) \ .all() job_to_output_hda_ids = sa_session.query(model.JobToOutputDatasetAssociation.job_id, model.JobToOutputDatasetAssociation.dataset_id, model.JobToOutputDatasetAssociation.name) \ .filter(model.JobToOutputDatasetAssociation.job_id > offset_start) \ .filter(model.JobToOutputDatasetAssociation.job_id <= min(end_job_id, offset_start + args.batch_size)) \ .all() # add type and concat job_to_hda_ids = [[list(i), "input"] for i in job_to_input_hda_ids] + [[list(i), "output"] for i in job_to_output_hda_ids] # put all of the hda_ids into a list hda_ids = [i[0][1] for i in job_to_hda_ids] hdas = sa_session.query(model.HistoryDatasetAssociation.id, model.HistoryDatasetAssociation.dataset_id, model.HistoryDatasetAssociation.extension) \ .filter(model.HistoryDatasetAssociation.id.in_(hda_ids)) \ .all() # put all the dataset ids into a list dataset_ids = [i[1] for i in hdas] # get the sizes of the datasets datasets = sa_session.query(model.Dataset.id, model.Dataset.total_size) \ .filter(model.Dataset.id.in_(dataset_ids)) \ .all() # datasets to dictionay for easy search hdas = {i[0]: i[1:] for i in hdas} datasets = {i[0]: i[1:] for i in datasets} for job_to_hda in job_to_hda_ids: job = job_to_hda[0] # job_id, hda_id, name filetype = job_to_hda[1] # input|output # No associated job if job[0] not in job_tool_map: continue # If the tool is blacklisted, exclude everywhere if job_tool_map[job[0]] in blacklisted_tools: continue hda_id = job[1] if hda_id is None: continue dataset_id = hdas[hda_id][0] if dataset_id is None: continue try: handle_datasets.write(str(job[0])) handle_datasets.write('\t') handle_datasets.write(str(hda_id)) handle_datasets.write('\t') handle_datasets.write(str(hdas[hda_id][1])) handle_datasets.write('\t') handle_datasets.write(str(datasets[dataset_id][0])) handle_datasets.write('\t') handle_datasets.write(str(job[2])) handle_datasets.write('\t') handle_datasets.write(str(filetype)) handle_datasets.write('\n') except Exception: logging.warning("Unable to write out a 'handle_datasets' row. Ignoring the row.", exc_info=True) continue handle_datasets.close() annotate('export_datasets_end') annotate('export_metric_num_start', 'Exporting Metrics (Numeric)') handle_metric_num = open(REPORT_BASE + '.metric_num.tsv', 'w') handle_metric_num.write('\t'.join(('job_id', 'plugin', 'name', 'value')) + '\n') for offset_start in range(last_job_sent, end_job_id, args.batch_size): logging.debug("Processing %s:%s", offset_start, min(end_job_id, offset_start + args.batch_size)) for metric in sa_session.query(model.JobMetricNumeric.job_id, model.JobMetricNumeric.plugin, model.JobMetricNumeric.metric_name, model.JobMetricNumeric.metric_value) \ .filter(model.JobMetricNumeric.job_id > offset_start) \ .filter(model.JobMetricNumeric.job_id <= min(end_job_id, offset_start + args.batch_size)) \ .all(): # No associated job if metric[0] not in job_tool_map: continue # If the tool is blacklisted, exclude everywhere if job_tool_map[metric[0]] in blacklisted_tools: continue try: handle_metric_num.write(str(metric[0])) handle_metric_num.write('\t') handle_metric_num.write(metric[1]) handle_metric_num.write('\t') handle_metric_num.write(metric[2]) handle_metric_num.write('\t') handle_metric_num.write(str(metric[3])) handle_metric_num.write('\n') except Exception: logging.warning("Unable to write out a 'handle_metric_num' row. Ignoring the row.", exc_info=True) continue handle_metric_num.close() annotate('export_metric_num_end') annotate('export_params_start', 'Export Job Parameters') handle_params = open(REPORT_BASE + '.params.tsv', 'w') handle_params.write('\t'.join(('job_id', 'name', 'value')) + '\n') for offset_start in range(last_job_sent, end_job_id, args.batch_size): logging.debug("Processing %s:%s", offset_start, min(end_job_id, offset_start + args.batch_size)) for param in sa_session.query(model.JobParameter.job_id, model.JobParameter.name, model.JobParameter.value) \ .filter(model.JobParameter.job_id > offset_start) \ .filter(model.JobParameter.job_id <= min(end_job_id, offset_start + args.batch_size)) \ .all(): # No associated job if param[0] not in job_tool_map: continue # If the tool is blacklisted, exclude everywhere if job_tool_map[param[0]] in blacklisted_tools: continue try: sanitized = san.sanitize_data(job_tool_map[param[0]], param[1], param[2]) handle_params.write(str(param[0])) handle_params.write('\t') handle_params.write(param[1]) handle_params.write('\t') handle_params.write(json.dumps(sanitized)) handle_params.write('\n') except Exception: logging.warning("Unable to write out a 'handle_params' row. Ignoring the row.", exc_info=True) continue handle_params.close() annotate('export_params_end') # Now on to outputs. with tarfile.open(REPORT_BASE + '.tar.gz', 'w:gz') as handle: for name in ('jobs', 'metric_num', 'params', 'datasets'): handle.add(REPORT_BASE + '.' + name + '.tsv') for name in ('jobs', 'metric_num', 'params', 'datasets'): os.unlink(REPORT_BASE + '.' + name + '.tsv') _times.append(('job_finish', time.time() - _start_time)) sha = hash_util.memory_bound_hexdigest(hash_util.sha256, REPORT_BASE + ".tar.gz") _times.append(('hash_finish', time.time() - _start_time)) # Now serialize the individual report data. with open(REPORT_BASE + '.json', 'w') as handle: if config['grt']['share_toolbox']: toolbox = [ (tool.id, tool.name, tool.version, tool.tool_shed, tool.repository_id, tool.repository_name) for tool_id, tool in app.toolbox._tools_by_id.items() ] else: toolbox = None json.dump({ "version": 1, "galaxy_version": gxconfig.version_major, "generated": REPORT_IDENTIFIER, "report_hash": "sha256:" + sha, "metrics": { "_times": _times, }, "users": { "active": len(active_users.keys()), "total": sa_session.query(model.User.id).count(), }, "jobs": job_state_data, "tools": toolbox }, handle) # Write our checkpoint file so we know where to start next time. with open(CHECK_POINT_FILE, 'w') as handle: handle.write(str(end_job_id))
def main(): """ Managing library datasets is a bit complex, so here is a scenario that hopefully provides clarification. The complexities of handling library datasets is mostly contained in the delete_datasets() method in this script. Assume we have 1 library dataset with: LibraryDatasetDatasetAssociation -> LibraryDataset and Dataset At this point, we have the following database column values: LibraryDatasetDatasetAssociation deleted: False LibraryDataset deleted: False, purged: False Dataset deleted: False purged: False 1. A user deletes the assumed dataset above from a data library via a UI menu option. This action results in the following database column values (changes from previous step marked with *): LibraryDatasetDatasetAssociation deleted: False LibraryDataset deleted: True*, purged: False Dataset deleted: False, purged: False 2. After the number of days configured for the delete_datasets() method (option -6 below) have passed, execution of the delete_datasets() method results in the following database column values (changes from previous step marked with *): LibraryDatasetDatasetAssociation deleted: True* LibraryDataset deleted: True, purged: True* Dataset deleted: True*, purged: False 3. After the number of days configured for the purge_datasets() method (option -3 below) have passed, execution of the purge_datasets() method results in the following database column values (changes from previous step marked with *): LibraryDatasetDatasetAssociation deleted: True LibraryDataset deleted: True, purged: True Dataset deleted: True, purged: True* (dataset file removed from disk if -r flag is used) This scenario is about as simple as it gets. Keep in mind that a Dataset object can have many HistoryDatasetAssociations and many LibraryDatasetDatasetAssociations, and a LibraryDataset can have many LibraryDatasetDatasetAssociations. Another way of stating it is: LibraryDatasetDatasetAssociation objects map LibraryDataset objects to Dataset objects, and Dataset objects may be mapped to History objects via HistoryDatasetAssociation objects. """ parser = argparse.ArgumentParser() parser.add_argument('legacy_config', metavar='CONFIG', type=str, default=None, nargs='?', help='config file (legacy, use --config instead)') parser.add_argument("-d", "--days", dest="days", action="store", type=int, help="number of days (60)", default=60) parser.add_argument("-r", "--remove_from_disk", action="store_true", dest="remove_from_disk", help="remove datasets from disk when purged", default=False) parser.add_argument("-i", "--info_only", action="store_true", dest="info_only", help="info about the requested action", default=False) parser.add_argument("-f", "--force_retry", action="store_true", dest="force_retry", help="performs the requested actions, but ignores whether it might have been done before. Useful when -r wasn't used, but should have been", default=False) parser.add_argument("-1", "--delete_userless_histories", action="store_true", dest="delete_userless_histories", default=False, help="delete userless histories and datasets") parser.add_argument("-2", "--purge_histories", action="store_true", dest="purge_histories", default=False, help="purge deleted histories") parser.add_argument("-3", "--purge_datasets", action="store_true", dest="purge_datasets", default=False, help="purge deleted datasets") parser.add_argument("-4", "--purge_libraries", action="store_true", dest="purge_libraries", default=False, help="purge deleted libraries") parser.add_argument("-5", "--purge_folders", action="store_true", dest="purge_folders", default=False, help="purge deleted library folders") parser.add_argument("-6", "--delete_datasets", action="store_true", dest="delete_datasets", default=False, help="mark deletable datasets as deleted and purge associated dataset instances") populate_config_args(parser) args = parser.parse_args() config_override = None if args.legacy_config: config_override = args.legacy_config if not (args.purge_folders ^ args.delete_userless_histories ^ args.purge_libraries ^ args.purge_histories ^ args.purge_datasets ^ args.delete_datasets): parser.print_help() sys.exit(0) if args.remove_from_disk and args.info_only: parser.error("remove_from_disk and info_only are mutually exclusive") app_properties = app_properties_from_args(args, legacy_config_override=config_override) config = galaxy.config.Configuration(**app_properties) app = CleanupDatasetsApplication(config) cutoff_time = datetime.utcnow() - timedelta(days=args.days) now = strftime("%Y-%m-%d %H:%M:%S") print("##########################################") print("\n# %s - Handling stuff older than %i days" % (now, args.days)) if args.info_only: print("# Displaying info only ( --info_only )\n") elif args.remove_from_disk: print("Datasets will be removed from disk.\n") else: print("Datasets will NOT be removed from disk.\n") if args.delete_userless_histories: delete_userless_histories(app, cutoff_time, info_only=args.info_only, force_retry=args.force_retry) elif args.purge_histories: purge_histories(app, cutoff_time, args.remove_from_disk, info_only=args.info_only, force_retry=args.force_retry) elif args.purge_datasets: purge_datasets(app, cutoff_time, args.remove_from_disk, info_only=args.info_only, force_retry=args.force_retry) elif args.purge_libraries: purge_libraries(app, cutoff_time, args.remove_from_disk, info_only=args.info_only, force_retry=args.force_retry) elif args.purge_folders: purge_folders(app, cutoff_time, args.remove_from_disk, info_only=args.info_only, force_retry=args.force_retry) elif args.delete_datasets: delete_datasets(app, cutoff_time, args.remove_from_disk, info_only=args.info_only, force_retry=args.force_retry) app.shutdown() sys.exit(0)