def add_workflows(args, other_args, subparser=None): """ Add GeneFlow workflows to database. Args: args.workflow_yaml: GeneFlow definition with workflows. args.config: GeneFlow config file path. args.environment: Config environment. Returns: On success: True. On failure: False. """ workflow_yaml = args.workflow_yaml config = args.config environment = args.environment # load config file cfg = Config() if not cfg.load(config): Log.an().error('cannot load config file: %s', config) return False config_dict = cfg.config(environment) if not config_dict: Log.an().error('invalid config environment: %s', environment) return False # connect to data source try: data_source = DataSource(config_dict['database']) except DataSourceException as err: Log.an().error('data source initialization error [%s]', str(err)) return False # import workflow defs = data_source.import_workflows_from_def(workflow_yaml) if not defs: Log.an().error('workflow definition load failed: %s', workflow_yaml) return False data_source.commit() # display new IDs for workflow in defs: Log.some().info('workflow loaded: %s -> %s', workflow, defs[workflow]) return True
def add_apps(args): """ Add GeneFlow apps to database. Args: args.app_yaml: GeneFlow definition with apps. args.config_file: GeneFlow config file path. args.environment: Config environment. Returns: On success: True. On failure: False. """ app_yaml = args.app_yaml config_file = args.config_file environment = args.environment # load config file cfg = Config() if not cfg.load(config_file): Log.an().error('cannot load config file: %s', config_file) return False config_dict = cfg.config(environment) if not config_dict: Log.an().error('invalid config environment: %s', environment) return False # connect to data source try: data_source = DataSource(config_dict['database']) except DataSourceException as err: Log.an().error('data source initialization error [%s]', str(err)) return False # import apps defs = data_source.import_apps_from_def(app_yaml) if not defs: Log.an().error('app definition load failed: %s', app_yaml) return False data_source.commit() # display new IDs for app in defs: Log.some().info('app loaded: %s -> %s', app, defs[app]) return True
def _load_apps(self): """ Load and validate app definitions from the database. Args: self: class instance Returns: On success: True. On failure: False. """ try: data_source = DataSource(self._config['database']) except DataSourceException as err: msg = 'data source initialization error [{}]'.format(str(err)) Log.an().error(msg) return self._fatal(msg) self._apps = data_source.get_app_defs_by_workflow_id( self._job['workflow_id'] ) if self._apps is False: msg = 'cannot load apps from data source: workflow_id={}'.\ format(self._job['workflow_id']) Log.an().error(msg) return self._fatal(msg) if not self._apps: msg = 'no apps found for workflow: workflow_id={}'.\ format(self._job['workflow_id']) Log.an().error(msg) return self._fatal(msg) # validate the app definitions for app in self._apps: valid_def = Definition.validate_app(self._apps[app]) if valid_def is False: msg = 'invalid app definition:\n{}'\ .format(yaml.dump(self._apps[app])) Log.an().error(msg) return self._fatal(msg) self._apps[app] = valid_def return True
def _update_status_db(self, status, msg): """ Update the status of the step, and the status record in the database. Args: status: new step status. msg: message associated with step status. Returns: On success: True. On failure: False. """ try: data_source = DataSource(self._config['database']) except DataSourceException as err: msg = 'data source initialization error [{}]'.format(str(err)) Log.an().error(msg) return False self._status = status detail = self._serialize_detail() if not data_source.update_job_step_status( self._step['step_id'], self._job['job_id'], self._status, json.dumps(detail), msg ): Log.an().warning('cannot update job status in data source') data_source.rollback() data_source.commit() return True
def _load_workflow(self): """ Load and validate workflow definition from the database. Args: self: class instance Returns: On success: True. On failure: False. """ try: data_source = DataSource(self._config['database']) except DataSourceException as err: msg = 'data source initialization error [{}]'.format(str(err)) Log.an().error(msg) return self._fatal(msg) self._workflow = data_source.get_workflow_def_by_id( self._job['workflow_id'] ) if self._workflow is False: msg = 'cannot load workflow from data source: workflow_id={}'.\ format(self._job['workflow_id']) Log.an().error(msg) return self._fatal(msg) if not self._workflow: msg = 'workflow not found: workflow_id={}'\ .format(self._job['workflow_id']) Log.an().error(msg) return self._fatal(msg) # validate the workflow definition valid_def = Definition.validate_workflow(self._workflow) if valid_def is False: msg = 'invalid workflow definition:\n{}'\ .format(yaml.dump(self._workflow)) Log.an().error(msg) return self._fatal(msg) self._workflow = valid_def return True
def _load_job(self): """ Load and validate job definition from the database. Args: self: class instance Returns: On success: True. On failure: False. """ try: data_source = DataSource(self._config['database']) except DataSourceException as err: msg = 'data source initialization error [{}]'.format(str(err)) Log.an().error(msg) return self._fatal(msg) self._job = data_source.get_job_def_by_id(self._job_id) if self._job is False: msg = 'cannot load job from data source: job_id={}'\ .format(self._job_id) Log.an().error(msg) return self._fatal(msg) if not self._job: msg = 'job not found: job_id={}'.format(self._job_id) Log.an().error(msg) return self._fatal(msg) # validate the job definition valid_def = Definition.validate_job(self._job) if valid_def is False: msg = 'invalid job definition:\n{}'.format(yaml.dump(self._job)) Log.an().error(msg) return self._fatal(msg) self._job = valid_def return True
def clear_database(context): try: gfdb = DataSource(context.geneflow_config['database']) except Exception as err: assert False # delete all workflows from database workflows = gfdb.get_workflows() assert workflows is not False for workflow in workflows: assert gfdb.delete_workflow_by_id(workflow['id']) gfdb.commit() # delete all apps from database apps = gfdb.get_apps() assert apps is not False for app in apps: assert gfdb.delete_app_by_id(app['id']) gfdb.commit()
def _update_status_db(self, status, msg): """ Update workflow status in DB. Args: self: class instance status: Workflow status msg: Success, error or warning message Returns: On success: True. On failure: False. """ try: data_source = DataSource(self._config['database']) except DataSourceException as err: msg = 'data source initialization error [{}]'.format(str(err)) Log.an().error(msg) return False # set start time (if started, or errored immediatedly) if ( status in ['RUNNING', 'ERROR'] and self._status == 'PENDING' ): if not data_source.set_job_started(self._job_id): Log.a().warning('cannot set job start time in data source') data_source.rollback() # set finished time (even on error) if status in ['FINISHED', 'ERROR']: if not data_source.set_job_finished(self._job_id): Log.a().warning('cannot set job finish time in data source') data_source.rollback() # if state change, contact notification endpoint if status != self._status: if self._job['notifications']: self._send_notifications(status) # update database self._status = status if not data_source.update_job_status(self._job_id, status, msg): Log.a().warning('cannot update job status in data source') data_source.rollback() data_source.commit() return True
def run(args, other_args, subparser): """ Run GeneFlow workflow engine. Args: args.workflow_path: workflow definition or package directory. args.job: path to job definition Returns: On success: True. On failure: False. """ # get absolute path to workflow workflow_path = resolve_workflow_path(args.workflow_path) if workflow_path: Log.some().info('workflow definition found: %s', workflow_path) else: Log.an().error('cannot find workflow definition: %s', args.workflow_path) return False # setup environment env = Environment(workflow_path=workflow_path) if not env.initialize(): Log.an().error('cannot initialize geneflow environment') return False # create default config file and SQLite db cfg = Config() cfg.default(env.get_sqlite_db_path()) cfg.write(env.get_config_path()) config_dict = cfg.config('local') # load workflow into db try: data_source = DataSource(config_dict['database']) except DataSourceException as err: Log.an().error('data source initialization error [%s]', str(err)) return False defs = data_source.import_definition(workflow_path) if not defs: Log.an().error('workflow definition load failed: %s', workflow_path) return False if not defs['workflows']: Log.an().error('workflow definition load failed: %s', workflow_path) return False data_source.commit() for workflow in defs['workflows']: Log.some().info('workflow loaded: %s -> %s', workflow, defs['workflows'][workflow]) # get workflow definition back from database to ensure # that it's a valid definition workflow_id = next(iter(defs['workflows'].values())) workflow_dict = data_source.get_workflow_def_by_id(workflow_id) if not workflow_dict: Log.an().error( 'cannot get workflow definition from data source: workflow_id=%s', workflow_id) return False ### define arg parsing methods def parse_dynamic_args(workflow_dict): """ Parse dynamic args based on workflow dictionary as well as some static args. Args: other_args: List of remaining args from initial parse of workflow path. workflow_dict: Workflow dictionary Returns: On success: List of parsed arguments. On failure: False. """ # parse dynamic args. these are determined from workflow definition dynamic_parser = argparse.ArgumentParser() dynamic_parser.add_argument('-j', '--job', type=str, default=None, dest='job_path', help='Job Definition(s)') for input_key in workflow_dict['inputs']: dynamic_parser.add_argument( '--in.{}'.format(input_key), dest='inputs.{}'.format(input_key), required=False, default=workflow_dict['inputs'][input_key]['default'], help=workflow_dict['inputs'][input_key]['label']) for param_key in workflow_dict['parameters']: dynamic_parser.add_argument( '--param.{}'.format(param_key), dest='parameters.{}'.format(param_key), required=False, default=workflow_dict['parameters'][param_key]['default'], help=workflow_dict['parameters'][param_key]['label']) dynamic_parser.add_argument('-o', '--output', type=str, default='~/geneflow-output', help='Output Folder') dynamic_parser.add_argument('-n', '--name', type=str, default='geneflow-job', help='Name of Job') dynamic_parser.add_argument('-w', '--work', nargs='+', type=str, default=[], help='Work Directory') dynamic_parser.add_argument('--exec-context', '--ec', nargs='+', type=str, dest='exec_context', default=[], help='Execution Contexts') dynamic_parser.add_argument('--exec-method', '--em', nargs='+', type=str, dest='exec_method', default=[], help='Execution Methods') dynamic_parser.add_argument('--exec-param', '--ep', nargs='+', type=str, dest='exec_param', default=[], help='Execution Parameters') dynamic_args = dynamic_parser.parse_known_args(other_args) return dynamic_args[0] if 'gooey' in sys.modules: @Gooey(program_name='GeneFlow: {}'.format(workflow_dict['name']), program_description=workflow_dict['description'], target='gf --log-level={} run {}'.format( args.log_level, args.workflow_path), monospace_display=True) def parse_dynamic_args_gui(workflow_dict): """ Parse dynamic args based on workflow dictionary as well as some static args. Display a GUI interface. Args: other_args: List of remaining args from initial parse of workflow path. workflow_dict: Workflow dictionary Returns: On success: List of parsed arguments. On failure: False. """ # parse dynamic args. these are determined from workflow definition dynamic_parser = GooeyParser() input_group = dynamic_parser.add_argument_group( "Workflow Inputs", "Files or folders to be passed to the workflow") for input_key in workflow_dict['inputs']: widget = 'FileChooser' if workflow_dict['inputs'][input_key]['type'] == 'Directory': widget = 'DirChooser' input_group.add_argument( '--in.{}'.format(input_key), dest='inputs.{}'.format(input_key), required=False, default=workflow_dict['inputs'][input_key]['default'], help=workflow_dict['inputs'][input_key]['label'], widget=widget) param_group = dynamic_parser.add_argument_group( "Workflow Parameters", "Number or string parameters to be passed to the workflow") for param_key in workflow_dict['parameters']: param_group.add_argument( '--param.{}'.format(param_key), dest='parameters.{}'.format(param_key), required=False, default=workflow_dict['parameters'][param_key]['default'], help=workflow_dict['parameters'][param_key]['label']) job_group = dynamic_parser.add_argument_group( "Job Options", "Output/intermediate folders and job name") job_group.add_argument('-o', '--output', type=str, default='~/geneflow-output', help='Output Folder', widget='DirChooser') job_group.add_argument('-n', '--name', type=str, default='geneflow-job', help='Name of Job') job_group.add_argument('-w', '--work', nargs='+', type=str, default=[], help='Work Directory') exec_group = dynamic_parser.add_argument_group( "Execution Options", "Customize workflow execution") exec_group.add_argument('--exec-context', '--ec', nargs='+', type=str, dest='exec_context', default=[], help='Execution Contexts') exec_group.add_argument('--exec-method', '--em', nargs='+', type=str, dest='exec_method', default=[], help='Execution Methods') exec_group.add_argument('--exec-param', '--ep', nargs='+', type=str, dest='exec_param', default=[], help='Execution Parameters') dynamic_args = dynamic_parser.parse_args(other_args) return dynamic_args # get dynamic args if args.gui and 'gooey' in sys.modules: dynamic_args = parse_dynamic_args_gui(workflow_dict) else: dynamic_args = parse_dynamic_args(workflow_dict) # get absolute path to job file if provided job_path = None if dynamic_args.job_path: job_path = Path(dynamic_args.job_path).absolute() # load job definition if provided jobs_dict = {} gf_def = Definition() if job_path: if not gf_def.load(job_path): Log.an().error('Job definition load failed') return False jobs_dict = gf_def.jobs() else: # create default definition jobs_dict = { 'job': { 'name': 'GeneFlow job', 'output_uri': 'geneflow_output', 'work_uri': { 'local': '~/.geneflow/work' } } } # override with known cli parameters apply_job_modifiers(jobs_dict, [ 'name={}'.format(dynamic_args.name), 'output_uri={}'.format( dynamic_args.output) ]) # insert workflow name into job, if not provided workflow_name = next(iter(defs['workflows'])) for job in jobs_dict.values(): if 'workflow_name' not in job: job['workflow_name'] = workflow_name # add inputs and parameters to job definition apply_job_modifiers( jobs_dict, [ '{}={}'.format(dynamic_arg, getattr(dynamic_args, dynamic_arg)) for dynamic_arg in vars(dynamic_args) \ if dynamic_arg.startswith('inputs.') or dynamic_arg.startswith('parameters.') ] ) # add work URIs to job definition work_uris = {} for work_arg in dynamic_args.work: parsed_work_uri = URIParser.parse(work_arg) if not parsed_work_uri: # skip if invalid URI Log.a().warning('invalid work uri: %s', work_arg) else: work_uris[ parsed_work_uri['scheme']] = parsed_work_uri['chopped_uri'] apply_job_modifiers(jobs_dict, [ 'work_uri.{}={}'.format(context, work_uris[context]) for context in work_uris ]) # add execution options to job definition apply_job_modifiers(jobs_dict, [ 'execution.context.{}={}'.format(*exec_arg.split(':', 1)[0:2]) for exec_arg in dynamic_args.exec_context ] + [ 'execution.method.{}={}'.format(*exec_arg.split(':', 1)[0:2]) for exec_arg in dynamic_args.exec_method ] + [ 'execution.parameters.{}={}'.format(*exec_arg.split(':', 1)[0:2]) for exec_arg in dynamic_args.exec_param ]) # get default values from workflow definition for job in jobs_dict.values(): if 'inputs' not in job: job['inputs'] = {} if 'parameters' not in job: job['parameters'] = {} for input_key in workflow_dict['inputs']: if input_key not in job['inputs']: job['inputs'][input_key]\ = workflow_dict['inputs'][input_key]['default'] for param_key in workflow_dict['parameters']: if param_key not in job['parameters']: job['parameters'][param_key]\ = workflow_dict['parameters'][param_key]['default'] # expand URIs for job in jobs_dict.values(): # output URI parsed_uri = URIParser.parse(job['output_uri']) if not parsed_uri: Log.an().error('invalid output uri: %s', job['output_uri']) return False # expand relative path if local if parsed_uri['scheme'] == 'local': job['output_uri'] = str( Path(parsed_uri['chopped_path']).expanduser().resolve()) # work URIs for context in job['work_uri']: parsed_uri = URIParser.parse(job['work_uri'][context]) if not parsed_uri: Log.an().error('invalid work uri: %s', job['work_uri']) return False # expand relative path if local if parsed_uri['scheme'] == 'local': job['work_uri'][context] = str( Path(parsed_uri['chopped_path']).expanduser().resolve()) # input URIs for input_key in job['inputs']: parsed_uri = URIParser.parse(job['inputs'][input_key]) if not parsed_uri: Log.an().error('invalid input uri: %s', job['inputs'][input_key]) return False # expand relative path if local if parsed_uri['scheme'] == 'local': job['inputs'][input_key] = str( Path(parsed_uri['chopped_path']).expanduser().resolve()) # import jobs into database job_ids = data_source.import_jobs_from_dict(jobs_dict) if job_ids is False: Log.an().error('cannot import jobs') return False data_source.commit() # create process pool to run workflows in parallel pool = Pool(min(5, len(job_ids))) jobs = [{'name': job, 'id': job_ids[job], 'log': None} for job in job_ids] result = pool.map( partial(geneflow.cli.common.run_workflow, config=config_dict, log_level=args.log_level), jobs) pool.close() pool.join() if not all(result): Log.an().error('some jobs failed') return result
def run(args): """ Run GeneFlow workflow engine. Args: args.workflow: workflow definition or package directory. args.job_yaml: job definition. Returns: On success: True. On failure: False. """ # get absolute path to workflow workflow_yaml = resolve_workflow_path(args.workflow) if workflow_yaml: Log.some().info('workflow definition found: %s', workflow_yaml) else: Log.an().error('cannot find workflow definition: %s', args.workflow) return False # get absolute path to job file if provided job_yaml = None if args.job_yaml: job_yaml = Path(args.job_yaml).absolute() # setup environment env = Environment(workflow_path=workflow_yaml) if not env.initialize(): Log.an().error('cannot initialize geneflow environment') return False # create default config file and SQLite db cfg = Config() cfg.default(env.get_sqlite_db_path()) cfg.write(env.get_config_path()) config_dict = cfg.config('local') # load workflow into db try: data_source = DataSource(config_dict['database']) except DataSourceException as err: Log.an().error('data source initialization error [%s]', str(err)) return False defs = data_source.import_definition(workflow_yaml) if not defs: Log.an().error('workflow definition load failed: %s', workflow_yaml) return False if not defs['workflows']: Log.an().error('workflow definition load failed: %s', workflow_yaml) return False data_source.commit() for workflow in defs['workflows']: Log.some().info( 'workflow loaded: %s -> %s', workflow, defs['workflows'][workflow] ) # load job definition if provided jobs_dict = {} gf_def = Definition() if job_yaml: if not gf_def.load(job_yaml): Log.an().error('Job definition load failed') return False jobs_dict = gf_def.jobs() else: # create default definition jobs_dict = { 'job': { 'name': 'GeneFlow job', 'output_uri': 'geneflow_output', 'work_uri': { 'local': '~/.geneflow/work' } } } # override with cli parameters if args.data: apply_job_modifiers(jobs_dict, args.data) # insert workflow name, if not provided workflow_name = next(iter(defs['workflows'])) for job in jobs_dict.values(): if 'workflow_name' not in job: job['workflow_name'] = workflow_name # extract workflow defaults for inputs and parameters if not provided # in job definition workflow_id = next(iter(defs['workflows'].values())) workflow_dict = data_source.get_workflow_def_by_id(workflow_id) if not workflow_dict: Log.an().error( 'cannot get workflow definition from data source: workflow_id=%s', workflow_id ) return False for job in jobs_dict.values(): if 'inputs' not in job: job['inputs'] = {} if 'parameters' not in job: job['parameters'] = {} for input_key in workflow_dict['inputs']: if input_key not in job['inputs']: job['inputs'][input_key]\ = workflow_dict['inputs'][input_key]['default'] for param_key in workflow_dict['parameters']: if param_key not in job['parameters']: job['parameters'][param_key]\ = workflow_dict['parameters'][param_key]['default'] # expand URIs for job in jobs_dict.values(): # output URI parsed_uri = URIParser.parse(job['output_uri']) if not parsed_uri: Log.an().error('invalid output uri: %s', job['output_uri']) return False # expand relative path if local if parsed_uri['scheme'] == 'local': job['output_uri'] = str( Path(parsed_uri['chopped_path']).expanduser().resolve() ) # work URIs for context in job['work_uri']: parsed_uri = URIParser.parse(job['work_uri'][context]) if not parsed_uri: Log.an().error('invalid work uri: %s', job['work_uri']) return False # expand relative path if local if parsed_uri['scheme'] == 'local': job['work_uri'][context] = str( Path(parsed_uri['chopped_path']).expanduser().resolve() ) # input URIs for input_key in job['inputs']: parsed_uri = URIParser.parse(job['inputs'][input_key]) if not parsed_uri: Log.an().error( 'invalid input uri: %s', job['inputs'][input_key] ) return False # expand relative path if local if parsed_uri['scheme'] == 'local': job['inputs'][input_key] = str( Path(parsed_uri['chopped_path']).expanduser().resolve() ) # import jobs into database job_ids = data_source.import_jobs_from_dict(jobs_dict) if job_ids is False: Log.an().error('cannot import jobs') return False data_source.commit() # create process pool to run workflows in parallel pool = Pool(min(5, len(job_ids))) jobs = [ { 'name': job, 'id': job_ids[job], 'log': None } for job in job_ids ] result = pool.map( partial( geneflow.cli.common.run_workflow, config=config_dict, log_level=args.log_level ), jobs ) pool.close() pool.join() if not all(result): Log.an().error('some jobs failed') return result
def run_pending(args): """ Run any jobs in database in the PENDING state. Args: args.config_file: GeneFlow config file path. args.environment: Config environment. Returns: On success: True. On failure: False. """ config_file = args.config_file environment = args.environment log_location = args.log_location # load config file cfg = Config() if not cfg.load(config_file): Log.an().error('cannot load config file: %s', config_file) return False config_dict = cfg.config(environment) if not config_dict: Log.an().error('invalid config environment: %s', environment) return False # connect to data source try: data_source = DataSource(config_dict['database']) except DataSourceException as err: Log.an().error('data source initialization error [%s]', str(err)) return False # get pending jobs from database pending_jobs = data_source.get_pending_jobs() if pending_jobs is False: Log.an().error('cannot query for pending jobs') return False if not pending_jobs: # no jobs found return True Log.some().info('pending jobs found:\n%s', pprint.pformat(pending_jobs)) pool = Pool(min(5, len(pending_jobs))) jobs = [{ 'name': job['name'], 'id': job['id'], 'log': str(Path(log_location) / (job['id'] + '.log')) } for job in pending_jobs] result = pool.map( partial(geneflow.cli.common.run_workflow, config=config_dict, log_level=args.log_level), jobs) pool.close() pool.join() if not all(result): Log.an().error('some jobs failed') return result