def _load_workflow_def(self): # load geneflow definition file gf_def = Definition() if not gf_def.load(str(self._workflow_yaml)): Log.an().error('invalid geneflow definition: %s', self._workflow_yaml) return False # make sure there is a workflow definition in the file if not gf_def.workflows(): Log.an().error('no workflows in geneflow definition') return False # extract the workflow definition self._workflow = next(iter(gf_def.workflows().values())) return True
def _load_apps(self): """ Load and validate app definitions from the database. Args: self: class instance Returns: On success: True. On failure: False. """ try: data_source = DataSource(self._config['database']) except DataSourceException as err: msg = 'data source initialization error [{}]'.format(str(err)) Log.an().error(msg) return self._fatal(msg) self._apps = data_source.get_app_defs_by_workflow_id( self._job['workflow_id'] ) if self._apps is False: msg = 'cannot load apps from data source: workflow_id={}'.\ format(self._job['workflow_id']) Log.an().error(msg) return self._fatal(msg) if not self._apps: msg = 'no apps found for workflow: workflow_id={}'.\ format(self._job['workflow_id']) Log.an().error(msg) return self._fatal(msg) # validate the app definitions for app in self._apps: valid_def = Definition.validate_app(self._apps[app]) if valid_def is False: msg = 'invalid app definition:\n{}'\ .format(yaml.dump(self._apps[app])) Log.an().error(msg) return self._fatal(msg) self._apps[app] = valid_def return True
def _load_workflow(self): """ Load and validate workflow definition from the database. Args: self: class instance Returns: On success: True. On failure: False. """ try: data_source = DataSource(self._config['database']) except DataSourceException as err: msg = 'data source initialization error [{}]'.format(str(err)) Log.an().error(msg) return self._fatal(msg) self._workflow = data_source.get_workflow_def_by_id( self._job['workflow_id'] ) if self._workflow is False: msg = 'cannot load workflow from data source: workflow_id={}'.\ format(self._job['workflow_id']) Log.an().error(msg) return self._fatal(msg) if not self._workflow: msg = 'workflow not found: workflow_id={}'\ .format(self._job['workflow_id']) Log.an().error(msg) return self._fatal(msg) # validate the workflow definition valid_def = Definition.validate_workflow(self._workflow) if valid_def is False: msg = 'invalid workflow definition:\n{}'\ .format(yaml.dump(self._workflow)) Log.an().error(msg) return self._fatal(msg) self._workflow = valid_def return True
def _load_job(self): """ Load and validate job definition from the database. Args: self: class instance Returns: On success: True. On failure: False. """ try: data_source = DataSource(self._config['database']) except DataSourceException as err: msg = 'data source initialization error [{}]'.format(str(err)) Log.an().error(msg) return self._fatal(msg) self._job = data_source.get_job_def_by_id(self._job_id) if self._job is False: msg = 'cannot load job from data source: job_id={}'\ .format(self._job_id) Log.an().error(msg) return self._fatal(msg) if not self._job: msg = 'job not found: job_id={}'.format(self._job_id) Log.an().error(msg) return self._fatal(msg) # validate the job definition valid_def = Definition.validate_job(self._job) if valid_def is False: msg = 'invalid job definition:\n{}'.format(yaml.dump(self._job)) Log.an().error(msg) return self._fatal(msg) self._job = valid_def return True
def load_app(self): """ Load app definition. Args: self: class instance Returns: On success: True On failure: False """ # read yaml file self._app = self._yaml_to_dict(str(Path(self._path / 'app.yaml'))) # empty dict? if not self._app: Log.an().error('cannot load/parse app.yaml file in app: %s', self._path) return False valid_def = Definition.validate_app(self._app) if not valid_def: Log.an().error('app validation error') return False # check formatting of version self._app['agave_version'] = slugify( self._app['version'].lower()).replace('-', '.') if self._app['agave_version'].islower(): # contains letters, invalid version Log.an().error( 'app config validation error: app version cannot contain letters' ) return False return True
def run(args, other_args, subparser): """ Run GeneFlow workflow engine. Args: args.workflow_path: workflow definition or package directory. args.job: path to job definition Returns: On success: True. On failure: False. """ # get absolute path to workflow workflow_path = resolve_workflow_path(args.workflow_path) if workflow_path: Log.some().info('workflow definition found: %s', workflow_path) else: Log.an().error('cannot find workflow definition: %s', args.workflow_path) return False # setup environment env = Environment(workflow_path=workflow_path) if not env.initialize(): Log.an().error('cannot initialize geneflow environment') return False # create default config file and SQLite db cfg = Config() cfg.default(env.get_sqlite_db_path()) cfg.write(env.get_config_path()) config_dict = cfg.config('local') # load workflow into db try: data_source = DataSource(config_dict['database']) except DataSourceException as err: Log.an().error('data source initialization error [%s]', str(err)) return False defs = data_source.import_definition(workflow_path) if not defs: Log.an().error('workflow definition load failed: %s', workflow_path) return False if not defs['workflows']: Log.an().error('workflow definition load failed: %s', workflow_path) return False data_source.commit() for workflow in defs['workflows']: Log.some().info('workflow loaded: %s -> %s', workflow, defs['workflows'][workflow]) # get workflow definition back from database to ensure # that it's a valid definition workflow_id = next(iter(defs['workflows'].values())) workflow_dict = data_source.get_workflow_def_by_id(workflow_id) if not workflow_dict: Log.an().error( 'cannot get workflow definition from data source: workflow_id=%s', workflow_id) return False ### define arg parsing methods def parse_dynamic_args(workflow_dict): """ Parse dynamic args based on workflow dictionary as well as some static args. Args: other_args: List of remaining args from initial parse of workflow path. workflow_dict: Workflow dictionary Returns: On success: List of parsed arguments. On failure: False. """ # parse dynamic args. these are determined from workflow definition dynamic_parser = argparse.ArgumentParser() dynamic_parser.add_argument('-j', '--job', type=str, default=None, dest='job_path', help='Job Definition(s)') for input_key in workflow_dict['inputs']: dynamic_parser.add_argument( '--in.{}'.format(input_key), dest='inputs.{}'.format(input_key), required=False, default=workflow_dict['inputs'][input_key]['default'], help=workflow_dict['inputs'][input_key]['label']) for param_key in workflow_dict['parameters']: dynamic_parser.add_argument( '--param.{}'.format(param_key), dest='parameters.{}'.format(param_key), required=False, default=workflow_dict['parameters'][param_key]['default'], help=workflow_dict['parameters'][param_key]['label']) dynamic_parser.add_argument('-o', '--output', type=str, default='~/geneflow-output', help='Output Folder') dynamic_parser.add_argument('-n', '--name', type=str, default='geneflow-job', help='Name of Job') dynamic_parser.add_argument('-w', '--work', nargs='+', type=str, default=[], help='Work Directory') dynamic_parser.add_argument('--exec-context', '--ec', nargs='+', type=str, dest='exec_context', default=[], help='Execution Contexts') dynamic_parser.add_argument('--exec-method', '--em', nargs='+', type=str, dest='exec_method', default=[], help='Execution Methods') dynamic_parser.add_argument('--exec-param', '--ep', nargs='+', type=str, dest='exec_param', default=[], help='Execution Parameters') dynamic_args = dynamic_parser.parse_known_args(other_args) return dynamic_args[0] if 'gooey' in sys.modules: @Gooey(program_name='GeneFlow: {}'.format(workflow_dict['name']), program_description=workflow_dict['description'], target='gf --log-level={} run {}'.format( args.log_level, args.workflow_path), monospace_display=True) def parse_dynamic_args_gui(workflow_dict): """ Parse dynamic args based on workflow dictionary as well as some static args. Display a GUI interface. Args: other_args: List of remaining args from initial parse of workflow path. workflow_dict: Workflow dictionary Returns: On success: List of parsed arguments. On failure: False. """ # parse dynamic args. these are determined from workflow definition dynamic_parser = GooeyParser() input_group = dynamic_parser.add_argument_group( "Workflow Inputs", "Files or folders to be passed to the workflow") for input_key in workflow_dict['inputs']: widget = 'FileChooser' if workflow_dict['inputs'][input_key]['type'] == 'Directory': widget = 'DirChooser' input_group.add_argument( '--in.{}'.format(input_key), dest='inputs.{}'.format(input_key), required=False, default=workflow_dict['inputs'][input_key]['default'], help=workflow_dict['inputs'][input_key]['label'], widget=widget) param_group = dynamic_parser.add_argument_group( "Workflow Parameters", "Number or string parameters to be passed to the workflow") for param_key in workflow_dict['parameters']: param_group.add_argument( '--param.{}'.format(param_key), dest='parameters.{}'.format(param_key), required=False, default=workflow_dict['parameters'][param_key]['default'], help=workflow_dict['parameters'][param_key]['label']) job_group = dynamic_parser.add_argument_group( "Job Options", "Output/intermediate folders and job name") job_group.add_argument('-o', '--output', type=str, default='~/geneflow-output', help='Output Folder', widget='DirChooser') job_group.add_argument('-n', '--name', type=str, default='geneflow-job', help='Name of Job') job_group.add_argument('-w', '--work', nargs='+', type=str, default=[], help='Work Directory') exec_group = dynamic_parser.add_argument_group( "Execution Options", "Customize workflow execution") exec_group.add_argument('--exec-context', '--ec', nargs='+', type=str, dest='exec_context', default=[], help='Execution Contexts') exec_group.add_argument('--exec-method', '--em', nargs='+', type=str, dest='exec_method', default=[], help='Execution Methods') exec_group.add_argument('--exec-param', '--ep', nargs='+', type=str, dest='exec_param', default=[], help='Execution Parameters') dynamic_args = dynamic_parser.parse_args(other_args) return dynamic_args # get dynamic args if args.gui and 'gooey' in sys.modules: dynamic_args = parse_dynamic_args_gui(workflow_dict) else: dynamic_args = parse_dynamic_args(workflow_dict) # get absolute path to job file if provided job_path = None if dynamic_args.job_path: job_path = Path(dynamic_args.job_path).absolute() # load job definition if provided jobs_dict = {} gf_def = Definition() if job_path: if not gf_def.load(job_path): Log.an().error('Job definition load failed') return False jobs_dict = gf_def.jobs() else: # create default definition jobs_dict = { 'job': { 'name': 'GeneFlow job', 'output_uri': 'geneflow_output', 'work_uri': { 'local': '~/.geneflow/work' } } } # override with known cli parameters apply_job_modifiers(jobs_dict, [ 'name={}'.format(dynamic_args.name), 'output_uri={}'.format( dynamic_args.output) ]) # insert workflow name into job, if not provided workflow_name = next(iter(defs['workflows'])) for job in jobs_dict.values(): if 'workflow_name' not in job: job['workflow_name'] = workflow_name # add inputs and parameters to job definition apply_job_modifiers( jobs_dict, [ '{}={}'.format(dynamic_arg, getattr(dynamic_args, dynamic_arg)) for dynamic_arg in vars(dynamic_args) \ if dynamic_arg.startswith('inputs.') or dynamic_arg.startswith('parameters.') ] ) # add work URIs to job definition work_uris = {} for work_arg in dynamic_args.work: parsed_work_uri = URIParser.parse(work_arg) if not parsed_work_uri: # skip if invalid URI Log.a().warning('invalid work uri: %s', work_arg) else: work_uris[ parsed_work_uri['scheme']] = parsed_work_uri['chopped_uri'] apply_job_modifiers(jobs_dict, [ 'work_uri.{}={}'.format(context, work_uris[context]) for context in work_uris ]) # add execution options to job definition apply_job_modifiers(jobs_dict, [ 'execution.context.{}={}'.format(*exec_arg.split(':', 1)[0:2]) for exec_arg in dynamic_args.exec_context ] + [ 'execution.method.{}={}'.format(*exec_arg.split(':', 1)[0:2]) for exec_arg in dynamic_args.exec_method ] + [ 'execution.parameters.{}={}'.format(*exec_arg.split(':', 1)[0:2]) for exec_arg in dynamic_args.exec_param ]) # get default values from workflow definition for job in jobs_dict.values(): if 'inputs' not in job: job['inputs'] = {} if 'parameters' not in job: job['parameters'] = {} for input_key in workflow_dict['inputs']: if input_key not in job['inputs']: job['inputs'][input_key]\ = workflow_dict['inputs'][input_key]['default'] for param_key in workflow_dict['parameters']: if param_key not in job['parameters']: job['parameters'][param_key]\ = workflow_dict['parameters'][param_key]['default'] # expand URIs for job in jobs_dict.values(): # output URI parsed_uri = URIParser.parse(job['output_uri']) if not parsed_uri: Log.an().error('invalid output uri: %s', job['output_uri']) return False # expand relative path if local if parsed_uri['scheme'] == 'local': job['output_uri'] = str( Path(parsed_uri['chopped_path']).expanduser().resolve()) # work URIs for context in job['work_uri']: parsed_uri = URIParser.parse(job['work_uri'][context]) if not parsed_uri: Log.an().error('invalid work uri: %s', job['work_uri']) return False # expand relative path if local if parsed_uri['scheme'] == 'local': job['work_uri'][context] = str( Path(parsed_uri['chopped_path']).expanduser().resolve()) # input URIs for input_key in job['inputs']: parsed_uri = URIParser.parse(job['inputs'][input_key]) if not parsed_uri: Log.an().error('invalid input uri: %s', job['inputs'][input_key]) return False # expand relative path if local if parsed_uri['scheme'] == 'local': job['inputs'][input_key] = str( Path(parsed_uri['chopped_path']).expanduser().resolve()) # import jobs into database job_ids = data_source.import_jobs_from_dict(jobs_dict) if job_ids is False: Log.an().error('cannot import jobs') return False data_source.commit() # create process pool to run workflows in parallel pool = Pool(min(5, len(job_ids))) jobs = [{'name': job, 'id': job_ids[job], 'log': None} for job in job_ids] result = pool.map( partial(geneflow.cli.common.run_workflow, config=config_dict, log_level=args.log_level), jobs) pool.close() pool.join() if not all(result): Log.an().error('some jobs failed') return result
def help_func(args, other_args, subparser=None): """ GeneFlow workflow help. Args: args.workflow: workflow definition or package directory. Returns: On success: True. On failure: False. """ # get absolute path to workflow workflow_yaml = resolve_workflow_path(args.workflow) if workflow_yaml: Log.some().info('workflow definition found: %s', workflow_yaml) else: Log.an().error('cannot find workflow definition: %s', args.workflow) return False # load workflow gf_def = Definition() if not gf_def.load(workflow_yaml): Log.an().error('workflow definition load failed: %s', workflow_yaml) return False # get first workflow dict workflow_dict = next(iter(gf_def.workflows().values())) print() print('{}: {}'.format(workflow_dict['name'], workflow_dict['description'])) print() print('Execution Command:') print('\tgf [--log-level LOG_LEVEL] [--log-file LOG_FILE] run WORKFLOW_PATH') print('\t\t-o OUTPUT [-n NAME] [INPUTS] [PARAMETERS] [-w WORK_DIR [WORK_DIR ...]]') print('\t\t[--ec CONTEXT [CONTEXT ...]] [--em METHOD [METHOD ...]] [--ep PARAM [PARAM ...]]') print() print('\tWORKFLOW_PATH: Path to directory that contains workflow definition') print() print('Job Configuration:') print('\t-o,--output: Output directory') print('\t-n,--name: Job name, a directory with this name will be created in the output directory') print('\t\tdefault: geneflow-job') print('\t-w,--work: Work directories, for temporary or intermediate data') print('\t\tdefault: ~/.geneflow/work') print('\t--no-output-hash: Flag indicating that the output directory should NOT include a random hash') print('\t\tdefault: not set, output will include random hash') print() print('Inputs: Workflow-Specific Files or Folders') for input_key in workflow_dict['inputs']: print( '\t--in.{}: {}: {}'.format( input_key, workflow_dict['inputs'][input_key]['label'], workflow_dict['inputs'][input_key]['description'] ) ) print( '\t\ttype: {}, default: {}'.format( workflow_dict['inputs'][input_key]['type'], workflow_dict['inputs'][input_key]['default'] ) ) print() print('Parameters: Workflow-Specific Values') for param_key in workflow_dict['parameters']: print( '\t--param.{}: {}: {}'.format( param_key, workflow_dict['parameters'][param_key]['label'], workflow_dict['parameters'][param_key]['description'] ) ) print( '\t\ttype: {}, default: {}'.format( workflow_dict['parameters'][param_key]['type'], workflow_dict['parameters'][param_key]['default'] ) ) print() print('Execution Configuration:') print('\t--ec,--exec-context: Execution contexts, e.g., local, agave, gridengine.') print('\t\tThese can be specified for all workflow steps with "default:[CONTEXT]"') print('\t\tor for specific steps with "step-name:[CONTEXT]".') print('\t--em,--exec-method: Exeuction methods, e.g., singularity, docker, environment.') print('\t\tThese can be specified for all workflow steps with "default:[METHOD]"') print('\t\tor for specific steps with "step-name:[METHOD]". By default each app associated') print('\t\twith a workflow step tries to automatically detect the execution method.') print('\t--ep,--exec-param: Execution parameters, e.g., slots, mem, or other.') print('\t\tThese can be specified for all workflow steps with "default.slots:[VALUE]"') print('\t\tor for specific steps with "step-name.slots:[VALUE]". Execution parameters') print('\t\tdepend on the execution context.') return True
def run(args): """ Run GeneFlow workflow engine. Args: args.workflow: workflow definition or package directory. args.job_yaml: job definition. Returns: On success: True. On failure: False. """ # get absolute path to workflow workflow_yaml = resolve_workflow_path(args.workflow) if workflow_yaml: Log.some().info('workflow definition found: %s', workflow_yaml) else: Log.an().error('cannot find workflow definition: %s', args.workflow) return False # get absolute path to job file if provided job_yaml = None if args.job_yaml: job_yaml = Path(args.job_yaml).absolute() # setup environment env = Environment(workflow_path=workflow_yaml) if not env.initialize(): Log.an().error('cannot initialize geneflow environment') return False # create default config file and SQLite db cfg = Config() cfg.default(env.get_sqlite_db_path()) cfg.write(env.get_config_path()) config_dict = cfg.config('local') # load workflow into db try: data_source = DataSource(config_dict['database']) except DataSourceException as err: Log.an().error('data source initialization error [%s]', str(err)) return False defs = data_source.import_definition(workflow_yaml) if not defs: Log.an().error('workflow definition load failed: %s', workflow_yaml) return False if not defs['workflows']: Log.an().error('workflow definition load failed: %s', workflow_yaml) return False data_source.commit() for workflow in defs['workflows']: Log.some().info( 'workflow loaded: %s -> %s', workflow, defs['workflows'][workflow] ) # load job definition if provided jobs_dict = {} gf_def = Definition() if job_yaml: if not gf_def.load(job_yaml): Log.an().error('Job definition load failed') return False jobs_dict = gf_def.jobs() else: # create default definition jobs_dict = { 'job': { 'name': 'GeneFlow job', 'output_uri': 'geneflow_output', 'work_uri': { 'local': '~/.geneflow/work' } } } # override with cli parameters if args.data: apply_job_modifiers(jobs_dict, args.data) # insert workflow name, if not provided workflow_name = next(iter(defs['workflows'])) for job in jobs_dict.values(): if 'workflow_name' not in job: job['workflow_name'] = workflow_name # extract workflow defaults for inputs and parameters if not provided # in job definition workflow_id = next(iter(defs['workflows'].values())) workflow_dict = data_source.get_workflow_def_by_id(workflow_id) if not workflow_dict: Log.an().error( 'cannot get workflow definition from data source: workflow_id=%s', workflow_id ) return False for job in jobs_dict.values(): if 'inputs' not in job: job['inputs'] = {} if 'parameters' not in job: job['parameters'] = {} for input_key in workflow_dict['inputs']: if input_key not in job['inputs']: job['inputs'][input_key]\ = workflow_dict['inputs'][input_key]['default'] for param_key in workflow_dict['parameters']: if param_key not in job['parameters']: job['parameters'][param_key]\ = workflow_dict['parameters'][param_key]['default'] # expand URIs for job in jobs_dict.values(): # output URI parsed_uri = URIParser.parse(job['output_uri']) if not parsed_uri: Log.an().error('invalid output uri: %s', job['output_uri']) return False # expand relative path if local if parsed_uri['scheme'] == 'local': job['output_uri'] = str( Path(parsed_uri['chopped_path']).expanduser().resolve() ) # work URIs for context in job['work_uri']: parsed_uri = URIParser.parse(job['work_uri'][context]) if not parsed_uri: Log.an().error('invalid work uri: %s', job['work_uri']) return False # expand relative path if local if parsed_uri['scheme'] == 'local': job['work_uri'][context] = str( Path(parsed_uri['chopped_path']).expanduser().resolve() ) # input URIs for input_key in job['inputs']: parsed_uri = URIParser.parse(job['inputs'][input_key]) if not parsed_uri: Log.an().error( 'invalid input uri: %s', job['inputs'][input_key] ) return False # expand relative path if local if parsed_uri['scheme'] == 'local': job['inputs'][input_key] = str( Path(parsed_uri['chopped_path']).expanduser().resolve() ) # import jobs into database job_ids = data_source.import_jobs_from_dict(jobs_dict) if job_ids is False: Log.an().error('cannot import jobs') return False data_source.commit() # create process pool to run workflows in parallel pool = Pool(min(5, len(job_ids))) jobs = [ { 'name': job, 'id': job_ids[job], 'log': None } for job in job_ids ] result = pool.map( partial( geneflow.cli.common.run_workflow, config=config_dict, log_level=args.log_level ), jobs ) pool.close() pool.join() if not all(result): Log.an().error('some jobs failed') return result
def help_func(args): """ GeneFlow workflow help. Args: args.workflow: workflow definition or package directory. Returns: On success: True. On failure: False. """ # get absolute path to workflow workflow_yaml = resolve_workflow_path(args.workflow) if workflow_yaml: Log.some().info('workflow definition found: %s', workflow_yaml) else: Log.an().error('cannot find workflow definition: %s', args.workflow) return False # load workflow gf_def = Definition() if not gf_def.load(workflow_yaml): Log.an().error('workflow definition load failed: %s', workflow_yaml) return False # get first workflow dict workflow_dict = next(iter(gf_def.workflows().values())) print() print('GeneFlow: {}'.format(workflow_dict['name'])) print() print('{}'.format(workflow_dict['description'])) print() print('Inputs:') for input_key in workflow_dict['inputs']: print( '\t--{}: {}: {}'.format( input_key, workflow_dict['inputs'][input_key]['label'], workflow_dict['inputs'][input_key]['description'] ) ) print( '\t\ttype: {}, default: {}'.format( workflow_dict['inputs'][input_key]['type'], workflow_dict['inputs'][input_key]['default'] ) ) print() print('Parameters:') for param_key in workflow_dict['parameters']: print( '\t--{}: {}: {}'.format( param_key, workflow_dict['parameters'][param_key]['label'], workflow_dict['parameters'][param_key]['description'] ) ) print( '\t\ttype: {}, default: {}'.format( workflow_dict['parameters'][param_key]['type'], workflow_dict['parameters'][param_key]['default'] ) ) return True