def __init__(self, rule, settings, urls=None): self.job_options = JobOptions(rule, settings) self.rule = rule self.settings = settings rule_params = dict(rule.params.__dict__) self.disco, self.ddfs = get_disco_handle(rule_params.get('server', settings.get('server'))) rule_params.update(settings) self.params = Params(**rule_params) self.urls = urls try: # attempt to allow for overriden worker class from settings file or rule if rule.worker: worker = rule.worker else: worker_mod, dot, worker_class = settings.get('worker').rpartition('.') mod = __import__(worker_mod, {}, {}, worker_mod) worker = getattr(mod, worker_class)() self.job = Job(name=rule.name, master=self.disco.master, worker=worker) except Exception as e: log.warn("Error instantiating worker: %s %s - loading default worker" % (settings.get('worker'), e)) self.job = Job(name=rule.name, master=self.disco.master) self.full_job_id = None self.jobinfo = None self._notify(JOB_START)
def __init__(self, rule, settings, urls=None): self.job_options = JobOptions(rule, settings) self.rule = rule self.settings = settings rule_params = dict(rule.params.__dict__) self.disco, self.ddfs = get_disco_handle(rule_params.get("server", settings.get("server"))) rule_params.update(settings) self.params = Params(**rule_params) self.urls = urls try: # attempt to allow for overriden worker class from settings file or rule if rule.worker: worker = rule.worker else: worker_mod, dot, worker_class = settings.get("worker").rpartition(".") mod = __import__(worker_mod, {}, {}, worker_mod) worker = getattr(mod, worker_class)() self.job = Job(name=rule.name, master=self.disco.master, worker=worker) except Exception as e: log.warn("Error instantiating worker: %s %s - loading default worker" % (settings.get("worker"), e)) self.job = Job(name=rule.name, master=self.disco.master) self.full_job_id = None self.jobinfo = None self._notify(JOB_START)
def _run_concurrent_rules(rule_list, settings, urls_blackboard): """Execute a list of rules concurrently, it assumes all the rules are runable(ie. all output urls of its sub_rule are available) Output: job_results. A dictionary of (rule_name : outputurls) pairs Exceptions: Exception, if it fails to start a job or one of the jobs dies """ def _get_rule_name(disco_job_name): return disco_job_name.rsplit('@')[0] # need to save both inferno_jobs and disco_jobs jobs = [] inferno_jobs = [] for rule in rule_list: urls = [] for sub_rule in extract_subrules(rule): urls += urls_blackboard[sub_rule.name] inferno_job, job = _start_job(rule, settings, urls) if job: jobs.append(job) inferno_jobs.append(inferno_job) else: raise Exception('There is not enough blobs to run %s' % rule.name) job_results = {} stop = False server, _ = get_disco_handle(settings.get('server')) while jobs: try: inactive, active = server.results(jobs, 5000) except CommError: # to deal with the long time jobs(e.g waiting for shuffling) continue for jobname, (status, results) in inactive: if status == "ready": job_results[_get_rule_name(jobname)] = results elif status == "dead": stop = True jobs = active if stop: break if stop: for jobname, _ in jobs: server.kill(jobname) raise Exception('One of the concurrent jobs failed.') return inferno_jobs, job_results
def expire_data(days, dryrun, extra_tags): settings = InfernoSettings() _, ddfs = get_disco_handle(settings["server"]) tags = extract_tags_from_infernyx() tags = tags.union(extra_tags) to_delete = [] date_lower = date.today() + timedelta(days=-days) try: all_tags = ddfs.list() except Exception as e: print "Can not fetch the tag list from ddfs: %s" % e return for tag in all_tags: try: prefix, ds = tag.rsplit(':', 1) tag_date = datetime.strptime(ds, "%Y-%m-%d").date() except: continue # ignore the non-standard tag name if prefix in tags and tag_date < date_lower: to_delete.append(tag) to_delete.sort() # delete tags with "incoming" first, then the "processed" ones if dryrun: if to_delete: print "Following tags will be deleted:\n" print "\n".join(to_delete) else: print "Nothing to be done" else: for tag in to_delete: try: print "Deleting tag: %s" % tag ddfs.delete(tag) except Exception as e: print "Failed to delete tag %s: %s" % (tag, e)
def main(argv=sys.argv): options, parser = _get_options(argv[1:]) settings = _get_settings(options) if options['example_rules']: try: os.mkdir(options['example_rules']) here = os.path.dirname(__file__) src_dir = os.path.join(here, '..', 'example_rules') src_dir = os.path.abspath(src_dir) dst_dir = os.path.abspath(options['example_rules']) for name in os.listdir(src_dir): if name.endswith('.py'): src = os.path.join(src_dir, name) dst = os.path.join(dst_dir, name) shutil.copy(src, dst) print '\n\tCreated example rules dir:\n\n\t\t%s' % dst_dir for name in os.listdir(dst_dir): print '\t\t\t', name except Exception as e: print 'Error creating example rules dir %r' % (e) finally: return _setup_logging(settings) for path in settings.get('extra_python_paths'): sys.path.insert(0, path) if options['process_results']: settings['no_purge'] = True rules_dir = options.get('rules_directory') if not rules_dir: rules_dir = settings.get('rules_directory') try: rule_name = options['process_results'].split('@')[0] job_name = options['process_results'].split('.')[1] rule = get_rules_by_name(rule_name, rules_dir, immediate=True)[0] job = InfernoJob(rule, settings) status, results = job.disco.results(job_name) if status == 'ready': if job.rule.rule_init_function: job.rule.rule_init_function(job.params) rule.result_processor(rule.result_iterator(results), params=job.params, job_id=job_name) except Exception as e: import traceback trace = traceback.format_exc(15) log.error(trace) log.error("Error processing results for job: %s %s" % (options['process_results'], e)) raise e elif options['process_map']: settings['no_purge'] = True rules_dir = options.get('rules_directory') if not rules_dir: rules_dir = settings.get('rules_directory') try: rule_name = options['process_map'].split('@')[0] job_name = options['process_map'].split('.')[1] rule = get_rules_by_name(rule_name, rules_dir, immediate=True)[0] rule.map_function = None rule.source_tags = [] disco, ddfs = get_disco_handle(settings.get('server')) rule.source_urls = disco.mapresults(job_name) job = InfernoJob(rule, settings) if job.start(): job.wait() except Exception as e: import traceback trace = traceback.format_exc(15) log.error(trace) log.error("Error processing map results for job: %s %s" % (options['process_map'], e)) raise e elif options['immediate_rule']: # run inferno in 'immediate' mode settings['no_purge'] = True setproctitle('inferno - immediate.%s' % options['immediate_rule']) immed_rule = settings.get('immediate_rule') rules_dir = settings.get('rules_directory') rules = get_rules_by_name(immed_rule, rules_dir, immediate=True) try: for rule in rules: execute_rule(rule, settings) except Exception as e: import traceback trace = traceback.format_exc(15) log.error('Job failed: %s' % e.message) log.error(trace) exit(1) elif options['run_daemon']: # run inferno in 'daemon' mode from inferno.lib.daemon import InfernoDaemon setproctitle('inferno - master') InfernoDaemon(settings).start() else: # Display help when no options specified parser.print_help()