예제 #1
0
파일: job.py 프로젝트: pooya/inferno
    def __init__(self, rule, settings, urls=None):
        self.job_options = JobOptions(rule, settings)
        self.rule = rule
        self.settings = settings
        rule_params = dict(rule.params.__dict__)
        self.disco, self.ddfs = get_disco_handle(rule_params.get('server', settings.get('server')))
        rule_params.update(settings)
        self.params = Params(**rule_params)
        self.urls = urls

        try:
            # attempt to allow for overriden worker class from settings file or rule
            if rule.worker:
                worker = rule.worker
            else:
                worker_mod, dot, worker_class = settings.get('worker').rpartition('.')
                mod = __import__(worker_mod, {}, {}, worker_mod)
                worker = getattr(mod, worker_class)()
            self.job = Job(name=rule.name,
                           master=self.disco.master,
                           worker=worker)
        except Exception as e:
            log.warn("Error instantiating worker: %s %s - loading default worker"
                     % (settings.get('worker'), e))
            self.job = Job(name=rule.name,
                           master=self.disco.master)
        self.full_job_id = None
        self.jobinfo = None
        self._notify(JOB_START)
예제 #2
0
파일: job.py 프로젝트: 0scarLi/inferno
    def __init__(self, rule, settings, urls=None):
        self.job_options = JobOptions(rule, settings)
        self.rule = rule
        self.settings = settings
        rule_params = dict(rule.params.__dict__)
        self.disco, self.ddfs = get_disco_handle(rule_params.get("server", settings.get("server")))
        rule_params.update(settings)
        self.params = Params(**rule_params)
        self.urls = urls

        try:
            # attempt to allow for overriden worker class from settings file or rule
            if rule.worker:
                worker = rule.worker
            else:
                worker_mod, dot, worker_class = settings.get("worker").rpartition(".")
                mod = __import__(worker_mod, {}, {}, worker_mod)
                worker = getattr(mod, worker_class)()
            self.job = Job(name=rule.name, master=self.disco.master, worker=worker)
        except Exception as e:
            log.warn("Error instantiating worker: %s %s - loading default worker" % (settings.get("worker"), e))
            self.job = Job(name=rule.name, master=self.disco.master)
        self.full_job_id = None
        self.jobinfo = None
        self._notify(JOB_START)
예제 #3
0
def _run_concurrent_rules(rule_list, settings, urls_blackboard):
    """Execute a list of rules concurrently, it assumes all the rules are
    runable(ie. all output urls of its sub_rule are available)

    Output:
        job_results. A dictionary of (rule_name : outputurls) pairs
    Exceptions:
        Exception, if it fails to start a job or one of the jobs dies
    """
    def _get_rule_name(disco_job_name):
        return disco_job_name.rsplit('@')[0]

    # need to save both inferno_jobs and disco_jobs
    jobs = []
    inferno_jobs = []
    for rule in rule_list:
        urls = []
        for sub_rule in extract_subrules(rule):
            urls += urls_blackboard[sub_rule.name]
        inferno_job, job = _start_job(rule, settings, urls)
        if job:
            jobs.append(job)
            inferno_jobs.append(inferno_job)
        else:
            raise Exception('There is not enough blobs to run %s' % rule.name)

    job_results = {}
    stop = False
    server, _ = get_disco_handle(settings.get('server'))
    while jobs:
        try:
            inactive, active = server.results(jobs, 5000)
        except CommError:
            # to deal with the long time jobs(e.g waiting for shuffling)
            continue
        for jobname, (status, results) in inactive:
            if status == "ready":
                job_results[_get_rule_name(jobname)] = results
            elif status == "dead":
                stop = True
        jobs = active
        if stop:
            break

    if stop:
        for jobname, _ in jobs:
            server.kill(jobname)
            raise Exception('One of the concurrent jobs failed.')

    return inferno_jobs, job_results
예제 #4
0
def _run_concurrent_rules(rule_list, settings, urls_blackboard):
    """Execute a list of rules concurrently, it assumes all the rules are
    runable(ie. all output urls of its sub_rule are available)

    Output:
        job_results. A dictionary of (rule_name : outputurls) pairs
    Exceptions:
        Exception, if it fails to start a job or one of the jobs dies
    """
    def _get_rule_name(disco_job_name):
        return disco_job_name.rsplit('@')[0]

    # need to save both inferno_jobs and disco_jobs
    jobs = []
    inferno_jobs = []
    for rule in rule_list:
        urls = []
        for sub_rule in extract_subrules(rule):
            urls += urls_blackboard[sub_rule.name]
        inferno_job, job = _start_job(rule, settings, urls)
        if job:
            jobs.append(job)
            inferno_jobs.append(inferno_job)
        else:
            raise Exception('There is not enough blobs to run %s' % rule.name)

    job_results = {}
    stop = False
    server, _ = get_disco_handle(settings.get('server'))
    while jobs:
        try:
            inactive, active = server.results(jobs, 5000)
        except CommError:
            # to deal with the long time jobs(e.g waiting for shuffling)
            continue
        for jobname, (status, results) in inactive:
            if status == "ready":
                job_results[_get_rule_name(jobname)] = results
            elif status == "dead":
                stop = True
        jobs = active
        if stop:
            break

    if stop:
        for jobname, _ in jobs:
            server.kill(jobname)
            raise Exception('One of the concurrent jobs failed.')

    return inferno_jobs, job_results
def expire_data(days, dryrun, extra_tags):
    settings = InfernoSettings()
    _, ddfs = get_disco_handle(settings["server"])
    tags = extract_tags_from_infernyx()
    tags = tags.union(extra_tags)
    to_delete = []
    date_lower = date.today() + timedelta(days=-days)
    try:
        all_tags = ddfs.list()
    except Exception as e:
        print "Can not fetch the tag list from ddfs: %s" % e
        return

    for tag in all_tags:
        try:
            prefix, ds = tag.rsplit(':', 1)
            tag_date = datetime.strptime(ds, "%Y-%m-%d").date()
        except:
            continue  # ignore the non-standard tag name
        if prefix in tags and tag_date < date_lower:
            to_delete.append(tag)

    to_delete.sort()  # delete tags with "incoming" first, then the "processed" ones
    if dryrun:
        if to_delete:
            print "Following tags will be deleted:\n"
            print "\n".join(to_delete)
        else:
            print "Nothing to be done"
    else:
        for tag in to_delete:
            try:
                print "Deleting tag: %s" % tag
                ddfs.delete(tag)
            except Exception as e:
                print "Failed to delete tag %s: %s" % (tag, e)
예제 #6
0
def main(argv=sys.argv):
    options, parser = _get_options(argv[1:])
    settings = _get_settings(options)

    if options['example_rules']:
        try:
            os.mkdir(options['example_rules'])
            here = os.path.dirname(__file__)
            src_dir = os.path.join(here, '..', 'example_rules')
            src_dir = os.path.abspath(src_dir)
            dst_dir = os.path.abspath(options['example_rules'])
            for name in os.listdir(src_dir):
                if name.endswith('.py'):
                    src = os.path.join(src_dir, name)
                    dst = os.path.join(dst_dir, name)
                    shutil.copy(src, dst)
            print '\n\tCreated example rules dir:\n\n\t\t%s' % dst_dir
            for name in os.listdir(dst_dir):
                print '\t\t\t', name
        except Exception as e:
            print 'Error creating example rules dir %r' % (e)
        finally:
            return

    _setup_logging(settings)

    for path in settings.get('extra_python_paths'):
        sys.path.insert(0, path)

    if options['process_results']:
        settings['no_purge'] = True
        rules_dir = options.get('rules_directory')
        if not rules_dir:
            rules_dir = settings.get('rules_directory')
        try:
            rule_name = options['process_results'].split('@')[0]
            job_name = options['process_results'].split('.')[1]
            rule = get_rules_by_name(rule_name, rules_dir, immediate=True)[0]
            job = InfernoJob(rule, settings)
            status, results = job.disco.results(job_name)
            if status == 'ready':
                if job.rule.rule_init_function:
                    job.rule.rule_init_function(job.params)
                rule.result_processor(rule.result_iterator(results),
                                      params=job.params,
                                      job_id=job_name)
        except Exception as e:
            import traceback
            trace = traceback.format_exc(15)
            log.error(trace)
            log.error("Error processing results for job: %s %s" %
                      (options['process_results'], e))
            raise e
    elif options['process_map']:
        settings['no_purge'] = True
        rules_dir = options.get('rules_directory')
        if not rules_dir:
            rules_dir = settings.get('rules_directory')
        try:
            rule_name = options['process_map'].split('@')[0]
            job_name = options['process_map'].split('.')[1]
            rule = get_rules_by_name(rule_name, rules_dir, immediate=True)[0]
            rule.map_function = None
            rule.source_tags = []
            disco, ddfs = get_disco_handle(settings.get('server'))
            rule.source_urls = disco.mapresults(job_name)
            job = InfernoJob(rule, settings)
            if job.start():
                job.wait()
        except Exception as e:
            import traceback
            trace = traceback.format_exc(15)
            log.error(trace)
            log.error("Error processing map results for job: %s %s" %
                      (options['process_map'], e))
            raise e
    elif options['immediate_rule']:
        # run inferno in 'immediate' mode
        settings['no_purge'] = True
        setproctitle('inferno - immediate.%s' % options['immediate_rule'])
        immed_rule = settings.get('immediate_rule')
        rules_dir = settings.get('rules_directory')
        rules = get_rules_by_name(immed_rule, rules_dir, immediate=True)
        try:
            for rule in rules:
                execute_rule(rule, settings)
        except Exception as e:
            import traceback
            trace = traceback.format_exc(15)
            log.error('Job failed: %s' % e.message)
            log.error(trace)
            exit(1)

    elif options['run_daemon']:
        # run inferno in 'daemon' mode
        from inferno.lib.daemon import InfernoDaemon
        setproctitle('inferno - master')
        InfernoDaemon(settings).start()

    else:
        # Display help when no options specified
        parser.print_help()
예제 #7
0
파일: run.py 프로젝트: chango/inferno
def main(argv=sys.argv):
    options, parser = _get_options(argv[1:])
    settings = _get_settings(options)

    if options['example_rules']:
        try:
            os.mkdir(options['example_rules'])
            here = os.path.dirname(__file__)
            src_dir = os.path.join(here, '..', 'example_rules')
            src_dir = os.path.abspath(src_dir)
            dst_dir = os.path.abspath(options['example_rules'])
            for name in os.listdir(src_dir):
                if name.endswith('.py'):
                    src = os.path.join(src_dir, name)
                    dst = os.path.join(dst_dir, name)
                    shutil.copy(src, dst)
            print '\n\tCreated example rules dir:\n\n\t\t%s' % dst_dir
            for name in os.listdir(dst_dir):
                print '\t\t\t', name
        except Exception as e:
            print 'Error creating example rules dir %r' % (e)
        finally:
            return

    _setup_logging(settings)

    for path in settings.get('extra_python_paths'):
        sys.path.insert(0, path)

    if options['process_results']:
        settings['no_purge'] = True
        rules_dir = options.get('rules_directory')
        if not rules_dir:
            rules_dir = settings.get('rules_directory')
        try:
            rule_name = options['process_results'].split('@')[0]
            job_name = options['process_results'].split('.')[1]
            rule = get_rules_by_name(rule_name, rules_dir, immediate=True)[0]
            job = InfernoJob(rule, settings)
            status, results = job.disco.results(job_name)
            if status == 'ready':
                if job.rule.rule_init_function:
                    job.rule.rule_init_function(job.params)
                rule.result_processor(rule.result_iterator(results), params=job.params, job_id=job_name)
        except Exception as e:
            import traceback
            trace = traceback.format_exc(15)
            log.error(trace)
            log.error("Error processing results for job: %s %s" % (options['process_results'], e))
            raise e
    elif options['process_map']:
        settings['no_purge'] = True
        rules_dir = options.get('rules_directory')
        if not rules_dir:
            rules_dir = settings.get('rules_directory')
        try:
            rule_name = options['process_map'].split('@')[0]
            job_name = options['process_map'].split('.')[1]
            rule = get_rules_by_name(rule_name, rules_dir, immediate=True)[0]
            rule.map_function = None
            rule.source_tags = []
            disco, ddfs = get_disco_handle(settings.get('server'))
            rule.source_urls = disco.mapresults(job_name)
            job = InfernoJob(rule, settings)
            if job.start():
                job.wait()
        except Exception as e:
            import traceback
            trace = traceback.format_exc(15)
            log.error(trace)
            log.error("Error processing map results for job: %s %s" % (options['process_map'], e))
            raise e
    elif options['immediate_rule']:
        # run inferno in 'immediate' mode
        settings['no_purge'] = True
        setproctitle('inferno - immediate.%s' % options['immediate_rule'])
        immed_rule = settings.get('immediate_rule')
        rules_dir = settings.get('rules_directory')
        rules = get_rules_by_name(immed_rule, rules_dir, immediate=True)
        try:
            for rule in rules:
                execute_rule(rule, settings)
        except Exception as e:
            import traceback
            trace = traceback.format_exc(15)
            log.error('Job failed: %s' % e.message)
            log.error(trace)
            exit(1)

    elif options['run_daemon']:
        # run inferno in 'daemon' mode
        from inferno.lib.daemon import InfernoDaemon
        setproctitle('inferno - master')
        InfernoDaemon(settings).start()

    else:
        # Display help when no options specified
        parser.print_help()