Exemplo n.º 1
0
def pre_process_data(ns, tf, log_details):
    """
    For convenience, perform operations on the input stream before passing
    along to other data processing code
    """
    if ns.sample:
        log.info('sampling a percentage of input data without replacement',
                 extra=dict(sampling_pct=ns.sample, **log_details))
        tf = tf.sample(False, ns.sample, 0)
    if ns.mapjson:
        log.info('converting all elements in data stream to json')
        tf = tf.map(simplejson.loads)
    return tf
Exemplo n.º 2
0
def apply_data_transform(ns, sc, log_details, pjob_id, module):
    """Pass control to the module.main method.  If module.main specifies a
    `textFile` parameter, pass the textFile instance.  Otherwise, just map
    module.main on the RDD
    """
    func_args = inspect.getargspec(module.main).args
    if 'sc' in func_args:
        log.info(
            'passing spark context to a module.main function',
            extra=log_details)
        try:
            module.main(sc=sc, ns=ns, **pjob_id)
        except Exception as err:
            log_and_raise(
                "Job failed with error: %s" % err, log_details)
    else:
        read_fp = format_fp(ns.read_fp, ns, pjob_id)
        log_details = dict(read_fp=read_fp, **log_details)
        tf = sc.textFile(read_fp, ns.minPartitions)
        tf = pre_process_data(ns=ns, tf=tf, log_details=log_details)
        if 'textFile' in func_args:
            log.info(
                'passing textFile instance to a module.main function',
                extra=log_details)
            try:
                module.main(textFile=tf, ns=ns, **pjob_id)
            except Exception as err:
                log_and_raise(
                    "Job failed with error: %s" % err, log_details)

        else:
            write_fp = format_fp(ns.write_fp, ns, pjob_id)
            log.info(
                'mapping a module.main function to all elements in a textFile'
                ' and writing output',
                extra=dict(write_fp=write_fp, **log_details))
            try:
                (
                    tf
                    .map(functools.partial(module.main, ns=ns, **pjob_id))
                    .saveAsTextFile(write_fp)
                )
            except Exception as err:
                log_and_raise(err, log_details)
Exemplo n.º 3
0
def main(ns):
    """
    A generic plugin that schedules arbitrary bash jobs using Stolos

    Assume code is written in Python.  For Scala or R code, use another option.
    """
    job_id = ns.job_id
    ld = dict(app_name=ns.app_name, job_id=ns.job_id)
    log.info('Running bash job', extra=ld)
    cmd = get_bash_cmd(ns.app_name)
    if ns.bash_cmd:
        cmd += ' '.join(ns.bash_cmd)
        log.debug("Appending user-supplied bash options to defaults",
                  extra=dict(app_name=ns.app_name, job_id=job_id, cmd=cmd))
    ld.update(cmd=cmd)
    if not cmd:
        raise UserWarning(
            "You need to specify bash options or configure default bash"
            " options")

    _cmdargs = dict(**ns.__dict__)
    _cmdargs.update(api.parse_job_id(ns.app_name, job_id))
    cmd = cmd.format(**_cmdargs)

    if ns.redirect_to_stderr:
        _std = sys.stderr
    else:
        _std = PIPE

    log.info('running command', extra=ld)
    returncode, stdout, stderr = run(cmd,
                                     shell=True,
                                     timeout=ns.watch,
                                     stdout=_std,
                                     stderr=_std)
    ld = dict(bash_returncode=returncode, stdout=stdout, stderr=stderr, **ld)
    if returncode == -9:
        log_and_raise("Bash job timed out", ld)
    elif returncode != 0:
        # this raises an error and logs output:
        log_and_raise("Bash job failed", ld)
    else:
        log.info("Bash job succeeded", extra=ld)
Exemplo n.º 4
0
def main(ns):
    """
    A generic plugin that schedules arbitrary bash jobs using Stolos

    Assume code is written in Python.  For Scala or R code, use another option.
    """
    job_id = ns.job_id
    ld = dict(app_name=ns.app_name, job_id=ns.job_id)
    log.info('Running bash job', extra=ld)
    cmd = get_bash_cmd(ns.app_name)
    if ns.bash_cmd:
        cmd += ' '.join(ns.bash_cmd)
        log.debug(
            "Appending user-supplied bash options to defaults", extra=dict(
                app_name=ns.app_name, job_id=job_id, cmd=cmd))
    ld.update(cmd=cmd)
    if not cmd:
        raise UserWarning(
            "You need to specify bash options or configure default bash"
            " options")

    _cmdargs = dict(**ns.__dict__)
    _cmdargs.update(api.parse_job_id(ns.app_name, job_id))
    cmd = cmd.format(**_cmdargs)

    if ns.redirect_to_stderr:
        _std = sys.stderr
    else:
        _std = PIPE

    log.info('running command', extra=ld)
    returncode, stdout, stderr = run(
        cmd, shell=True, timeout=ns.watch, stdout=_std, stderr=_std)
    ld = dict(bash_returncode=returncode, stdout=stdout, stderr=stderr, **ld)
    if returncode == -9:
        log_and_raise("Bash job timed out", ld)
    elif returncode != 0:
        # this raises an error and logs output:
        log_and_raise("Bash job failed", ld)
    else:
        log.info("Bash job succeeded", extra=ld)