Пример #1
0
def run(schema_file,
        data_path,
        script=None,
        spec_file=None,
        verbose=None,
        yarn=None):
    """
    Main function to run pyspark job. It requires a schema file, an HDFS directory
    with data and optional script with mapper/reducer functions.
    """
    time0 = time.time()
    # pyspark modules
    from pyspark import SparkContext

    # define spark context, it's main object which allow
    # to communicate with spark
    ctx = SparkContext(appName="AvroKeyInputFormat", pyFiles=[script])
    logger = SparkLogger(ctx)
    if not verbose:
        logger.set_level('ERROR')
    if yarn:
        logger.info("YARN client mode enabled")

    # load FWJR schema
    rdd = ctx.textFile(schema_file, 1).collect()

    # define input avro schema, the rdd is a list of lines (sc.textFile similar to readlines)
    avsc = reduce(lambda x, y: x + y, rdd)  # merge all entries from rdd list
    schema = ''.join(avsc.split())  # remove spaces in avsc map
    conf = {"avro.schema.input.key": schema}

    # define newAPIHadoopFile parameters, java classes
    aformat = "org.apache.avro.mapreduce.AvroKeyInputFormat"
    akey = "org.apache.avro.mapred.AvroKey"
    awrite = "org.apache.hadoop.io.NullWritable"
    aconv = "org.apache.spark.examples.pythonconverters.AvroWrapperToJavaConverter"

    # load data from HDFS
    if isinstance(data_path, list):
        avro_rdd = ctx.union([
            ctx.newAPIHadoopFile(f, aformat, akey, awrite, aconv, conf=conf)
            for f in data_path
        ])
    else:
        avro_rdd = ctx.newAPIHadoopFile(data_path,
                                        aformat,
                                        akey,
                                        awrite,
                                        aconv,
                                        conf=conf)

    # process data, here the map will read record from avro file
    # if we need a whole record we'll use lambda x: x[0], e.g.
    # output = avro_rdd.map(lambda x: x[0]).collect()
    #
    # if we need a particular key, e.g. jobid, we'll extract it
    # within lambda function, e.g. lambda x: x[0]['jobid'], e.g.
    # output = avro_rdd.map(lambda x: x[0]['jobid']).collect()
    #
    # in more general way we write mapper/reducer functions which will be
    # executed by Spark via collect call
    spec = None
    if spec_file:
        if os.path.isfile(spec_file):
            spec = json.load(open(spec_file))
        else:
            spec = json.loads(spec_file)
    if script:
        obj = import_(script)
        logger.info("Use user-based script %s" % obj)
        if not hasattr(obj, 'MapReduce'):
            logger.error('Unable to find MapReduce class in %s, %s' \
                    % (script, obj))
            ctx.stop()
            return
        # we have a nested use case when one MR return WMArchive spec
        # we'll loop in that case until we get non-spec output
        while True:
            mro = obj.MapReduce(spec)
            # example of collecting records from mapper and
            # passing all of them to reducer function
            records = avro_rdd.map(mro.mapper).collect()
            out = mro.reducer(records)
            logger.info('OUTPUT %s %s' % (out, type(out)))
            if is_spec(out):
                spec = out
            else:
                break

        # the map(f).reduce(f) example but it does not collect
        # intermediate records
        # out = avro_rdd.map(obj.mapper).reduce(obj.reducer).collect()
    else:
        records = avro_rdd.map(basic_mapper).collect()
        out = basic_reducer(records)
    ctx.stop()
    if verbose:
        logger.info("Elapsed time %s" % htime(time.time() - time0))
    return out
Пример #2
0
def runActionsHistoryQuery(schema_file, data_path, verbose=None, yarn=None):
    """
    Function to run a pyspark job to find the logs associated with the actionshistory.json file 
    To run use a spec file with only timerange, e.g. "spec":{ "timerange":[20180706,20180706]}}
    and then execute: myspark --spec=large_query.spec --logfail
    """

    if verbose:
        print("### schema: %s" % schema_file)
        print("### path  : %s" % data_path)
        print("### spec  : %s" % spec_file)
    time0 = time.time()
    # pyspark modules
    from pyspark import SparkContext
    from pyspark import SQLContext, StorageLevel

    # define spark context, it's main object which allow
    # to communicate with spark
    ctx = SparkContext(appName="AvroKeyInputFormat")
    logger = SparkLogger(ctx)
    if not verbose:
        logger.set_level('ERROR')
    if yarn:
        logger.info("YARN client mode enabled")

    # load FWJR schema
    rdd = ctx.textFile(schema_file, 1).collect()

    # define input avro schema, the rdd is a list of lines (sc.textFile similar to readlines)
    avsc = reduce(lambda x, y: x + y, rdd)  # merge all entries from rdd list
    schema = ''.join(avsc.split())  # remove spaces in avsc map
    conf = {"avro.schema.input.key": schema}

    # define newAPIHadoopFile parameters, java classes
    aformat = "org.apache.avro.mapreduce.AvroKeyInputFormat"
    akey = "org.apache.avro.mapred.AvroKey"
    awrite = "org.apache.hadoop.io.NullWritable"
    aconv = "org.apache.spark.examples.pythonconverters.AvroWrapperToJavaConverter"

    # load data from HDFS
    if isinstance(data_path, list):
        avro_rdd = ctx.union([
            ctx.newAPIHadoopFile(f, aformat, akey, awrite, aconv, conf=conf)
            for f in data_path
        ])
    else:
        avro_rdd = ctx.newAPIHadoopFile(data_path,
                                        aformat,
                                        akey,
                                        awrite,
                                        aconv,
                                        conf=conf)

    # load failing task
    rdd_failing_tasks = ctx.textFile(
        "hdfs:///cms/users/llayer/failing_tasks.csv")

    #
    # first step: filter the data for failing workflows and join with the actionshistory workflows
    #

    # filter the tasks - keep only failing that have a log file
    def getFailing(row):
        rec = row[0]
        meta = rec.get('meta_data', {})
        if meta.get('jobstate', '') != 'jobfailed':
            return False
        if rec.get('LFNArray', []) == []:
            return False
        return True

    # create key-value structure for join
    def avro_rdd_KV(row):
        rec = row[0]
        task = rec["task"]
        return (task, rec)

    # filter + join task names
    fail_workflows = avro_rdd.filter(lambda x : getFailing(x)) \
                     .map(lambda x : avro_rdd_KV(x)) \
                     .join(rdd_failing_tasks.map(lambda x : (x,x)))

    #
    # second step: filter the data for logcollect tasks and join with the previous result
    #

    # keep only logcollect jobs
    def filterLogCollect(row):
        rec = row[0]
        meta = rec.get('meta_data', {})
        if meta.get('jobtype', '').lower() != 'logcollect':
            return False
        return True

    # create KV structure using log archive as key and return important information
    def log_KV(row):
        rec = row[1][0]
        task = rec["task"]
        lfn_array = rec.get('LFNArray', [])
        meta = rec.get('meta_data', {})
        jobstate = meta.get('jobstate', '')
        steps = rec.get('steps', [])
        status = []
        site = []
        for step in steps:
            status.append(step.get('status', ''))
            site.append(step.get('site', ''))
        out_dict = {
            'task': task,
            'jobstate': jobstate,
            'status': status,
            'site': site
        }
        return [(lfn, out_dict) for lfn in lfn_array if 'logArch' in lfn]

    # create KV structure for log collect jobs using log archives as keys
    def logcoll_KV(row):
        rec = row[0]
        task = rec["task"]
        lfn_array = rec.get('LFNArray', [])
        logCollect = ""
        for lfn in lfn_array:
            if 'logcollect' in lfn.lower():
                logCollect = lfn
        meta = rec.get('meta_data', {})
        jobstate = meta.get('jobstate', '')
        out_dict = {
            'logcollect_task': task,
            'logcollect_jobstate': jobstate,
            'logcollect_lfn': logCollect
        }
        return [(lfn, out_dict) for lfn in lfn_array if 'logArch' in lfn]

    # log collect tasks
    logColl = avro_rdd.filter(lambda x: filterLogCollect(x)).flatMap(
        lambda x: logcoll_KV(x))

    # join the frames with log archives as keys
    result = fail_workflows.flatMap(lambda x: log_KV(x)).join(logColl)

    # write back the records to hdfs
    result.saveAsTextFile("hdfs:///cms/users/llayer/logs10.csv")

    ctx.stop()
    if verbose:
        logger.info("Elapsed time %s" % htime(time.time() - time0))

    return 0
Пример #3
0
def run(schema_file, data_path, script=None, spec_file=None, verbose=None, yarn=None):
    """
    Main function to run pyspark job. It requires a schema file, an HDFS directory
    with data and optional script with mapper/reducer functions.
    """
    time0 = time.time()
    # pyspark modules
    from pyspark import SparkContext

    # define spark context, it's main object which allow
    # to communicate with spark
    ctx = SparkContext(appName="AvroKeyInputFormat", pyFiles=[script])
    logger = SparkLogger(ctx)
    if  not verbose:
        logger.set_level('ERROR')
    if yarn:
        logger.info("YARN client mode enabled")

    # load FWJR schema
    rdd = ctx.textFile(schema_file, 1).collect()

    # define input avro schema, the rdd is a list of lines (sc.textFile similar to readlines)
    avsc = reduce(lambda x, y: x + y, rdd) # merge all entries from rdd list
    schema = ''.join(avsc.split()) # remove spaces in avsc map
    conf = {"avro.schema.input.key": schema}

    # define newAPIHadoopFile parameters, java classes
    aformat="org.apache.avro.mapreduce.AvroKeyInputFormat"
    akey="org.apache.avro.mapred.AvroKey"
    awrite="org.apache.hadoop.io.NullWritable"
    aconv="org.apache.spark.examples.pythonconverters.AvroWrapperToJavaConverter"

    # load data from HDFS
    if  isinstance(data_path, list):
        avro_rdd = ctx.union([ctx.newAPIHadoopFile(f, aformat, akey, awrite, aconv, conf=conf) for f in data_path])
    else:
        avro_rdd = ctx.newAPIHadoopFile(data_path, aformat, akey, awrite, aconv, conf=conf)

    # process data, here the map will read record from avro file
    # if we need a whole record we'll use lambda x: x[0], e.g.
    # output = avro_rdd.map(lambda x: x[0]).collect()
    #
    # if we need a particular key, e.g. jobid, we'll extract it
    # within lambda function, e.g. lambda x: x[0]['jobid'], e.g.
    # output = avro_rdd.map(lambda x: x[0]['jobid']).collect()
    #
    # in more general way we write mapper/reducer functions which will be
    # executed by Spark via collect call
    spec = None
    if  spec_file:
        if  os.path.isfile(spec_file):
            spec = json.load(open(spec_file))
        else:
            spec = json.loads(spec_file)
    if  script:
        obj = import_(script)
        logger.info("Use user-based script %s" % obj)
        if  not hasattr(obj, 'MapReduce'):
            logger.error('Unable to find MapReduce class in %s, %s' \
                    % (script, obj))
            ctx.stop()
            return
        mro = obj.MapReduce(spec)
        # example of collecting records from mapper and
        # passing all of them to reducer function
        records = avro_rdd.map(mro.mapper).collect()
        out = mro.reducer(records)

        # the map(f).reduce(f) example but it does not collect
        # intermediate records
        # out = avro_rdd.map(obj.mapper).reduce(obj.reducer).collect()
    else:
        records = avro_rdd.map(basic_mapper).collect()
        out = basic_reducer(records)
    ctx.stop()
    if  verbose:
        logger.info("Elapsed time %s" % htime(time.time()-time0))
    return out
Пример #4
0
def run(schema_file, data_path, script=None, spec_file=None, verbose=None, rout=None, yarn=None):
    """
    Main function to run pyspark job. It requires a schema file, an HDFS directory
    with data and optional script with mapper/reducer functions.
    """
    if  script:
        script = get_script(script)
    if  verbose:
        print("### schema: %s" % schema_file)
        print("### path  : %s" % data_path)
        print("### script: %s" % script)
        print("### spec  : %s" % spec_file)
    time0 = time.time()
    # pyspark modules
    from pyspark import SparkContext

    # define spark context, it's main object which allow
    # to communicate with spark
    ctx = SparkContext(appName="AvroKeyInputFormat", pyFiles=[script])
    logger = SparkLogger(ctx)
    if  not verbose:
        logger.set_level('ERROR')
    if yarn:
        logger.info("YARN client mode enabled")

    # load FWJR schema
    rdd = ctx.textFile(schema_file, 1).collect()

    # define input avro schema, the rdd is a list of lines (sc.textFile similar to readlines)
    avsc = reduce(lambda x, y: x + y, rdd) # merge all entries from rdd list
    schema = ''.join(avsc.split()) # remove spaces in avsc map
    conf = {"avro.schema.input.key": schema}

    # define newAPIHadoopFile parameters, java classes
    aformat="org.apache.avro.mapreduce.AvroKeyInputFormat"
    akey="org.apache.avro.mapred.AvroKey"
    awrite="org.apache.hadoop.io.NullWritable"
    aconv="org.apache.spark.examples.pythonconverters.AvroWrapperToJavaConverter"

    # load data from HDFS
    if  isinstance(data_path, list):
        avro_rdd = ctx.union([ctx.newAPIHadoopFile(f, aformat, akey, awrite, aconv, conf=conf) for f in data_path])
    else:
        avro_rdd = ctx.newAPIHadoopFile(data_path, aformat, akey, awrite, aconv, conf=conf)

    # process data, here the map will read record from avro file
    # if we need a whole record we'll use lambda x: x[0], e.g.
    # output = avro_rdd.map(lambda x: x[0]).collect()
    #
    # if we need a particular key, e.g. jobid, we'll extract it
    # within lambda function, e.g. lambda x: x[0]['jobid'], e.g.
    # output = avro_rdd.map(lambda x: x[0]['jobid']).collect()
    #
    # in more general way we write mapper/reducer functions which will be
    # executed by Spark via collect call
    spec = None
    if  spec_file:
        if  os.path.isfile(spec_file):
            spec = json.load(open(spec_file))
        else:
            spec = json.loads(spec_file)
    if  verbose:
        spec['verbose'] = 1
        print("### spec %s" % json.dumps(spec))
    if  rout:
        spec['output'] = rout
    if  script:
        obj = import_(script)
        logger.info("Use user-based script %s" % obj)
        if  not hasattr(obj, 'MapReduce'):
            logger.error('Unable to find MapReduce class in %s, %s' \
                    % (script, obj))
            ctx.stop()
            return
        # we have a nested use case when one MR return WMArchive spec
        # we'll loop in that case until we get non-spec output
        count = 0
        while True:
            mro = obj.MapReduce(spec)
            mname = mro.__dict__.get('name', '').split('.')[0]
            print("### Load %s" % mname)
            if  mname.lower().endswith('counter'):
                out = avro_rdd.filter(mro.mapper).count()
                if  rout:
                    with open(rout, 'w') as ostream:
                        ostream.write(out)
                break
            # example of collecting records from mapper and
            # passing all of them to reducer function
            records = avro_rdd.filter(mro.mapper).collect()
            out = mro.reducer(records)
            if  verbose:
                print("### Loop count %s" % count)
            if  count > 3:
                print("### WARNING, loop counter exceed its limit")
                break
            if  is_spec(out):
                spec = out
            else:
                break
            count += 1

        # the map(f).reduce(f) example but it does not collect
        # intermediate records
        # out = avro_rdd.map(obj.mapper).reduce(obj.reducer).collect()
    else:
        records = avro_rdd.map(basic_mapper).collect()
        out = basic_reducer(records)
    ctx.stop()
    if  verbose:
        logger.info("Elapsed time %s" % htime(time.time()-time0))
    return out