def run(schema_file, data_path, script=None, spec_file=None, verbose=None, yarn=None): """ Main function to run pyspark job. It requires a schema file, an HDFS directory with data and optional script with mapper/reducer functions. """ time0 = time.time() # pyspark modules from pyspark import SparkContext # define spark context, it's main object which allow # to communicate with spark ctx = SparkContext(appName="AvroKeyInputFormat", pyFiles=[script]) logger = SparkLogger(ctx) if not verbose: logger.set_level('ERROR') if yarn: logger.info("YARN client mode enabled") # load FWJR schema rdd = ctx.textFile(schema_file, 1).collect() # define input avro schema, the rdd is a list of lines (sc.textFile similar to readlines) avsc = reduce(lambda x, y: x + y, rdd) # merge all entries from rdd list schema = ''.join(avsc.split()) # remove spaces in avsc map conf = {"avro.schema.input.key": schema} # define newAPIHadoopFile parameters, java classes aformat = "org.apache.avro.mapreduce.AvroKeyInputFormat" akey = "org.apache.avro.mapred.AvroKey" awrite = "org.apache.hadoop.io.NullWritable" aconv = "org.apache.spark.examples.pythonconverters.AvroWrapperToJavaConverter" # load data from HDFS if isinstance(data_path, list): avro_rdd = ctx.union([ ctx.newAPIHadoopFile(f, aformat, akey, awrite, aconv, conf=conf) for f in data_path ]) else: avro_rdd = ctx.newAPIHadoopFile(data_path, aformat, akey, awrite, aconv, conf=conf) # process data, here the map will read record from avro file # if we need a whole record we'll use lambda x: x[0], e.g. # output = avro_rdd.map(lambda x: x[0]).collect() # # if we need a particular key, e.g. jobid, we'll extract it # within lambda function, e.g. lambda x: x[0]['jobid'], e.g. # output = avro_rdd.map(lambda x: x[0]['jobid']).collect() # # in more general way we write mapper/reducer functions which will be # executed by Spark via collect call spec = None if spec_file: if os.path.isfile(spec_file): spec = json.load(open(spec_file)) else: spec = json.loads(spec_file) if script: obj = import_(script) logger.info("Use user-based script %s" % obj) if not hasattr(obj, 'MapReduce'): logger.error('Unable to find MapReduce class in %s, %s' \ % (script, obj)) ctx.stop() return # we have a nested use case when one MR return WMArchive spec # we'll loop in that case until we get non-spec output while True: mro = obj.MapReduce(spec) # example of collecting records from mapper and # passing all of them to reducer function records = avro_rdd.map(mro.mapper).collect() out = mro.reducer(records) logger.info('OUTPUT %s %s' % (out, type(out))) if is_spec(out): spec = out else: break # the map(f).reduce(f) example but it does not collect # intermediate records # out = avro_rdd.map(obj.mapper).reduce(obj.reducer).collect() else: records = avro_rdd.map(basic_mapper).collect() out = basic_reducer(records) ctx.stop() if verbose: logger.info("Elapsed time %s" % htime(time.time() - time0)) return out
def runActionsHistoryQuery(schema_file, data_path, verbose=None, yarn=None): """ Function to run a pyspark job to find the logs associated with the actionshistory.json file To run use a spec file with only timerange, e.g. "spec":{ "timerange":[20180706,20180706]}} and then execute: myspark --spec=large_query.spec --logfail """ if verbose: print("### schema: %s" % schema_file) print("### path : %s" % data_path) print("### spec : %s" % spec_file) time0 = time.time() # pyspark modules from pyspark import SparkContext from pyspark import SQLContext, StorageLevel # define spark context, it's main object which allow # to communicate with spark ctx = SparkContext(appName="AvroKeyInputFormat") logger = SparkLogger(ctx) if not verbose: logger.set_level('ERROR') if yarn: logger.info("YARN client mode enabled") # load FWJR schema rdd = ctx.textFile(schema_file, 1).collect() # define input avro schema, the rdd is a list of lines (sc.textFile similar to readlines) avsc = reduce(lambda x, y: x + y, rdd) # merge all entries from rdd list schema = ''.join(avsc.split()) # remove spaces in avsc map conf = {"avro.schema.input.key": schema} # define newAPIHadoopFile parameters, java classes aformat = "org.apache.avro.mapreduce.AvroKeyInputFormat" akey = "org.apache.avro.mapred.AvroKey" awrite = "org.apache.hadoop.io.NullWritable" aconv = "org.apache.spark.examples.pythonconverters.AvroWrapperToJavaConverter" # load data from HDFS if isinstance(data_path, list): avro_rdd = ctx.union([ ctx.newAPIHadoopFile(f, aformat, akey, awrite, aconv, conf=conf) for f in data_path ]) else: avro_rdd = ctx.newAPIHadoopFile(data_path, aformat, akey, awrite, aconv, conf=conf) # load failing task rdd_failing_tasks = ctx.textFile( "hdfs:///cms/users/llayer/failing_tasks.csv") # # first step: filter the data for failing workflows and join with the actionshistory workflows # # filter the tasks - keep only failing that have a log file def getFailing(row): rec = row[0] meta = rec.get('meta_data', {}) if meta.get('jobstate', '') != 'jobfailed': return False if rec.get('LFNArray', []) == []: return False return True # create key-value structure for join def avro_rdd_KV(row): rec = row[0] task = rec["task"] return (task, rec) # filter + join task names fail_workflows = avro_rdd.filter(lambda x : getFailing(x)) \ .map(lambda x : avro_rdd_KV(x)) \ .join(rdd_failing_tasks.map(lambda x : (x,x))) # # second step: filter the data for logcollect tasks and join with the previous result # # keep only logcollect jobs def filterLogCollect(row): rec = row[0] meta = rec.get('meta_data', {}) if meta.get('jobtype', '').lower() != 'logcollect': return False return True # create KV structure using log archive as key and return important information def log_KV(row): rec = row[1][0] task = rec["task"] lfn_array = rec.get('LFNArray', []) meta = rec.get('meta_data', {}) jobstate = meta.get('jobstate', '') steps = rec.get('steps', []) status = [] site = [] for step in steps: status.append(step.get('status', '')) site.append(step.get('site', '')) out_dict = { 'task': task, 'jobstate': jobstate, 'status': status, 'site': site } return [(lfn, out_dict) for lfn in lfn_array if 'logArch' in lfn] # create KV structure for log collect jobs using log archives as keys def logcoll_KV(row): rec = row[0] task = rec["task"] lfn_array = rec.get('LFNArray', []) logCollect = "" for lfn in lfn_array: if 'logcollect' in lfn.lower(): logCollect = lfn meta = rec.get('meta_data', {}) jobstate = meta.get('jobstate', '') out_dict = { 'logcollect_task': task, 'logcollect_jobstate': jobstate, 'logcollect_lfn': logCollect } return [(lfn, out_dict) for lfn in lfn_array if 'logArch' in lfn] # log collect tasks logColl = avro_rdd.filter(lambda x: filterLogCollect(x)).flatMap( lambda x: logcoll_KV(x)) # join the frames with log archives as keys result = fail_workflows.flatMap(lambda x: log_KV(x)).join(logColl) # write back the records to hdfs result.saveAsTextFile("hdfs:///cms/users/llayer/logs10.csv") ctx.stop() if verbose: logger.info("Elapsed time %s" % htime(time.time() - time0)) return 0
def run(schema_file, data_path, script=None, spec_file=None, verbose=None, yarn=None): """ Main function to run pyspark job. It requires a schema file, an HDFS directory with data and optional script with mapper/reducer functions. """ time0 = time.time() # pyspark modules from pyspark import SparkContext # define spark context, it's main object which allow # to communicate with spark ctx = SparkContext(appName="AvroKeyInputFormat", pyFiles=[script]) logger = SparkLogger(ctx) if not verbose: logger.set_level('ERROR') if yarn: logger.info("YARN client mode enabled") # load FWJR schema rdd = ctx.textFile(schema_file, 1).collect() # define input avro schema, the rdd is a list of lines (sc.textFile similar to readlines) avsc = reduce(lambda x, y: x + y, rdd) # merge all entries from rdd list schema = ''.join(avsc.split()) # remove spaces in avsc map conf = {"avro.schema.input.key": schema} # define newAPIHadoopFile parameters, java classes aformat="org.apache.avro.mapreduce.AvroKeyInputFormat" akey="org.apache.avro.mapred.AvroKey" awrite="org.apache.hadoop.io.NullWritable" aconv="org.apache.spark.examples.pythonconverters.AvroWrapperToJavaConverter" # load data from HDFS if isinstance(data_path, list): avro_rdd = ctx.union([ctx.newAPIHadoopFile(f, aformat, akey, awrite, aconv, conf=conf) for f in data_path]) else: avro_rdd = ctx.newAPIHadoopFile(data_path, aformat, akey, awrite, aconv, conf=conf) # process data, here the map will read record from avro file # if we need a whole record we'll use lambda x: x[0], e.g. # output = avro_rdd.map(lambda x: x[0]).collect() # # if we need a particular key, e.g. jobid, we'll extract it # within lambda function, e.g. lambda x: x[0]['jobid'], e.g. # output = avro_rdd.map(lambda x: x[0]['jobid']).collect() # # in more general way we write mapper/reducer functions which will be # executed by Spark via collect call spec = None if spec_file: if os.path.isfile(spec_file): spec = json.load(open(spec_file)) else: spec = json.loads(spec_file) if script: obj = import_(script) logger.info("Use user-based script %s" % obj) if not hasattr(obj, 'MapReduce'): logger.error('Unable to find MapReduce class in %s, %s' \ % (script, obj)) ctx.stop() return mro = obj.MapReduce(spec) # example of collecting records from mapper and # passing all of them to reducer function records = avro_rdd.map(mro.mapper).collect() out = mro.reducer(records) # the map(f).reduce(f) example but it does not collect # intermediate records # out = avro_rdd.map(obj.mapper).reduce(obj.reducer).collect() else: records = avro_rdd.map(basic_mapper).collect() out = basic_reducer(records) ctx.stop() if verbose: logger.info("Elapsed time %s" % htime(time.time()-time0)) return out
def run(schema_file, data_path, script=None, spec_file=None, verbose=None, rout=None, yarn=None): """ Main function to run pyspark job. It requires a schema file, an HDFS directory with data and optional script with mapper/reducer functions. """ if script: script = get_script(script) if verbose: print("### schema: %s" % schema_file) print("### path : %s" % data_path) print("### script: %s" % script) print("### spec : %s" % spec_file) time0 = time.time() # pyspark modules from pyspark import SparkContext # define spark context, it's main object which allow # to communicate with spark ctx = SparkContext(appName="AvroKeyInputFormat", pyFiles=[script]) logger = SparkLogger(ctx) if not verbose: logger.set_level('ERROR') if yarn: logger.info("YARN client mode enabled") # load FWJR schema rdd = ctx.textFile(schema_file, 1).collect() # define input avro schema, the rdd is a list of lines (sc.textFile similar to readlines) avsc = reduce(lambda x, y: x + y, rdd) # merge all entries from rdd list schema = ''.join(avsc.split()) # remove spaces in avsc map conf = {"avro.schema.input.key": schema} # define newAPIHadoopFile parameters, java classes aformat="org.apache.avro.mapreduce.AvroKeyInputFormat" akey="org.apache.avro.mapred.AvroKey" awrite="org.apache.hadoop.io.NullWritable" aconv="org.apache.spark.examples.pythonconverters.AvroWrapperToJavaConverter" # load data from HDFS if isinstance(data_path, list): avro_rdd = ctx.union([ctx.newAPIHadoopFile(f, aformat, akey, awrite, aconv, conf=conf) for f in data_path]) else: avro_rdd = ctx.newAPIHadoopFile(data_path, aformat, akey, awrite, aconv, conf=conf) # process data, here the map will read record from avro file # if we need a whole record we'll use lambda x: x[0], e.g. # output = avro_rdd.map(lambda x: x[0]).collect() # # if we need a particular key, e.g. jobid, we'll extract it # within lambda function, e.g. lambda x: x[0]['jobid'], e.g. # output = avro_rdd.map(lambda x: x[0]['jobid']).collect() # # in more general way we write mapper/reducer functions which will be # executed by Spark via collect call spec = None if spec_file: if os.path.isfile(spec_file): spec = json.load(open(spec_file)) else: spec = json.loads(spec_file) if verbose: spec['verbose'] = 1 print("### spec %s" % json.dumps(spec)) if rout: spec['output'] = rout if script: obj = import_(script) logger.info("Use user-based script %s" % obj) if not hasattr(obj, 'MapReduce'): logger.error('Unable to find MapReduce class in %s, %s' \ % (script, obj)) ctx.stop() return # we have a nested use case when one MR return WMArchive spec # we'll loop in that case until we get non-spec output count = 0 while True: mro = obj.MapReduce(spec) mname = mro.__dict__.get('name', '').split('.')[0] print("### Load %s" % mname) if mname.lower().endswith('counter'): out = avro_rdd.filter(mro.mapper).count() if rout: with open(rout, 'w') as ostream: ostream.write(out) break # example of collecting records from mapper and # passing all of them to reducer function records = avro_rdd.filter(mro.mapper).collect() out = mro.reducer(records) if verbose: print("### Loop count %s" % count) if count > 3: print("### WARNING, loop counter exceed its limit") break if is_spec(out): spec = out else: break count += 1 # the map(f).reduce(f) example but it does not collect # intermediate records # out = avro_rdd.map(obj.mapper).reduce(obj.reducer).collect() else: records = avro_rdd.map(basic_mapper).collect() out = basic_reducer(records) ctx.stop() if verbose: logger.info("Elapsed time %s" % htime(time.time()-time0)) return out