def test_wmaHash(self): "Test wmaHash function" rec1 = {'foo':1, 'data':[{'one':1}, {'two':2}]} rec2 = {'foo':1, 'data':[{'two':2}, {'one':1}]} rec3 = {'data':[{'two':2}, {'one':1}], 'foo':1} hsr1 = wmaHash(rec1) hsr2 = wmaHash(rec2) hsr3 = wmaHash(rec3) self.assertEqual(hsr1, hsr2) self.assertEqual(hsr1, hsr3) self.assertEqual(hsr2, hsr3)
def test_wmaHash(self): "Test wmaHash function" rec1 = {"foo": 1, "data": [{"one": 1}, {"two": 2}]} rec2 = {"foo": 1, "data": [{"two": 2}, {"one": 1}]} rec3 = {"data": [{"two": 2}, {"one": 1}], "foo": 1} hsr1 = wmaHash(rec1) hsr2 = wmaHash(rec2) hsr3 = wmaHash(rec3) self.assertEqual(hsr1, hsr2) self.assertEqual(hsr1, hsr3) self.assertEqual(hsr2, hsr3)
def main(): "Main function" optmgr = OptionParser() opts = optmgr.parser.parse_args() time0 = time.time() hdir = opts.hdir.split() if len(hdir) == 1: hdir = hdir[0] results = run(opts.schema, hdir, opts.script, opts.spec, opts.verbose, opts.yarn) if opts.store: data = { "results": results, "ts": time.time(), "etime": time.time() - time0 } if opts.wmaid: data['wmaid'] = opts.wmaid else: data['wmaid'] = wmaHash(data) data['dtype'] = 'job' pdata = dict(job=data) postdata(opts.store, pdata, opts.ckey, opts.cert, opts.verbose) else: #print(results) pass
def file_write(self, fname, data): "Write documents in append mode to given file name" # perform input data validation good_data = [] # write bad data records into output file bdir = '%s/bad' % os.path.dirname(fname) if not os.path.exists(bdir): os.makedirs(bdir) bfname = '%s/%s_bad.txt' % (bdir, os.path.basename(fname)) count = ecount = edocs = 0 with open(bfname, 'a') as bstream: for rec in data: validator = RecordValidator() validator.run(self.schema_json, rec) if validator.errors: bstream.write(json.dumps(rec)+'\n') for err in validator.errors: msg = 'SCHEMA ERROR ' for key, val in err.items(): msg += '%s: %s ' % (key.upper(), json.dumps(val)) bstream.write(msg+'\n') bstream.write('-------------\n') ecount += len(validator.errors) edocs += 1 else: good_data.append(rec) count += 1 if ecount: print("WARNING: received %s docs, found %s bad docs, %s errors, see %s"\ % (count, edocs, ecount, bfname)) # use only good portion of the data data = good_data try: schema = self.schema wmaids = [] if not hasattr(data, '__iter__') or isinstance(data, dict): data = [data] if os.path.exists(fname): schema = None # we'll append to existing file mode = 'a+' if fname.endswith('.avro') else 'a' if mode == 'a': print("We're unable yet to implement read-write mode with compressed avro files") raise NotImplementedError rec = None # keep doc in case of failure with DataFileWriter(open_file(fname, mode), DatumWriter(), schema) as writer: for rec in data: writer.append(rec) writer.flush() wmaid = rec.get('wmaid', wmaHash(rec)) wmaids.append(wmaid) return wmaids except Exception as exc: err = traceback.format_exc(limit=1).splitlines()[-1] line = ' '.join(str(exc).replace('\n', '').split()) msg = 'Failure in %s storage, error=%s, exception=%s' \ % (self.stype, err, line) msg += ' Failed document: ' msg += json.dumps(rec) raise WriteError(msg)
def wmaid(self, data): "Return wmaid for given data record or list of records" if isinstance(data, list) or isinstance(data, GeneratorType): wmaids = self.getids(data) wmaid = wmaHash('_'.join(wmaids)) else: wmaid = data['wmaid'] return wmaid
def wmaid(self, data): "Return wmaid for given data record or list of records" if isinstance(data, list) or isinstance(data, GeneratorType): wmaids = self.getids(data) wmaid = wmaHash('_'.join(wmaids)) else: wmaid = data['wmaid'] return wmaid
def setUp(self): self.tdir = tempfile.mkdtemp() self.mgr = FileStorage('fileio:%s' % self.tdir) data = {"int":1, "float":1.2, "list":[1,2,3], "dict":{"dname": "foo", "dval":1}, "listdict":[{"lname":"foo"}], "str":"string"} self.bare_data = dict(data) data['wmaid'] = wmaHash(data) data['stype'] = self.mgr.stype self.data = data
def check(self, data): "Cross-check the data based on its wmaid" try: wmaid = data.pop('wmaid') except: wmaid = '' if 'stype' in data: del data['stype'] hid = wmaHash(data) if hid != wmaid: raise Exception("Invalid data hash, hid=%s, wmaid=%s, data=%s" \ % (hid, wmaid, data))
def check(self, data): "Cross-check the data based on its wmaid" try: wmaid = data.pop('wmaid') except: wmaid = '' if 'stype' in data: del data['stype'] hid = wmaHash(data) if hid != wmaid: raise Exception("Invalid data hash, hid=%s, wmaid=%s, data=%s" \ % (hid, wmaid, data))
def submit(self, spec, fields): """ Submit job to HDFS/Spark platform, returns list of hash ids """ # generate uid for given spec/fields rep = json.dumps(dict(spec=spec, fields=fields)) wmaid = wmaHash(rep) # submit spark job self.taskmgr.spawn(self.submit_spark, wmaid, spec, fields) # self.submit_spark(wmaid, spec, fields) # return wmaids of submitted job results = [wmaid] return results
def submit(self, spec, fields): """ Submit job to HDFS/Spark platform, returns list of hash ids """ # generate uid for given spec/fields rep = json.dumps(dict(spec=spec, fields=fields)) wmaid = wmaHash(rep) # submit spark job self.taskmgr.spawn(self.submit_spark, wmaid, spec, fields) # self.submit_spark(wmaid, spec, fields) # return wmaids of submitted job results = [wmaid] return results
def encode(self, docs): """ Encode given set of documents into appropriate format for long term storage. This method will consume documents in DMWM JSON format. Yield encoded documents to the client. """ for doc in docs: if not doc.get('wmaid', ''): doc['wmaid'] = wmaHash(doc) if not doc.get('wmats', 0): doc['wmats'] = time.time() if not doc.get('stype', ''): doc['stype'] = self.sts.stype yield doc
def encode(self, docs): """ Encode given set of documents into appropriate format for long term storage. This method will consume documents in DMWM JSON format. Yield encoded documents to the client. """ for doc in docs: if not doc.get('wmaid', ''): doc['wmaid'] = wmaHash(doc) if not doc.get('wmats', 0): doc['wmats'] = time.time() if not doc.get('stype', ''): doc['stype'] = self.sts.stype yield doc
def setUp(self): self.tdir = tempfile.mkdtemp() data = {"int":1, "float":1.2, "list":[1,2,3], "dict":{"dname": "foo", "dval":1}, "listdict":[{"lname":"foo"}], "str":"string"} self.bare_data = dict(data) data['wmaid'] = wmaHash(data) data['stype'] = 'avroio' self.data = data schema = gen_schema(self.data) sname = os.path.join(self.tdir, 'schema.avsc') with open(sname, 'w') as ostream: ostream.write(json.dumps(schema)) self.mgr = AvroStorage('avroio:%s' % sname)
def main(): "Main function" optmgr = OptionParser() opts = optmgr.parser.parse_args() time0 = time.time() if opts.scripts: scripts() sys.exit(0) todate = datetime.datetime.today() todate = int(todate.strftime("%Y%m%d")) fromdate = datetime.datetime.today()-datetime.timedelta(days=1) fromdate = int(fromdate.strftime("%Y%m%d")) spec = json.load(open(opts.spec)) if opts.spec else {} timerange = spec.get('spec', {}).get('timerange', [fromdate, todate]) if opts.hdir == HDIR: hdir = opts.hdir.split() if len(hdir) == 1: hdir = hdir[0] hdirs = [] for tval in range_dates(timerange): if hdir.find(tval) == -1: hdirs.append(os.path.join(hdir, tval)) hdir = hdirs else: hdir = opts.hdir results = run(opts.schema, hdir, opts.script, opts.spec, opts.verbose, opts.rout, opts.yarn) if opts.store: data = {"results":results,"ts":time.time(),"etime":time.time()-time0} if opts.wmaid: data['wmaid'] = opts.wmaid else: data['wmaid'] = wmaHash(data) data['dtype'] = 'job' pdata = dict(job=data) postdata(opts.store, pdata, opts.ckey, opts.cert, opts.verbose) elif opts.amq: creds = credentials(opts.amq) host, port = creds['host_and_ports'].split(':') if creds and StompAMQ: print("### Send %s docs via StompAMQ" % len(results)) amq = StompAMQ(creds['username'], creds['password'], \ creds['producer'], creds['topic'], [(host, port)]) amq.send(results) else: print(results)
def setUp(self): self.tdir = tempfile.mkdtemp() self.mgr = FileStorage('fileio:%s' % self.tdir) data = { "int": 1, "float": 1.2, "list": [1, 2, 3], "dict": { "dname": "foo", "dval": 1 }, "listdict": [{ "lname": "foo" }], "str": "string" } self.bare_data = dict(data) data['wmaid'] = wmaHash(data) data['stype'] = self.mgr.stype self.data = data
def main(): "Main function" optmgr = OptionParser() opts = optmgr.parser.parse_args() time0 = time.time() hdir = opts.hdir.split() if len(hdir) == 1: hdir = hdir[0] results = run(opts.schema, hdir, opts.script, opts.spec, opts.verbose, opts.yarn) if opts.store: data = {"results":results,"ts":time.time(),"etime":time.time()-time0} if opts.wmaid: data['wmaid'] = opts.wmaid else: data['wmaid'] = wmaHash(data) data['dtype'] = 'job' pdata = dict(job=data) postdata(opts.store, pdata, opts.ckey, opts.cert, opts.verbose) else: print(results)
def setUp(self): uri = os.environ.get("WMA_MONGODB", "mongodb://localhost:8230") self.dbname = "test_fwjr" try: self.mgr = MongoStorage(uri, dbname=self.dbname) self.mgr.remove() except: self.mgr = None print("WARNING: cannot connect to %s" % uri) data = { "int": 1, "float": 1.2, "list": [1, 2, 3], "dict": {"dname": "foo", "dval": 1}, "listdict": [{"lname": "foo"}], "str": "string", } self.bare_data = dict(data) data["wmaid"] = wmaHash(data) data["stype"] = "mongodb" self.data = data
def setUp(self): self.tdir = tempfile.mkdtemp() data = { "int": 1, "float": 1.2, "list": [1, 2, 3], "dict": { "dname": "foo", "dval": 1 }, "listdict": [{ "lname": "foo" }], "str": "string" } self.bare_data = dict(data) data['wmaid'] = wmaHash(data) data['stype'] = 'avroio' self.data = data schema = gen_schema(self.data) sname = os.path.join(self.tdir, 'schema.avsc') with open(sname, 'w') as ostream: ostream.write(json.dumps(schema)) self.mgr = AvroStorage('avroio:%s' % sname)
def file_write(self, fname, data): "Write documents in append mode to given file name" # perform input data validation good_data = [] # write bad data records into output file bdir = os.path.dirname(fname) bdir = '%s/bad' % bdir if bdir else '/tmp/bad' if not os.path.exists(bdir): os.makedirs(bdir) bfname = '%s/%s_bad.txt' % (bdir, os.path.basename(fname)) count = ecount = edocs = 0 with open(bfname, 'a') as bstream: for rec in data: validator = RecordValidator() validator.run(self.schema_json, rec) if validator.errors: bstream.write(json.dumps(rec) + '\n') for err in validator.errors: msg = 'SCHEMA ERROR ' for key, val in err.items(): msg += '%s: %s ' % (key.upper(), json.dumps(val)) bstream.write(msg + '\n') bstream.write('-------------\n') ecount += len(validator.errors) edocs += 1 else: good_data.append(rec) count += 1 if ecount: print("WARNING: received %s docs, found %s bad docs, %s errors, see %s"\ % (count, edocs, ecount, bfname)) # use only good portion of the data data = good_data try: schema = self.schema wmaids = [] if not hasattr(data, '__iter__') or isinstance(data, dict): data = [data] if os.path.exists(fname): schema = None # we'll append to existing file mode = 'a+' if fname.endswith('.avro') else 'a' if mode == 'a': print( "We're unable yet to implement read-write mode with compressed avro files" ) raise NotImplementedError rec = None # keep doc in case of failure with DataFileWriter(open_file(fname, mode), DatumWriter(), schema) as writer: for rec in data: writer.append(rec) writer.flush() wmaid = rec.get('wmaid', wmaHash(rec)) wmaids.append(wmaid) return wmaids except Exception as exc: err = traceback.format_exc(limit=1).splitlines()[-1] line = ' '.join(str(exc).replace('\n', '').split()) msg = 'Failure in %s storage, error=%s, exception=%s' \ % (self.stype, err, line) msg += ' Failed document: ' msg += json.dumps(rec) raise WriteError(msg)
def main(): "Main function" optmgr = OptionParser() opts = optmgr.parser.parse_args() time0 = time.time() if opts.scripts: scripts() sys.exit(0) verbose = opts.verbose todate = datetime.datetime.today() todate = int(todate.strftime("%Y%m%d")) fromdate = datetime.datetime.today() - datetime.timedelta(days=1) fromdate = int(fromdate.strftime("%Y%m%d")) spec = {} try: if os.path.isfile(opts.spec): spec = json.load(open(opts.spec)) else: spec = json.loads(opts.spec) except Exception as exp: pass timerange = spec.get('spec', {}).get('timerange', [fromdate, todate]) if timerange and verbose: print("### TimeRang: %s" % timerange) hdir = opts.hdir if timerange: pat = re.compile(".*/20[0-9][0-9].*") if len(hdir.split()) == 1 and not pat.match(hdir): hdir = hdir.split()[0] hdirs = [] for tval in range_dates(timerange): if hdir.find(tval) == -1: hdirs.append(os.path.join(hdir, tval)) hdir = hdirs if verbose: print("### HDIR: %s" % hdir) results = run(opts.schema, hdir, opts.script, opts.spec, verbose, opts.rout, opts.yarn) if opts.store: data = { "results": results, "ts": time.time(), "etime": time.time() - time0 } if opts.wmaid: data['wmaid'] = opts.wmaid else: data['wmaid'] = wmaHash(data) data['dtype'] = 'job' pdata = dict(job=data) postdata(opts.store, pdata, opts.ckey, opts.cert, verbose) elif opts.amq: creds = credentials(opts.amq) host, port = creds['host_and_ports'].split(':') port = int(port) if creds and StompAMQ: print("### Send %s docs via StompAMQ" % len(results)) amq = StompAMQ(creds['username'], creds['password'], \ creds['producer'], creds['topic'], \ validation_schema=None, \ host_and_ports=[(host, port)]) data = [] for doc in results: hid = doc.get("hash", 1) if '_id' in doc: del doc['_id'] # delete ObjectID from MongoDB producer = "wmarchive" tstamp = int(time.time()) * 1000 notification, _, _ = amq.make_notification(doc, hid, producer=producer, ts=tstamp, dataSubfield="") data.append(notification) results = amq.send(data) print("### results from AMQ %s" % len(results)) else: if isinstance(results, list): print("### number of results %s" % len(results)) for doc in results: if '_id' in doc: del doc['_id'] # delete ObjectID from MongoDB try: print(json.dumps(doc)) except: print(doc) else: print(results)
def main(): "Main function" optmgr = OptionParser() opts = optmgr.parser.parse_args() time0 = time.time() if opts.scripts: scripts() sys.exit(0) verbose = opts.verbose todate = datetime.datetime.today() todate = int(todate.strftime("%Y%m%d")) fromdate = datetime.datetime.today() - datetime.timedelta(days=1) fromdate = int(fromdate.strftime("%Y%m%d")) spec = {} try: if os.path.isfile(opts.spec): spec = json.load(open(opts.spec)) else: spec = json.loads(opts.spec) except Exception as exp: pass timerange = spec.get('spec', {}).get('timerange', [fromdate, todate]) if timerange and verbose: print("### TimeRang: %s" % timerange) hdir = opts.hdir if timerange: pat = re.compile(".*/20[0-9][0-9].*") if len(hdir.split()) == 1 and not pat.match(hdir): hdir = hdir.split()[0] hdirs = [] for tval in range_dates(timerange): if hdir.find(tval) == -1: hdfs_file_path = os.path.join(hdir, tval) # check whether the hdfs path exists cmd = ['hdfs', 'dfs', '-test', '-d', hdfs_file_path] ret, out, err = run_cmd(cmd) if ret == 0: hdirs.append(hdfs_file_path) else: print "Path does not exist:", hdfs_file_path hdir = hdirs if verbose: print("### HDIR: %s" % hdir) if opts.logfail: print "Start query" runActionsHistoryQuery(opts.schema, hdir, verbose, opts.yarn) print "Finish query" else: results = run(opts.schema, hdir, opts.script, opts.spec, verbose, opts.rout, opts.yarn) if opts.store: data = { "results": results, "ts": time.time(), "etime": time.time() - time0 } if opts.wmaid: data['wmaid'] = opts.wmaid else: data['wmaid'] = wmaHash(data) data['dtype'] = 'job' pdata = dict(job=data) postdata(opts.store, pdata, opts.ckey, opts.cert, verbose) elif opts.amq: creds = credentials(opts.amq) host, port = creds['host_and_ports'].split(':') port = int(port) if creds and StompAMQ: print("### Send %s docs via StompAMQ" % len(results)) amq = StompAMQ(creds['username'], creds['password'], \ creds['producer'], creds['topic'], [(host, port)]) data = [] for doc in results: hid = doc.get("hash", 1) if '_id' in doc: del doc['_id'] # delete ObjectID from MongoDB data.append(amq.make_notification(doc, hid)) results = amq.send(data) print("### results from AMQ %s" % len(results)) else: if isinstance(results, list): print("### number of results %s" % len(results)) for doc in results: if '_id' in doc: del doc['_id'] # delete ObjectID from MongoDB try: print(json.dumps(doc)) except: print(doc) else: print(results)
def main(): "Main function" optmgr = OptionParser() opts = optmgr.parser.parse_args() time0 = time.time() if opts.scripts: scripts() sys.exit(0) todate = datetime.datetime.today() todate = int(todate.strftime("%Y%m%d")) fromdate = datetime.datetime.today() - datetime.timedelta(days=1) fromdate = int(fromdate.strftime("%Y%m%d")) spec = {} try: if os.path.isfile(opts.spec): spec = json.load(open(opts.spec)) else: spec = json.loads(opts.spec) except Exception as exp: pass timerange = spec.get('spec', {}).get('timerange', [fromdate, todate]) if opts.hdir == HDIR: hdir = opts.hdir.split() if len(hdir) == 1: hdir = hdir[0] hdirs = [] for tval in range_dates(timerange): if hdir.find(tval) == -1: hdirs.append(os.path.join(hdir, tval)) hdir = hdirs else: hdir = opts.hdir results = run(opts.schema, hdir, opts.script, opts.spec, opts.verbose, opts.rout, opts.yarn) if opts.store: data = { "results": results, "ts": time.time(), "etime": time.time() - time0 } if opts.wmaid: data['wmaid'] = opts.wmaid else: data['wmaid'] = wmaHash(data) data['dtype'] = 'job' pdata = dict(job=data) postdata(opts.store, pdata, opts.ckey, opts.cert, opts.verbose) elif opts.amq: creds = credentials(opts.amq) host, port = creds['host_and_ports'].split(':') port = int(port) if creds and StompAMQ: print("### Send %s docs via StompAMQ" % len(results)) amq = StompAMQ(creds['username'], creds['password'], \ creds['producer'], creds['topic'], [(host, port)]) data = [] for doc in results: hid = doc.get("hash", 1) if '_id' in doc: del doc['_id'] # delete ObjectID from MongoDB data.append(amq.make_notification(doc, hid)) results = amq.send(data) print("### results from AMQ", len(results)) else: print("### agg. results", len(results))