Exemplo n.º 1
0
 def test_wmaHash(self):
     "Test wmaHash function"
     rec1 = {'foo':1, 'data':[{'one':1}, {'two':2}]}
     rec2 = {'foo':1, 'data':[{'two':2}, {'one':1}]}
     rec3 = {'data':[{'two':2}, {'one':1}], 'foo':1}
     hsr1 = wmaHash(rec1)
     hsr2 = wmaHash(rec2)
     hsr3 = wmaHash(rec3)
     self.assertEqual(hsr1, hsr2)
     self.assertEqual(hsr1, hsr3)
     self.assertEqual(hsr2, hsr3)
Exemplo n.º 2
0
 def test_wmaHash(self):
     "Test wmaHash function"
     rec1 = {"foo": 1, "data": [{"one": 1}, {"two": 2}]}
     rec2 = {"foo": 1, "data": [{"two": 2}, {"one": 1}]}
     rec3 = {"data": [{"two": 2}, {"one": 1}], "foo": 1}
     hsr1 = wmaHash(rec1)
     hsr2 = wmaHash(rec2)
     hsr3 = wmaHash(rec3)
     self.assertEqual(hsr1, hsr2)
     self.assertEqual(hsr1, hsr3)
     self.assertEqual(hsr2, hsr3)
Exemplo n.º 3
0
def main():
    "Main function"
    optmgr = OptionParser()
    opts = optmgr.parser.parse_args()
    time0 = time.time()
    hdir = opts.hdir.split()
    if len(hdir) == 1:
        hdir = hdir[0]
    results = run(opts.schema, hdir, opts.script, opts.spec, opts.verbose,
                  opts.yarn)
    if opts.store:
        data = {
            "results": results,
            "ts": time.time(),
            "etime": time.time() - time0
        }
        if opts.wmaid:
            data['wmaid'] = opts.wmaid
        else:
            data['wmaid'] = wmaHash(data)
        data['dtype'] = 'job'
        pdata = dict(job=data)
        postdata(opts.store, pdata, opts.ckey, opts.cert, opts.verbose)
    else:
        #print(results)
        pass
Exemplo n.º 4
0
    def file_write(self, fname, data):
        "Write documents in append mode to given file name"
        # perform input data validation
        good_data = []
        # write bad data records into output file
        bdir = '%s/bad' % os.path.dirname(fname)
        if  not os.path.exists(bdir):
            os.makedirs(bdir)
        bfname = '%s/%s_bad.txt' % (bdir, os.path.basename(fname))
        count = ecount = edocs = 0
        with open(bfname, 'a') as bstream:
            for rec in data:
                validator = RecordValidator()
                validator.run(self.schema_json, rec)
                if  validator.errors:
                    bstream.write(json.dumps(rec)+'\n')
                    for err in validator.errors:
                        msg = 'SCHEMA ERROR '
                        for key, val in err.items():
                            msg += '%s: %s ' % (key.upper(), json.dumps(val))
                        bstream.write(msg+'\n')
                    bstream.write('-------------\n')
                    ecount += len(validator.errors)
                    edocs += 1
                else:
                    good_data.append(rec)
                count += 1
        if  ecount:
            print("WARNING: received %s docs, found %s bad docs, %s errors, see %s"\
                    % (count, edocs, ecount, bfname))
        # use only good portion of the data
        data = good_data
        try:
            schema = self.schema
            wmaids = []
            if  not hasattr(data, '__iter__') or isinstance(data, dict):
                data = [data]

            if  os.path.exists(fname):
                schema = None # we'll append to existing file
            mode = 'a+' if fname.endswith('.avro') else 'a'
            if  mode == 'a':
                print("We're unable yet to implement read-write mode with compressed avro files")
                raise NotImplementedError
            rec = None # keep doc in case of failure
            with DataFileWriter(open_file(fname, mode), DatumWriter(), schema) as writer:
                for rec in data:
                    writer.append(rec)
                    writer.flush()
                    wmaid = rec.get('wmaid', wmaHash(rec))
                    wmaids.append(wmaid)
            return wmaids
        except Exception as exc:
            err = traceback.format_exc(limit=1).splitlines()[-1]
            line = ' '.join(str(exc).replace('\n', '').split())
            msg = 'Failure in %s storage, error=%s, exception=%s' \
                    % (self.stype, err, line)
            msg += ' Failed document: '
            msg += json.dumps(rec)
            raise WriteError(msg)
Exemplo n.º 5
0
 def wmaid(self, data):
     "Return wmaid for given data record or list of records"
     if  isinstance(data, list) or isinstance(data, GeneratorType):
         wmaids = self.getids(data)
         wmaid = wmaHash('_'.join(wmaids))
     else:
         wmaid = data['wmaid']
     return wmaid
Exemplo n.º 6
0
 def wmaid(self, data):
     "Return wmaid for given data record or list of records"
     if isinstance(data, list) or isinstance(data, GeneratorType):
         wmaids = self.getids(data)
         wmaid = wmaHash('_'.join(wmaids))
     else:
         wmaid = data['wmaid']
     return wmaid
Exemplo n.º 7
0
    def setUp(self):
        self.tdir = tempfile.mkdtemp()
        self.mgr = FileStorage('fileio:%s' % self.tdir)
	data = {"int":1, "float":1.2, "list":[1,2,3],
		"dict":{"dname": "foo", "dval":1},
		"listdict":[{"lname":"foo"}], "str":"string"}
        self.bare_data = dict(data)
        data['wmaid'] = wmaHash(data)
        data['stype'] = self.mgr.stype
        self.data = data
Exemplo n.º 8
0
 def check(self, data):
     "Cross-check the data based on its wmaid"
     try:
         wmaid = data.pop('wmaid')
     except:
         wmaid = ''
     if 'stype' in data:
         del data['stype']
     hid = wmaHash(data)
     if hid != wmaid:
         raise Exception("Invalid data hash, hid=%s, wmaid=%s, data=%s" \
                 % (hid, wmaid, data))
Exemplo n.º 9
0
 def check(self, data):
     "Cross-check the data based on its wmaid"
     try:
         wmaid = data.pop('wmaid')
     except:
         wmaid = ''
     if  'stype' in data:
         del data['stype']
     hid = wmaHash(data)
     if  hid != wmaid:
         raise Exception("Invalid data hash, hid=%s, wmaid=%s, data=%s" \
                 % (hid, wmaid, data))
Exemplo n.º 10
0
 def submit(self, spec, fields):
     """
     Submit job to HDFS/Spark platform, returns list of hash ids
     """
     # generate uid for given spec/fields
     rep = json.dumps(dict(spec=spec, fields=fields))
     wmaid = wmaHash(rep)
     # submit spark job
     self.taskmgr.spawn(self.submit_spark, wmaid, spec, fields)
     # self.submit_spark(wmaid, spec, fields)
     # return wmaids of submitted job
     results = [wmaid]
     return results
Exemplo n.º 11
0
 def submit(self, spec, fields):
     """
     Submit job to HDFS/Spark platform, returns list of hash ids
     """
     # generate uid for given spec/fields
     rep = json.dumps(dict(spec=spec, fields=fields))
     wmaid = wmaHash(rep)
     # submit spark job
     self.taskmgr.spawn(self.submit_spark, wmaid, spec, fields)
     # self.submit_spark(wmaid, spec, fields)
     # return wmaids of submitted job
     results = [wmaid]
     return results
Exemplo n.º 12
0
 def encode(self, docs):
     """
     Encode given set of documents into appropriate format for long term storage.
     This method will consume documents in DMWM JSON format.
     Yield encoded documents to the client.
     """
     for doc in docs:
         if  not doc.get('wmaid', ''):
             doc['wmaid'] = wmaHash(doc)
         if  not doc.get('wmats', 0):
             doc['wmats'] = time.time()
         if  not doc.get('stype', ''):
             doc['stype'] = self.sts.stype
         yield doc
Exemplo n.º 13
0
 def encode(self, docs):
     """
     Encode given set of documents into appropriate format for long term storage.
     This method will consume documents in DMWM JSON format.
     Yield encoded documents to the client.
     """
     for doc in docs:
         if not doc.get('wmaid', ''):
             doc['wmaid'] = wmaHash(doc)
         if not doc.get('wmats', 0):
             doc['wmats'] = time.time()
         if not doc.get('stype', ''):
             doc['stype'] = self.sts.stype
         yield doc
Exemplo n.º 14
0
    def setUp(self):
        self.tdir = tempfile.mkdtemp()
	data = {"int":1, "float":1.2, "list":[1,2,3],
		"dict":{"dname": "foo", "dval":1},
		"listdict":[{"lname":"foo"}], "str":"string"}
        self.bare_data = dict(data)
        data['wmaid'] = wmaHash(data)
        data['stype'] = 'avroio'
        self.data = data
        schema = gen_schema(self.data)
        sname = os.path.join(self.tdir, 'schema.avsc')
        with open(sname, 'w') as ostream:
            ostream.write(json.dumps(schema))
        self.mgr = AvroStorage('avroio:%s' % sname)
Exemplo n.º 15
0
def main():
    "Main function"
    optmgr  = OptionParser()
    opts = optmgr.parser.parse_args()
    time0 = time.time()

    if  opts.scripts:
        scripts()
        sys.exit(0)

    todate = datetime.datetime.today()
    todate = int(todate.strftime("%Y%m%d"))
    fromdate = datetime.datetime.today()-datetime.timedelta(days=1)
    fromdate = int(fromdate.strftime("%Y%m%d"))
    spec = json.load(open(opts.spec)) if opts.spec else {}
    timerange = spec.get('spec', {}).get('timerange', [fromdate, todate])

    if  opts.hdir == HDIR:
        hdir = opts.hdir.split()
        if  len(hdir) == 1:
            hdir = hdir[0]
            hdirs = []
            for tval in range_dates(timerange):
                if  hdir.find(tval) == -1:
                    hdirs.append(os.path.join(hdir, tval))
            hdir = hdirs
    else:
        hdir = opts.hdir
    results = run(opts.schema, hdir, opts.script, opts.spec, opts.verbose, opts.rout, opts.yarn)
    if  opts.store:
        data = {"results":results,"ts":time.time(),"etime":time.time()-time0}
        if  opts.wmaid:
            data['wmaid'] = opts.wmaid
        else:
            data['wmaid'] = wmaHash(data)
        data['dtype'] = 'job'
        pdata = dict(job=data)
        postdata(opts.store, pdata, opts.ckey, opts.cert, opts.verbose)
    elif opts.amq:
        creds = credentials(opts.amq)
        host, port = creds['host_and_ports'].split(':')
        if  creds and StompAMQ:
            print("### Send %s docs via StompAMQ" % len(results))
            amq = StompAMQ(creds['username'], creds['password'], \
                    creds['producer'], creds['topic'], [(host, port)])
            amq.send(results)
    else:
        print(results)
Exemplo n.º 16
0
 def setUp(self):
     self.tdir = tempfile.mkdtemp()
     self.mgr = FileStorage('fileio:%s' % self.tdir)
     data = {
         "int": 1,
         "float": 1.2,
         "list": [1, 2, 3],
         "dict": {
             "dname": "foo",
             "dval": 1
         },
         "listdict": [{
             "lname": "foo"
         }],
         "str": "string"
     }
     self.bare_data = dict(data)
     data['wmaid'] = wmaHash(data)
     data['stype'] = self.mgr.stype
     self.data = data
Exemplo n.º 17
0
def main():
    "Main function"
    optmgr  = OptionParser()
    opts = optmgr.parser.parse_args()
    time0 = time.time()
    hdir = opts.hdir.split()
    if  len(hdir) == 1:
        hdir = hdir[0]
    results = run(opts.schema, hdir, opts.script, opts.spec, opts.verbose, opts.yarn)
    if  opts.store:
        data = {"results":results,"ts":time.time(),"etime":time.time()-time0}
        if  opts.wmaid:
            data['wmaid'] = opts.wmaid
        else:
            data['wmaid'] = wmaHash(data)
        data['dtype'] = 'job'
        pdata = dict(job=data)
        postdata(opts.store, pdata, opts.ckey, opts.cert, opts.verbose)
    else:
        print(results)
Exemplo n.º 18
0
 def setUp(self):
     uri = os.environ.get("WMA_MONGODB", "mongodb://localhost:8230")
     self.dbname = "test_fwjr"
     try:
         self.mgr = MongoStorage(uri, dbname=self.dbname)
         self.mgr.remove()
     except:
         self.mgr = None
         print("WARNING: cannot connect to %s" % uri)
     data = {
         "int": 1,
         "float": 1.2,
         "list": [1, 2, 3],
         "dict": {"dname": "foo", "dval": 1},
         "listdict": [{"lname": "foo"}],
         "str": "string",
     }
     self.bare_data = dict(data)
     data["wmaid"] = wmaHash(data)
     data["stype"] = "mongodb"
     self.data = data
Exemplo n.º 19
0
 def setUp(self):
     self.tdir = tempfile.mkdtemp()
     data = {
         "int": 1,
         "float": 1.2,
         "list": [1, 2, 3],
         "dict": {
             "dname": "foo",
             "dval": 1
         },
         "listdict": [{
             "lname": "foo"
         }],
         "str": "string"
     }
     self.bare_data = dict(data)
     data['wmaid'] = wmaHash(data)
     data['stype'] = 'avroio'
     self.data = data
     schema = gen_schema(self.data)
     sname = os.path.join(self.tdir, 'schema.avsc')
     with open(sname, 'w') as ostream:
         ostream.write(json.dumps(schema))
     self.mgr = AvroStorage('avroio:%s' % sname)
Exemplo n.º 20
0
    def file_write(self, fname, data):
        "Write documents in append mode to given file name"
        # perform input data validation
        good_data = []
        # write bad data records into output file
        bdir = os.path.dirname(fname)
        bdir = '%s/bad' % bdir if bdir else '/tmp/bad'
        if not os.path.exists(bdir):
            os.makedirs(bdir)
        bfname = '%s/%s_bad.txt' % (bdir, os.path.basename(fname))
        count = ecount = edocs = 0
        with open(bfname, 'a') as bstream:
            for rec in data:
                validator = RecordValidator()
                validator.run(self.schema_json, rec)
                if validator.errors:
                    bstream.write(json.dumps(rec) + '\n')
                    for err in validator.errors:
                        msg = 'SCHEMA ERROR '
                        for key, val in err.items():
                            msg += '%s: %s ' % (key.upper(), json.dumps(val))
                        bstream.write(msg + '\n')
                    bstream.write('-------------\n')
                    ecount += len(validator.errors)
                    edocs += 1
                else:
                    good_data.append(rec)
                count += 1
        if ecount:
            print("WARNING: received %s docs, found %s bad docs, %s errors, see %s"\
                    % (count, edocs, ecount, bfname))
        # use only good portion of the data
        data = good_data
        try:
            schema = self.schema
            wmaids = []
            if not hasattr(data, '__iter__') or isinstance(data, dict):
                data = [data]

            if os.path.exists(fname):
                schema = None  # we'll append to existing file
            mode = 'a+' if fname.endswith('.avro') else 'a'
            if mode == 'a':
                print(
                    "We're unable yet to implement read-write mode with compressed avro files"
                )
                raise NotImplementedError
            rec = None  # keep doc in case of failure
            with DataFileWriter(open_file(fname, mode), DatumWriter(),
                                schema) as writer:
                for rec in data:
                    writer.append(rec)
                    writer.flush()
                    wmaid = rec.get('wmaid', wmaHash(rec))
                    wmaids.append(wmaid)
            return wmaids
        except Exception as exc:
            err = traceback.format_exc(limit=1).splitlines()[-1]
            line = ' '.join(str(exc).replace('\n', '').split())
            msg = 'Failure in %s storage, error=%s, exception=%s' \
                    % (self.stype, err, line)
            msg += ' Failed document: '
            msg += json.dumps(rec)
            raise WriteError(msg)
Exemplo n.º 21
0
def main():
    "Main function"
    optmgr = OptionParser()
    opts = optmgr.parser.parse_args()
    time0 = time.time()

    if opts.scripts:
        scripts()
        sys.exit(0)

    verbose = opts.verbose
    todate = datetime.datetime.today()
    todate = int(todate.strftime("%Y%m%d"))
    fromdate = datetime.datetime.today() - datetime.timedelta(days=1)
    fromdate = int(fromdate.strftime("%Y%m%d"))
    spec = {}
    try:
        if os.path.isfile(opts.spec):
            spec = json.load(open(opts.spec))
        else:
            spec = json.loads(opts.spec)
    except Exception as exp:
        pass
    timerange = spec.get('spec', {}).get('timerange', [fromdate, todate])
    if timerange and verbose:
        print("### TimeRang: %s" % timerange)

    hdir = opts.hdir
    if timerange:
        pat = re.compile(".*/20[0-9][0-9].*")
        if len(hdir.split()) == 1 and not pat.match(hdir):
            hdir = hdir.split()[0]
            hdirs = []
            for tval in range_dates(timerange):
                if hdir.find(tval) == -1:
                    hdirs.append(os.path.join(hdir, tval))
            hdir = hdirs
    if verbose:
        print("### HDIR: %s" % hdir)
    results = run(opts.schema, hdir, opts.script, opts.spec, verbose,
                  opts.rout, opts.yarn)
    if opts.store:
        data = {
            "results": results,
            "ts": time.time(),
            "etime": time.time() - time0
        }
        if opts.wmaid:
            data['wmaid'] = opts.wmaid
        else:
            data['wmaid'] = wmaHash(data)
        data['dtype'] = 'job'
        pdata = dict(job=data)
        postdata(opts.store, pdata, opts.ckey, opts.cert, verbose)
    elif opts.amq:
        creds = credentials(opts.amq)
        host, port = creds['host_and_ports'].split(':')
        port = int(port)
        if creds and StompAMQ:
            print("### Send %s docs via StompAMQ" % len(results))
            amq = StompAMQ(creds['username'], creds['password'], \
                    creds['producer'], creds['topic'], \
                    validation_schema=None, \
                    host_and_ports=[(host, port)])
            data = []
            for doc in results:
                hid = doc.get("hash", 1)
                if '_id' in doc:
                    del doc['_id']  # delete ObjectID from MongoDB
                producer = "wmarchive"
                tstamp = int(time.time()) * 1000
                notification, _, _ = amq.make_notification(doc,
                                                           hid,
                                                           producer=producer,
                                                           ts=tstamp,
                                                           dataSubfield="")
                data.append(notification)
            results = amq.send(data)
            print("### results from AMQ %s" % len(results))
    else:
        if isinstance(results, list):
            print("### number of results %s" % len(results))
            for doc in results:
                if '_id' in doc:
                    del doc['_id']  # delete ObjectID from MongoDB
                try:
                    print(json.dumps(doc))
                except:
                    print(doc)
        else:
            print(results)
Exemplo n.º 22
0
def main():
    "Main function"
    optmgr = OptionParser()
    opts = optmgr.parser.parse_args()
    time0 = time.time()

    if opts.scripts:
        scripts()
        sys.exit(0)

    verbose = opts.verbose
    todate = datetime.datetime.today()
    todate = int(todate.strftime("%Y%m%d"))
    fromdate = datetime.datetime.today() - datetime.timedelta(days=1)
    fromdate = int(fromdate.strftime("%Y%m%d"))
    spec = {}
    try:
        if os.path.isfile(opts.spec):
            spec = json.load(open(opts.spec))
        else:
            spec = json.loads(opts.spec)
    except Exception as exp:
        pass
    timerange = spec.get('spec', {}).get('timerange', [fromdate, todate])
    if timerange and verbose:
        print("### TimeRang: %s" % timerange)

    hdir = opts.hdir
    if timerange:
        pat = re.compile(".*/20[0-9][0-9].*")
        if len(hdir.split()) == 1 and not pat.match(hdir):
            hdir = hdir.split()[0]
            hdirs = []
            for tval in range_dates(timerange):
                if hdir.find(tval) == -1:
                    hdfs_file_path = os.path.join(hdir, tval)
                    # check whether the hdfs path exists
                    cmd = ['hdfs', 'dfs', '-test', '-d', hdfs_file_path]
                    ret, out, err = run_cmd(cmd)
                    if ret == 0:
                        hdirs.append(hdfs_file_path)
                    else:
                        print "Path does not exist:", hdfs_file_path
            hdir = hdirs
    if verbose:
        print("### HDIR: %s" % hdir)
    if opts.logfail:
        print "Start query"
        runActionsHistoryQuery(opts.schema, hdir, verbose, opts.yarn)
        print "Finish query"
    else:
        results = run(opts.schema, hdir, opts.script, opts.spec, verbose,
                      opts.rout, opts.yarn)
        if opts.store:
            data = {
                "results": results,
                "ts": time.time(),
                "etime": time.time() - time0
            }
            if opts.wmaid:
                data['wmaid'] = opts.wmaid
            else:
                data['wmaid'] = wmaHash(data)
            data['dtype'] = 'job'
            pdata = dict(job=data)
            postdata(opts.store, pdata, opts.ckey, opts.cert, verbose)
        elif opts.amq:
            creds = credentials(opts.amq)
            host, port = creds['host_and_ports'].split(':')
            port = int(port)
            if creds and StompAMQ:
                print("### Send %s docs via StompAMQ" % len(results))
                amq = StompAMQ(creds['username'], creds['password'], \
                        creds['producer'], creds['topic'], [(host, port)])
                data = []
                for doc in results:
                    hid = doc.get("hash", 1)
                    if '_id' in doc:
                        del doc['_id']  # delete ObjectID from MongoDB
                    data.append(amq.make_notification(doc, hid))
                results = amq.send(data)
                print("### results from AMQ %s" % len(results))
        else:
            if isinstance(results, list):
                print("### number of results %s" % len(results))
                for doc in results:
                    if '_id' in doc:
                        del doc['_id']  # delete ObjectID from MongoDB
                    try:
                        print(json.dumps(doc))
                    except:
                        print(doc)
            else:
                print(results)
Exemplo n.º 23
0
def main():
    "Main function"
    optmgr = OptionParser()
    opts = optmgr.parser.parse_args()
    time0 = time.time()

    if opts.scripts:
        scripts()
        sys.exit(0)

    todate = datetime.datetime.today()
    todate = int(todate.strftime("%Y%m%d"))
    fromdate = datetime.datetime.today() - datetime.timedelta(days=1)
    fromdate = int(fromdate.strftime("%Y%m%d"))
    spec = {}
    try:
        if os.path.isfile(opts.spec):
            spec = json.load(open(opts.spec))
        else:
            spec = json.loads(opts.spec)
    except Exception as exp:
        pass
    timerange = spec.get('spec', {}).get('timerange', [fromdate, todate])

    if opts.hdir == HDIR:
        hdir = opts.hdir.split()
        if len(hdir) == 1:
            hdir = hdir[0]
            hdirs = []
            for tval in range_dates(timerange):
                if hdir.find(tval) == -1:
                    hdirs.append(os.path.join(hdir, tval))
            hdir = hdirs
    else:
        hdir = opts.hdir
    results = run(opts.schema, hdir, opts.script, opts.spec, opts.verbose,
                  opts.rout, opts.yarn)
    if opts.store:
        data = {
            "results": results,
            "ts": time.time(),
            "etime": time.time() - time0
        }
        if opts.wmaid:
            data['wmaid'] = opts.wmaid
        else:
            data['wmaid'] = wmaHash(data)
        data['dtype'] = 'job'
        pdata = dict(job=data)
        postdata(opts.store, pdata, opts.ckey, opts.cert, opts.verbose)
    elif opts.amq:
        creds = credentials(opts.amq)
        host, port = creds['host_and_ports'].split(':')
        port = int(port)
        if creds and StompAMQ:
            print("### Send %s docs via StompAMQ" % len(results))
            amq = StompAMQ(creds['username'], creds['password'], \
                    creds['producer'], creds['topic'], [(host, port)])
            data = []
            for doc in results:
                hid = doc.get("hash", 1)
                if '_id' in doc:
                    del doc['_id']  # delete ObjectID from MongoDB
                data.append(amq.make_notification(doc, hid))
            results = amq.send(data)
            print("### results from AMQ", len(results))
    else:
        print("### agg. results", len(results))