def write(self, data): """ Write given data chunk (list of WM documents) into proxy server. Return true or false of write operation. """ reason = '' status = 'ok' ids = [] if isinstance(data, dict): data = [data] try: if not isinstance(data, list): raise HTTPError(500, "WMArchive exception, invalid data format: %s" % type(data)) docs = [r for r in self.encode(data)] ids = self.sts.write(docs) if not ids and len(data): # somehow we got empty list for given data status = 'unknown' except WriteError as exp: reason = tstamp("WMArchiveManager::write") + " exception: %s" % str(exp) print(reason) traceback.print_exc() ids = extractFWJRids(data) raise HTTPError(500, 'WMArhchive WriteError, ids=%s, exception=%s'\ % (ids, str(exp))) except Exception as exp: reason = tstamp("WMArchiveManager::write") + " exception: %s" % str(exp) print(reason) traceback.print_exc() ids = extractFWJRids(data) raise HTTPError(500, 'WMArhchive exception, ids=%s, exception=%s'\ % (ids, str(exp))) result = {'stype': self.sts.stype, 'ids': ids, 'status': status} if reason: result['reason'] = reason return result
def read(self, spec, fields): """ Send request to proxy server to read data for given query. Yield list of found documents or None. """ result = {'input': {'spec': spec, 'fields': fields}, 'results': [], 'storage': self.sts.stype, 'status': 'ok'} # convert given spec into query suitable for sts/lts if isinstance(spec, dict): try: trange = spec.pop('timerange') except KeyError: print(tstamp("WMArchiveManager::read"), "timerange is not provided") result['reason'] = 'No timerange is provided, please adjust your query spec' result['status'] = 'fail' return result if trange_check(trange): print(tstamp("WMArchiveManager::read"), "bad timerange: %s" % trange) result['reason'] = 'Unable to parse timerange, should be [YYYYMMDD, YYYYMMDD]' result['status'] = 'fail' return result # based on given time range define which manager # we'll use for data look-up mgr = self.sts if use_lts(trange, self.tls_thr): spec['timerange'] = trange # put back timerange for HDFS hdir constraint mgr = self.lts # convert spec into WMArchive one spec, fields = self.qmap(mgr, spec, fields) else: # if spec is a list, it means user look-up docs by wmaids # they represents results of LTS data look-up mgr = self.sts status = 'ok' reason = None try: # request data from back-end data = mgr.read(spec, fields) except ReadError as exp: print(exp) data = [] status = 'read error' except Exception as exp: data = [] print(tstamp("WMArchiveManager::read"), "fail with %s" % str(exp)) reason = str(exp) status = 'fail' result['data'] = data result['status'] = status if reason: result['reason'] = reason return result
def daemon(name, opts): "Daemon function" thr = opts.thr*1024*1024 # convert input in MB into bytes while True: time.sleep(opts.sleep) print(tstamp(name), 'Migrate mongodb records to avro files') migrate(opts.muri, opts.odir, opts.mdir, \ opts.schema, thr, opts.compress, opts.chunk, opts.mthr) print(tstamp(name), 'Cleanup MongoDB') cleanup(opts.muri, opts.tstamp, opts.stype)
def daemon(name, opts): "Daemon function" thr = opts.thr * 1024 * 1024 # convert input in MB into bytes while True: time.sleep(opts.sleep) print(tstamp(name), 'Migrate mongodb records to avro files') migrate(opts.muri, opts.odir, opts.mdir, \ opts.schema, thr, opts.compress, opts.chunk) print(tstamp(name), 'Cleanup MongoDB') cleanup(opts.muri, opts.tstamp, opts.stype)
def read(self, spec, fields): """ Send request to proxy server to read data for given query. Yield list of found documents or None. """ self.read_access += 1 dbname = spec.get('dtype', 'fwjr') result = {'input': {'spec': spec, 'fields': fields}, 'results': [], 'storage': self.sts[dbname].stype, 'status': 'ok'} # convert given spec into query suitable for sts/lts if isinstance(spec, dict): try: trange = spec.pop('timerange') except KeyError: print(tstamp("WMArchiveManager::read"), "timerange is not provided in spec", spec) raise HTTPError(400, 'WMArhchive no timerange, spec=%s' % spec) if trange_check(trange): print(tstamp("WMArchiveManager::read"), "bad timerange: %s" % trange) raise HTTPError(400, 'WMArhchive unable to parse timerange, spec=%s' % spec) # based on given time range define which manager # we'll use for data look-up mgr = self.sts[dbname] if use_lts(trange, self.tls_thr): spec['timerange'] = trange # put back timerange for HDFS hdir constraint mgr = self.lts # convert spec into WMArchive one spec, fields = self.qmap(mgr, spec, fields) else: # if spec is a list, it means user look-up docs by wmaids # they represents results of LTS data look-up mgr = self.sts[dbname] status = 'ok' reason = None try: # request data from back-end data = mgr.read(spec, fields) except ReadError as exp: print(tstamp("WMArchiveManager::read"), "exception: %s" % str(exp)) traceback.print_exc() raise HTTPError(400, 'WMArhchive ReadError, exception %s' % str(exp)) except Exception as exp: print(tstamp("WMArchiveManager::read"), "exception: %s" % str(exp)) traceback.print_exc() raise HTTPError(400, 'WMArhchive exception %s' % str(exp)) result['data'] = data result['status'] = status if reason: result['reason'] = reason return result
def cleanup(muri, tst, stype): "Cleanup data in MongoDB (muri) for given timestamp (tst)" time0 = time.time() mstg = MongoStorage(muri) # remove records whose type is hdfsio, i.e. already migrated to HDFS, # and whose time stamp is less than provided one query = {'stype': stype, 'wmats':{'$lt': dateformat(tst)}} rdocs = mstg.ndocs(query) tdocs = time.time()-time0 print(tstamp('mongo2avro'), 'found %s docs (in %s) to be removed' % (rdocs, elapsed_time(time0))) time0 = time.time() response = mstg.remove(query) print(tstamp('mongo2avro'), 'remove query %s in %s' % (query, elapsed_time(time0)))
def read(self, spec, fields): """ Send request to proxy server to read data for given query. Yield list of found documents or None. """ result = {'input': {'spec': spec, 'fields': fields}, 'results': [], 'storage': self.sts.stype, 'status': 'ok'} # convert given spec into query suitable for sts/lts if isinstance(spec, dict): try: trange = spec.pop('timerange') except KeyError: print(tstamp("WMArchiveManager::read"), "timerange is not provided in spec", spec) raise HTTPError(400, 'WMArhchive no timerange, spec=%s' % spec) if trange_check(trange): print(tstamp("WMArchiveManager::read"), "bad timerange: %s" % trange) raise HTTPError(400, 'WMArhchive unable to parse timerange, spec=%s' % spec) # based on given time range define which manager # we'll use for data look-up mgr = self.sts if use_lts(trange, self.tls_thr): spec['timerange'] = trange # put back timerange for HDFS hdir constraint mgr = self.lts # convert spec into WMArchive one spec, fields = self.qmap(mgr, spec, fields) else: # if spec is a list, it means user look-up docs by wmaids # they represents results of LTS data look-up mgr = self.sts status = 'ok' reason = None try: # request data from back-end data = mgr.read(spec, fields) except ReadError as exp: print(tstamp("WMArchiveManager::read"), "exception: %s" % str(exp)) traceback.print_exc() raise HTTPError(400, 'WMArhchive ReadError, exception %s' % str(exp)) except Exception as exp: print(tstamp("WMArchiveManager::read"), "exception: %s" % str(exp)) traceback.print_exc() raise HTTPError(400, 'WMArhchive exception %s' % str(exp)) result['data'] = data result['status'] = status if reason: result['reason'] = reason return result
def cleanup(muri, tst, stype): "Cleanup data in MongoDB (muri) for given timestamp (tst)" time0 = time.time() mstg = MongoStorage(muri) # remove records whose type is hdfsio, i.e. already migrated to HDFS, # and whose time stamp is less than provided one query = {'stype': stype, 'wmats': {'$lt': dateformat(tst)}} rdocs = mstg.ndocs(query) tdocs = time.time() - time0 print(tstamp('mongo2avro'), 'found %s docs (in %s) to be removed' % (rdocs, elapsed_time(time0))) time0 = time.time() response = mstg.remove(query) print(tstamp('mongo2avro'), 'remove query %s in %s' % (query, elapsed_time(time0)))
def write(self, data): """ Write given data chunk (list of WM documents) into proxy server. Return true or false of write operation. """ status = 'ok' ids = [] try: if isinstance(data, dict): data = [data] if not isinstance(data, list): raise Exception("WMArchiveManager::write, Invalid data format: %s" % type(data)) docs = [r for r in self.encode(data)] ids = self.sts.write(docs) if not ids and len(data): # somehow we got empty list for given data status = 'unknown' except WriteError as exp: print(exp) data = [] status = 'write error' except Exception as exp: print(tstamp("WMArchiveManager::write"), "fail with %s" % str(exp)) status = 'fail' ids = [] result = {'stype': self.sts.stype, 'ids': ids, 'status': status} return result
def submit_spark(self, wmaid, spec, fields, wait=60): """ Submit function provides interface how to submit job to HDFS/Spark/MR. It will use subprocess module to call specific function, e.g. bash script (myspark) The job parameters includes: HDFS directory pattern, schema file, script name, spec file and store uri. The job will be routed to yarn cluster. The myspark script will store results back to provided store uri, i.e. WMArchive REST interface. """ "Run given command in subprocess" hdir = ' '.join(make_hdfs_path(self.hdir, spec.pop('timerange'))) schema = self.uri sfile = 'PySpark/RecordFinder.py' if 'aggregate' in spec: sfile = 'PySpark/RecordReader.py' ppath = '/'.join(WMArchive.__file__.split('/')[:-1]) script = os.path.join(ppath, sfile) data = json.dumps(dict(spec=spec, fields=fields)) os.environ[ 'PYTHONPATH'] = os.environ['PYTHONPATH'] + ':%s/PySpark' % ppath cmd = 'myspark %s --hdir="%s" --schema=%s --script=%s --spec=\'%s\' --store=%s --wmaid=%s' \ % (self.yarn, hdir, schema, script, data, self.wmauri, wmaid) print(tstamp("WMArchive::LTS"), cmd) proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True, env=os.environ) # wait for process if we use taskmgr. The taskmgr has internal queue # which controls number of running jobs proc.wait()
def __init__(self, app, config, mount): """ :arg app: reference to the application object. :arg config: reference to the configuration. :arg str mount: URL mount point.""" mainroot = 'wmarchive' # entry point in access URL wpath = os.getenv('WMA_STATIC_ROOT', '') if not wpath: content = os.path.abspath(__file__).rsplit('/', 5)[0] xlib = (__file__.find("/xlib/") >= 0 and "x") or "" wpath = "%s/%sdata/" % (content, xlib) if not wpath.endswith('/'): wpath += '/' print(tstamp(self.__class__.__name__), "static content: %s" % wpath) mdict = {"root": wpath, \ "rx": re.compile(r"^[a-z]+/[-a-z0-9]+\.(?:html)$")} tdict = {"root": wpath+"templates/", \ "rx": re.compile(r"^([a-zA-Z]+/)*[-a-z0-9_]+\.(?:html|tmpl)$")} jdict = {"root": wpath+"js/", \ "rx": re.compile(r"^([a-zA-Z]+/)*[-a-z0-9_]+\.(?:js)$")} cdict = {"root": wpath+"css/", \ "rx": re.compile(r"^([a-zA-Z]+/)*[-a-z0-9_]+\..*(?:css)$")} idict = {"root": wpath+"images/", \ "rx": re.compile(r"^([a-zA-Z]+/)*[-a-z0-9_]+\.(?:png|gif|jpg)$")} roots = {mainroot: mdict, "templates": tdict, \ "js": jdict, "css": cdict, "images": idict} # location of frontpage in the root, e.g. wmarchive frontpage = "%s/templates/wma.html" % mainroot RESTFrontPage.__init__(self, app, config, mount, frontpage, roots)
def submit_spark(self, wmaid, spec, fields, wait=60): """ Submit function provides interface how to submit job to HDFS/Spark/MR. It will use subprocess module to call specific function, e.g. bash script (myspark) The job parameters includes: HDFS directory pattern, schema file, script name, spec file and store uri. The job will be routed to yarn cluster. The myspark script will store results back to provided store uri, i.e. WMArchive REST interface. """ "Run given command in subprocess" hdir = ' '.join(make_hdfs_path(self.hdir, spec.pop('timerange'))) schema = self.uri sfile = 'PySpark/RecordFinder.py' if 'aggregate' in spec: sfile = 'PySpark/RecordReader.py' ppath = '/'.join(WMArchive.__file__.split('/')[:-1]) script = os.path.join(ppath, sfile) data = json.dumps(dict(spec=spec, fields=fields)) os.environ['PYTHONPATH']=os.environ['PYTHONPATH']+':%s/PySpark' % ppath cmd = 'myspark %s --hdir="%s" --schema=%s --script=%s --spec=\'%s\' --store=%s --wmaid=%s' \ % (self.yarn, hdir, schema, script, data, self.wmauri, wmaid) print(tstamp("WMArchive::LTS"), cmd) proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True, env=os.environ) # wait for process if we use taskmgr. The taskmgr has internal queue # which controls number of running jobs proc.wait()
def __init__(self, config=None): # define DB names to work with. These names should correspond to # dtype of documents we assign, see find_dtype and encode method self.dbnames = ['fwjr', 'crab'] # Short-Term Storage self.sts = {} for dbname in self.dbnames: self.sts[dbname] = STSManager(config.short_storage_uri, dbname=dbname) self.sts_agg = STSManager(config.short_storage_uri, dbname='aggregated') # Long-Term Storage self.tls_thr = config.long_storage_thr if LTS: # we'll use this module if it's loaded self.lts = LTSManager(config.long_storage_uri, config.wmauri, config.yarn) else: # fallback self.lts = self.sts['fwjr'] self.specmap = {} with open(config.specmap, 'r') as istream: cdict = {} for line in istream.readlines(): pair = line.replace('\n', '').split(',') self.specmap[pair[0]] = pair[1] # lfn:LFNArray msg = "Short-Term Storage %s, Long-Term Storage %s, specmap %s" % ( self.sts, self.lts, self.specmap) print(tstamp("WMArchiveManager::init"), msg) self.time0 = time.time() self.read_access = 0 self.write_access = 0
def migrate(muri, dbname, odir, mdir, avsc, thr, compress, chunk, close2midnight, dtype): "Write data from MongoDB (muri) to avro file(s) on local file system" mstg = MongoStorage(muri, dbname) auri = avsc if avsc.startswith('avroio:') else 'avroio:%s' % avsc astg = AvroStorage(auri) # read data from MongoDB for given storage and document types query = {'stype': mstg.stype, 'dtype': dtype} mdocs = mstg.find(query, None) # with no fields we'll get entire docs # loop over provided docs and write them into avro file on local file system wmaids = [] total = 0 fsize = 0 fname = file_name(odir, mdir, thr, compress, close2midnight) while True: data = [r for r in itertools.islice(mdocs, chunk)] total += len(data) if not len(data): break ids = astg.file_write(fname, data) if os.path.isfile(fname): fsize = os.path.getsize(fname) wmaids += ids if ids: # update status attributes of docs in MongoDB spec = {'$set': {'stype': astg.stype}} mstg.update(ids, spec) try: if PSUTIL: pid = os.getpid() proc = psutil.Process(pid) mem = proc.memory_info_ex() rss = 'RSS:%s' % size_format(mem.rss) else: rss = '' except: rss = '' print(tstamp('mongo2avro'), "%s docs %s %s (%s bytes) %s" \ % (len(ids), fname, size_format(fsize), fsize, rss)) fname = file_name(odir, mdir, thr, compress, close2midnight) print(tstamp('mongo2avro'), "wrote %s docs out of %s" % (len(wmaids), total))
def write(self, data): """ Write given data chunk (list of WM documents) into proxy server. Return true or false of write operation. """ self.write_access += 1 reason = '' status = 'ok' stype = 'unknown' ids = [] if isinstance(data, dict): data = [data] try: if not isinstance(data, list): raise HTTPError( 500, "WMArchive exception, invalid data format: %s" % type(data)) docs = [r for r in self.encode(data)] dtype = docs[0]['dtype'] ids = self.sts[dtype].write(docs) stype = self.sts[dtype].stype if not ids and len( data): # somehow we got empty list for given data status = 'unknown' except WriteError as exp: reason = tstamp( "WMArchiveManager::write") + " exception: %s" % str(exp) print(reason) traceback.print_exc() ids = extractFWJRids(data) raise HTTPError(500, 'WMArhchive WriteError, ids=%s, exception=%s'\ % (ids, str(exp))) except Exception as exp: reason = tstamp( "WMArchiveManager::write") + " exception: %s" % str(exp) print(reason) traceback.print_exc() ids = extractFWJRids(data) raise HTTPError(500, 'WMArhchive exception, ids=%s, exception=%s'\ % (ids, str(exp))) result = {'stype': stype, 'ids': ids, 'status': status} if reason: result['reason'] = reason return result
def migrate(muri, odir, mdir, avsc, thr, compress, chunk, close2midnight): "Write data from MongoDB (muri) to avro file(s) on local file system" mstg = MongoStorage(muri) auri = avsc if avsc.startswith('avroio:') else 'avroio:%s' % avsc astg = AvroStorage(auri) # read data from MongoDB, returned mdocs is generator type query = {'stype': mstg.stype} mdocs = mstg.find(query, None) # with no fields we'll get entire docs # loop over provided docs and write them into avro file on local file system wmaids = [] total = 0 fsize = 0 fname = file_name(odir, mdir, thr, compress, close2midnight) while True: data = [r for r in itertools.islice(mdocs, chunk)] total += len(data) if not len(data): break ids = astg.file_write(fname, data) if os.path.isfile(fname): fsize = os.path.getsize(fname) wmaids += ids if ids: # update status attributes of docs in MongoDB spec = {'$set' : {'stype': astg.stype}} mstg.update(ids, spec) try: if PSUTIL: pid = os.getpid() proc = psutil.Process(pid) mem = proc.memory_info_ex() rss = 'RSS:%s' % size_format(mem.rss) else: rss = '' except: rss = '' print(tstamp('mongo2avro'), "%s docs %s %s (%s bytes) %s" \ % (len(ids), fname, size_format(fsize), fsize, rss)) fname = file_name(odir, mdir, thr, compress, close2midnight) print(tstamp('mongo2avro'), "wrote %s docs out of %s" % (len(wmaids), total))
def monitor(name, func, args): "Monitor thread for given name/func/args" while True: threads = threading.enumerate() threads.sort() found = False for thr in threads: if name == thr.name: found = True break if not found: print(tstamp('WARNING'), 'mongo2avro thread was not found, start new one') start_new_thread(name, func, (name, args)) time.sleep(5)
def __init__(self, config=None): # Short-Term Storage self.sts = STSManager(config.short_storage_uri) # Long-Term Storage self.tls_thr = config.long_storage_thr if LTS: # we'll use this module if it's loaded self.lts = LTSManager(config.long_storage_uri, config.wmauri, config.yarn) else: # fallback self.lts = self.sts self.specmap = {} with open(config.specmap, 'r') as istream: cdict = {} for line in istream.readlines(): pair = line.replace('\n', '').split(',') self.specmap[pair[0]] = pair[1] # lfn:LFNArray msg = "Short-Term Storage %s, Long-Term Storage %s, specmap %s" % (self.sts, self.lts, self.specmap) print(tstamp("WMArchiveManager::init"), msg)
def move_file(fname, mdir): "Move given file into migration area" try: os.mkdir(mdir) except OSError: pass bname = os.path.basename(fname).split('.')[0] tname = name = time.strftime("%H%M%S", time.gmtime()) nname = os.path.join(mdir, '%s_%s.avro' % (bname, tname)) print(tstamp('mongo2avro'), 'mv %s %s' % (fname, nname)) shutil.move(fname, nname) # remove bad file (see AvroIO.py) associated with fname bfname = '%s/bad/%s_bad.txt' % (os.path.dirname(fname), os.path.basename(fname)) if os.path.isfile(bfname): bfsize = os.path.getsize(bfname) if not bfsize: os.remove(bfname)
def __init__(self, config=None): # Short-Term Storage self.sts = STSManager(config.short_storage_uri) # Long-Term Storage self.tls_thr = config.long_storage_thr if LTS: # we'll use this module if it's loaded self.lts = LTSManager(config.long_storage_uri, config.wmauri, config.yarn) else: # fallback self.lts = self.sts self.specmap = {} with open(config.specmap, 'r') as istream: cdict = {} for line in istream.readlines(): pair = line.replace('\n', '').split(',') self.specmap[pair[0]] = pair[1] # lfn:LFNArray msg = "Short-Term Storage %s, Long-Term Storage %s, specmap %s" % ( self.sts, self.lts, self.specmap) print(tstamp("WMArchiveManager::init"), msg)
def file_name(odir, mdir, thr, compress): """ Read content of given dir and either re-use existing file or create a new one based on given file size threshold. When file exceed given threshold it is moved into migrate area within the same given directory. """ files = [f for f in os.listdir(odir) \ if os.path.isfile(os.path.join(odir, f))] if not files: return gen_file_name(odir, compress) files.sort() last_file = files[-1] fname = os.path.join(odir, last_file) size = os.path.getsize(fname) if size < thr: return fname try: os.mkdir(mdir) except OSError: pass # move files into migration area bname = os.path.basename(fname).split('.')[0] tname = name = time.strftime("%H%M%S", time.gmtime()) nname = os.path.join(mdir, '%s_%s.avro' % (bname, tname)) print(tstamp('mongo2avro'), 'mv %s %s' % (fname, nname)) shutil.move(fname, nname) # remove bad file (see AvroIO.py) associated with fname bfname = '%s/bad/%s_bad.txt' % (os.path.dirname(fname), os.path.basename(fname)) if os.path.isfile(bfname): bfsize = os.path.getsize(bfname) if not bfsize: os.remove(bfname) return file_name(odir, mdir, thr, compress)
def __init__(self, config=None): # define DB names to work with. These names should correspond to # dtype of documents we assign, see find_dtype and encode method self.dbnames = ['fwjr', 'crab'] # Short-Term Storage self.sts = {} for dbname in self.dbnames: self.sts[dbname] = STSManager(config.short_storage_uri, dbname=dbname) self.sts_agg = STSManager(config.short_storage_uri, dbname='aggregated') # Long-Term Storage self.tls_thr = config.long_storage_thr if LTS: # we'll use this module if it's loaded self.lts = LTSManager(config.long_storage_uri, config.wmauri, config.yarn) else: # fallback self.lts = self.sts['fwjr'] self.specmap = {} with open(config.specmap, 'r') as istream: cdict = {} for line in istream.readlines(): pair = line.replace('\n', '').split(',') self.specmap[pair[0]] = pair[1] # lfn:LFNArray # Monit manager self.monit = MonitManager(config.monit_credentials, config.monit_attributes) # NATS manager if hasattr(config, 'use_nats') and config.use_nats: self.nats = NATSManager(config.nats_server, topics=config.nats_topics, default_topic='cms.wmarchive', cms_filter=cms_filter) else: self.nats = None msg = "Short-Term Storage %s, Long-Term Storage %s, specmap %s" \ % (self.sts, self.lts, self.specmap) msg += '\nMonit {}'.format(self.monit) msg += '\nNATS {}'.format(self.nats) print(tstamp("WMArchiveManager::init"), msg) self.time0 = time.time() self.read_access = 0 self.write_access = 0
def log(self, msg): "Write given message to log stream" print(tstamp(self.__class__.__name__), msg)
def __str__(self): error = tstamp(repr(self.message)) return error