Пример #1
0
 def write(self, data):
     """
     Write given data chunk (list of WM documents) into proxy server.
     Return true or false of write operation.
     """
     reason = ''
     status = 'ok'
     ids = []
     if  isinstance(data, dict):
         data = [data]
     try:
         if  not isinstance(data, list):
             raise HTTPError(500, "WMArchive exception, invalid data format: %s" % type(data))
         docs = [r for r in self.encode(data)]
         ids = self.sts.write(docs)
         if  not ids and len(data): # somehow we got empty list for given data
             status = 'unknown'
     except WriteError as exp:
         reason = tstamp("WMArchiveManager::write") + " exception: %s" % str(exp)
         print(reason)
         traceback.print_exc()
         ids = extractFWJRids(data)
         raise HTTPError(500, 'WMArhchive WriteError, ids=%s, exception=%s'\
                 % (ids, str(exp)))
     except Exception as exp:
         reason = tstamp("WMArchiveManager::write") + " exception: %s" % str(exp)
         print(reason)
         traceback.print_exc()
         ids = extractFWJRids(data)
         raise HTTPError(500, 'WMArhchive exception, ids=%s, exception=%s'\
                 % (ids, str(exp)))
     result = {'stype': self.sts.stype, 'ids': ids, 'status': status}
     if  reason:
         result['reason'] = reason
     return result
Пример #2
0
    def read(self, spec, fields):
        """
        Send request to proxy server to read data for given query.
        Yield list of found documents or None.
        """
        result = {'input': {'spec': spec, 'fields': fields},
                  'results': [], 'storage': self.sts.stype, 'status': 'ok'}
        # convert given spec into query suitable for sts/lts
        if  isinstance(spec, dict):
            try:
                trange = spec.pop('timerange')
            except KeyError:
                print(tstamp("WMArchiveManager::read"), "timerange is not provided")
                result['reason'] = 'No timerange is provided, please adjust your query spec'
                result['status'] = 'fail'
                return result

            if  trange_check(trange):
                print(tstamp("WMArchiveManager::read"), "bad timerange: %s" % trange)
                result['reason'] = 'Unable to parse timerange, should be [YYYYMMDD, YYYYMMDD]'
                result['status'] = 'fail'
                return result

            # based on given time range define which manager
            # we'll use for data look-up
            mgr = self.sts
            if  use_lts(trange, self.tls_thr):
                spec['timerange'] = trange # put back timerange for HDFS hdir constraint
                mgr = self.lts

            # convert spec into WMArchive one
            spec, fields = self.qmap(mgr, spec, fields)
        else:
            # if spec is a list, it means user look-up docs by wmaids
            # they represents results of LTS data look-up
            mgr = self.sts
        status = 'ok'
        reason = None
        try:
            # request data from back-end
            data = mgr.read(spec, fields)
        except ReadError as exp:
            print(exp)
            data = []
            status = 'read error'
        except Exception as exp:
            data = []
            print(tstamp("WMArchiveManager::read"), "fail with %s" % str(exp))
            reason = str(exp)
            status = 'fail'
        result['data'] = data
        result['status'] = status
        if  reason:
            result['reason'] = reason
        return result
Пример #3
0
def daemon(name, opts):
    "Daemon function"
    thr = opts.thr*1024*1024 # convert input in MB into bytes
    while True:
        time.sleep(opts.sleep)
        print(tstamp(name), 'Migrate mongodb records to avro files')
        migrate(opts.muri, opts.odir, opts.mdir, \
                opts.schema, thr, opts.compress, opts.chunk, opts.mthr)

        print(tstamp(name), 'Cleanup MongoDB')
        cleanup(opts.muri, opts.tstamp, opts.stype)
Пример #4
0
def daemon(name, opts):
    "Daemon function"
    thr = opts.thr * 1024 * 1024  # convert input in MB into bytes
    while True:
        time.sleep(opts.sleep)
        print(tstamp(name), 'Migrate mongodb records to avro files')
        migrate(opts.muri, opts.odir, opts.mdir, \
                opts.schema, thr, opts.compress, opts.chunk)

        print(tstamp(name), 'Cleanup MongoDB')
        cleanup(opts.muri, opts.tstamp, opts.stype)
Пример #5
0
    def read(self, spec, fields):
        """
        Send request to proxy server to read data for given query.
        Yield list of found documents or None.
        """
        self.read_access += 1
        dbname = spec.get('dtype', 'fwjr')
        result = {'input': {'spec': spec, 'fields': fields},
                  'results': [], 'storage': self.sts[dbname].stype, 'status': 'ok'}
        # convert given spec into query suitable for sts/lts
        if  isinstance(spec, dict):
            try:
                trange = spec.pop('timerange')
            except KeyError:
                print(tstamp("WMArchiveManager::read"), "timerange is not provided in spec", spec)
                raise HTTPError(400, 'WMArhchive no timerange, spec=%s' % spec)

            if  trange_check(trange):
                print(tstamp("WMArchiveManager::read"), "bad timerange: %s" % trange)
                raise HTTPError(400, 'WMArhchive unable to parse timerange, spec=%s' % spec)

            # based on given time range define which manager
            # we'll use for data look-up
            mgr = self.sts[dbname]
            if  use_lts(trange, self.tls_thr):
                spec['timerange'] = trange # put back timerange for HDFS hdir constraint
                mgr = self.lts

            # convert spec into WMArchive one
            spec, fields = self.qmap(mgr, spec, fields)
        else:
            # if spec is a list, it means user look-up docs by wmaids
            # they represents results of LTS data look-up
            mgr = self.sts[dbname]
        status = 'ok'
        reason = None
        try:
            # request data from back-end
            data = mgr.read(spec, fields)
        except ReadError as exp:
            print(tstamp("WMArchiveManager::read"), "exception: %s" % str(exp))
            traceback.print_exc()
            raise HTTPError(400, 'WMArhchive ReadError, exception %s' % str(exp))
        except Exception as exp:
            print(tstamp("WMArchiveManager::read"), "exception: %s" % str(exp))
            traceback.print_exc()
            raise HTTPError(400, 'WMArhchive exception %s' % str(exp))
        result['data'] = data
        result['status'] = status
        if  reason:
            result['reason'] = reason
        return result
Пример #6
0
def cleanup(muri, tst, stype):
    "Cleanup data in MongoDB (muri) for given timestamp (tst)"
    time0 = time.time()
    mstg = MongoStorage(muri)
    # remove records whose type is hdfsio, i.e. already migrated to HDFS,
    # and whose time stamp is less than provided one
    query = {'stype': stype, 'wmats':{'$lt': dateformat(tst)}}
    rdocs = mstg.ndocs(query)
    tdocs = time.time()-time0
    print(tstamp('mongo2avro'), 'found %s docs (in %s) to be removed' % (rdocs, elapsed_time(time0)))
    time0 = time.time()
    response = mstg.remove(query)
    print(tstamp('mongo2avro'), 'remove query %s in %s' % (query, elapsed_time(time0)))
Пример #7
0
    def read(self, spec, fields):
        """
        Send request to proxy server to read data for given query.
        Yield list of found documents or None.
        """
        result = {'input': {'spec': spec, 'fields': fields},
                  'results': [], 'storage': self.sts.stype, 'status': 'ok'}
        # convert given spec into query suitable for sts/lts
        if  isinstance(spec, dict):
            try:
                trange = spec.pop('timerange')
            except KeyError:
                print(tstamp("WMArchiveManager::read"), "timerange is not provided in spec", spec)
                raise HTTPError(400, 'WMArhchive no timerange, spec=%s' % spec)

            if  trange_check(trange):
                print(tstamp("WMArchiveManager::read"), "bad timerange: %s" % trange)
                raise HTTPError(400, 'WMArhchive unable to parse timerange, spec=%s' % spec)

            # based on given time range define which manager
            # we'll use for data look-up
            mgr = self.sts
            if  use_lts(trange, self.tls_thr):
                spec['timerange'] = trange # put back timerange for HDFS hdir constraint
                mgr = self.lts

            # convert spec into WMArchive one
            spec, fields = self.qmap(mgr, spec, fields)
        else:
            # if spec is a list, it means user look-up docs by wmaids
            # they represents results of LTS data look-up
            mgr = self.sts
        status = 'ok'
        reason = None
        try:
            # request data from back-end
            data = mgr.read(spec, fields)
        except ReadError as exp:
            print(tstamp("WMArchiveManager::read"), "exception: %s" % str(exp))
            traceback.print_exc()
            raise HTTPError(400, 'WMArhchive ReadError, exception %s' % str(exp))
        except Exception as exp:
            print(tstamp("WMArchiveManager::read"), "exception: %s" % str(exp))
            traceback.print_exc()
            raise HTTPError(400, 'WMArhchive exception %s' % str(exp))
        result['data'] = data
        result['status'] = status
        if  reason:
            result['reason'] = reason
        return result
Пример #8
0
def cleanup(muri, tst, stype):
    "Cleanup data in MongoDB (muri) for given timestamp (tst)"
    time0 = time.time()
    mstg = MongoStorage(muri)
    # remove records whose type is hdfsio, i.e. already migrated to HDFS,
    # and whose time stamp is less than provided one
    query = {'stype': stype, 'wmats': {'$lt': dateformat(tst)}}
    rdocs = mstg.ndocs(query)
    tdocs = time.time() - time0
    print(tstamp('mongo2avro'),
          'found %s docs (in %s) to be removed' % (rdocs, elapsed_time(time0)))
    time0 = time.time()
    response = mstg.remove(query)
    print(tstamp('mongo2avro'),
          'remove query %s in %s' % (query, elapsed_time(time0)))
Пример #9
0
 def write(self, data):
     """
     Write given data chunk (list of WM documents) into proxy server.
     Return true or false of write operation.
     """
     status = 'ok'
     ids = []
     try:
         if  isinstance(data, dict):
             data = [data]
         if  not isinstance(data, list):
             raise Exception("WMArchiveManager::write, Invalid data format: %s" % type(data))
         docs = [r for r in self.encode(data)]
         ids = self.sts.write(docs)
         if  not ids and len(data): # somehow we got empty list for given data
             status = 'unknown'
     except WriteError as exp:
         print(exp)
         data = []
         status = 'write error'
     except Exception as exp:
         print(tstamp("WMArchiveManager::write"), "fail with %s" % str(exp))
         status = 'fail'
         ids = []
     result = {'stype': self.sts.stype, 'ids': ids, 'status': status}
     return result
Пример #10
0
    def submit_spark(self, wmaid, spec, fields, wait=60):
        """
        Submit function provides interface how to submit job to
        HDFS/Spark/MR. It will use subprocess module to call
        specific function, e.g. bash script (myspark)

        The job parameters includes: HDFS directory pattern, schema file,
        script name, spec file and store uri. The job will be routed to
        yarn cluster. The myspark script will store results back to
        provided store uri, i.e. WMArchive REST interface.
        """
        "Run given command in subprocess"
        hdir = ' '.join(make_hdfs_path(self.hdir, spec.pop('timerange')))
        schema = self.uri
        sfile = 'PySpark/RecordFinder.py'
        if 'aggregate' in spec:
            sfile = 'PySpark/RecordReader.py'
        ppath = '/'.join(WMArchive.__file__.split('/')[:-1])
        script = os.path.join(ppath, sfile)
        data = json.dumps(dict(spec=spec, fields=fields))
        os.environ[
            'PYTHONPATH'] = os.environ['PYTHONPATH'] + ':%s/PySpark' % ppath
        cmd = 'myspark %s --hdir="%s" --schema=%s --script=%s --spec=\'%s\' --store=%s --wmaid=%s' \
                % (self.yarn, hdir, schema, script, data, self.wmauri, wmaid)
        print(tstamp("WMArchive::LTS"), cmd)
        proc = subprocess.Popen(cmd,
                                stdout=subprocess.PIPE,
                                stderr=subprocess.PIPE,
                                shell=True,
                                env=os.environ)
        # wait for process if we use taskmgr. The taskmgr has internal queue
        # which controls number of running jobs
        proc.wait()
Пример #11
0
 def __init__(self, app, config, mount):
     """
     :arg app: reference to the application object.
     :arg config: reference to the configuration.
     :arg str mount: URL mount point."""
     mainroot = 'wmarchive'  # entry point in access URL
     wpath = os.getenv('WMA_STATIC_ROOT', '')
     if not wpath:
         content = os.path.abspath(__file__).rsplit('/', 5)[0]
         xlib = (__file__.find("/xlib/") >= 0 and "x") or ""
         wpath = "%s/%sdata/" % (content, xlib)
     if not wpath.endswith('/'):
         wpath += '/'
     print(tstamp(self.__class__.__name__), "static content: %s" % wpath)
     mdict = {"root": wpath, \
              "rx": re.compile(r"^[a-z]+/[-a-z0-9]+\.(?:html)$")}
     tdict = {"root": wpath+"templates/", \
             "rx": re.compile(r"^([a-zA-Z]+/)*[-a-z0-9_]+\.(?:html|tmpl)$")}
     jdict = {"root": wpath+"js/", \
              "rx": re.compile(r"^([a-zA-Z]+/)*[-a-z0-9_]+\.(?:js)$")}
     cdict = {"root": wpath+"css/", \
              "rx": re.compile(r"^([a-zA-Z]+/)*[-a-z0-9_]+\..*(?:css)$")}
     idict = {"root": wpath+"images/", \
              "rx": re.compile(r"^([a-zA-Z]+/)*[-a-z0-9_]+\.(?:png|gif|jpg)$")}
     roots = {mainroot: mdict, "templates": tdict, \
             "js": jdict, "css": cdict, "images": idict}
     # location of frontpage in the root, e.g. wmarchive
     frontpage = "%s/templates/wma.html" % mainroot
     RESTFrontPage.__init__(self, app, config, mount, frontpage, roots)
Пример #12
0
    def submit_spark(self, wmaid, spec, fields, wait=60):
        """
        Submit function provides interface how to submit job to
        HDFS/Spark/MR. It will use subprocess module to call
        specific function, e.g. bash script (myspark)

        The job parameters includes: HDFS directory pattern, schema file,
        script name, spec file and store uri. The job will be routed to
        yarn cluster. The myspark script will store results back to
        provided store uri, i.e. WMArchive REST interface.
        """
        "Run given command in subprocess"
        hdir = ' '.join(make_hdfs_path(self.hdir, spec.pop('timerange')))
        schema = self.uri
        sfile = 'PySpark/RecordFinder.py'
        if  'aggregate' in spec:
            sfile = 'PySpark/RecordReader.py'
        ppath = '/'.join(WMArchive.__file__.split('/')[:-1])
        script = os.path.join(ppath, sfile)
        data = json.dumps(dict(spec=spec, fields=fields))
        os.environ['PYTHONPATH']=os.environ['PYTHONPATH']+':%s/PySpark' % ppath
        cmd = 'myspark %s --hdir="%s" --schema=%s --script=%s --spec=\'%s\' --store=%s --wmaid=%s' \
                % (self.yarn, hdir, schema, script, data, self.wmauri, wmaid)
        print(tstamp("WMArchive::LTS"), cmd)
        proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True, env=os.environ)
        # wait for process if we use taskmgr. The taskmgr has internal queue
        # which controls number of running jobs
        proc.wait()
Пример #13
0
 def __init__(self, config=None):
     # define DB names to work with. These names should correspond to
     # dtype of documents we assign, see find_dtype and encode method
     self.dbnames = ['fwjr', 'crab']
     # Short-Term Storage
     self.sts = {}
     for dbname in self.dbnames:
         self.sts[dbname] = STSManager(config.short_storage_uri,
                                       dbname=dbname)
     self.sts_agg = STSManager(config.short_storage_uri,
                               dbname='aggregated')
     # Long-Term Storage
     self.tls_thr = config.long_storage_thr
     if LTS:  # we'll use this module if it's loaded
         self.lts = LTSManager(config.long_storage_uri, config.wmauri,
                               config.yarn)
     else:  # fallback
         self.lts = self.sts['fwjr']
     self.specmap = {}
     with open(config.specmap, 'r') as istream:
         cdict = {}
         for line in istream.readlines():
             pair = line.replace('\n', '').split(',')
             self.specmap[pair[0]] = pair[1]  # lfn:LFNArray
     msg = "Short-Term Storage %s, Long-Term Storage %s, specmap %s" % (
         self.sts, self.lts, self.specmap)
     print(tstamp("WMArchiveManager::init"), msg)
     self.time0 = time.time()
     self.read_access = 0
     self.write_access = 0
Пример #14
0
 def __init__(self, app, config, mount):
     """
     :arg app: reference to the application object.
     :arg config: reference to the configuration.
     :arg str mount: URL mount point."""
     mainroot = 'wmarchive' # entry point in access URL
     wpath = os.getenv('WMA_STATIC_ROOT', '')
     if  not wpath:
         content = os.path.abspath(__file__).rsplit('/', 5)[0]
         xlib = (__file__.find("/xlib/") >= 0 and "x") or ""
         wpath = "%s/%sdata/" % (content, xlib)
     if  not wpath.endswith('/'):
         wpath += '/'
     print(tstamp(self.__class__.__name__), "static content: %s" % wpath)
     mdict = {"root": wpath, \
              "rx": re.compile(r"^[a-z]+/[-a-z0-9]+\.(?:html)$")}
     tdict = {"root": wpath+"templates/", \
             "rx": re.compile(r"^([a-zA-Z]+/)*[-a-z0-9_]+\.(?:html|tmpl)$")}
     jdict = {"root": wpath+"js/", \
              "rx": re.compile(r"^([a-zA-Z]+/)*[-a-z0-9_]+\.(?:js)$")}
     cdict = {"root": wpath+"css/", \
              "rx": re.compile(r"^([a-zA-Z]+/)*[-a-z0-9_]+\..*(?:css)$")}
     idict = {"root": wpath+"images/", \
              "rx": re.compile(r"^([a-zA-Z]+/)*[-a-z0-9_]+\.(?:png|gif|jpg)$")}
     roots = {mainroot: mdict, "templates": tdict, \
             "js": jdict, "css": cdict, "images": idict}
     # location of frontpage in the root, e.g. wmarchive
     frontpage = "%s/templates/wma.html" % mainroot
     RESTFrontPage.__init__(self, app, config, mount, frontpage, roots)
Пример #15
0
def migrate(muri, dbname, odir, mdir, avsc, thr, compress, chunk,
            close2midnight, dtype):
    "Write data from MongoDB (muri) to avro file(s) on local file system"
    mstg = MongoStorage(muri, dbname)
    auri = avsc if avsc.startswith('avroio:') else 'avroio:%s' % avsc
    astg = AvroStorage(auri)

    # read data from MongoDB for given storage and document types
    query = {'stype': mstg.stype, 'dtype': dtype}
    mdocs = mstg.find(query, None)  # with no fields we'll get entire docs

    # loop over provided docs and write them into avro file on local file system
    wmaids = []
    total = 0
    fsize = 0
    fname = file_name(odir, mdir, thr, compress, close2midnight)
    while True:
        data = [r for r in itertools.islice(mdocs, chunk)]
        total += len(data)
        if not len(data):
            break
        ids = astg.file_write(fname, data)
        if os.path.isfile(fname):
            fsize = os.path.getsize(fname)
        wmaids += ids

        if ids:
            # update status attributes of docs in MongoDB
            spec = {'$set': {'stype': astg.stype}}
            mstg.update(ids, spec)

        try:
            if PSUTIL:
                pid = os.getpid()
                proc = psutil.Process(pid)
                mem = proc.memory_info_ex()
                rss = 'RSS:%s' % size_format(mem.rss)
            else:
                rss = ''
        except:
            rss = ''
        print(tstamp('mongo2avro'), "%s docs %s %s (%s bytes) %s" \
                % (len(ids), fname, size_format(fsize), fsize, rss))
        fname = file_name(odir, mdir, thr, compress, close2midnight)
    print(tstamp('mongo2avro'),
          "wrote %s docs out of %s" % (len(wmaids), total))
Пример #16
0
 def write(self, data):
     """
     Write given data chunk (list of WM documents) into proxy server.
     Return true or false of write operation.
     """
     self.write_access += 1
     reason = ''
     status = 'ok'
     stype = 'unknown'
     ids = []
     if isinstance(data, dict):
         data = [data]
     try:
         if not isinstance(data, list):
             raise HTTPError(
                 500, "WMArchive exception, invalid data format: %s" %
                 type(data))
         docs = [r for r in self.encode(data)]
         dtype = docs[0]['dtype']
         ids = self.sts[dtype].write(docs)
         stype = self.sts[dtype].stype
         if not ids and len(
                 data):  # somehow we got empty list for given data
             status = 'unknown'
     except WriteError as exp:
         reason = tstamp(
             "WMArchiveManager::write") + " exception: %s" % str(exp)
         print(reason)
         traceback.print_exc()
         ids = extractFWJRids(data)
         raise HTTPError(500, 'WMArhchive WriteError, ids=%s, exception=%s'\
                 % (ids, str(exp)))
     except Exception as exp:
         reason = tstamp(
             "WMArchiveManager::write") + " exception: %s" % str(exp)
         print(reason)
         traceback.print_exc()
         ids = extractFWJRids(data)
         raise HTTPError(500, 'WMArhchive exception, ids=%s, exception=%s'\
                 % (ids, str(exp)))
     result = {'stype': stype, 'ids': ids, 'status': status}
     if reason:
         result['reason'] = reason
     return result
Пример #17
0
def migrate(muri, odir, mdir, avsc, thr, compress, chunk, close2midnight):
    "Write data from MongoDB (muri) to avro file(s) on local file system"
    mstg = MongoStorage(muri)
    auri = avsc if avsc.startswith('avroio:') else 'avroio:%s' % avsc
    astg = AvroStorage(auri)

    # read data from MongoDB, returned mdocs is generator type
    query = {'stype': mstg.stype}
    mdocs = mstg.find(query, None) # with no fields we'll get entire docs

    # loop over provided docs and write them into avro file on local file system
    wmaids = []
    total = 0
    fsize = 0
    fname = file_name(odir, mdir, thr, compress, close2midnight)
    while True:
        data = [r for r in itertools.islice(mdocs, chunk)]
        total += len(data)
        if  not len(data):
            break
        ids = astg.file_write(fname, data)
        if  os.path.isfile(fname):
            fsize = os.path.getsize(fname)
        wmaids += ids

        if  ids:
            # update status attributes of docs in MongoDB
            spec = {'$set' : {'stype': astg.stype}}
            mstg.update(ids, spec)

        try:
            if  PSUTIL:
                pid = os.getpid()
                proc = psutil.Process(pid)
                mem = proc.memory_info_ex()
                rss = 'RSS:%s' % size_format(mem.rss)
            else:
                rss = ''
        except:
            rss = ''
        print(tstamp('mongo2avro'), "%s docs %s %s (%s bytes) %s" \
                % (len(ids), fname, size_format(fsize), fsize, rss))
        fname = file_name(odir, mdir, thr, compress, close2midnight)
    print(tstamp('mongo2avro'), "wrote %s docs out of %s" % (len(wmaids), total))
Пример #18
0
def monitor(name, func, args):
    "Monitor thread for given name/func/args"
    while True:
        threads = threading.enumerate()
        threads.sort()
        found = False
        for thr in threads:
            if  name == thr.name:
                found = True
                break
        if  not found:
            print(tstamp('WARNING'), 'mongo2avro thread was not found, start new one')
            start_new_thread(name, func, (name, args))
        time.sleep(5)
Пример #19
0
def monitor(name, func, args):
    "Monitor thread for given name/func/args"
    while True:
        threads = threading.enumerate()
        threads.sort()
        found = False
        for thr in threads:
            if name == thr.name:
                found = True
                break
        if not found:
            print(tstamp('WARNING'),
                  'mongo2avro thread was not found, start new one')
            start_new_thread(name, func, (name, args))
        time.sleep(5)
Пример #20
0
 def __init__(self, config=None):
     # Short-Term Storage
     self.sts = STSManager(config.short_storage_uri)
     # Long-Term Storage
     self.tls_thr = config.long_storage_thr
     if  LTS: # we'll use this module if it's loaded
         self.lts = LTSManager(config.long_storage_uri, config.wmauri, config.yarn)
     else: # fallback
         self.lts = self.sts
     self.specmap = {}
     with open(config.specmap, 'r') as istream:
         cdict = {}
         for line in istream.readlines():
             pair = line.replace('\n', '').split(',')
             self.specmap[pair[0]] = pair[1] # lfn:LFNArray
     msg = "Short-Term Storage %s, Long-Term Storage %s, specmap %s" % (self.sts, self.lts, self.specmap)
     print(tstamp("WMArchiveManager::init"), msg)
Пример #21
0
def move_file(fname, mdir):
    "Move given file into migration area"
    try:
        os.mkdir(mdir)
    except OSError:
        pass
    bname = os.path.basename(fname).split('.')[0]
    tname = name = time.strftime("%H%M%S", time.gmtime())
    nname = os.path.join(mdir, '%s_%s.avro' % (bname, tname))
    print(tstamp('mongo2avro'), 'mv %s %s' % (fname, nname))
    shutil.move(fname, nname)

    # remove bad file (see AvroIO.py) associated with fname
    bfname = '%s/bad/%s_bad.txt' % (os.path.dirname(fname), os.path.basename(fname))
    if  os.path.isfile(bfname):
        bfsize = os.path.getsize(bfname)
        if  not bfsize:
            os.remove(bfname)
Пример #22
0
 def __init__(self, config=None):
     # Short-Term Storage
     self.sts = STSManager(config.short_storage_uri)
     # Long-Term Storage
     self.tls_thr = config.long_storage_thr
     if LTS:  # we'll use this module if it's loaded
         self.lts = LTSManager(config.long_storage_uri, config.wmauri,
                               config.yarn)
     else:  # fallback
         self.lts = self.sts
     self.specmap = {}
     with open(config.specmap, 'r') as istream:
         cdict = {}
         for line in istream.readlines():
             pair = line.replace('\n', '').split(',')
             self.specmap[pair[0]] = pair[1]  # lfn:LFNArray
     msg = "Short-Term Storage %s, Long-Term Storage %s, specmap %s" % (
         self.sts, self.lts, self.specmap)
     print(tstamp("WMArchiveManager::init"), msg)
Пример #23
0
def move_file(fname, mdir):
    "Move given file into migration area"
    try:
        os.mkdir(mdir)
    except OSError:
        pass
    bname = os.path.basename(fname).split('.')[0]
    tname = name = time.strftime("%H%M%S", time.gmtime())
    nname = os.path.join(mdir, '%s_%s.avro' % (bname, tname))
    print(tstamp('mongo2avro'), 'mv %s %s' % (fname, nname))
    shutil.move(fname, nname)

    # remove bad file (see AvroIO.py) associated with fname
    bfname = '%s/bad/%s_bad.txt' % (os.path.dirname(fname),
                                    os.path.basename(fname))
    if os.path.isfile(bfname):
        bfsize = os.path.getsize(bfname)
        if not bfsize:
            os.remove(bfname)
Пример #24
0
def file_name(odir, mdir, thr, compress):
    """
    Read content of given dir and either re-use existing file or create a new one
    based on given file size threshold. When file exceed given threshold it is
    moved into migrate area within the same given directory.
    """
    files = [f for f in os.listdir(odir) \
            if os.path.isfile(os.path.join(odir, f))]
    if not files:
        return gen_file_name(odir, compress)

    files.sort()
    last_file = files[-1]
    fname = os.path.join(odir, last_file)
    size = os.path.getsize(fname)
    if size < thr:
        return fname

    try:
        os.mkdir(mdir)
    except OSError:
        pass

    # move files into migration area
    bname = os.path.basename(fname).split('.')[0]
    tname = name = time.strftime("%H%M%S", time.gmtime())
    nname = os.path.join(mdir, '%s_%s.avro' % (bname, tname))
    print(tstamp('mongo2avro'), 'mv %s %s' % (fname, nname))
    shutil.move(fname, nname)

    # remove bad file (see AvroIO.py) associated with fname
    bfname = '%s/bad/%s_bad.txt' % (os.path.dirname(fname),
                                    os.path.basename(fname))
    if os.path.isfile(bfname):
        bfsize = os.path.getsize(bfname)
        if not bfsize:
            os.remove(bfname)
    return file_name(odir, mdir, thr, compress)
Пример #25
0
def file_name(odir, mdir, thr, compress):
    """
    Read content of given dir and either re-use existing file or create a new one
    based on given file size threshold. When file exceed given threshold it is
    moved into migrate area within the same given directory.
    """
    files = [f for f in os.listdir(odir) \
            if os.path.isfile(os.path.join(odir, f))]
    if  not files:
        return gen_file_name(odir, compress)

    files.sort()
    last_file = files[-1]
    fname = os.path.join(odir, last_file)
    size = os.path.getsize(fname)
    if  size < thr:
        return fname

    try:
        os.mkdir(mdir)
    except OSError:
        pass

    # move files into migration area
    bname = os.path.basename(fname).split('.')[0]
    tname = name = time.strftime("%H%M%S", time.gmtime())
    nname = os.path.join(mdir, '%s_%s.avro' % (bname, tname))
    print(tstamp('mongo2avro'), 'mv %s %s' % (fname, nname))
    shutil.move(fname, nname)

    # remove bad file (see AvroIO.py) associated with fname
    bfname = '%s/bad/%s_bad.txt' % (os.path.dirname(fname), os.path.basename(fname))
    if  os.path.isfile(bfname):
        bfsize = os.path.getsize(bfname)
        if  not bfsize:
            os.remove(bfname)
    return file_name(odir, mdir, thr, compress)
Пример #26
0
 def __init__(self, config=None):
     # define DB names to work with. These names should correspond to
     # dtype of documents we assign, see find_dtype and encode method
     self.dbnames = ['fwjr', 'crab']
     # Short-Term Storage
     self.sts = {}
     for dbname in self.dbnames:
         self.sts[dbname] = STSManager(config.short_storage_uri, dbname=dbname)
     self.sts_agg = STSManager(config.short_storage_uri, dbname='aggregated')
     # Long-Term Storage
     self.tls_thr = config.long_storage_thr
     if  LTS: # we'll use this module if it's loaded
         self.lts = LTSManager(config.long_storage_uri, config.wmauri, config.yarn)
     else: # fallback
         self.lts = self.sts['fwjr']
     self.specmap = {}
     with open(config.specmap, 'r') as istream:
         cdict = {}
         for line in istream.readlines():
             pair = line.replace('\n', '').split(',')
             self.specmap[pair[0]] = pair[1] # lfn:LFNArray
     # Monit manager
     self.monit = MonitManager(config.monit_credentials, config.monit_attributes)
     # NATS manager
     if hasattr(config, 'use_nats') and config.use_nats:
         self.nats = NATSManager(config.nats_server, topics=config.nats_topics, default_topic='cms.wmarchive', cms_filter=cms_filter)
     else:
         self.nats = None
     msg = "Short-Term Storage %s, Long-Term Storage %s, specmap %s" \
             % (self.sts, self.lts, self.specmap)
     msg += '\nMonit {}'.format(self.monit)
     msg += '\nNATS {}'.format(self.nats)
     print(tstamp("WMArchiveManager::init"), msg)
     self.time0 = time.time()
     self.read_access = 0
     self.write_access = 0
Пример #27
0
 def log(self, msg):
     "Write given message to log stream"
     print(tstamp(self.__class__.__name__), msg)
Пример #28
0
 def log(self, msg):
     "Write given message to log stream"
     print(tstamp(self.__class__.__name__), msg)
Пример #29
0
 def __str__(self):
     error = tstamp(repr(self.message))
     return error
Пример #30
0
 def __str__(self):
     error = tstamp(repr(self.message))
     return error