def validate_mapping(self, mapping, env): idxkwargs = self[env] # just get the default indexer (target_name doesn't exist, return default one) idxklass = self.find_indexer(target_name="__placeholder_name__%s" % get_random_string()) idxr_obj = idxklass(**idxkwargs) settings = idxr_obj.get_index_creation_settings() # generate a random index, it'll be deleted at the end index_name = ("hub_tmp_%s" % get_random_string()).lower() idxr = ESIndexer(index=index_name, es_host=idxr_obj.host, doc_type=None) self.logger.info( "Testing mapping by creating index '%s' on host '%s' (settings: %s)", index_name, idxr_obj.host, settings) try: res = idxr.create_index(mapping, settings) return res except Exception as e: self.logger.exception("create_index failed") raise e finally: try: idxr.delete_index() except Exception: pass
def make_temp_collection(self): '''Create a temp collection for dataloading, e.g., entrez_geneinfo_INEMO.''' if self.temp_collection_name: # already set return new_collection = None self.temp_collection_name = self.collection_name + '_temp_' + get_random_string() return self.temp_collection_name
def make_temp_collection(self): '''Create a temp collection for dataloading, e.g., entrez_geneinfo_INEMO.''' new_collection = None while 1: new_collection = self.__collection__ + '_temp_' + get_random_string() if new_collection not in self.db.collection_names(): break self.temp_collection = self.db[new_collection] return new_collection
def make_temp_collection(self): '''Create a temp collection for dataloading, e.g., entrez_geneinfo_INEMO.''' new_collection = None while 1: new_collection = self.__collection__ + '_temp_' + get_random_string( ) if new_collection not in self.db.collection_names(): break self.temp_collection = self.db[new_collection] return new_collection
def switch_collection(self): '''after a successful loading, rename temp_collection to regular collection name, and renaming existing collection to a temp name for archiving purpose. ''' if self.temp_collection and self.temp_collection.count() > 0: if self.collection.count() > 0: # renaming existing collections new_name = '_'.join([ self.__collection__, 'archive', get_timestamp(), get_random_string() ]) self.collection.rename(new_name, dropTarget=True) self.temp_collection.rename(self.__collection__) else: print("Error: load data first.")
def _validate_mapping(): client = AsyncElasticsearch(**indexer.es_client_args) index_name = ("hub_tmp_%s" % get_random_string()).lower() try: return (yield from client.indices.create( index_name, body={ "settings": (yield from indexer.es_index_settings.finalize(client)), "mappings": (yield from indexer.es_index_mappings.finalize(client)) })) finally: yield from client.indices.delete(index_name, ignore_unavailable=True) yield from client.close()
def defer_to_process(self, pinfo=None, func=None, *args): @asyncio.coroutine def run(future, job_id): nonlocal pinfo yield from self.check_constraints(pinfo) self.ok_to_run.release() # pinfo can contain predicates hardly pickleable during run_in_executor # but we also need not to touch the original one copy_pinfo = copy.deepcopy(pinfo) copy_pinfo.pop("__predicates__", None) self.jobs[job_id] = copy_pinfo res = self.loop.run_in_executor( self.process_queue, partial(do_work, job_id, "process", copy_pinfo, func, *args)) def ran(f): try: # consume future, just to trigger potential exceptions r = f.result() finally: # whatever the result we want to make sure to clean the job registry # to keep it sync with actual running jobs self.jobs.pop(job_id) res.add_done_callback(ran) res = yield from res # process could generate other parallelized jobs and return a Future/Task # If so, we want to make sure we get the results from that task if type(res) == asyncio.Task: res = yield from res future.set_result(res) yield from self.ok_to_run.acquire() f = asyncio.Future() def runned(innerf, job_id): if innerf.exception(): f.set_exception(innerf.exception()) job_id = get_random_string() fut = asyncio.ensure_future(run(f, job_id)) fut.add_done_callback(partial(runned, job_id=job_id)) return f
def switch_collection(self): '''after a successful loading, rename temp_collection to regular collection name, and renaming existing collection to a temp name for archiving purpose. ''' if self.temp_collection_name and self.db[ self.temp_collection_name].count() > 0: if self.collection_name in self.db.collection_names(): # renaming existing collections new_name = '_'.join([ self.collection_name, 'archive', get_timestamp(), get_random_string() ]) self.collection.rename(new_name, dropTarget=True) self.logger.info("Renaming collection '%s' to '%s'" % (self.temp_collection_name, self.collection_name)) self.db[self.temp_collection_name].rename(self.collection_name) else: raise ResourceError("No temp collection (or it's empty)")
def defer_to_thread(self, pinfo=None, func=None, *args): skip_check = pinfo.get("__skip_check__", False) @asyncio.coroutine def run(future, job_id): if not skip_check: yield from self.check_constraints(pinfo) self.ok_to_run.release() self.jobs[job_id] = pinfo res = self.loop.run_in_executor( self.thread_queue, partial(do_work, job_id, "thread", pinfo, func, *args)) def ran(f): try: r = f.result() finally: # whatever the result we want to make sure to clean the job registry # to keep it sync with actual running jobs self.jobs.pop(job_id) res.add_done_callback(ran) res = yield from res # thread could generate other parallelized jobs and return a Future/Task # If so, we want to make sure we get the results from that task if type(res) == asyncio.Task: res = yield from res future.set_result(res) if not skip_check: yield from self.ok_to_run.acquire() f = asyncio.Future() def runned(innerf, job_id): if innerf.exception(): f.set_exception(innerf.exception()) job_id = get_random_string() fut = asyncio.ensure_future(run(f, job_id)) fut.add_done_callback(partial(runned, job_id=job_id)) return f
def backup(folder=".", archive=None): """ Dump the whole hub_db database in given folder. "archive" can be pass to specify the target filename, otherwise, it's randomly generated Note: this doesn't backup source/merge data, just the internal data used by the hub """ # get database name (ie. hub_db internal database) db_name = get_src_dump().database.name dump = {} for getter in [ get_src_dump, get_src_master, get_src_build, get_src_build_config, get_data_plugin, get_api, get_cmd, get_event, get_hub_config ]: col = getter() dump[col.name] = [] for doc in col.find(): dump[col.name].append(doc) if not archive: archive = "backup_%s_%s.pyobj" % (get_timestamp(), get_random_string()) path = os.path.join(folder, archive) dumpobj(dump, path) return path
def func_wrapper(*args, **kwargs): ptype = args[0] # tracking process or thread ? # we're looking for some "pinfo" value (process info) to later # reporting. If we can't find any, we'll try our best to figure out # what this is about... # func is the do_work wrapper, we want the actual partial # is first arg a callable (func) or pinfo ? if callable(args[1]): innerfunc = args[1] innerargs = args[2:] pinfo = None else: innerfunc = args[2] innerargs = args[3:] pinfo = args[1] # make sure we can pickle the whole thing (and it's # just informative, so stringify is just ok there) innerargs = [str(arg) for arg in innerargs] if type(innerfunc) == partial: fname = innerfunc.func.__name__ elif type(innerfunc) == types.MethodType: fname = innerfunc.__self__.__class__.__name__ else: fname = innerfunc.__name__ firstarg = innerargs and innerargs[0] or "" if not pinfo: pinfo = { "category": None, "source": None, "step": None, "description": "%s %s" % (fname, firstarg) } worker = { 'func_name': fname, 'args': innerargs, 'kwargs': kwargs, 'started_at': time.time(), 'info': pinfo } results = None exc = None trace = None try: _id = None rnd = get_random_string() if ptype == "thread": _id = "%s" % threading.current_thread().getName() else: _id = os.getpid() # add random chars: 2 jobs handled by the same slot (pid or thread) # would override filename otherwise fn = "%s_%s" % (_id, rnd) worker["info"]["id"] = _id pidfile = os.path.join(config.RUN_DIR, "%s.pickle" % fn) pickle.dump(worker, open(pidfile, "wb")) results = func(*args, **kwargs) except Exception as e: import traceback trace = traceback.format_exc() logger.error("err %s\n%s" % (e, trace)) # we want to store exception so for now, just make a reference exc = e finally: if os.path.exists(pidfile): pass # move to "done" dir and register end of execution time os.rename( pidfile, os.path.join(config.RUN_DIR, "done", os.path.basename(pidfile))) pidfile = os.path.join(config.RUN_DIR, "done", os.path.basename(pidfile)) worker = pickle.load(open(pidfile, "rb")) worker["duration"] = timesofar(worker["started_at"]) worker["err"] = exc worker["trace"] = trace # try to keep original exception, but this may fail depending on # what's in the exception. If we can't, keep the string representation try: pickle.dump(worker, open(pidfile, "wb")) except Exception: worker["err"] = str(exc) pickle.dump(worker, open(pidfile, "wb")) # now raise original exception if exc: raise exc return results
def id_feeder(col, batch_size=1000, build_cache=True, logger=logging, force_use=False, force_build=False, validate_only=False): """Return an iterator for all _ids in collection "col" Search for a valid cache file if available, if not return a doc_feeder for that collection. Valid cache is a cache file that is newer than the collection. "db" can be "target" or "src". "build_cache" True will build a cache file as _ids are fetched, if no cache file was found "force_use" True will use any existing cache file and won't check whether it's valid of not. "force_build" True will build a new cache even if current one exists and is valid. "validate_only" will directly return [] if the cache is valid (convenient way to check if the cache is valid) """ src_db = get_src_db() ts = None found_meta = True if isinstance(col, DocMongoBackend): col = col.target_collection try: if col.database.name == config.DATA_TARGET_DATABASE: info = src_db["src_build"].find_one({"_id": col.name}) if not info: logger.warning( "Can't find information for target collection '%s'" % col.name) else: ts = info.get("_meta", {}).get("build_date") ts = ts and dtparser.parse(ts).timestamp() elif col.database.name == config.DATA_SRC_DATABASE: src_dump = get_src_dump() info = src_dump.find_one({ "$where": "function() {if(this.upload) {for(var index in this.upload.jobs) {if(this.upload.jobs[index].step == \"%s\") return this;}}}" % col.name }) if not info: logger.warning( "Can't find information for source collection '%s'" % col.name) else: ts = info["upload"]["jobs"][col.name]["started_at"].timestamp() else: logging.warning( "Can't find metadata for collection '%s' (not a target, not a source collection)" % col) found_meta = False build_cache = False except KeyError: logger.warning("Couldn't find timestamp in database for '%s'" % col.name) except Exception as e: logger.info( "%s is not a mongo collection, _id cache won't be built (error: %s)" % (col, e)) build_cache = False # try to find a cache file use_cache = False cache_file = None cache_format = getattr(config, "CACHE_FORMAT", None) if found_meta and getattr(config, "CACHE_FOLDER", None): cache_file = get_cache_filename(col.name) try: # size of empty file differs depending on compression empty_size = {None: 0, "xz": 32, "gzip": 25, "bz2": 14} if force_build: logger.warning("Force building cache file") use_cache = False # check size, delete if invalid elif os.path.getsize(cache_file) <= empty_size.get( cache_format, 32): logger.warning("Cache file exists but is empty, delete it") os.remove(cache_file) elif force_use: use_cache = True logger.info("Force using cache file") else: mt = os.path.getmtime(cache_file) if ts and mt >= ts: dtmt = datetime.datetime.fromtimestamp(mt).isoformat() dtts = datetime.datetime.fromtimestamp(ts).isoformat() logging.debug( "Cache is valid, modiftime_cache:%s >= col_timestamp:%s" % (dtmt, dtts)) use_cache = True else: logger.info("Cache is too old, discard it") except FileNotFoundError: pass if use_cache: logger.debug("Found valid cache file for '%s': %s" % (col.name, cache_file)) if validate_only: logging.debug("Only validating cache, now return") return [] with open_compressed_file(cache_file) as cache_in: if cache_format: iocache = io.TextIOWrapper(cache_in) else: iocache = cache_in for ids in iter_n(iocache, batch_size): yield [_id.strip() for _id in ids if _id.strip()] else: logger.debug( "No cache file found (or invalid) for '%s', use doc_feeder" % col.name) cache_out = None cache_temp = None if getattr(config, "CACHE_FOLDER", None) and config.CACHE_FOLDER and build_cache: if not os.path.exists(config.CACHE_FOLDER): os.makedirs(config.CACHE_FOLDER) cache_temp = "%s._tmp_" % cache_file # clean aborted cache file generation for tmpcache in glob.glob( os.path.join(config.CACHE_FOLDER, "%s*" % cache_temp)): logger.info("Removing aborted cache file '%s'" % tmpcache) os.remove(tmpcache) # use temp file and rename once done cache_temp = "%s%s" % (cache_temp, get_random_string()) cache_out = get_compressed_outfile(cache_temp, compress=cache_format) logger.info("Building cache file '%s'" % cache_temp) else: logger.info( "Can't build cache, cache not allowed or no cache folder") build_cache = False if isinstance(col, Collection): doc_feeder_func = partial(doc_feeder, col, step=batch_size, inbatch=True, fields={"_id": 1}) elif isinstance(col, DocMongoBackend): doc_feeder_func = partial(doc_feeder, col.target_collection, step=batch_size, inbatch=True, fields={"_id": 1}) elif isinstance(col, DocESBackend): # get_id_list directly return the _id, wrap it to match other # doc_feeder_func returned vals. Also return a batch of id def wrap_id(): ids = [] for _id in col.get_id_list(step=batch_size): ids.append({"_id": _id}) if len(ids) >= batch_size: yield ids ids = [] if ids: yield ids doc_feeder_func = partial(wrap_id) else: raise Exception("Unknown backend %s" % col) for doc_ids in doc_feeder_func(): doc_ids = [str(_doc["_id"]) for _doc in doc_ids] if build_cache: strout = "\n".join(doc_ids) + "\n" if cache_format: # assuming binary format (b/ccompressed) cache_out.write(strout.encode()) else: cache_out.write(strout) yield doc_ids if build_cache: cache_out.close() cache_final = os.path.splitext(cache_temp)[0] os.rename(cache_temp, cache_final)
def _get_target_name(self): return 'genedoc_{}_{}_{}'.format(self._build_config['name'], get_timestamp(), get_random_string()).lower()
def switch_collection(self): '''after a successful loading, rename temp_collection to regular collection name, and renaming existing collection to a temp name for archiving purpose. ''' if self.temp_collection and self.temp_collection.count() > 0: if self.collection.count() > 0: # renaming existing collections new_name = '_'.join([self.__collection__, 'archive', get_timestamp(), get_random_string()]) self.collection.rename(new_name, dropTarget=True) self.temp_collection.rename(self.__collection__) else: print("Error: load data first.")
def generate_target_name(self, build_config_name): assert build_config_name is not None return '{}_{}_{}'.format(build_config_name, get_timestamp(), get_random_string()).lower()
def sync_from_one_diff(index, collection, diff_filepath, validate=False, wait=60, dryrun=False, returncnt=False, save2file=None): sync = ESSyncer(index=index) #sync._index = index #sync._esi._index = index diff = loadobj(diff_filepath) source_collection = diff['source'] add_iter = sync.add(source_collection, diff['add']) delete_iter = sync.delete(collection, diff['delete']) update_iter = sync.update2(diff['update'], collection, source_collection) t00 = time() if save2file: from itertools import chain import json for op in chain(add_iter, delete_iter, update_iter): json.dump(op, save2file) print("="*20) print("Finished! [{}]".format(timesofar(t00))) return print('Adding new {} docs...'.format(len(diff['add']))) t0 = time() if not dryrun: try: bulk(sync._es, add_iter) except: pass print("Done. [{}]".format(timesofar(t0))) print('Deleting {} docs'.format(len(diff['delete']))) t0 = time() if not dryrun: bulk(sync._es, delete_iter) print("Done. [{}]".format(timesofar(t0))) print('Updating {} docs'.format(len(diff['update']))) t0 = time() if not dryrun: bulk(sync._es, update_iter) print("Done. [{}]".format(timesofar(t0))) # add flush and refresh try: res = sync._es.indices.flush() print("Flushing...", res) res = sync._es.indices.refresh() print("Refreshing...", res) except: pass print("="*20) print("Finished! [{}]".format(timesofar(t00))) if returncnt: cnt = { 'add': len(diff['add']), 'delete': len(diff['delete']), 'update': len(diff['update']) } return cnt if validate: print('Waiting {}s to let ES to finish...'.format(wait), end="") sleep(wait) print("Done.") print("Validating...") t0 = time() q = { "query": { "constant_score": { "filter": { "exists": { "field": 'clinvar' } } } } } data = sync._esi.doc_feeder(query=q, _source=collection) temp_collection = collection + '_temp_' + get_random_string() sync._src[temp_collection].drop() load_source(temp_collection, src_data=data) c1 = get_backend(source_collection, 'mongodb') c2 = get_backend(temp_collection, 'mongodb') diff_result = diff_collections(c1, c2, use_parallel=False) sync._src[temp_collection].drop() print("Done. [{}]".format(t0)) return diff_result
def generate_target_name(self, build_config_name): return 'genedoc_{}_{}_{}'.format(build_config_name, get_timestamp(), get_random_string()).lower()