def cleanup( self, env=None, # a snapshot environment describing a repository keep=3, # the number of most recent snapshots to keep in one group group_by="build_config", # the attr of which its values form groups dryrun=True, # display the snapshots to be deleted without deleting them ** filters # a set of criterions to limit which snapshots are to be cleaned ): """ Delete past snapshots and keep only the most recent ones. Examples: >>> snapshot_cleanup() >>> snapshot_cleanup("s3_outbreak") >>> snapshot_cleanup("s3_outbreak", keep=0) """ snapshots = cleaner.find( # filters support dotfield. get_src_build(), env, keep, group_by, **filters) if dryrun: return '\n'.join( ("-" * 75, cleaner.plain_text(snapshots), "-" * 75, "DRYRUN ONLY - APPLY THE ACTIONS WITH:", " > snapshot_cleanup(..., dryrun=False)")) # return the number of snapshots successfully deleted return cleaner.delete(get_src_build(), snapshots, self)
def _snapshot(snapshot): x = CumulativeResult() build_doc = self._doc(index) cfg = self.repcfg.format(build_doc) for step in ("pre", "snapshot", "post"): state = registrar.dispatch(step) # _TaskState Class state = state(get_src_build(), build_doc.get("_id")) logging.info(state) state.started() job = yield from self.job_manager.defer_to_thread( self.pinfo.get_pinfo(step, snapshot), partial(getattr(self, state.func), cfg, index, snapshot)) try: dx = yield from job dx = StepResult(dx) except Exception as exc: logging.exception(exc) state.failed({}, exc) raise exc else: merge(x.data, dx.data) logging.info(dx) logging.info(x) state.succeed({snapshot: x.data}, res=dx.data) return x
def update_metadata(self, indexer_env, index_name, build_name=None, _meta=None): """ Update _meta for index_name, based on build_name (_meta directly taken from the src_build document) or _meta """ idxkwargs = self[indexer_env] # 1st pass we get the doc_type (don't want to ask that on the signature...) indexer = create_backend( (idxkwargs["es_host"], index_name, None)).target_esidxer m = indexer._es.indices.get_mapping(index_name) assert len(m[index_name]["mappings"]) == 1, "Found more than one doc_type: " + \ "%s" % m[index_name]["mappings"].keys() doc_type = list(m[index_name]["mappings"].keys())[0] # 2nd pass to re-create correct indexer indexer = create_backend( (idxkwargs["es_host"], index_name, doc_type)).target_esidxer if build_name: build = get_src_build().find_one({"_id": build_name}) assert build, "No such build named '%s'" % build_name _meta = build.get("_meta") assert _meta is not None, "No _meta found" return indexer.update_mapping_meta({"_meta": _meta})
def __init__(self, *args, **kwargs): super(IndexManager, self).__init__(*args, **kwargs) self.src_build = get_src_build() self.indexers = {} self.es_config = {} self.t0 = time.time() self.prepared = False self.log_folder = LOG_FOLDER self.timestamp = datetime.now() self.setup()
def clean_stale_status(self): src_build = get_src_build() for build in src_build.find(): for job in build.get("jobs", []): if job.get("status", "").endswith("snapshotting"): logging.warning( "Found stale build '%s', marking snapshot status as 'canceled'" % build["_id"]) job["status"] = "canceled" src_build.replace_one({"_id": build["_id"]}, build)
def clean_stale_status(self): src_build = get_src_build() for build in src_build.find(): dirty = False for job in build.get("jobs", []): if job.get("status") == "syncing": logging.warning( "Found stale build '%s', marking sync status as 'canceled'" % build["_id"]) job["status"] = "canceled" dirty = True if dirty: src_build.replace_one({"_id": build["_id"]}, build)
def post_publish(self, s3_folder, old_db_col_names, new_db_col_names, diff_folder, release_folder, steps, s3_bucket, *args, **kwargs): bdoc = get_src_build().find_one({"_id": new_db_col_names}) assert bdoc, "Can't find build doc associated with index '%s' (should be named the same)" % new_db_col_names ids_file = export_ids(new_db_col_names) redir = "%s_ids.xz" % bdoc["build_config"]["assembly"] if "demo" in new_db_col_names: redir = "demo_%s" % redir upload_ids(ids_file, redir, s3_bucket=config.IDS_S3_BUCKET, aws_key=config.AWS_KEY, aws_secret=config.AWS_SECRET)
def post_publish(self, snapshot, index, *args, **kwargs): # assuming build name == index name, and assuming demo index has # "demo" in its name... # assuming full index, not demo, guess name now bdoc = get_src_build().find_one({"_id" : index}) assert bdoc, "Can't find build doc associated with index '%s' (should be named the same)" % index ids_file = export_ids(index) if "hg19" in index or "hg19" in snapshot: redir = "hg19_ids.xz" else: redir = "hg38_ids.xz" if "demo" in index or "demo" in snapshot: redir = "demo_%s" % redir upload_ids(ids_file, redir, s3_bucket=config.IDS_S3_BUCKET, aws_key=config.AWS_KEY, aws_secret=config.AWS_SECRET)
async def _update_meta(_meta): env = self.register[indexer_env] async with AsyncElasticsearch(**env["args"]) as client: doc_type = None if int((await client.info())['version']['number'].split('.')[0]) < 7: mappings = client.indices.get_mapping(index_name) mappings = mappings[index_name]["mappings"] doc_type = next(iter(mappings.keys())) if _meta is None: _id = build_name or index_name # best guess build = get_src_build().find_one({"_id": _id}) _meta = (build or {}).get("_meta") return await client.indices.put_mapping(body=dict(_meta=_meta), index=index_name, doc_type=doc_type)
def load_build(self): """ Load cold and hot build documents. Index settings are the one declared in the hot build doc. """ src_build = get_src_build() # we don't want to reload build docs if they are already loaded # so we can temporarily override values when dealing with cold/hot collection # (kind of a hack, not really clean, but...) if self.hot_build_doc and self.cold_build_doc and self.build_doc: self.logger.debug("Build documents already loaded") return self.hot_build_doc = src_build.find_one({'_id': self.hot_target_name}) # search the cold collection definition assert "build_config" in self.hot_build_doc and "cold_collection" in self.hot_build_doc["build_config"], \ "Can't find cold_collection field in build_config" self.cold_target_name = self.hot_build_doc["build_config"][ "cold_collection"] self.cold_build_doc = src_build.find_one( {'_id': self.cold_target_name}) # we'll register everything (status) on the hot one self.build_doc = self.hot_build_doc assert self.cold_build_doc, "Can't find build document associated to '%s'" % self.cold_target_name assert self.hot_build_doc, "Can't find build document associated to '%s'" % self.hot_target_name self.cold_cfg = self.cold_build_doc.get("build_config") self.hot_cfg = self.hot_build_doc.get("build_config") if self.hot_cfg or not self.cold_cfg: self.build_config = self.hot_cfg if "doc_type" not in self.hot_cfg: raise ValueError("Missing 'doc_type' in build config") self.doc_type = self.hot_cfg["doc_type"] self.num_shards = self.hot_cfg.get("num_shards", 10) # optional self.num_shards = self.num_shards and int( self.num_shards) or self.num_shards self.num_replicas = self.hot_cfg.get("num_replicas", 0) # optional self.num_replicas = self.num_replicas and int( self.num_replicas) or self.num_replicas self.conf_name = self.hot_cfg["name"] else: raise ValueError( "Cannot find build config associated to '%s' or '%s'" % (self.hot_target_name, self.cold_target_name)) return (self.cold_cfg, self.hot_cfg)
def extract_coldbuild(self): cold_target = self.build_config["cold_collection"] cold_build_doc = get_src_build().find_one({'_id': cold_target}) cold_build_doc = _BuildDoc(cold_build_doc) cold_build_doc["_id"] = self.build_name # * cold_build_doc["mapping"].update(self["mapping"]) # combine mapping merge_src_build_metadata([cold_build_doc, self]) # combine _meta # * About State Updates # All updates are diverted to the hot collection. # Indices & snapshots are only registered there. if self.build_config.get("num_shards"): cold_build_doc.build_config["num_shards"] = \ self.build_config["num_shards"] if self.build_config.get("num_replicas"): cold_build_doc.build_config["num_replicas"] = \ self.build_config["num_replicas"] return cold_build_doc
def __init__(self, *args, **kwargs): """ An example of config dict for this module. { "indexer_select": { None: "hub.dataindex.indexer.DrugIndexer", # default "build_config.cold_collection" : "mv.ColdHotVariantIndexer", }, "env": { "prod": { "host": "localhost:9200", "indexer": { "args": { "timeout": 300, "retry_on_timeout": True, "max_retries": 10, }, "bulk": { "chunk_size": 50 "raise_on_exception": False }, "concurrency": 3 }, "index": [ # for information only, only used in index_info {"index": "mydrugs_current", "doc_type": "drug"}, {"index": "mygene_current", "doc_type": "gene"} ], }, "dev": { ... } } } """ super().__init__(*args, **kwargs) self._srcbuild = get_src_build() self._config = {} self.logger, self.logfile = get_logger('indexmanager')
def load_build(self, target_name=None): '''Load build info from src_build collection.''' target_name = target_name or self.target_name src_build = get_src_build() self.build_doc = src_build.find_one({'_id': target_name}) assert self.build_doc, "Can't find build document associated to '%s'" % target_name _cfg = self.build_doc.get("build_config") if _cfg: self.build_config = _cfg #if not "doc_type" in _cfg: # raise ValueError("Missing 'doc_type' in build config") self.doc_type = _cfg.get("doc_type") self.num_shards = _cfg.get("num_shards", 10) # optional self.num_shards = self.num_shards and int( self.num_shards) or self.num_shards self.num_replicas = _cfg.get("num_replicas", 0) # optional self.num_replicas = self.num_replicas and int( self.num_replicas) or self.num_replicas self.conf_name = _cfg["name"] else: raise ValueError("Cannot find build config associated to '%s'" % target_name) return _cfg
def cleanup(self, env=None, keep=3, dryrun=True, **filters): """ Delete old indices except for the most recent ones. Examples: >>> index_cleanup() >>> index_cleanup("production") >>> index_cleanup("local", build_config="demo") >>> index_cleanup("local", keep=0) >>> index_cleanup(_id="<elasticsearch_index>") """ if not env and not dryrun: # low specificity, unsafe. raise ValueError('Missing argument "env".') cleaner = Cleaner(get_src_build(), self, self.logger) cleanups = cleaner.find(env, keep, **filters) if dryrun: return '\n'.join(("-" * 75, cleaner.plain_text(cleanups), "-" * 75, "DRYRUN ONLY - APPLY THE ACTIONS WITH:", " > index_cleanup(..., dryrun=False)")) job = asyncio.ensure_future(cleaner.clean(cleanups)) job.add_done_callback(self.logger.info) return job
def clean_stale_status(self): registrar.audit(get_src_build(), logging)
def create_backend(db_col_names, name_only=False, follow_ref=False, **kwargs): """ Guess what's inside 'db_col_names' and return the corresponding backend. - It could be a string (will first check for an src_build doc to check a backend_url field, if nothing there, will lookup a mongo collection in target database) - or a tuple("target|src","col_name") - or a ("mongodb://*****:*****@host","db","col_name") URI. - or a ("es_host:port","index_name","doc_type") If name_only is true, just return the name uniquely identifying the collection or index URI connection. """ col = None db = None is_mongo = True if type(db_col_names) == str: # first check build doc, if there's backend_url key, we'll use it instead of # direclty using db_col_names as target collection (see LinkDataBuilder) bdoc = get_src_build().find_one({"_id": db_col_names}) if follow_ref and bdoc and bdoc.get( "backend_url") and bdoc["backend_url"] != db_col_names: return create_backend(bdoc["backend_url"], name_only=name_only, follow_ref=follow_ref, **kwargs) else: db = mongo.get_target_db() col = db[db_col_names] # normalize params db_col_names = [ "%s:%s" % (db.client.HOST, db.client.PORT), db.name, col.name ] elif db_col_names[0].startswith("mongodb://"): assert len( db_col_names ) == 3, "Missing connection information for %s" % repr(db_col_names) conn = mongo.MongoClient(db_col_names[0]) db = conn[db_col_names[1]] col = db[db_col_names[2]] # normalize params db_col_names = [ "%s:%s" % (db.client.HOST, db.client.PORT), db.name, col.name ] elif len(db_col_names) == 3 and ":" in db_col_names[0]: is_mongo = False idxr = ESIndexer(index=db_col_names[1], doc_type=db_col_names[2], es_host=db_col_names[0], **kwargs) db = idxr col = db_col_names[1] else: assert len( db_col_names ) == 2, "Missing connection information for %s" % repr(db_col_names) db = db_col_names[0] == "target" and mongo.get_target_db( ) or mongo.get_src_db() col = db[db_col_names[1]] # normalize params (0:host, 1:port) db_col_names = [ "%s:%s" % (db.client.address[0], db.client.address[1]), db.name, col.name ] assert col is not None, "Could not create collection object from %s" % repr( db_col_names) if name_only: if is_mongo: return "mongo_%s_%s_%s" % (db_col_names[0].replace( ":", "_"), db_col_names[1], db_col_names[2]) else: return "es_%s_%s_%s" % (db_col_names[0].replace( ":", "_"), db_col_names[1], db_col_names[2]) else: if is_mongo: return DocMongoBackend(db, col) else: return DocESBackend(db)
def export_ids(col_name): """ Export all _ids from collection named col_name. If col_name refers to a build where a cold_collection is defined, will also extract _ids and sort/uniq them to have the full list of _ids of the actual merged (cold+hot) collection Output file is stored in DATA_EXPORT_FOLDER/ids, defaulting to <DATA_ARCHIVE_ROOT>/export/ids. Output filename is returned as the end, if successful. """ # prepare output directory DATA_EXPORT_FOLDER = getattr(btconfig,"DATA_EXPORT_FOLDER",None) if not DATA_EXPORT_FOLDER: DATA_EXPORT_FOLDER = os.path.join(btconfig.DATA_ARCHIVE_ROOT,"export") ids_export_folder = os.path.join(DATA_EXPORT_FOLDER,"ids") if not os.path.exists(ids_export_folder): logging.debug("Creating export/ids folder: %s" % ids_export_folder) os.makedirs(ids_export_folder) build = get_src_build().find_one({"_id":col_name}) cold = None if build: col = get_target_db()[col_name] if build.get("build_config",{}).get("cold_collection"): cold_name = build["build_config"]["cold_collection"] cold = get_target_db()[cold_name] logging.info("Found a cold collection '%s' associated to '%s'" % (cold_name,col_name)) else: # it's a src col = get_src_db()[col_name] # first iterate over all _ids. This will potentially update underlying _id cache it's not valid anymore, # so we're sure to work with latest data. If cache is valid, this will be pretty fast logging.info("Screening _ids in collection '%s'" % col.name) for _id in id_feeder(col,validate_only=True): pass # now accessing cache col_ids_cache = get_cache_filename(col.name) assert os.path.exists(col_ids_cache) logging.info("Now using cache file %s" % col_ids_cache) if cold: logging.info("Screening _ids in cold collection '%s'" % cold.name) for _id in id_feeder(cold,validate_only=True): pass # now accessing cache cold_ids_cache = get_cache_filename(cold.name) assert os.path.exists(cold_ids_cache) logging.info("Now using cache file %s" % cold_ids_cache) outfn = os.path.join(ids_export_folder,"%s_ids.xz" % col_name) # NOTE: can't use anyfile to open cache files and send _id through pipes # because it would load _id in memory (unless using hacks) so use cat (and # existing uncompressing ones, like gzcat/xzcat/...) to fully run the pipe # on the shell if cold: fout = anyfile(outfn,"wb") colext = os.path.splitext(col_ids_cache)[1] coldext = os.path.splitext(cold_ids_cache)[1] assert colext == coldext, "Hot and Cold _id cache are compressed differently (%s and %s), it should be the same" % (coldext,coldext) comp = colext.replace(".","") supportedcomps = ["xz","gz",""] # no compression allowed as well assert comp in supportedcomps, "Compression '%s' isn't supported (%s)" % (comp,supportedcomps) # IDs sent to pipe's input (sort) then compress it (xz) pcat = subprocess.Popen(["%scat" % comp, col_ids_cache, cold_ids_cache],stdout=subprocess.PIPE) psort = subprocess.Popen(["sort","-u"],stdin=pcat.stdout,stdout=subprocess.PIPE,universal_newlines=True) pcat.stdout.close() # will raise end of pipe error when finished if comp: pcomp = subprocess.Popen(["xz","-c"],stdin=psort.stdout,stdout=fout) else: # just print stdin to stdout pcomp = subprocess.Popen(["tee"],stdin=psort.stdout,stdout=fout) psort.stdout.close() try: logging.info("Running pipe to compute list of unique _ids") (out,err) = pcomp.communicate() # run the pipe! (blocking) if err: raise Exception(err) except Exception as e: logging.error("Error while running pipe to export _ids: %s" % e) # make sure to clean empty or half processed files try: os.unlink(outfn) finally: pass raise else: logging.info("Copying cache _id file") try: shutil.copyfile(col_ids_cache,outfn) except Exception as e: logging.error("Error while exporting _ids: %s" % e) # make sure to clean empty or half processed files try: os.unlink(outfn) finally: pass raise logging.info("Done exporting _ids to '%s'" % outfn) return outfn
def pending_snapshot(build_name): src_build = get_src_build() src_build.update({"_id": build_name}, {"$addToSet": { "pending": "snapshot" }})
def register_status(self, status, transient=False, init=False, **extra): src_build = get_src_build() job_info = { 'status': status, 'step_started_at': datetime.now().astimezone(), 'logfile': self.logfile, } # to select correct diff sub-record (1 collection can be diffed with multiple others) diff_key = "%s" % self.old.target_name # once in diff, select correct sync sub-record (1 diff can be applied to different backend) # replace dots as hostname can have dots which could be interpreted as dotted field by mongo # also remove doc_type (which can be sometimes None if hub deals with multiple APIs, # and is not useful in distinguishing where the diff was applid since there's only one # doc type allowed now since ES6 (last element in self.target_backend is doc_type) sync_key = "-".join(self.target_backend[:-1]).replace(".", "-") sync_info = {sync_key: {}} if transient: # record some "in-progress" information job_info['pid'] = os.getpid() else: # only register time when it's a final state job_info["time"] = timesofar(self.ti) t1 = round(time.time() - self.ti, 0) job_info["time_in_s"] = t1 sync_info[sync_key]["created_at"] = datetime.now().astimezone() if "sync" in extra: sync_info[sync_key].update(extra["sync"]) if "job" in extra: job_info.update(extra["job"]) # since the base is the merged collection, we register info there # as the new collection (diff results are associated to the most recent colleciton) build = src_build.find_one({'_id': self.new.target_name}) if not build: self.logger.info( "Can't find build document '%s', no status to register" % self.new.target_name) return assert "diff" in build and diff_key in build[ "diff"], "Missing previous diff information in build document" if init: # init timer for this step self.ti = time.time() src_build.update({'_id': self.new.target_name}, {"$push": { 'jobs': job_info }}) # now refresh/sync build = src_build.find_one({'_id': self.new.target_name}) else: # merge extra at root level # (to keep building data...) and update the last one # (it's been properly created before when init=True) build["jobs"] and build["jobs"][-1].update(job_info) def merge_info(target, d): if "__REPLACE__" in d.keys(): d.pop("__REPLACE__") target = d else: for k, v in d.items(): if isinstance(v, dict): if k in target: target[k] = merge_info(target[k], v) else: v.pop("__REPLACE__", None) # merge v with "nothing" just to make sure to remove any "__REPLACE__" v = merge_info({}, v) target[k] = v else: target[k] = v return target sync_info = { "sync": merge_info(build["diff"][diff_key].get("sync", {}), sync_info) } build["diff"][diff_key].update(sync_info) #src_build.update({'_id': build["_id"]}, {"$set": index_info}) src_build.replace_one({"_id": build["_id"]}, build)
def get_build_doc(index_name): src_build = get_src_build() doc = src_build.find_one({"index." + index_name: {"$exists": True}}) if not doc: logging.error("No build associated with index %s.", index_name) return doc
def set_pending_to_publish(col_name): src_build = get_src_build() src_build.update({"_id": col_name}, {"$addToSet": {"pending": "publish"}})
def clean_stale_status(self): IndexJobStateRegistrar.prune(get_src_build())
def __init__(self, indexer): self.indexer = indexer self.state = self.state(get_src_build(), indexer.build_name, indexer.es_index_name, logfile=indexer.logfile)
def poll(self, state, func): super().poll(state, func, col=get_src_build())
def collection(self): return get_src_build()
def register_status(self, status, transient=False, init=False, **extra): assert self.build_doc src_build = get_src_build() job_info = { 'status': status, 'step_started_at': datetime.now(), 'logfile': self.logfile, } index_info = { "index": { self.index_name: { 'host': self.host, 'environment': self.env, 'conf_name': self.conf_name, 'target_name': self.target_name, 'index_name': self.index_name, 'doc_type': self.doc_type, 'num_shards': self.num_shards, 'num_replicas': self.num_replicas } } } if transient: # record some "in-progress" information job_info['pid'] = os.getpid() else: # only register time when it's a final state job_info["time"] = timesofar(self.ti) t1 = round(time.time() - self.ti, 0) job_info["time_in_s"] = t1 index_info["index"][self.index_name]["created_at"] = datetime.now() if "index" in extra: index_info["index"][self.index_name].update(extra["index"]) if "job" in extra: job_info.update(extra["job"]) # since the base is the merged collection, we register info there build = src_build.find_one({'_id': self.target_name}) assert build, "Can't find build document '%s'" % self.target_name if init: # init timer for this step self.ti = time.time() src_build.update({'_id': self.target_name}, {"$push": { 'jobs': job_info }}) # now refresh/sync build = src_build.find_one({'_id': self.target_name}) else: # merge extra at root level # (to keep building data...) and update the last one # (it's been properly created before when init=True) build["jobs"] and build["jobs"][-1].update(job_info) def merge_index_info(target, d): if "__REPLACE__" in d.keys(): d.pop("__REPLACE__") target = d else: for k, v in d.items(): if type(v) == dict: if k in target: target[k] = merge_index_info(target[k], v) else: v.pop("__REPLACE__", None) # merge v with "nothing" just to make sure to remove any "__REPLACE__" v = merge_index_info({}, v) target[k] = v else: target[k] = v return target build = merge_index_info(build, index_info) src_build.replace_one({"_id": build["_id"]}, build)
def _doc(self, index): doc = get_src_build().find_one( {f"index.{index}.environment": self.idxenv}) if not doc: # not asso. with a build raise ValueError("Not a hub-managed index.") return doc # TODO UNIQUENESS
def set_pending_to_release_note(col_name): src_build = get_src_build() src_build.update({"_id": col_name}, {"$addToSet": {"pending": "release_note"}})