def params_unprepare_from_saved(fields, copy_to_legacy=False): """ Unescape all section and param names for hyper params and configuration If copy_to_legacy is set then copy hyperparams and configuration data to the legacy location for the old clients """ for param_field in ("hyperparams", "configuration"): params = safe_get(fields, param_field) if params: unescaped_params = { ParameterKeyEscaper.unescape(key): { ParameterKeyEscaper.unescape(k): v for k, v in value.items() } if isinstance(value, dict) else value for key, value in params.items() } dpath.set(fields, param_field, unescaped_params) if copy_to_legacy: for new_params_field, old_params_field, use_sections in ( (f"hyperparams", "execution/parameters", True), (f"configuration", "execution/model_desc", False), ): legacy_params = _get_legacy_params( safe_get(fields, new_params_field), with_sections=use_sections ) if legacy_params: dpath.new( fields, old_params_field, {_get_full_param_name(p): p["value"] for p in legacy_params}, )
def params_prepare_for_save(fields: dict, previous_task: Task = None): """ If legacy hyper params or configuration is passed then replace the corresponding section in the new structure Escape all the section and param names for hyper params and configuration to make it mongo sage """ for old_params_field, new_params_field, default_section in ( ("execution/parameters", "hyperparams", hyperparams_default_section), ("execution/model_desc", "configuration", None), ): legacy_params = safe_get(fields, old_params_field) if legacy_params is None: continue if ( not safe_get(fields, new_params_field) and previous_task and previous_task[new_params_field] ): previous_data = previous_task.to_proper_dict().get(new_params_field) removed = _remove_legacy_params( previous_data, with_sections=default_section is not None ) if not legacy_params and not removed: # if we only need to delete legacy fields from the db # but they are not there then there is no point to proceed continue fields_update = {new_params_field: previous_data} params_unprepare_from_saved(fields_update) fields.update(fields_update) for full_name, value in legacy_params.items(): section, name = split_param_name(full_name, default_section) new_path = list(filter(None, (new_params_field, section, name))) new_param = dict(name=name, type=hyperparams_legacy_type, value=str(value)) if section is not None: new_param["section"] = section dpath.new(fields, new_path, new_param) dpath.delete(fields, old_params_field) for param_field in ("hyperparams", "configuration"): params = safe_get(fields, param_field) if params: escaped_params = { ParameterKeyEscaper.escape(key): { ParameterKeyEscaper.escape(k): v for k, v in value.items() } if isinstance(value, dict) else value for key, value in params.items() } dpath.set(fields, param_field, escaped_params)
def _get_events_from_es_res(self, es_res: dict) -> Tuple[list, int, Optional[str]]: """ Return events and next scroll id from the scrolled query Release the scroll once it is exhausted """ total_events = safe_get(es_res, "hits/total/value", default=0) events = [doc["_source"] for doc in safe_get(es_res, "hits/hits", default=[])] next_scroll_id = es_res.get("_scroll_id") if next_scroll_id and not events: self.es.clear_scroll(scroll_id=next_scroll_id) next_scroll_id = self.empty_scroll return events, total_events, next_scroll_id
def _get_metric_fields(metrics: Sequence[dict]) -> dict: names = { "cpu_usage": "cpu_usage", "memory_used": "mem_used_gb", "memory_free": "mem_free_gb", } return { names[m["key"]]: { "min": safe_get(m, "min/value"), "max": safe_get(m, "max/value"), "avg": safe_get(m, "avg/value"), } for m in metrics if m["key"] in names }
def migrate_backend(db: Database): hyperparam_fields = ("execution.parameters", "hyperparams") configuration_fields = ("execution.model_desc", "configuration") collection: Collection = db["task"] for doc in collection.find(projection=hyperparam_fields + configuration_fields): set_commands = {} for (old_field, new_field), default_section in zip( (hyperparam_fields, configuration_fields), (hyperparams_default_section, None), ): legacy = safe_get(doc, old_field, separator=".") if not legacy: continue for full_name, value in legacy.items(): section, name = split_param_name(full_name, default_section) new_path = list(filter(None, (new_field, section, name))) # if safe_get(doc, new_path) is not None: # continue new_value = dict(name=name, type=hyperparams_legacy_type, value=str(value)) if section is not None: new_value["section"] = section set_commands[".".join(new_path)] = new_value if set_commands: collection.update_one({"_id": doc["_id"]}, {"$set": set_commands})
def _get_cardinality_fields(categories: Sequence[dict]) -> dict: names = {"cpu": "num_cores"} return { names[c["key"]]: safe_get(c, "count/value") for c in categories if c["key"] in names }
def _get_task_metrics(self, task_id, es_index, event_type: EventType) -> Sequence: es_req = { "size": 0, "query": { "bool": { "must": [ {"term": {"task": task_id}}, {"term": {"type": event_type.value}}, ] } }, "aggs": { "metrics": { "terms": {"field": "metric", "size": self.MAX_METRICS_COUNT} } }, } with translate_errors_context(), TimingContext("es", "_get_task_metrics"): es_res = self.es.search(index=es_index, body=es_req) return [ metric["key"] for metric in safe_get(es_res, "aggregations/metrics/buckets", default=[]) ]
def _upgrade_task_data(task_data: dict): for old_param_field, new_param_field, default_section in ( ("execution/parameters", "hyperparams", hyperparams_default_section), ("execution/model_desc", "configuration", None), ): legacy = safe_get(task_data, old_param_field) if not legacy: continue for full_name, value in legacy.items(): section, name = split_param_name(full_name, default_section) new_path = list(filter(None, (new_param_field, section, name))) if not safe_get(task_data, new_path): new_param = dict( name=name, type=hyperparams_legacy_type, value=str(value) ) if section is not None: new_param["section"] = section dpath.new(task_data, new_path, new_param) dpath.delete(task_data, old_param_field)
def _build_metric_interval(metric: str, variant: str, data: dict, samples: int) -> Tuple[str, str, int, int]: """ Calculate index interval per metric_variant variant so that the total amount of intervals does not exceeds the samples Return the interval and resulting amount of intervals """ count = safe_get(data, "count/value", default=0) if count < samples: return metric, variant, 1, count min_index = safe_get(data, "min_index/value", default=0) max_index = safe_get(data, "max_index/value", default=min_index) return ( metric, variant, max(1, int(max_index - min_index + 1) // samples), samples, )
def update_queue_entries(*entries): for entry in entries: if not entry: continue info = queues_info.get(entry.id, None) if not info: continue entry.name = info.get("name", None) entry.num_tasks = info.get("num_entries", 0) task_id = safe_get(info, "next_entry/task") if task_id: task = tasks_info.get(task_id, None) entry.next_task = IdNameEntry( id=task_id, name=task.name if task else None)
def _get_active_workers( cls, company_id, from_timestamp: int, to_timestamp: int ) -> dict: es_req = { "size": 0, "query": QueryBuilder.dates_range(from_timestamp, to_timestamp), "aggs": { "workers": { "terms": {"field": "worker"}, "aggs": {"last_activity_time": {"max": {"field": "timestamp"}}}, } }, } res = cls._run_worker_stats_query(company_id, es_req) buckets = safe_get(res, "aggregations/workers/buckets", default=[]) return { b["key"]: {"last_activity_time": b["last_activity_time"]["value"]} for b in buckets }
def vep_api_request(self): """General function for handling API communication. If there is some error with the returned data, the request will be retried a couple of times. Return obtained data as a dict. """ url = self.server + self.ext_url # try to pull the requested data and check if request was successful r = requests.get(url, headers={"content-type": "application/json"}) if r.ok: return safe_get(r.json(), 0), 200 else: # if some sort of error occurs if "matches reference" in str(r.content, "utf-8"): # Return None if the given alternative sequence matches the GRCh reference sequence. return None, 496 else: # Return None if some sort of different error occurs. print( f"VEP ERROR '{r.status_code}: {r.reason}' occured for {self.variant}. Retrying..." ) raise IOError("There has been an issue with a variant.")
def get_all_with_projection( self, company_id: str, last_seen: int ) -> Sequence[WorkerResponseEntry]: helpers = list( map( WorkerConversionHelper.from_worker_entry, self.get_all(company_id=company_id, last_seen=last_seen), ) ) task_ids = set(filter(None, (helper.task_id for helper in helpers))) all_queues = set( itertools.chain.from_iterable(helper.queue_ids for helper in helpers) ) queues_info = {} if all_queues: projection = [ {"$match": {"_id": {"$in": list(all_queues)}}}, { "$project": { "name": 1, "next_entry": {"$arrayElemAt": ["$entries", 0]}, "num_entries": {"$size": "$entries"}, } }, ] queues_info = { res["_id"]: res for res in Queue.objects.aggregate(projection) } task_ids = task_ids.union( filter( None, ( safe_get(info, "next_entry/task") for info in queues_info.values() ), ) ) tasks_info = {} if task_ids: tasks_info = { task.id: task for task in Task.objects(id__in=task_ids).only( "name", "started", "last_iteration" ) } def update_queue_entries(*entries): for entry in entries: if not entry: continue info = queues_info.get(entry.id, None) if not info: continue entry.name = info.get("name", None) entry.num_tasks = info.get("num_entries", 0) task_id = safe_get(info, "next_entry/task") if task_id: task = tasks_info.get(task_id, None) entry.next_task = IdNameEntry( id=task_id, name=task.name if task else None ) for helper in helpers: worker = helper.worker if helper.task_id: task = tasks_info.get(helper.task_id, None) if task: worker.task.running_time = ( int((datetime.utcnow() - task.started).total_seconds() * 1000) if task.started else 0 ) worker.task.last_iteration = task.last_iteration update_queue_entries(worker.queue) if worker.queues: update_queue_entries(*worker.queues) return [helper.worker for helper in helpers]
def assign_results(self, transcript_id=None, clear_params=False): """This function filters and formats the received annotation results. :param self.response_decoded: received annotation data :param transcript_index: index of the transcript to work with from transcript_consequences """ if clear_params: self.clear_params() if not transcript_id: transcript_id = self.transcript selected_transcript_consequences = None for result in self.response_decoded["transcript_consequences"]: if result.get("transcript_id") == transcript_id: selected_transcript_consequences = result break if self.response_decoded.get("vcf_string") is not None: self.vcf_string = re.sub(r"-", ":", self.response_decoded.get("vcf_string")) # extracts reference sequence from variant id in vcf format self.ref_seq_vep = safe_get(self.vcf_string.split(":"), 2) self.alt_seq_vep = safe_get(self.vcf_string.split(":"), 3) if self.ref_seq is None: self.ref_seq = self.response_decoded.get("vcf_string").split( "-")[2] self.alt_seq = self.response_decoded.get("vcf_string").split( "-")[3] self.id = self.response_decoded.get("id") if self.variant_format == "vcf": self.id = re.sub(r"[^a-zA-Z^0-9-]", ":", self.id) # checks if reference sequences match and returns result dictionary and status code accordingly if any([ x is None for x in [self.ref_seq, self.ref_seq_vep, self.alt_seq, self.alt_seq_vep] ]): pass else: if not get_seq_difference(self.ref_seq, self.alt_seq) == get_seq_difference( self.ref_seq_vep, self.alt_seq_vep): self.status_code = 201 else: # gets variant gnomad exome frequency and MAF self.gnomad_frequency, self.maf = self.get_frequencies( self.response_decoded.get("colocated_variants")) # This list of parameters is assigned to corresponding class attributes. key_list = [ "gene_symbol", "amino_acids", "gene_id", "cadd_phred", "sift_converted_rankscore", "mutationtaster_converted_rankscore", "mutationassessor_rankscore", "ada_score", "rf_score", "maxentscan_ref", "maxentscan_alt" ] for key in key_list: if selected_transcript_consequences.get(key) is not None: value = selected_transcript_consequences.get(key) if isinstance(value, float): self.__dict__[key] = round( selected_transcript_consequences.get(key), 2) else: self.__dict__[ key] = selected_transcript_consequences.get( key) if selected_transcript_consequences.get("impact") is not None: self.impact = selected_transcript_consequences.get( "impact").lower() if selected_transcript_consequences.get( "gerp++_rs_rankscore") is not None: self.gerp_rs_rankscore = round( selected_transcript_consequences.get( "gerp++_rs_rankscore"), 2) # if selected_transcript_consequences.get("consequence_terms") is not None: # self.consequence = safe_get(selected_transcript_consequences.get("consequence_terms"), 0) if selected_transcript_consequences.get("hgvsc") is not None: self.hgvsc_transcript, self.hgvsc_change = selected_transcript_consequences.get( "hgvsc").split(":") if selected_transcript_consequences.get( "consequence_terms") is not None: self.consequence = re.sub( r"_", " ", safe_get( selected_transcript_consequences.get( "consequence_terms"), 0)) if selected_transcript_consequences.get("hgvsp") is not None: self.protein, self.hgvsp_change = selected_transcript_consequences.get( "hgvsp").split(":") if selected_transcript_consequences.get( "polyphen_prediction") is not None: self.polyphen_prediction = re.sub( r"_", " ", selected_transcript_consequences.get( "polyphen_prediction")) # cutoffs correspond to Leipzig guidelines (Alamut) affected_splice_score = [] if self.ada_score is not None: if self.ada_score >= 0.6: self.ada_consequence = "splicing affected" else: self.ada_consequence = "splicing not affected" affected_splice_score.append(self.ada_consequence) if self.rf_score is not None: if self.rf_score >= 0.6: self.rf_consequence = "splicing affected" else: self.rf_consequence = "splicing not affected" affected_splice_score.append(self.rf_consequence) if self.maxentscan_ref is not None and self.maxentscan_alt is not None: self.maxentscan_decrease = ( self.maxentscan_alt - self.maxentscan_ref) / self.maxentscan_ref if self.maxentscan_decrease <= -0.15: self.maxentscan_consequence = "splicing affected" else: self.maxentscan_consequence = "splicing not affected" affected_splice_score.append(self.maxentscan_consequence) if (not self.impact in ["moderate", "high"]) and \ (any("splice" in _consequence_term or "synonymous" in _consequence_term for _consequence_term in selected_transcript_consequences.get("consequence_terms"))): if len(affected_splice_score) >= 2: if affected_splice_score.count( "splicing affected") >= 2: self.impact, self.explanation_dict["impact_splice_site"] = "high", \ f"VEP impact low, but {affected_splice_score.count('splicing affected')} of " \ f"{len(affected_splice_score)} splice site predictions are \"pathogenic\"" else: if self.cadd_phred: if self.cadd_phred >= 20: self.impact, self.explanation_dict["impact_splice_site"] = "high", \ f"VEP impact low, but CADD" \ f" phred = {self.cadd_phred}"
def _get_resource_stats_per_agent(cls, company_id: str, key: str) -> dict: agent_resource_threshold_sec = timedelta( hours=config.get("apiserver.statistics.report_interval_hours", 24) ).total_seconds() to_timestamp = int(time.time()) from_timestamp = to_timestamp - int(agent_resource_threshold_sec) es_req = { "size": 0, "query": QueryBuilder.dates_range(from_timestamp, to_timestamp), "aggs": { "workers": { "terms": {"field": "worker"}, "aggs": { "categories": { "terms": {"field": "category"}, "aggs": {"count": {"cardinality": {"field": "variant"}}}, }, "metrics": { "terms": {"field": "metric"}, "aggs": { "min": {"min": {"field": "value"}}, "max": {"max": {"field": "value"}}, "avg": {"avg": {"field": "value"}}, }, }, }, } }, } res = cls._run_worker_stats_query(company_id, es_req) def _get_cardinality_fields(categories: Sequence[dict]) -> dict: names = {"cpu": "num_cores"} return { names[c["key"]]: safe_get(c, "count/value") for c in categories if c["key"] in names } def _get_metric_fields(metrics: Sequence[dict]) -> dict: names = { "cpu_usage": "cpu_usage", "memory_used": "mem_used_gb", "memory_free": "mem_free_gb", } return { names[m["key"]]: { "min": safe_get(m, "min/value"), "max": safe_get(m, "max/value"), "avg": safe_get(m, "avg/value"), } for m in metrics if m["key"] in names } buckets = safe_get(res, "aggregations/workers/buckets", default=[]) return { b["key"]: { key: { "interval_sec": agent_resource_threshold_sec, **_get_cardinality_fields(safe_get(b, "categories/buckets", [])), **_get_metric_fields(safe_get(b, "metrics/buckets", [])), } } for b in buckets }