Пример #1
0
def params_unprepare_from_saved(fields, copy_to_legacy=False):
    """
    Unescape all section and param names for hyper params and configuration
    If copy_to_legacy is set then copy hyperparams and configuration data to the legacy location for the old clients
    """
    for param_field in ("hyperparams", "configuration"):
        params = safe_get(fields, param_field)
        if params:
            unescaped_params = {
                ParameterKeyEscaper.unescape(key): {
                    ParameterKeyEscaper.unescape(k): v for k, v in value.items()
                }
                if isinstance(value, dict)
                else value
                for key, value in params.items()
            }
            dpath.set(fields, param_field, unescaped_params)

    if copy_to_legacy:
        for new_params_field, old_params_field, use_sections in (
            (f"hyperparams", "execution/parameters", True),
            (f"configuration", "execution/model_desc", False),
        ):
            legacy_params = _get_legacy_params(
                safe_get(fields, new_params_field), with_sections=use_sections
            )
            if legacy_params:
                dpath.new(
                    fields,
                    old_params_field,
                    {_get_full_param_name(p): p["value"] for p in legacy_params},
                )
Пример #2
0
def params_prepare_for_save(fields: dict, previous_task: Task = None):
    """
    If legacy hyper params or configuration is passed then replace the corresponding section in the new structure
    Escape all the section and param names for hyper params and configuration to make it mongo sage
    """
    for old_params_field, new_params_field, default_section in (
        ("execution/parameters", "hyperparams", hyperparams_default_section),
        ("execution/model_desc", "configuration", None),
    ):
        legacy_params = safe_get(fields, old_params_field)
        if legacy_params is None:
            continue

        if (
            not safe_get(fields, new_params_field)
            and previous_task
            and previous_task[new_params_field]
        ):
            previous_data = previous_task.to_proper_dict().get(new_params_field)
            removed = _remove_legacy_params(
                previous_data, with_sections=default_section is not None
            )
            if not legacy_params and not removed:
                # if we only need to delete legacy fields from the db
                # but they are not there then there is no point to proceed
                continue

            fields_update = {new_params_field: previous_data}
            params_unprepare_from_saved(fields_update)
            fields.update(fields_update)

        for full_name, value in legacy_params.items():
            section, name = split_param_name(full_name, default_section)
            new_path = list(filter(None, (new_params_field, section, name)))
            new_param = dict(name=name, type=hyperparams_legacy_type, value=str(value))
            if section is not None:
                new_param["section"] = section
            dpath.new(fields, new_path, new_param)
        dpath.delete(fields, old_params_field)

    for param_field in ("hyperparams", "configuration"):
        params = safe_get(fields, param_field)
        if params:
            escaped_params = {
                ParameterKeyEscaper.escape(key): {
                    ParameterKeyEscaper.escape(k): v for k, v in value.items()
                }
                if isinstance(value, dict)
                else value
                for key, value in params.items()
            }
            dpath.set(fields, param_field, escaped_params)
Пример #3
0
    def _get_events_from_es_res(self, es_res: dict) -> Tuple[list, int, Optional[str]]:
        """
        Return events and next scroll id from the scrolled query
        Release the scroll once it is exhausted
        """
        total_events = safe_get(es_res, "hits/total/value", default=0)
        events = [doc["_source"] for doc in safe_get(es_res, "hits/hits", default=[])]
        next_scroll_id = es_res.get("_scroll_id")
        if next_scroll_id and not events:
            self.es.clear_scroll(scroll_id=next_scroll_id)
            next_scroll_id = self.empty_scroll

        return events, total_events, next_scroll_id
Пример #4
0
 def _get_metric_fields(metrics: Sequence[dict]) -> dict:
     names = {
         "cpu_usage": "cpu_usage",
         "memory_used": "mem_used_gb",
         "memory_free": "mem_free_gb",
     }
     return {
         names[m["key"]]: {
             "min": safe_get(m, "min/value"),
             "max": safe_get(m, "max/value"),
             "avg": safe_get(m, "avg/value"),
         }
         for m in metrics
         if m["key"] in names
     }
Пример #5
0
def migrate_backend(db: Database):
    hyperparam_fields = ("execution.parameters", "hyperparams")
    configuration_fields = ("execution.model_desc", "configuration")
    collection: Collection = db["task"]
    for doc in collection.find(projection=hyperparam_fields +
                               configuration_fields):
        set_commands = {}
        for (old_field, new_field), default_section in zip(
            (hyperparam_fields, configuration_fields),
            (hyperparams_default_section, None),
        ):
            legacy = safe_get(doc, old_field, separator=".")
            if not legacy:
                continue
            for full_name, value in legacy.items():
                section, name = split_param_name(full_name, default_section)
                new_path = list(filter(None, (new_field, section, name)))
                # if safe_get(doc, new_path) is not None:
                #    continue
                new_value = dict(name=name,
                                 type=hyperparams_legacy_type,
                                 value=str(value))
                if section is not None:
                    new_value["section"] = section
                set_commands[".".join(new_path)] = new_value
        if set_commands:
            collection.update_one({"_id": doc["_id"]}, {"$set": set_commands})
Пример #6
0
 def _get_cardinality_fields(categories: Sequence[dict]) -> dict:
     names = {"cpu": "num_cores"}
     return {
         names[c["key"]]: safe_get(c, "count/value")
         for c in categories
         if c["key"] in names
     }
Пример #7
0
    def _get_task_metrics(self, task_id, es_index, event_type: EventType) -> Sequence:
        es_req = {
            "size": 0,
            "query": {
                "bool": {
                    "must": [
                        {"term": {"task": task_id}},
                        {"term": {"type": event_type.value}},
                    ]
                }
            },
            "aggs": {
                "metrics": {
                    "terms": {"field": "metric", "size": self.MAX_METRICS_COUNT}
                }
            },
        }

        with translate_errors_context(), TimingContext("es", "_get_task_metrics"):
            es_res = self.es.search(index=es_index, body=es_req)

        return [
            metric["key"]
            for metric in safe_get(es_res, "aggregations/metrics/buckets", default=[])
        ]
Пример #8
0
 def _upgrade_task_data(task_data: dict):
     for old_param_field, new_param_field, default_section in (
         ("execution/parameters", "hyperparams", hyperparams_default_section),
         ("execution/model_desc", "configuration", None),
     ):
         legacy = safe_get(task_data, old_param_field)
         if not legacy:
             continue
         for full_name, value in legacy.items():
             section, name = split_param_name(full_name, default_section)
             new_path = list(filter(None, (new_param_field, section, name)))
             if not safe_get(task_data, new_path):
                 new_param = dict(
                     name=name, type=hyperparams_legacy_type, value=str(value)
                 )
                 if section is not None:
                     new_param["section"] = section
                 dpath.new(task_data, new_path, new_param)
         dpath.delete(task_data, old_param_field)
Пример #9
0
    def _build_metric_interval(metric: str, variant: str, data: dict,
                               samples: int) -> Tuple[str, str, int, int]:
        """
        Calculate index interval per metric_variant variant so that the
        total amount of intervals does not exceeds the samples
        Return the interval and resulting amount of intervals
        """
        count = safe_get(data, "count/value", default=0)
        if count < samples:
            return metric, variant, 1, count

        min_index = safe_get(data, "min_index/value", default=0)
        max_index = safe_get(data, "max_index/value", default=min_index)
        return (
            metric,
            variant,
            max(1,
                int(max_index - min_index + 1) // samples),
            samples,
        )
Пример #10
0
 def update_queue_entries(*entries):
     for entry in entries:
         if not entry:
             continue
         info = queues_info.get(entry.id, None)
         if not info:
             continue
         entry.name = info.get("name", None)
         entry.num_tasks = info.get("num_entries", 0)
         task_id = safe_get(info, "next_entry/task")
         if task_id:
             task = tasks_info.get(task_id, None)
             entry.next_task = IdNameEntry(
                 id=task_id, name=task.name if task else None)
Пример #11
0
 def _get_active_workers(
     cls, company_id, from_timestamp: int, to_timestamp: int
 ) -> dict:
     es_req = {
         "size": 0,
         "query": QueryBuilder.dates_range(from_timestamp, to_timestamp),
         "aggs": {
             "workers": {
                 "terms": {"field": "worker"},
                 "aggs": {"last_activity_time": {"max": {"field": "timestamp"}}},
             }
         },
     }
     res = cls._run_worker_stats_query(company_id, es_req)
     buckets = safe_get(res, "aggregations/workers/buckets", default=[])
     return {
         b["key"]: {"last_activity_time": b["last_activity_time"]["value"]}
         for b in buckets
     }
Пример #12
0
    def vep_api_request(self):
        """General function for handling API communication. If there is some error with the returned data, the request
        will be retried a couple of times. Return obtained data as a dict.
        """
        url = self.server + self.ext_url

        # try to pull the requested data and check if request was successful
        r = requests.get(url, headers={"content-type": "application/json"})

        if r.ok:
            return safe_get(r.json(), 0), 200
        else:
            # if some sort of error occurs
            if "matches reference" in str(r.content, "utf-8"):
                # Return None if the given alternative sequence matches the GRCh reference sequence.
                return None, 496
            else:
                # Return None if some sort of different error occurs.
                print(
                    f"VEP ERROR '{r.status_code}: {r.reason}' occured for {self.variant}. Retrying..."
                )
                raise IOError("There has been an issue with a variant.")
Пример #13
0
    def get_all_with_projection(
        self, company_id: str, last_seen: int
    ) -> Sequence[WorkerResponseEntry]:

        helpers = list(
            map(
                WorkerConversionHelper.from_worker_entry,
                self.get_all(company_id=company_id, last_seen=last_seen),
            )
        )

        task_ids = set(filter(None, (helper.task_id for helper in helpers)))
        all_queues = set(
            itertools.chain.from_iterable(helper.queue_ids for helper in helpers)
        )

        queues_info = {}
        if all_queues:
            projection = [
                {"$match": {"_id": {"$in": list(all_queues)}}},
                {
                    "$project": {
                        "name": 1,
                        "next_entry": {"$arrayElemAt": ["$entries", 0]},
                        "num_entries": {"$size": "$entries"},
                    }
                },
            ]
            queues_info = {
                res["_id"]: res for res in Queue.objects.aggregate(projection)
            }
            task_ids = task_ids.union(
                filter(
                    None,
                    (
                        safe_get(info, "next_entry/task")
                        for info in queues_info.values()
                    ),
                )
            )

        tasks_info = {}
        if task_ids:
            tasks_info = {
                task.id: task
                for task in Task.objects(id__in=task_ids).only(
                    "name", "started", "last_iteration"
                )
            }

        def update_queue_entries(*entries):
            for entry in entries:
                if not entry:
                    continue
                info = queues_info.get(entry.id, None)
                if not info:
                    continue
                entry.name = info.get("name", None)
                entry.num_tasks = info.get("num_entries", 0)
                task_id = safe_get(info, "next_entry/task")
                if task_id:
                    task = tasks_info.get(task_id, None)
                    entry.next_task = IdNameEntry(
                        id=task_id, name=task.name if task else None
                    )

        for helper in helpers:
            worker = helper.worker
            if helper.task_id:
                task = tasks_info.get(helper.task_id, None)
                if task:
                    worker.task.running_time = (
                        int((datetime.utcnow() - task.started).total_seconds() * 1000)
                        if task.started
                        else 0
                    )
                    worker.task.last_iteration = task.last_iteration

            update_queue_entries(worker.queue)
            if worker.queues:
                update_queue_entries(*worker.queues)

        return [helper.worker for helper in helpers]
Пример #14
0
    def assign_results(self, transcript_id=None, clear_params=False):
        """This function filters and formats the received annotation results.

        :param self.response_decoded: received annotation data
        :param transcript_index: index of the transcript to work with from transcript_consequences
        """
        if clear_params:
            self.clear_params()
        if not transcript_id:
            transcript_id = self.transcript
        selected_transcript_consequences = None
        for result in self.response_decoded["transcript_consequences"]:
            if result.get("transcript_id") == transcript_id:
                selected_transcript_consequences = result
                break

        if self.response_decoded.get("vcf_string") is not None:
            self.vcf_string = re.sub(r"-", ":",
                                     self.response_decoded.get("vcf_string"))
            # extracts reference sequence from variant id in vcf format
            self.ref_seq_vep = safe_get(self.vcf_string.split(":"), 2)
            self.alt_seq_vep = safe_get(self.vcf_string.split(":"), 3)
            if self.ref_seq is None:
                self.ref_seq = self.response_decoded.get("vcf_string").split(
                    "-")[2]
                self.alt_seq = self.response_decoded.get("vcf_string").split(
                    "-")[3]
        self.id = self.response_decoded.get("id")
        if self.variant_format == "vcf":
            self.id = re.sub(r"[^a-zA-Z^0-9-]", ":", self.id)

        # checks if reference sequences match and returns result dictionary and status code accordingly
        if any([
                x is None for x in
            [self.ref_seq, self.ref_seq_vep, self.alt_seq, self.alt_seq_vep]
        ]):
            pass
        else:
            if not get_seq_difference(self.ref_seq,
                                      self.alt_seq) == get_seq_difference(
                                          self.ref_seq_vep, self.alt_seq_vep):
                self.status_code = 201
            else:
                # gets variant gnomad exome frequency and MAF
                self.gnomad_frequency, self.maf = self.get_frequencies(
                    self.response_decoded.get("colocated_variants"))
                # This list of parameters is assigned to corresponding class attributes.
                key_list = [
                    "gene_symbol", "amino_acids", "gene_id", "cadd_phred",
                    "sift_converted_rankscore",
                    "mutationtaster_converted_rankscore",
                    "mutationassessor_rankscore", "ada_score", "rf_score",
                    "maxentscan_ref", "maxentscan_alt"
                ]
                for key in key_list:
                    if selected_transcript_consequences.get(key) is not None:
                        value = selected_transcript_consequences.get(key)
                        if isinstance(value, float):
                            self.__dict__[key] = round(
                                selected_transcript_consequences.get(key), 2)
                        else:
                            self.__dict__[
                                key] = selected_transcript_consequences.get(
                                    key)

                if selected_transcript_consequences.get("impact") is not None:
                    self.impact = selected_transcript_consequences.get(
                        "impact").lower()

                if selected_transcript_consequences.get(
                        "gerp++_rs_rankscore") is not None:
                    self.gerp_rs_rankscore = round(
                        selected_transcript_consequences.get(
                            "gerp++_rs_rankscore"), 2)

                # if selected_transcript_consequences.get("consequence_terms") is not None:
                #     self.consequence = safe_get(selected_transcript_consequences.get("consequence_terms"), 0)
                if selected_transcript_consequences.get("hgvsc") is not None:
                    self.hgvsc_transcript, self.hgvsc_change = selected_transcript_consequences.get(
                        "hgvsc").split(":")

                if selected_transcript_consequences.get(
                        "consequence_terms") is not None:
                    self.consequence = re.sub(
                        r"_", " ",
                        safe_get(
                            selected_transcript_consequences.get(
                                "consequence_terms"), 0))

                if selected_transcript_consequences.get("hgvsp") is not None:
                    self.protein, self.hgvsp_change = selected_transcript_consequences.get(
                        "hgvsp").split(":")

                if selected_transcript_consequences.get(
                        "polyphen_prediction") is not None:
                    self.polyphen_prediction = re.sub(
                        r"_", " ",
                        selected_transcript_consequences.get(
                            "polyphen_prediction"))

                # cutoffs correspond to Leipzig guidelines (Alamut)
                affected_splice_score = []
                if self.ada_score is not None:
                    if self.ada_score >= 0.6:
                        self.ada_consequence = "splicing affected"
                    else:
                        self.ada_consequence = "splicing not affected"
                    affected_splice_score.append(self.ada_consequence)

                if self.rf_score is not None:
                    if self.rf_score >= 0.6:
                        self.rf_consequence = "splicing affected"
                    else:
                        self.rf_consequence = "splicing not affected"
                    affected_splice_score.append(self.rf_consequence)

                if self.maxentscan_ref is not None and self.maxentscan_alt is not None:
                    self.maxentscan_decrease = (
                        self.maxentscan_alt -
                        self.maxentscan_ref) / self.maxentscan_ref
                    if self.maxentscan_decrease <= -0.15:
                        self.maxentscan_consequence = "splicing affected"
                    else:
                        self.maxentscan_consequence = "splicing not affected"
                    affected_splice_score.append(self.maxentscan_consequence)

                if (not self.impact in ["moderate", "high"]) and \
                        (any("splice" in _consequence_term or "synonymous" in _consequence_term
                             for _consequence_term in selected_transcript_consequences.get("consequence_terms"))):
                    if len(affected_splice_score) >= 2:
                        if affected_splice_score.count(
                                "splicing affected") >= 2:
                            self.impact, self.explanation_dict["impact_splice_site"] = "high", \
                                   f"VEP impact low, but {affected_splice_score.count('splicing affected')} of " \
                                   f"{len(affected_splice_score)} splice site predictions are \"pathogenic\""
                    else:
                        if self.cadd_phred:
                            if self.cadd_phred >= 20:
                                self.impact, self.explanation_dict["impact_splice_site"] = "high", \
                                                                                           f"VEP impact low, but CADD" \
                                                                                           f" phred = {self.cadd_phred}"
Пример #15
0
    def _get_resource_stats_per_agent(cls, company_id: str, key: str) -> dict:
        agent_resource_threshold_sec = timedelta(
            hours=config.get("apiserver.statistics.report_interval_hours", 24)
        ).total_seconds()
        to_timestamp = int(time.time())
        from_timestamp = to_timestamp - int(agent_resource_threshold_sec)
        es_req = {
            "size": 0,
            "query": QueryBuilder.dates_range(from_timestamp, to_timestamp),
            "aggs": {
                "workers": {
                    "terms": {"field": "worker"},
                    "aggs": {
                        "categories": {
                            "terms": {"field": "category"},
                            "aggs": {"count": {"cardinality": {"field": "variant"}}},
                        },
                        "metrics": {
                            "terms": {"field": "metric"},
                            "aggs": {
                                "min": {"min": {"field": "value"}},
                                "max": {"max": {"field": "value"}},
                                "avg": {"avg": {"field": "value"}},
                            },
                        },
                    },
                }
            },
        }
        res = cls._run_worker_stats_query(company_id, es_req)

        def _get_cardinality_fields(categories: Sequence[dict]) -> dict:
            names = {"cpu": "num_cores"}
            return {
                names[c["key"]]: safe_get(c, "count/value")
                for c in categories
                if c["key"] in names
            }

        def _get_metric_fields(metrics: Sequence[dict]) -> dict:
            names = {
                "cpu_usage": "cpu_usage",
                "memory_used": "mem_used_gb",
                "memory_free": "mem_free_gb",
            }
            return {
                names[m["key"]]: {
                    "min": safe_get(m, "min/value"),
                    "max": safe_get(m, "max/value"),
                    "avg": safe_get(m, "avg/value"),
                }
                for m in metrics
                if m["key"] in names
            }

        buckets = safe_get(res, "aggregations/workers/buckets", default=[])
        return {
            b["key"]: {
                key: {
                    "interval_sec": agent_resource_threshold_sec,
                    **_get_cardinality_fields(safe_get(b, "categories/buckets", [])),
                    **_get_metric_fields(safe_get(b, "metrics/buckets", [])),
                }
            }
            for b in buckets
        }