示例#1
0
    def get_unique_metric_variants(cls, company_id, project_ids: Sequence[str],
                                   include_subprojects: bool):
        pipeline = [
            {
                "$match": {
                    **cls._get_company_constraint(company_id),
                    **cls._get_project_constraint(project_ids, include_subprojects),
                }
            },
            {
                "$project": {
                    "metrics": {
                        "$objectToArray": "$last_metrics"
                    }
                }
            },
            {
                "$unwind": "$metrics"
            },
            {
                "$project": {
                    "metric": "$metrics.k",
                    "variants": {
                        "$objectToArray": "$metrics.v"
                    },
                }
            },
            {
                "$unwind": "$variants"
            },
            {
                "$group": {
                    "_id": {
                        "metric": "$variants.v.metric",
                        "variant": "$variants.v.variant",
                    },
                    "metrics": {
                        "$addToSet": {
                            "metric": "$variants.v.metric",
                            "metric_hash": "$metric",
                            "variant": "$variants.v.variant",
                            "variant_hash": "$variants.k",
                        }
                    },
                }
            },
            {
                "$sort": OrderedDict({
                    "_id.metric": 1,
                    "_id.variant": 1
                })
            },
        ]

        result = Task.aggregate(pipeline)
        return [r["metrics"][0] for r in result]
示例#2
0
    def get_configuration_names(cls, company_id: str,
                                task_ids: Sequence[str]) -> Dict[str, list]:
        with TimingContext("mongo", "get_configuration_names"):
            pipeline = [
                {
                    "$match": {
                        "company": {
                            "$in": [None, "", company_id]
                        },
                        "_id": {
                            "$in": task_ids
                        },
                    }
                },
                {
                    "$project": {
                        "items": {
                            "$objectToArray": "$configuration"
                        }
                    }
                },
                {
                    "$unwind": "$items"
                },
                {
                    "$group": {
                        "_id": "$_id",
                        "names": {
                            "$addToSet": "$items.k"
                        }
                    }
                },
            ]

            tasks = Task.aggregate(pipeline)

            return {
                task["_id"]: {
                    "names":
                    sorted(
                        ParameterKeyEscaper.unescape(name)
                        for name in task["names"])
                }
                for task in tasks
            }
示例#3
0
 def _get_experiments_stats(cls,
                            company_id,
                            workers: Optional[Sequence] = None) -> dict:
     pipeline = [
         {
             "$match": {
                 "company": company_id,
                 "started": {
                     "$exists": True,
                     "$ne": None
                 },
                 "last_update": {
                     "$exists": True,
                     "$ne": None
                 },
                 "status": {
                     "$nin": ["created", "queued"]
                 },
                 **({
                     "last_worker": {
                         "$in": workers
                     }
                 } if workers else {}),
             }
         },
         {
             "$group": {
                 "_id": "$last_worker" if workers else None,
                 "count": {
                     "$sum": 1
                 },
                 "avg_run_time_sec": {
                     "$avg": {
                         "$divide": [
                             {
                                 "$subtract": ["$last_update", "$started"]
                             },
                             1000,
                         ]
                     }
                 },
                 "avg_iterations": {
                     "$avg": "$last_iteration"
                 },
             }
         },
         {
             "$project": {
                 "count": 1,
                 "avg_run_time_sec": {
                     "$trunc": "$avg_run_time_sec"
                 },
                 "avg_iterations": {
                     "$trunc": "$avg_iterations"
                 },
             }
         },
     ]
     return {
         group["_id"]: {k: v
                        for k, v in group.items() if k != "_id"}
         for group in Task.aggregate(pipeline)
     }
示例#4
0
    def get_aggregated_project_parameters(
        cls,
        company_id,
        project_ids: Sequence[str],
        include_subprojects: bool,
        page: int = 0,
        page_size: int = 500,
    ) -> Tuple[int, int, Sequence[dict]]:
        page = max(0, page)
        page_size = max(1, page_size)
        pipeline = [
            {
                "$match": {
                    **cls._get_company_constraint(company_id),
                    **cls._get_project_constraint(project_ids, include_subprojects),
                    "hyperparams": {
                        "$exists": True,
                        "$gt": {}
                    },
                }
            },
            {
                "$project": {
                    "sections": {
                        "$objectToArray": "$hyperparams"
                    }
                }
            },
            {
                "$unwind": "$sections"
            },
            {
                "$project": {
                    "section": "$sections.k",
                    "names": {
                        "$objectToArray": "$sections.v"
                    },
                }
            },
            {
                "$unwind": "$names"
            },
            {
                "$group": {
                    "_id": {
                        "section": "$section",
                        "name": "$names.k"
                    }
                }
            },
            {
                "$sort": OrderedDict({
                    "_id.section": 1,
                    "_id.name": 1
                })
            },
            {
                "$skip": page * page_size
            },
            {
                "$limit": page_size
            },
            {
                "$group": {
                    "_id": 1,
                    "total": {
                        "$sum": 1
                    },
                    "results": {
                        "$push": "$$ROOT"
                    },
                }
            },
        ]

        result = next(Task.aggregate(pipeline), None)

        total = 0
        remaining = 0
        results = []

        if result:
            total = int(result.get("total", -1))
            results = [{
                "section":
                ParameterKeyEscaper.unescape(nested_get(r,
                                                        ("_id", "section"))),
                "name":
                ParameterKeyEscaper.unescape(nested_get(r, ("_id", "name"))),
            } for r in result.get("results", [])]
            remaining = max(0, total - (len(results) + page * page_size))

        return total, remaining, results
示例#5
0
    def get_task_hyperparam_distinct_values(
        self,
        company_id: str,
        project_ids: Sequence[str],
        section: str,
        name: str,
        include_subprojects: bool,
        allow_public: bool = True,
    ) -> ParamValues:
        company_constraint = self._get_company_constraint(
            company_id, allow_public)
        project_constraint = self._get_project_constraint(
            project_ids, include_subprojects)
        key_path = f"hyperparams.{ParameterKeyEscaper.escape(section)}.{ParameterKeyEscaper.escape(name)}"
        last_updated_task = (Task.objects(
            **company_constraint,
            **project_constraint,
            **{
                f"{key_path.replace('.', '__')}__exists": True
            },
        ).only("last_update").order_by("-last_update").limit(1).first())
        if not last_updated_task:
            return 0, []

        redis_key = f"hyperparam_values_{company_id}_{'_'.join(project_ids)}_{section}_{name}_{allow_public}"
        last_update = last_updated_task.last_update or datetime.utcnow()
        cached_res = self._get_cached_param_values(
            key=redis_key,
            last_update=last_update,
            allowed_delta_sec=config.get(
                "services.tasks.hyperparam_values.cache_allowed_outdate_sec",
                60),
        )
        if cached_res:
            return cached_res

        max_values = config.get("services.tasks.hyperparam_values.max_count",
                                100)
        pipeline = [
            {
                "$match": {
                    **company_constraint,
                    **project_constraint,
                    key_path: {
                        "$exists": True
                    },
                }
            },
            {
                "$project": {
                    "value": f"${key_path}.value"
                }
            },
            {
                "$group": {
                    "_id": "$value"
                }
            },
            {
                "$sort": {
                    "_id": 1
                }
            },
            {
                "$limit": max_values
            },
            {
                "$group": {
                    "_id": 1,
                    "total": {
                        "$sum": 1
                    },
                    "results": {
                        "$push": "$$ROOT._id"
                    },
                }
            },
        ]

        result = next(Task.aggregate(pipeline, collation=Task._numeric_locale),
                      None)
        if not result:
            return 0, []

        total = int(result.get("total", 0))
        values = result.get("results", [])

        ttl = config.get("services.tasks.hyperparam_values.cache_ttl_sec",
                         86400)
        cached = dict(last_update=last_update.timestamp(),
                      total=total,
                      values=values)
        self.redis.setex(redis_key, ttl, json.dumps(cached))

        return total, values
示例#6
0
    def get_aggregated_project_parameters(
        company_id,
        project_ids: Sequence[str] = None,
        page: int = 0,
        page_size: int = 500,
    ) -> Tuple[int, int, Sequence[dict]]:

        page = max(0, page)
        page_size = max(1, page_size)
        pipeline = [
            {
                "$match": {
                    "company": {
                        "$in": [None, "", company_id]
                    },
                    "hyperparams": {
                        "$exists": True,
                        "$gt": {}
                    },
                    **({
                        "project": {
                            "$in": project_ids
                        }
                    } if project_ids else {}),
                }
            },
            {
                "$project": {
                    "sections": {
                        "$objectToArray": "$hyperparams"
                    }
                }
            },
            {
                "$unwind": "$sections"
            },
            {
                "$project": {
                    "section": "$sections.k",
                    "names": {
                        "$objectToArray": "$sections.v"
                    },
                }
            },
            {
                "$unwind": "$names"
            },
            {
                "$group": {
                    "_id": {
                        "section": "$section",
                        "name": "$names.k"
                    }
                }
            },
            {
                "$sort": OrderedDict({
                    "_id.section": 1,
                    "_id.name": 1
                })
            },
            {
                "$group": {
                    "_id": 1,
                    "total": {
                        "$sum": 1
                    },
                    "results": {
                        "$push": "$$ROOT"
                    },
                }
            },
            {
                "$project": {
                    "total": 1,
                    "results": {
                        "$slice": ["$results", page * page_size, page_size]
                    },
                }
            },
        ]

        with translate_errors_context():
            result = next(Task.aggregate(pipeline), None)

        total = 0
        remaining = 0
        results = []

        if result:
            total = int(result.get("total", -1))
            results = [{
                "section":
                ParameterKeyEscaper.unescape(dpath.get(r, "_id/section")),
                "name":
                ParameterKeyEscaper.unescape(dpath.get(r, "_id/name")),
            } for r in result.get("results", [])]
            remaining = max(0, total - (len(results) + page * page_size))

        return total, remaining, results
示例#7
0
    def get_unique_metric_variants(company_id, project_ids=None):
        pipeline = [
            {
                "$match":
                dict(
                    company={"$in": [None, "", company_id]},
                    **({
                        "project": {
                            "$in": project_ids
                        }
                    } if project_ids else {}),
                )
            },
            {
                "$project": {
                    "metrics": {
                        "$objectToArray": "$last_metrics"
                    }
                }
            },
            {
                "$unwind": "$metrics"
            },
            {
                "$project": {
                    "metric": "$metrics.k",
                    "variants": {
                        "$objectToArray": "$metrics.v"
                    },
                }
            },
            {
                "$unwind": "$variants"
            },
            {
                "$group": {
                    "_id": {
                        "metric": "$variants.v.metric",
                        "variant": "$variants.v.variant",
                    },
                    "metrics": {
                        "$addToSet": {
                            "metric": "$variants.v.metric",
                            "metric_hash": "$metric",
                            "variant": "$variants.v.variant",
                            "variant_hash": "$variants.k",
                        }
                    },
                }
            },
            {
                "$sort": OrderedDict({
                    "_id.metric": 1,
                    "_id.variant": 1
                })
            },
        ]

        with translate_errors_context():
            result = Task.aggregate(pipeline)
            return [r["metrics"][0] for r in result]
示例#8
0
    def get_unique_metric_variants(company_id, project_ids: Sequence[str],
                                   include_subprojects: bool):
        if project_ids:
            if include_subprojects:
                project_ids = project_ids_with_children(project_ids)
            project_constraint = {"project": {"$in": project_ids}}
        else:
            project_constraint = {}
        pipeline = [
            {
                "$match":
                dict(
                    company={"$in": [None, "", company_id]},
                    **project_constraint,
                )
            },
            {
                "$project": {
                    "metrics": {
                        "$objectToArray": "$last_metrics"
                    }
                }
            },
            {
                "$unwind": "$metrics"
            },
            {
                "$project": {
                    "metric": "$metrics.k",
                    "variants": {
                        "$objectToArray": "$metrics.v"
                    },
                }
            },
            {
                "$unwind": "$variants"
            },
            {
                "$group": {
                    "_id": {
                        "metric": "$variants.v.metric",
                        "variant": "$variants.v.variant",
                    },
                    "metrics": {
                        "$addToSet": {
                            "metric": "$variants.v.metric",
                            "metric_hash": "$metric",
                            "variant": "$variants.v.variant",
                            "variant_hash": "$variants.k",
                        }
                    },
                }
            },
            {
                "$sort": OrderedDict({
                    "_id.metric": 1,
                    "_id.variant": 1
                })
            },
        ]

        with translate_errors_context():
            result = Task.aggregate(pipeline)
            return [r["metrics"][0] for r in result]
示例#9
0
    def get_project_stats(
        cls,
        company: str,
        project_ids: Sequence[str],
        specific_state: Optional[EntityVisibility] = None,
        include_children: bool = True,
        filter_: Mapping[str, Any] = None,
    ) -> Tuple[Dict[str, dict], Dict[str, dict]]:
        if not project_ids:
            return {}, {}

        child_projects = (_get_sub_projects(project_ids, _only=("id", "name"))
                          if include_children else {})
        project_ids_with_children = set(project_ids) | {
            c.id
            for c in itertools.chain.from_iterable(child_projects.values())
        }
        status_count_pipeline, runtime_pipeline = cls.make_projects_get_all_pipelines(
            company,
            project_ids=list(project_ids_with_children),
            specific_state=specific_state,
            filter_=filter_,
        )

        default_counts = dict.fromkeys(get_options(TaskStatus), 0)

        def set_default_count(entry):
            return dict(default_counts, **entry)

        status_count = defaultdict(lambda: {})
        key = itemgetter(EntityVisibility.archived.value)
        for result in Task.aggregate(status_count_pipeline):
            for k, group in groupby(sorted(result["counts"], key=key), key):
                section = (EntityVisibility.archived
                           if k else EntityVisibility.active).value
                status_count[result["_id"]][section] = set_default_count({
                    count_entry["status"]: count_entry["count"]
                    for count_entry in group
                })

        def sum_status_count(a: Mapping[str, Mapping],
                             b: Mapping[str, Mapping]) -> Dict[str, dict]:
            return {
                section: {
                    status: nested_get(a, (section, status), default=0) +
                    nested_get(b, (section, status), default=0)
                    for status in set(a.get(section, {}))
                    | set(b.get(section, {}))
                }
                for section in set(a) | set(b)
            }

        status_count = cls.aggregate_project_data(
            func=sum_status_count,
            project_ids=project_ids,
            child_projects=child_projects,
            data=status_count,
        )

        runtime = {
            result["_id"]: {k: v
                            for k, v in result.items() if k != "_id"}
            for result in Task.aggregate(runtime_pipeline)
        }

        def sum_runtime(a: Mapping[str, Mapping],
                        b: Mapping[str, Mapping]) -> Dict[str, dict]:
            return {
                section: a.get(section, 0) + b.get(section, 0)
                if not section.endswith("max_task_started") else max(
                    a.get(section) or datetime.min,
                    b.get(section) or datetime.min)
                for section in set(a) | set(b)
            }

        runtime = cls.aggregate_project_data(
            func=sum_runtime,
            project_ids=project_ids,
            child_projects=child_projects,
            data=runtime,
        )

        def get_status_counts(project_id, section):
            project_runtime = runtime.get(project_id, {})
            project_section_statuses = nested_get(status_count,
                                                  (project_id, section),
                                                  default=default_counts)

            def get_time_or_none(value):
                return value if value != datetime.min else None

            return {
                "status_count":
                project_section_statuses,
                "total_tasks":
                sum(project_section_statuses.values()),
                "total_runtime":
                project_runtime.get(section, 0),
                "completed_tasks_24h":
                project_runtime.get(f"{section}_recently_completed", 0),
                "last_task_run":
                get_time_or_none(
                    project_runtime.get(f"{section}_max_task_started",
                                        datetime.min)),
            }

        report_for_states = [
            s for s in cls.visibility_states
            if not specific_state or specific_state == s
        ]

        stats = {
            project: {
                task_state.value: get_status_counts(project, task_state.value)
                for task_state in report_for_states
            }
            for project in project_ids
        }

        children = {
            project: sorted(
                [{
                    "id": c.id,
                    "name": c.name
                } for c in child_projects.get(project, [])],
                key=itemgetter("name"),
            )
            for project in project_ids
        }
        return stats, children
示例#10
0
    def get_project_stats(
        cls,
        company: str,
        project_ids: Sequence[str],
        specific_state: Optional[EntityVisibility] = None,
    ) -> Tuple[Dict[str, dict], Dict[str, dict]]:
        if not project_ids:
            return {}, {}

        child_projects = _get_sub_projects(project_ids, _only=("id", "name"))
        project_ids_with_children = set(project_ids) | {
            c.id for c in itertools.chain.from_iterable(child_projects.values())
        }
        status_count_pipeline, runtime_pipeline = cls.make_projects_get_all_pipelines(
            company,
            project_ids=list(project_ids_with_children),
            specific_state=specific_state,
        )

        default_counts = dict.fromkeys(get_options(TaskStatus), 0)

        def set_default_count(entry):
            return dict(default_counts, **entry)

        status_count = defaultdict(lambda: {})
        key = itemgetter(EntityVisibility.archived.value)
        for result in Task.aggregate(status_count_pipeline):
            for k, group in groupby(sorted(result["counts"], key=key), key):
                section = (
                    EntityVisibility.archived if k else EntityVisibility.active
                ).value
                status_count[result["_id"]][section] = set_default_count(
                    {
                        count_entry["status"]: count_entry["count"]
                        for count_entry in group
                    }
                )

        def sum_status_count(
            a: Mapping[str, Mapping], b: Mapping[str, Mapping]
        ) -> Dict[str, dict]:
            return {
                section: {
                    status: nested_get(a, (section, status), 0)
                    + nested_get(b, (section, status), 0)
                    for status in set(a.get(section, {})) | set(b.get(section, {}))
                }
                for section in set(a) | set(b)
            }

        status_count = cls.aggregate_project_data(
            func=sum_status_count,
            project_ids=project_ids,
            child_projects=child_projects,
            data=status_count,
        )

        runtime = {
            result["_id"]: {k: v for k, v in result.items() if k != "_id"}
            for result in Task.aggregate(runtime_pipeline)
        }

        def sum_runtime(
            a: Mapping[str, Mapping], b: Mapping[str, Mapping]
        ) -> Dict[str, dict]:
            return {
                section: a.get(section, 0) + b.get(section, 0)
                for section in set(a) | set(b)
            }

        runtime = cls.aggregate_project_data(
            func=sum_runtime,
            project_ids=project_ids,
            child_projects=child_projects,
            data=runtime,
        )

        def get_status_counts(project_id, section):
            return {
                "total_runtime": nested_get(runtime, (project_id, section), 0),
                "status_count": nested_get(
                    status_count, (project_id, section), default_counts
                ),
            }

        report_for_states = [
            s for s in EntityVisibility if not specific_state or specific_state == s
        ]

        stats = {
            project: {
                task_state.value: get_status_counts(project, task_state.value)
                for task_state in report_for_states
            }
            for project in project_ids
        }

        children = {
            project: sorted(
                [{"id": c.id, "name": c.name} for c in child_projects.get(project, [])],
                key=itemgetter("name"),
            )
            for project in project_ids
        }
        return stats, children