示例#1
0
 def _build_subquery(self):
     if not self.kwargs.get("tad_set_uuid"):
         return None
     else:
         set_pk = TadSet.objects.get(
             sodar_uuid=self.kwargs["tad_set_uuid"]).pk
         term_interval_center = (TadBoundaryInterval.sa.start +
                                 (TadBoundaryInterval.sa.end -
                                  TadBoundaryInterval.sa.start + 1) / 2)
         term_overlaps = and_(
             term_interval_center >= StructuralVariant.sa.start,
             term_interval_center <= StructuralVariant.sa.end,
         )
         fields = [
             func.coalesce(
                 func.min(
                     case(
                         [(term_overlaps, 0)],
                         else_=func.least(
                             func.abs(term_interval_center -
                                      StructuralVariant.sa.start),
                             func.abs(term_interval_center -
                                      StructuralVariant.sa.end),
                         ),
                     )),
                 -1,
             ).label("distance_to_center")
         ]
         return (select(fields).select_from(TadBoundaryInterval.sa).where(
             and_(
                 TadBoundaryInterval.sa.tad_set_id == set_pk,
                 overlaps(TadBoundaryInterval, StructuralVariant),
             )).alias("subquery_tad_boundaries_inner").lateral(
                 "subquery_tad_boundaries_outer"))
示例#2
0
def overlaps(lhs, rhs, min_overlap=None, rhs_start=None, rhs_end=None):
    """Returns term of ``lhs`` overlapping with ``rhs`` based on the start/end fields."""
    if rhs_start is None:
        rhs_start = rhs.sa.start
    if rhs_end is None:
        rhs_end = rhs.sa.end
    if min_overlap is None:
        return and_(
            lhs.sa.release == rhs.sa.release,
            lhs.sa.chromosome == rhs.sa.chromosome,
            lhs.sa.bin.in_(
                select([column("bin")]).select_from(func.overlapping_bins(rhs_start - 1, rhs_end))
            ),
            lhs.sa.end >= rhs_start,
            lhs.sa.start <= rhs_end,
        )
    else:
        term_overlap = func.least(lhs.sa.end, rhs_end) - func.greatest(lhs.sa.start, rhs_start) + 1
        return and_(
            lhs.sa.release == rhs.sa.release,
            lhs.sa.chromosome == rhs.sa.chromosome,
            rhs.sa.bin.in_(
                select([column("bin")]).select_from(
                    func.overlapping_bins(lhs.sa.start - 1, lhs.sa.end)
                )
            ),
            lhs.sa.end >= rhs_start,
            lhs.sa.start <= rhs_end,
            cast(term_overlap, Float) / func.greatest((rhs_end - rhs_start + 1), 1) > min_overlap,
            cast(term_overlap, Float) / func.greatest((lhs.sa.end - lhs.sa.start + 1), 1)
            > min_overlap,
        )
示例#3
0
def best_matching_flags(sa_engine, case_id, sv_uuid, min_overlap=0.95):
    """Find best matching ``StructuralVariantFlags`` object for the given case and SV.

    Returns ``None`` if none could be found.
    """
    sv = StructuralVariant.objects.get(case_id=case_id, sv_uuid=sv_uuid)
    term_overlap = (
        func.least(StructuralVariantFlags.sa.end, sv.end)
        - func.greatest(StructuralVariantFlags.sa.start, sv.start)
        + 1
    )
    query = (
        select(
            [
                StructuralVariantFlags.sa.sodar_uuid.label("flags_uuid"),
                func.least(
                    cast(term_overlap, Float) / func.greatest((sv.end - sv.start + 1), 1),
                    cast(term_overlap, Float)
                    / func.greatest(
                        (StructuralVariantFlags.sa.end - StructuralVariantFlags.sa.start + 1), 1
                    ),
                ).label("reciprocal_overlap"),
            ]
        )
        .select_from(StructuralVariantFlags.sa)
        .where(
            and_(
                StructuralVariantFlags.sa.case_id == case_id,
                StructuralVariantFlags.sa.release == sv.release,
                StructuralVariantFlags.sa.chromosome == sv.chromosome,
                StructuralVariantFlags.sa.bin.in_(
                    select([column("bin")]).select_from(func.overlapping_bins(sv.start - 1, sv.end))
                ),
                StructuralVariantFlags.sa.end >= sv.start,
                StructuralVariantFlags.sa.start <= sv.end,
                StructuralVariantFlags.sa.sv_type == sv.sv_type,
                cast(term_overlap, Float) / func.greatest((sv.end - sv.start + 1), 1) > min_overlap,
                cast(term_overlap, Float)
                / func.greatest(
                    (StructuralVariantFlags.sa.end - StructuralVariantFlags.sa.start + 1), 1
                )
                > min_overlap,
            )
        )
    )
    return sa_engine.execute(query.order_by(query.c.reciprocal_overlap.desc()))
示例#4
0
def process_update_recommendation_scores(payload):
    text_fields = [
        User.hometown,
        User.occupation,
        User.education,
        User.about_me,
        User.my_travels,
        User.things_i_like,
        User.about_place,
        User.additional_information,
        User.pet_details,
        User.kid_details,
        User.housemate_details,
        User.other_host_info,
        User.sleeping_details,
        User.area,
        User.house_rules,
    ]
    home_fields = [User.about_place, User.other_host_info, User.sleeping_details, User.area, User.house_rules]

    def poor_man_gaussian():
        """
        Produces an approximatley std normal random variate
        """
        trials = 5
        return (sum([func.random() for _ in range(trials)]) - trials / 2) / sqrt(trials / 12)

    def int_(stmt):
        return func.coalesce(cast(stmt, Integer), 0)

    def float_(stmt):
        return func.coalesce(cast(stmt, Float), 0.0)

    with session_scope() as session:
        # profile
        profile_text = ""
        for field in text_fields:
            profile_text += func.coalesce(field, "")
        text_length = func.length(profile_text)
        home_text = ""
        for field in home_fields:
            home_text += func.coalesce(field, "")
        home_length = func.length(home_text)

        has_text = int_(text_length > 500)
        long_text = int_(text_length > 2000)
        has_pic = int_(User.avatar_key != None)
        can_host = int_(User.hosting_status == HostingStatus.can_host)
        cant_host = int_(User.hosting_status == HostingStatus.cant_host)
        filled_home = int_(User.last_minute != None) * int_(home_length > 200)
        profile_points = 2 * has_text + 3 * long_text + 2 * has_pic + 3 * can_host + 2 * filled_home - 5 * cant_host

        # references
        left_ref_expr = int_(1).label("left_reference")
        left_refs_subquery = (
            select(Reference.from_user_id.label("user_id"), left_ref_expr).group_by(Reference.from_user_id).subquery()
        )
        left_reference = int_(left_refs_subquery.c.left_reference)
        has_reference_expr = int_(func.count(Reference.id) >= 1).label("has_reference")
        ref_count_expr = int_(func.count(Reference.id)).label("ref_count")
        ref_avg_expr = func.avg(1.4 * (Reference.rating - 0.3)).label("ref_avg")
        has_multiple_types_expr = int_(func.count(distinct(Reference.reference_type)) >= 2).label("has_multiple_types")
        has_bad_ref_expr = int_(func.sum(int_((Reference.rating <= 0.2) | (~Reference.was_appropriate))) >= 1).label(
            "has_bad_ref"
        )
        received_ref_subquery = (
            select(
                Reference.to_user_id.label("user_id"),
                has_reference_expr,
                has_multiple_types_expr,
                has_bad_ref_expr,
                ref_count_expr,
                ref_avg_expr,
            )
            .group_by(Reference.to_user_id)
            .subquery()
        )
        has_multiple_types = int_(received_ref_subquery.c.has_multiple_types)
        has_reference = int_(received_ref_subquery.c.has_reference)
        has_bad_reference = int_(received_ref_subquery.c.has_bad_ref)
        rating_score = float_(
            received_ref_subquery.c.ref_avg
            * (
                2 * func.least(received_ref_subquery.c.ref_count, 5)
                + func.greatest(received_ref_subquery.c.ref_count - 5, 0)
            )
        )
        ref_score = 2 * has_reference + has_multiple_types + left_reference - 5 * has_bad_reference + rating_score

        # activeness
        recently_active = int_(User.last_active >= now() - timedelta(days=180))
        very_recently_active = int_(User.last_active >= now() - timedelta(days=14))
        recently_messaged = int_(func.max(Message.time) > now() - timedelta(days=14))
        messaged_lots = int_(func.count(Message.id) > 5)
        messaging_points_subquery = (recently_messaged + messaged_lots).label("messaging_points")
        messaging_subquery = (
            select(Message.author_id.label("user_id"), messaging_points_subquery)
            .where(Message.message_type == MessageType.text)
            .group_by(Message.author_id)
            .subquery()
        )
        activeness_points = recently_active + 2 * very_recently_active + int_(messaging_subquery.c.messaging_points)

        # verification
        phone_verified = int_(User.phone_is_verified)
        cb_subquery = (
            select(ClusterSubscription.user_id.label("user_id"), func.min(Cluster.parent_node_id).label("min_node_id"))
            .join(Cluster, Cluster.id == ClusterSubscription.cluster_id)
            .where(ClusterSubscription.role == ClusterRole.admin)
            .where(Cluster.is_official_cluster)
            .group_by(ClusterSubscription.user_id)
            .subquery()
        )
        min_node_id = cb_subquery.c.min_node_id
        cb = int_(min_node_id >= 1)
        f = int_(User.id <= 2)
        wcb = int_(min_node_id == 1)
        verification_points = 0.0 + 100 * f + 10 * wcb + 5 * cb

        # response rate
        t = (
            select(Message.conversation_id, Message.time)
            .where(Message.message_type == MessageType.chat_created)
            .subquery()
        )
        s = (
            select(Message.conversation_id, Message.author_id, func.min(Message.time).label("time"))
            .group_by(Message.conversation_id, Message.author_id)
            .subquery()
        )
        hr_subquery = (
            select(
                HostRequest.host_user_id.label("user_id"),
                func.avg(s.c.time - t.c.time).label("avg_response_time"),
                func.count(t.c.time).label("received"),
                func.count(s.c.time).label("responded"),
                float_(
                    extract(
                        "epoch",
                        percentile_disc(0.33).within_group(func.coalesce(s.c.time - t.c.time, timedelta(days=1000))),
                    )
                    / 60.0
                ).label("response_time_33p"),
                float_(
                    extract(
                        "epoch",
                        percentile_disc(0.66).within_group(func.coalesce(s.c.time - t.c.time, timedelta(days=1000))),
                    )
                    / 60.0
                ).label("response_time_66p"),
            )
            .join(t, t.c.conversation_id == HostRequest.conversation_id)
            .outerjoin(
                s, and_(s.c.conversation_id == HostRequest.conversation_id, s.c.author_id == HostRequest.host_user_id)
            )
            .group_by(HostRequest.host_user_id)
            .subquery()
        )
        avg_response_time = hr_subquery.c.avg_response_time
        avg_response_time_hr = float_(extract("epoch", avg_response_time) / 60.0)
        received = hr_subquery.c.received
        responded = hr_subquery.c.responded
        response_time_33p = hr_subquery.c.response_time_33p
        response_time_66p = hr_subquery.c.response_time_66p
        response_rate = float_(responded / (1.0 * func.greatest(received, 1)))
        # be careful with nulls
        response_rate_points = -10 * int_(response_time_33p > 60 * 48.0) + 5 * int_(response_time_66p < 60 * 48.0)

        recommendation_score = (
            profile_points
            + ref_score
            + activeness_points
            + verification_points
            + response_rate_points
            + 2 * poor_man_gaussian()
        )

        scores = (
            select(User.id.label("user_id"), recommendation_score.label("score"))
            .outerjoin(messaging_subquery, messaging_subquery.c.user_id == User.id)
            .outerjoin(left_refs_subquery, left_refs_subquery.c.user_id == User.id)
            .outerjoin(received_ref_subquery, received_ref_subquery.c.user_id == User.id)
            .outerjoin(cb_subquery, cb_subquery.c.user_id == User.id)
            .outerjoin(hr_subquery, hr_subquery.c.user_id == User.id)
        ).subquery()

        session.execute(
            User.__table__.update().values(recommendation_score=scores.c.score).where(User.id == scores.c.user_id)
        )

    logger.info("Updated recommendation scores")
示例#5
0
 def get_query_column(self, entity):
     return func.least(*(getattr(entity, c) for c in self.entity_columns))
示例#6
0
def query_work_day_stats(
    company_id,
    start_date=None,
    end_date=None,
    first=None,
    after=None,
    tzname="Europe/Paris",
):
    tz = gettz(tzname)
    if after:
        max_time, user_id_ = parse_datetime_plus_id_cursor(after)
        max_date = max_time.date()
        end_date = min(max_date, end_date) if end_date else max_date

    query = (Activity.query.join(Mission).join(
        Expenditure,
        and_(
            Activity.user_id == Expenditure.user_id,
            Activity.mission_id == Expenditure.mission_id,
        ),
        isouter=True,
    ).with_entities(
        Activity.id,
        Activity.user_id,
        Activity.mission_id,
        Mission.name,
        Activity.start_time,
        Activity.end_time,
        Activity.type,
        Expenditure.id.label("expenditure_id"),
        Expenditure.type.label("expenditure_type"),
        func.generate_series(
            func.date_trunc(
                "day",
                func.timezone(
                    tzname,
                    func.timezone("UTC", Activity.start_time),
                ),
            ),
            func.timezone(
                tzname,
                func.coalesce(
                    func.timezone("UTC", Activity.end_time),
                    func.now(),
                ),
            ),
            "1 day",
        ).label("day"),
    ).filter(
        Mission.company_id == company_id,
        ~Activity.is_dismissed,
        Activity.start_time != Activity.end_time,
    ))

    query = _apply_time_range_filters(
        query,
        to_datetime(start_date, tz_for_date=tz),
        to_datetime(end_date,
                    tz_for_date=tz,
                    convert_dates_to_end_of_day_times=True),
    )

    has_next_page = False
    if first:
        activity_first = max(first * 5, 200)
        query = query.order_by(desc("day"), desc(
            Activity.user_id)).limit(activity_first + 1)
        has_next_page = query.count() > activity_first

    query = query.subquery()

    query = (db.session.query(query).group_by(
        query.c.user_id, query.c.day, query.c.mission_id,
        query.c.name).with_entities(
            query.c.user_id.label("user_id"),
            query.c.day,
            func.timezone("UTC",
                          func.timezone(tzname,
                                        query.c.day)).label("utc_day_start"),
            query.c.mission_id.label("mission_id"),
            query.c.name.label("mission_name"),
            func.min(
                func.greatest(
                    query.c.start_time,
                    func.timezone("UTC", func.timezone(tzname, query.c.day)),
                )).label("start_time"),
            func.max(
                func.least(
                    func.timezone(
                        "UTC",
                        func.timezone(
                            tzname,
                            query.c.day + func.cast("1 day", Interval)),
                    ),
                    func.coalesce(query.c.end_time, func.now()),
                )).label("end_time"),
            func.bool_or(
                and_(
                    query.c.end_time.is_(None),
                    query.c.day == func.current_date(),
                )).label("is_running"),
            *[
                func.sum(
                    case(
                        [(
                            query.c.type == a_type.value,
                            extract(
                                "epoch",
                                func.least(
                                    func.timezone(
                                        "UTC",
                                        func.timezone(
                                            tzname,
                                            query.c.day +
                                            func.cast("1 day", Interval),
                                        ),
                                    ),
                                    func.coalesce(query.c.end_time,
                                                  func.now()),
                                ) - func.greatest(
                                    query.c.start_time,
                                    func.timezone(
                                        "UTC",
                                        func.timezone(tzname, query.c.day),
                                    ),
                                ),
                            ),
                        )],
                        else_=0,
                    )).label(f"{a_type.value}_duration")
                for a_type in ActivityType
            ],
            func.greatest(func.count(distinct(query.c.expenditure_id)),
                          1).label("n_exp_dups"),
            func.count(distinct(query.c.id)).label("n_act_dups"),
            *[
                func.sum(
                    case(
                        [(query.c.expenditure_type == e_type.value, 1)],
                        else_=0,
                    )).label(f"n_{e_type.value}_expenditures")
                for e_type in ExpenditureType
            ],
        ).subquery())

    query = (db.session.query(query).group_by(
        query.c.user_id, query.c.day).with_entities(
            query.c.user_id.label("user_id"),
            query.c.day,
            func.array_agg(distinct(
                query.c.mission_name)).label("mission_names"),
            func.min(query.c.start_time).label("start_time"),
            func.max(query.c.end_time).label("end_time"),
            func.bool_or(query.c.is_running).label("is_running"),
            *[
                func.sum(
                    getattr(query.c, f"{a_type.value}_duration") /
                    query.c.n_exp_dups).cast(Integer).label(
                        f"{a_type.value}_duration") for a_type in ActivityType
            ],
            *[
                func.sum(
                    getattr(query.c, f"n_{e_type.value}_expenditures") /
                    query.c.n_act_dups).cast(Integer).label(
                        f"n_{e_type.value}_expenditures")
                for e_type in ExpenditureType
            ],
        ).order_by(desc("day"), desc("user_id")).subquery())

    query = db.session.query(query).with_entities(
        *query.c,
        extract("epoch", query.c.end_time -
                query.c.start_time).label("service_duration"),
        reduce(
            lambda a, b: a + b,
            [
                getattr(query.c, f"{a_type.value}_duration")
                for a_type in ActivityType
            ],
        ).label("total_work_duration"),
    )

    results = query.all()
    if after:
        results = [
            r for r in results if r.day.date() < max_date or (
                r.day.date() == max_date and r.user_id < user_id_)
        ]

    if first:
        if has_next_page:
            # The last work day may be incomplete because we didn't fetch all the activities => remove it
            results = results[:-1]
        if len(results) > first:
            results = results[:first]
            has_next_page = True

    return results, has_next_page
示例#7
0
 def get_query_column(self, entity):
     return func.least(*(getattr(entity, c) for c in self.entity_columns))