def _build_subquery(self): if not self.kwargs.get("tad_set_uuid"): return None else: set_pk = TadSet.objects.get( sodar_uuid=self.kwargs["tad_set_uuid"]).pk term_interval_center = (TadBoundaryInterval.sa.start + (TadBoundaryInterval.sa.end - TadBoundaryInterval.sa.start + 1) / 2) term_overlaps = and_( term_interval_center >= StructuralVariant.sa.start, term_interval_center <= StructuralVariant.sa.end, ) fields = [ func.coalesce( func.min( case( [(term_overlaps, 0)], else_=func.least( func.abs(term_interval_center - StructuralVariant.sa.start), func.abs(term_interval_center - StructuralVariant.sa.end), ), )), -1, ).label("distance_to_center") ] return (select(fields).select_from(TadBoundaryInterval.sa).where( and_( TadBoundaryInterval.sa.tad_set_id == set_pk, overlaps(TadBoundaryInterval, StructuralVariant), )).alias("subquery_tad_boundaries_inner").lateral( "subquery_tad_boundaries_outer"))
def overlaps(lhs, rhs, min_overlap=None, rhs_start=None, rhs_end=None): """Returns term of ``lhs`` overlapping with ``rhs`` based on the start/end fields.""" if rhs_start is None: rhs_start = rhs.sa.start if rhs_end is None: rhs_end = rhs.sa.end if min_overlap is None: return and_( lhs.sa.release == rhs.sa.release, lhs.sa.chromosome == rhs.sa.chromosome, lhs.sa.bin.in_( select([column("bin")]).select_from(func.overlapping_bins(rhs_start - 1, rhs_end)) ), lhs.sa.end >= rhs_start, lhs.sa.start <= rhs_end, ) else: term_overlap = func.least(lhs.sa.end, rhs_end) - func.greatest(lhs.sa.start, rhs_start) + 1 return and_( lhs.sa.release == rhs.sa.release, lhs.sa.chromosome == rhs.sa.chromosome, rhs.sa.bin.in_( select([column("bin")]).select_from( func.overlapping_bins(lhs.sa.start - 1, lhs.sa.end) ) ), lhs.sa.end >= rhs_start, lhs.sa.start <= rhs_end, cast(term_overlap, Float) / func.greatest((rhs_end - rhs_start + 1), 1) > min_overlap, cast(term_overlap, Float) / func.greatest((lhs.sa.end - lhs.sa.start + 1), 1) > min_overlap, )
def best_matching_flags(sa_engine, case_id, sv_uuid, min_overlap=0.95): """Find best matching ``StructuralVariantFlags`` object for the given case and SV. Returns ``None`` if none could be found. """ sv = StructuralVariant.objects.get(case_id=case_id, sv_uuid=sv_uuid) term_overlap = ( func.least(StructuralVariantFlags.sa.end, sv.end) - func.greatest(StructuralVariantFlags.sa.start, sv.start) + 1 ) query = ( select( [ StructuralVariantFlags.sa.sodar_uuid.label("flags_uuid"), func.least( cast(term_overlap, Float) / func.greatest((sv.end - sv.start + 1), 1), cast(term_overlap, Float) / func.greatest( (StructuralVariantFlags.sa.end - StructuralVariantFlags.sa.start + 1), 1 ), ).label("reciprocal_overlap"), ] ) .select_from(StructuralVariantFlags.sa) .where( and_( StructuralVariantFlags.sa.case_id == case_id, StructuralVariantFlags.sa.release == sv.release, StructuralVariantFlags.sa.chromosome == sv.chromosome, StructuralVariantFlags.sa.bin.in_( select([column("bin")]).select_from(func.overlapping_bins(sv.start - 1, sv.end)) ), StructuralVariantFlags.sa.end >= sv.start, StructuralVariantFlags.sa.start <= sv.end, StructuralVariantFlags.sa.sv_type == sv.sv_type, cast(term_overlap, Float) / func.greatest((sv.end - sv.start + 1), 1) > min_overlap, cast(term_overlap, Float) / func.greatest( (StructuralVariantFlags.sa.end - StructuralVariantFlags.sa.start + 1), 1 ) > min_overlap, ) ) ) return sa_engine.execute(query.order_by(query.c.reciprocal_overlap.desc()))
def process_update_recommendation_scores(payload): text_fields = [ User.hometown, User.occupation, User.education, User.about_me, User.my_travels, User.things_i_like, User.about_place, User.additional_information, User.pet_details, User.kid_details, User.housemate_details, User.other_host_info, User.sleeping_details, User.area, User.house_rules, ] home_fields = [User.about_place, User.other_host_info, User.sleeping_details, User.area, User.house_rules] def poor_man_gaussian(): """ Produces an approximatley std normal random variate """ trials = 5 return (sum([func.random() for _ in range(trials)]) - trials / 2) / sqrt(trials / 12) def int_(stmt): return func.coalesce(cast(stmt, Integer), 0) def float_(stmt): return func.coalesce(cast(stmt, Float), 0.0) with session_scope() as session: # profile profile_text = "" for field in text_fields: profile_text += func.coalesce(field, "") text_length = func.length(profile_text) home_text = "" for field in home_fields: home_text += func.coalesce(field, "") home_length = func.length(home_text) has_text = int_(text_length > 500) long_text = int_(text_length > 2000) has_pic = int_(User.avatar_key != None) can_host = int_(User.hosting_status == HostingStatus.can_host) cant_host = int_(User.hosting_status == HostingStatus.cant_host) filled_home = int_(User.last_minute != None) * int_(home_length > 200) profile_points = 2 * has_text + 3 * long_text + 2 * has_pic + 3 * can_host + 2 * filled_home - 5 * cant_host # references left_ref_expr = int_(1).label("left_reference") left_refs_subquery = ( select(Reference.from_user_id.label("user_id"), left_ref_expr).group_by(Reference.from_user_id).subquery() ) left_reference = int_(left_refs_subquery.c.left_reference) has_reference_expr = int_(func.count(Reference.id) >= 1).label("has_reference") ref_count_expr = int_(func.count(Reference.id)).label("ref_count") ref_avg_expr = func.avg(1.4 * (Reference.rating - 0.3)).label("ref_avg") has_multiple_types_expr = int_(func.count(distinct(Reference.reference_type)) >= 2).label("has_multiple_types") has_bad_ref_expr = int_(func.sum(int_((Reference.rating <= 0.2) | (~Reference.was_appropriate))) >= 1).label( "has_bad_ref" ) received_ref_subquery = ( select( Reference.to_user_id.label("user_id"), has_reference_expr, has_multiple_types_expr, has_bad_ref_expr, ref_count_expr, ref_avg_expr, ) .group_by(Reference.to_user_id) .subquery() ) has_multiple_types = int_(received_ref_subquery.c.has_multiple_types) has_reference = int_(received_ref_subquery.c.has_reference) has_bad_reference = int_(received_ref_subquery.c.has_bad_ref) rating_score = float_( received_ref_subquery.c.ref_avg * ( 2 * func.least(received_ref_subquery.c.ref_count, 5) + func.greatest(received_ref_subquery.c.ref_count - 5, 0) ) ) ref_score = 2 * has_reference + has_multiple_types + left_reference - 5 * has_bad_reference + rating_score # activeness recently_active = int_(User.last_active >= now() - timedelta(days=180)) very_recently_active = int_(User.last_active >= now() - timedelta(days=14)) recently_messaged = int_(func.max(Message.time) > now() - timedelta(days=14)) messaged_lots = int_(func.count(Message.id) > 5) messaging_points_subquery = (recently_messaged + messaged_lots).label("messaging_points") messaging_subquery = ( select(Message.author_id.label("user_id"), messaging_points_subquery) .where(Message.message_type == MessageType.text) .group_by(Message.author_id) .subquery() ) activeness_points = recently_active + 2 * very_recently_active + int_(messaging_subquery.c.messaging_points) # verification phone_verified = int_(User.phone_is_verified) cb_subquery = ( select(ClusterSubscription.user_id.label("user_id"), func.min(Cluster.parent_node_id).label("min_node_id")) .join(Cluster, Cluster.id == ClusterSubscription.cluster_id) .where(ClusterSubscription.role == ClusterRole.admin) .where(Cluster.is_official_cluster) .group_by(ClusterSubscription.user_id) .subquery() ) min_node_id = cb_subquery.c.min_node_id cb = int_(min_node_id >= 1) f = int_(User.id <= 2) wcb = int_(min_node_id == 1) verification_points = 0.0 + 100 * f + 10 * wcb + 5 * cb # response rate t = ( select(Message.conversation_id, Message.time) .where(Message.message_type == MessageType.chat_created) .subquery() ) s = ( select(Message.conversation_id, Message.author_id, func.min(Message.time).label("time")) .group_by(Message.conversation_id, Message.author_id) .subquery() ) hr_subquery = ( select( HostRequest.host_user_id.label("user_id"), func.avg(s.c.time - t.c.time).label("avg_response_time"), func.count(t.c.time).label("received"), func.count(s.c.time).label("responded"), float_( extract( "epoch", percentile_disc(0.33).within_group(func.coalesce(s.c.time - t.c.time, timedelta(days=1000))), ) / 60.0 ).label("response_time_33p"), float_( extract( "epoch", percentile_disc(0.66).within_group(func.coalesce(s.c.time - t.c.time, timedelta(days=1000))), ) / 60.0 ).label("response_time_66p"), ) .join(t, t.c.conversation_id == HostRequest.conversation_id) .outerjoin( s, and_(s.c.conversation_id == HostRequest.conversation_id, s.c.author_id == HostRequest.host_user_id) ) .group_by(HostRequest.host_user_id) .subquery() ) avg_response_time = hr_subquery.c.avg_response_time avg_response_time_hr = float_(extract("epoch", avg_response_time) / 60.0) received = hr_subquery.c.received responded = hr_subquery.c.responded response_time_33p = hr_subquery.c.response_time_33p response_time_66p = hr_subquery.c.response_time_66p response_rate = float_(responded / (1.0 * func.greatest(received, 1))) # be careful with nulls response_rate_points = -10 * int_(response_time_33p > 60 * 48.0) + 5 * int_(response_time_66p < 60 * 48.0) recommendation_score = ( profile_points + ref_score + activeness_points + verification_points + response_rate_points + 2 * poor_man_gaussian() ) scores = ( select(User.id.label("user_id"), recommendation_score.label("score")) .outerjoin(messaging_subquery, messaging_subquery.c.user_id == User.id) .outerjoin(left_refs_subquery, left_refs_subquery.c.user_id == User.id) .outerjoin(received_ref_subquery, received_ref_subquery.c.user_id == User.id) .outerjoin(cb_subquery, cb_subquery.c.user_id == User.id) .outerjoin(hr_subquery, hr_subquery.c.user_id == User.id) ).subquery() session.execute( User.__table__.update().values(recommendation_score=scores.c.score).where(User.id == scores.c.user_id) ) logger.info("Updated recommendation scores")
def get_query_column(self, entity): return func.least(*(getattr(entity, c) for c in self.entity_columns))
def query_work_day_stats( company_id, start_date=None, end_date=None, first=None, after=None, tzname="Europe/Paris", ): tz = gettz(tzname) if after: max_time, user_id_ = parse_datetime_plus_id_cursor(after) max_date = max_time.date() end_date = min(max_date, end_date) if end_date else max_date query = (Activity.query.join(Mission).join( Expenditure, and_( Activity.user_id == Expenditure.user_id, Activity.mission_id == Expenditure.mission_id, ), isouter=True, ).with_entities( Activity.id, Activity.user_id, Activity.mission_id, Mission.name, Activity.start_time, Activity.end_time, Activity.type, Expenditure.id.label("expenditure_id"), Expenditure.type.label("expenditure_type"), func.generate_series( func.date_trunc( "day", func.timezone( tzname, func.timezone("UTC", Activity.start_time), ), ), func.timezone( tzname, func.coalesce( func.timezone("UTC", Activity.end_time), func.now(), ), ), "1 day", ).label("day"), ).filter( Mission.company_id == company_id, ~Activity.is_dismissed, Activity.start_time != Activity.end_time, )) query = _apply_time_range_filters( query, to_datetime(start_date, tz_for_date=tz), to_datetime(end_date, tz_for_date=tz, convert_dates_to_end_of_day_times=True), ) has_next_page = False if first: activity_first = max(first * 5, 200) query = query.order_by(desc("day"), desc( Activity.user_id)).limit(activity_first + 1) has_next_page = query.count() > activity_first query = query.subquery() query = (db.session.query(query).group_by( query.c.user_id, query.c.day, query.c.mission_id, query.c.name).with_entities( query.c.user_id.label("user_id"), query.c.day, func.timezone("UTC", func.timezone(tzname, query.c.day)).label("utc_day_start"), query.c.mission_id.label("mission_id"), query.c.name.label("mission_name"), func.min( func.greatest( query.c.start_time, func.timezone("UTC", func.timezone(tzname, query.c.day)), )).label("start_time"), func.max( func.least( func.timezone( "UTC", func.timezone( tzname, query.c.day + func.cast("1 day", Interval)), ), func.coalesce(query.c.end_time, func.now()), )).label("end_time"), func.bool_or( and_( query.c.end_time.is_(None), query.c.day == func.current_date(), )).label("is_running"), *[ func.sum( case( [( query.c.type == a_type.value, extract( "epoch", func.least( func.timezone( "UTC", func.timezone( tzname, query.c.day + func.cast("1 day", Interval), ), ), func.coalesce(query.c.end_time, func.now()), ) - func.greatest( query.c.start_time, func.timezone( "UTC", func.timezone(tzname, query.c.day), ), ), ), )], else_=0, )).label(f"{a_type.value}_duration") for a_type in ActivityType ], func.greatest(func.count(distinct(query.c.expenditure_id)), 1).label("n_exp_dups"), func.count(distinct(query.c.id)).label("n_act_dups"), *[ func.sum( case( [(query.c.expenditure_type == e_type.value, 1)], else_=0, )).label(f"n_{e_type.value}_expenditures") for e_type in ExpenditureType ], ).subquery()) query = (db.session.query(query).group_by( query.c.user_id, query.c.day).with_entities( query.c.user_id.label("user_id"), query.c.day, func.array_agg(distinct( query.c.mission_name)).label("mission_names"), func.min(query.c.start_time).label("start_time"), func.max(query.c.end_time).label("end_time"), func.bool_or(query.c.is_running).label("is_running"), *[ func.sum( getattr(query.c, f"{a_type.value}_duration") / query.c.n_exp_dups).cast(Integer).label( f"{a_type.value}_duration") for a_type in ActivityType ], *[ func.sum( getattr(query.c, f"n_{e_type.value}_expenditures") / query.c.n_act_dups).cast(Integer).label( f"n_{e_type.value}_expenditures") for e_type in ExpenditureType ], ).order_by(desc("day"), desc("user_id")).subquery()) query = db.session.query(query).with_entities( *query.c, extract("epoch", query.c.end_time - query.c.start_time).label("service_duration"), reduce( lambda a, b: a + b, [ getattr(query.c, f"{a_type.value}_duration") for a_type in ActivityType ], ).label("total_work_duration"), ) results = query.all() if after: results = [ r for r in results if r.day.date() < max_date or ( r.day.date() == max_date and r.user_id < user_id_) ] if first: if has_next_page: # The last work day may be incomplete because we didn't fetch all the activities => remove it results = results[:-1] if len(results) > first: results = results[:first] has_next_page = True return results, has_next_page