def test_foreignkey_reverse(self): books = list(Book.objects.all()) with self.assertNumQueries(1): prefetch_related_objects(books, 'first_time_authors') with self.assertNumQueries(0): [list(book.first_time_authors.all()) for book in books]
def test_prefetch_object_to_attr(self): book1 = Book.objects.get(id=self.book1.id) with self.assertNumQueries(1): prefetch_related_objects([book1], Prefetch('authors', to_attr='the_authors')) with self.assertNumQueries(0): self.assertCountEqual(book1.the_authors, [self.author1, self.author2, self.author3])
def test_m2m_forward(self): book1 = Book.objects.get(id=self.book1.id) with self.assertNumQueries(1): prefetch_related_objects([book1], 'authors') with self.assertNumQueries(0): self.assertCountEqual(book1.authors.all(), [self.author1, self.author2, self.author3])
def test_foreignkey_forward(self): authors = list(Author.objects.all()) with self.assertNumQueries(1): prefetch_related_objects(authors, 'first_book') with self.assertNumQueries(0): [author.first_book for author in authors]
def test_m2m_reverse(self): author1 = Author.objects.get(id=self.author1.id) with self.assertNumQueries(1): prefetch_related_objects([author1], 'books') with self.assertNumQueries(0): self.assertCountEqual(author1.books.all(), [self.book1, self.book2])
def test_prefetch_object(self): book1 = Book.objects.get(id=self.book1.id) with self.assertNumQueries(1): prefetch_related_objects([book1], Prefetch('authors')) with self.assertNumQueries(0): self.assertEqual(set(book1.authors.all()), {self.author1, self.author2, self.author3})
def site(request): # type: (django.http.request.HttpRequest) -> dict """Add site settings to the context under the 'site' key.""" site = get_current_site(request) prefetch_related_objects( [site], 'settings__translations') return {'site': site}
def _get_json_for_individuals(individuals, user=None, project_guid=None, family_guid=None, add_sample_guids_field=False, family_fields=None): """Returns a JSON representation for the given list of Individuals. Args: individuals (array): array of django models for the individual. user (object): Django User object for determining whether to include restricted/internal-only fields project_guid (string): An optional field to use as the projectGuid instead of querying the DB family_guid (boolean): An optional field to use as the familyGuid instead of querying the DB add_sample_guids_field (boolean): A flag to indicate weather sample ids should be added Returns: array: array of json objects """ def _get_case_review_status_modified_by(modified_by): return modified_by.email or modified_by.username if hasattr(modified_by, 'email') else modified_by def _load_phenotips_data(phenotips_data): phenotips_json = None if phenotips_data: try: phenotips_json = json.loads(phenotips_data) except Exception as e: logger.error("Couldn't parse phenotips: {}".format(e)) return phenotips_json def _process_result(result, individual): mother = result.pop('mother', None) father = result.pop('father', None) result.update({ 'caseReviewStatusLastModifiedBy': _get_case_review_status_modified_by(result.get('caseReviewStatusLastModifiedBy')), 'phenotipsData': _load_phenotips_data(result['phenotipsData']), 'maternalGuid': mother.guid if mother else None, 'paternalGuid': father.guid if father else None, 'maternalId': mother.individual_id if mother else None, 'paternalId': father.individual_id if father else None, 'displayName': result['displayName'] or result['individualId'], }) if add_sample_guids_field: result['sampleGuids'] = [s.guid for s in individual.sample_set.all()] nested_fields = [ {'fields': ('family', 'guid'), 'value': family_guid}, {'fields': ('family', 'project', 'guid'), 'key': 'projectGuid', 'value': project_guid}, ] if family_fields: for field in family_fields: nested_fields.append({'fields': ('family', field), 'key': _to_camel_case(field)}) prefetch_related_objects(individuals, 'family') prefetch_related_objects(individuals, 'mother') prefetch_related_objects(individuals, 'father') prefetch_related_objects(individuals, 'case_review_status_last_modified_by') if add_sample_guids_field: prefetch_related_objects(individuals, 'sample_set') return _get_json_for_models(individuals, nested_fields=nested_fields, user=user, process_result=_process_result)
def prefetch_export_runs(queryset_list_or_model): prefetch_args = ['job__provider_tasks__provider', 'job__provider_tasks__formats', 'provider_tasks__tasks__result', 'provider_tasks__tasks__exceptions'] if isinstance(queryset_list_or_model, models.query.QuerySet): return queryset_list_or_model.select_related('user').prefetch_related(*prefetch_args) elif isinstance(queryset_list_or_model, list): models.prefetch_related_objects(queryset_list_or_model, *prefetch_args) elif isinstance(queryset_list_or_model, ExportRun): models.prefetch_related_objects([queryset_list_or_model], *prefetch_args) return queryset_list_or_model
def test_prefetch_queryset(self): book1 = Book.objects.get(id=self.book1.id) with self.assertNumQueries(1): prefetch_related_objects( [book1], Prefetch('authors', queryset=Author.objects.filter(id__in=[self.author1.id, self.author2.id])) ) with self.assertNumQueries(0): self.assertCountEqual(book1.authors.all(), [self.author1, self.author2])
def practice_overview(self, request, pk): del request, pk # not needed student = self.get_object() prefetch_related_objects( [student], Prefetch( 'task_sessions', queryset=TaskSession.objects.select_related('task'))) # -> Same as student = Student.objects.prefetch_related(...).get(pk=pk) domain = get_domain() overview = get_practice_overview(domain, student) serializer = PracticeOverviewSerializer(overview) return Response(serializer.data)
def _process_variants(variants, families): prefetch_related_objects(families, 'project') genes = _saved_variant_genes(variants) # TODO add locus lists on the client side (?) projects = {family.project for family in families} _add_locus_lists(projects, variants, genes) saved_variants_by_guid = _get_saved_variants(variants) return { 'searchedVariants': variants, 'savedVariantsByGuid': saved_variants_by_guid, 'genesById': genes, }
def search(self, page_number: int = 1, page_size: int = 25): """ Runs the search for this search and constructs :param page_number: The result page :param page_size: The number of items per page """ queryset = Card.objects.filter(self.root_parameter.query()).distinct() self.add_sort_param(CardNameSortParam()) self.add_sort_param(CardColourSortParam()) self.add_sort_param(CardPowerSortParam()) queryset = queryset.order_by( *[order for sort_param in self.sort_params for order in sort_param.get_sort_list()]) self.paginator = Paginator(queryset, page_size) try: self.page = self.paginator.page(page_number) except EmptyPage: return cards = list(self.page) prefetch_related_objects(cards, 'printings__printed_languages__physical_cards__ownerships') prefetch_related_objects(cards, 'printings__printed_languages__language') prefetch_related_objects(cards, 'printings__set') prefetch_related_objects(cards, 'printings__rarity') preferred_set = self.get_preferred_set() self.results = [SearchResult(card, selected_set=preferred_set) for card in cards]
def test_m2m_then_m2m(self): """A m2m can be followed through another m2m.""" authors = list(Author.objects.all()) with self.assertNumQueries(2): prefetch_related_objects(authors, 'books__read_by') with self.assertNumQueries(0): self.assertEqual( [ [[str(r) for r in b.read_by.all()] for b in a.books.all()] for a in authors ], [ [['Amy'], ['Belinda']], # Charlotte - Poems, Jane Eyre [['Amy']], # Anne - Poems [['Amy'], []], # Emily - Poems, Wuthering Heights [['Amy', 'Belinda']], # Jane - Sense and Sense ] )
def get_json_for_analysis_groups(analysis_groups, project_guid=None): """Returns a JSON representation of the given list of AnalysisGroups. Args: analysis_groups (array): array of django models for the AnalysisGroups. project_guid (string): An optional field to use as the projectGuid instead of querying the DB Returns: array: array of json objects """ def _process_result(result, group): result.update({ 'familyGuids': [f.guid for f in group.families.only('guid').all()] }) prefetch_related_objects(analysis_groups, 'families') nested_fields = [{'fields': ('project', 'guid'), 'value': project_guid}] return _get_json_for_models(analysis_groups, nested_fields=nested_fields, process_result=_process_result)
def _get_json_for_families(families, user=None, add_individual_guids_field=False, project_guid=None): """Returns a JSON representation of the given Family. Args: families (array): array of django models representing the family. user (object): Django User object for determining whether to include restricted/internal-only fields add_individual_guids_field (bool): whether to add an 'individualGuids' field. NOTE: this will require a database query. project_guid (boolean): An optional field to use as the projectGuid instead of querying the DB Returns: array: json objects """ def _get_pedigree_image_url(pedigree_image): if isinstance(pedigree_image, ImageFieldFile): try: pedigree_image = pedigree_image.url except Exception: pedigree_image = None return os.path.join("/media/", pedigree_image) if pedigree_image else None def _process_result(result, family): result['analysedBy'] = [{ 'createdBy': {'fullName': ab.created_by.get_full_name(), 'email': ab.created_by.email, 'isStaff': ab.created_by.is_staff}, 'lastModifiedDate': ab.last_modified_date, } for ab in family.familyanalysedby_set.all()] pedigree_image = _get_pedigree_image_url(result.pop('pedigreeImage')) if pedigree_image: result['pedigreeImage'] = pedigree_image if add_individual_guids_field: result['individualGuids'] = [i.guid for i in family.individual_set.all()] if not result['displayName']: result['displayName'] = result['familyId'] prefetch_related_objects(families, 'familyanalysedby_set__created_by') if add_individual_guids_field: prefetch_related_objects(families, 'individual_set') nested_fields = [{'fields': ('project', 'guid'), 'value': project_guid}] return _get_json_for_models(families, nested_fields=nested_fields, user=user, process_result=_process_result)
def get_json_for_saved_variants(saved_variants, add_tags=False, add_details=False, project=None, user=None, **kwargs): """Returns a JSON representation of the given variant. Args: saved_variant (object): Django model for the SavedVariant. Returns: dict: json object """ from seqr.views.utils.variant_utils import variant_details def _process_result(variant_json, saved_variant): if add_tags: variant_json.update({ 'tags': [get_json_for_variant_tag(tag) for tag in saved_variant.varianttag_set.all()], 'functionalData': [get_json_for_variant_functional_data(tag) for tag in saved_variant.variantfunctionaldata_set.all()], 'notes': [get_json_for_variant_note(tag) for tag in saved_variant.variantnote_set.all()], }) if add_details: saved_variant_json = json.loads(saved_variant.saved_variant_json or '{}') variant_json.update(variant_details(saved_variant_json, project or saved_variant.project, user, **kwargs)) variant_json.update({ 'variantId': saved_variant.guid, # TODO get from json 'familyGuids': [saved_variant.family.guid], }) return variant_json prefetch_related_objects(saved_variants, 'family') if not project: prefetch_related_objects(saved_variants, 'project') if add_tags: prefetch_related_objects(saved_variants, 'varianttag_set__variant_tag_type', 'varianttag_set__created_by', 'variantnote_set__created_by', 'variantfunctionaldata_set__created_by') return _get_json_for_models(saved_variants, guid_key='variantGuid', process_result=_process_result)
def get_json_for_locus_lists(locus_lists, user, include_genes=False): """Returns a JSON representation of the given LocusLists. Args: locus_lists (array): array of LocusList django models. Returns: array: json objects """ def _process_result(result, locus_list): gene_set = locus_list.locuslistgene_set interval_set = locus_list.locuslistinterval_set if include_genes: intervals = _get_json_for_models(interval_set.all()) genome_versions = {interval['genomeVersion'] for interval in intervals} result.update({ 'items': [{'geneId': gene.gene_id} for gene in gene_set.all()] + intervals, 'intervalGenomeVersion': genome_versions.pop() if len(genome_versions) == 1 else None, }) result.update({ 'numEntries': gene_set.count() + interval_set.count(), 'canEdit': user == locus_list.created_by, }) prefetch_related_objects(locus_lists, 'created_by') prefetch_related_objects(locus_lists, 'locuslistgene_set') prefetch_related_objects(locus_lists, 'locuslistinterval_set') return _get_json_for_models(locus_lists, user=user, process_result=_process_result)
def project_versions(request, project_slug): """ Project version list view. Shows the available versions and lets the user choose which ones to build. """ project = get_object_or_404( Project.objects.protected(request.user), slug=project_slug, ) versions = Version.objects.public( user=request.user, project=project, only_active=False, ) active_versions = versions.filter(active=True) inactive_versions = versions.filter(active=False) # If there's a wiped query string, check the string against the versions # list and display a success message. Deleting directories doesn't know how # to fail. :) wiped = request.GET.get('wipe', '') wiped_version = versions.filter(slug=wiped) if wiped and wiped_version.count(): messages.success(request, 'Version wiped: ' + wiped) # Optimize project permission checks prefetch_related_objects([project], 'users') return render( request, 'projects/project_version_list.html', { 'inactive_versions': inactive_versions, 'active_versions': active_versions, 'project': project, }, )
def get_access_by_project( projects: Sequence[Project], user: User) -> MutableMapping[Project, MutableMapping[str, Any]]: request = env.request project_teams = list( ProjectTeam.objects.filter( project__in=projects).select_related("team")) project_team_map = defaultdict(list) for pt in project_teams: project_team_map[pt.project_id].append(pt.team) team_memberships = get_team_memberships([pt.team for pt in project_teams], user) org_roles = get_org_roles({i.organization_id for i in projects}, user) prefetch_related_objects(projects, "organization") is_superuser = request and is_active_superuser( request) and request.user == user result = {} for project in projects: is_member = any(t.id in team_memberships for t in project_team_map.get(project.id, [])) org_role = org_roles.get(project.organization_id) if is_member: has_access = True elif is_superuser: has_access = True elif project.organization.flags.allow_joinleave: has_access = True elif org_role and roles.get(org_role).is_global: has_access = True else: has_access = False result[project] = {"is_member": is_member, "has_access": has_access} return result
def my_threads(self, request): queryset = self.get_queryset() \ .only_threads_with_user(request.user) \ .select_related('latest_message') \ .prefetch_related('participants') queryset = self.filter_queryset(queryset) paginator = ThreadPagination() threads = list( paginator.paginate_queryset(queryset, request, view=self)) messages = [ t.latest_message for t in threads if t.latest_message is not None ] prefetch_related_objects(threads + messages, 'reactions') serializer = self.get_serializer(threads, many=True) message_serializer = self.get_serializer(messages, many=True) return paginator.get_paginated_response({ 'threads': serializer.data, 'messages': message_serializer.data })
def get_json_for_saved_variants(saved_variants, add_tags=False, add_details=False, project=None, user=None, **kwargs): """Returns a JSON representation of the given variant. Args: saved_variant (object): Django model for the SavedVariant. Returns: dict: json object """ from seqr.views.utils.variant_utils import variant_details def _process_result(variant_json, saved_variant): if add_tags: variant_json.update({ 'tags': [ get_json_for_variant_tag(tag) for tag in saved_variant.varianttag_set.all() ], 'functionalData': [ get_json_for_variant_functional_data(tag) for tag in saved_variant.variantfunctionaldata_set.all() ], 'notes': [ get_json_for_variant_note(tag) for tag in saved_variant.variantnote_set.all() ], }) if add_details: saved_variant_json = json.loads(saved_variant.saved_variant_json or '{}') variant_json.update( variant_details(saved_variant_json, project or saved_variant.project, user, **kwargs)) variant_json.update({ 'variantId': saved_variant.guid, # TODO get from json 'familyGuids': [saved_variant.family.guid], }) return variant_json prefetch_related_objects(saved_variants, 'family') if not project: prefetch_related_objects(saved_variants, 'project') if add_tags: prefetch_related_objects(saved_variants, 'varianttag_set__variant_tag_type', 'varianttag_set__created_by', 'variantnote_set__created_by', 'variantfunctionaldata_set__created_by') return _get_json_for_models(saved_variants, guid_key='variantGuid', process_result=_process_result)
def _check_value(cls, value, multi_model=True): """ If the value is a queryset then apply the prefetches and select related. if the value is not a queryset then return the value itself :param value: :return: """ # in case it is a single model and not a queryset, then prefetches can be applied in this way to the model # itself. The queryset being None makes sure everything returns as if the queryset has no prefetches at all if (not multi_model) and isinstance(value, models.Model): models.prefetch_related_objects([value], *cls._prepare_prefetch_list()) if not isinstance(value, (models.QuerySet, models.Manager)): return value queryset = value.all() if isinstance(value, models.Manager) else value prefetch_list = cls._prepare_prefetch_list(queryset) select = cls.database_relations['select'][::] return queryset.select_related(*select).prefetch_related( *prefetch_list)
def get_courses(user, org=None, filter_=None): """ Return a LazySequence of courses available, optionally filtered by org code (case-insensitive). """ courses = branding.get_visible_courses( org=org, filter_=filter_, ).prefetch_related( Prefetch( 'modes', queryset=CourseMode.objects.exclude( mode_slug__in=CourseMode.CREDIT_MODES), to_attr='selectable_modes', ), ).select_related('image_set') permission_name = configuration_helpers.get_value( 'COURSE_CATALOG_VISIBILITY_PERMISSION', settings.COURSE_CATALOG_VISIBILITY_PERMISSION) if user.is_authenticated: prefetch_related_objects([user], 'roles', 'courseenrollment_set', 'experimentdata_set') return LazySequence( (c for c in courses if has_access(user, permission_name, c)), est_len=courses.count())
def run_program(self, request, pk=None): task_session_id = request.data['task-session-id'] program = request.data['program'] correct = request.data['correct'] task_session = ( TaskSession.objects .select_related('task', 'student') .get(pk=task_session_id)) student = task_session.student assert student.pk == int(pk) domain = get_domain() progress = actions.run_program(domain, task_session, program, correct) response = {'correct': correct} if correct: prefetch_related_objects( [student], Prefetch( 'task_sessions', queryset=TaskSession.objects.select_related('task'))) response['recommendation'] = get_recommendation(domain, student) response['progress'] = progress or [] print('progress', progress) serializer = RunProgramResponseSerializer(response) return Response(serializer.data)
def get_json_for_genes(genes, user=None, add_dbnsfp=False, add_omim=False, add_constraints=False, add_notes=False, add_expression=False): """Returns a JSON representation of the given list of GeneInfo. Args: genes (array): array of django models for the GeneInfo. Returns: array: array of json objects """ total_gene_constraints = GeneConstraint.objects.count() if add_notes: gene_notes_json = get_json_for_gene_notes_by_gene_id( [gene.gene_id for gene in genes], user) def _add_total_constraint_count(result, *args): result['totalGenes'] = total_gene_constraints def _process_result(result, gene): if add_dbnsfp: dbnsfp = gene.dbnsfpgene_set.first() if dbnsfp: result.update(_get_json_for_model(dbnsfp)) else: result.update(_get_empty_json_for_model(dbNSFPGene)) if add_omim: result['omimPhenotypes'] = _get_json_for_models( gene.omim_set.all()) if add_constraints: constraint = gene.geneconstraint_set.order_by('-mis_z', '-pLI').first() result['constraints'] = _get_json_for_model( constraint, process_result=_add_total_constraint_count ) if constraint else {} if add_notes: result['notes'] = gene_notes_json.get(result['geneId'], []) if add_expression: result[ 'expression'] = gene.geneexpression.expression_values if hasattr( gene, 'geneexpression') else None if add_dbnsfp: prefetch_related_objects(genes, 'dbnsfpgene_set') if add_omim: prefetch_related_objects(genes, 'omim_set') if add_constraints: prefetch_related_objects(genes, 'geneconstraint_set') return _get_json_for_models(genes, process_result=_process_result)
def get_json_for_locus_lists(locus_lists, user, include_genes=False, include_project_count=False, is_analyst=None): """Returns a JSON representation of the given LocusLists. Args: locus_lists (array): array of LocusList django models. Returns: array: json objects """ def _process_result(result, locus_list): gene_set = locus_list.locuslistgene_set interval_set = locus_list.locuslistinterval_set if include_genes: intervals = _get_json_for_models(interval_set.all()) genome_versions = { interval['genomeVersion'] for interval in intervals } result.update({ 'items': [{ 'geneId': gene.gene_id } for gene in gene_set.all()] + intervals, 'intervalGenomeVersion': genome_versions.pop() if len(genome_versions) == 1 else None, }) if include_project_count: result['numProjects'] = locus_list.num_projects result.update({ 'numEntries': gene_set.count() + interval_set.count(), 'canEdit': user == locus_list.created_by, }) prefetch_related_objects(locus_lists, 'created_by') prefetch_related_objects(locus_lists, 'locuslistgene_set') prefetch_related_objects(locus_lists, 'locuslistinterval_set') return _get_json_for_models(locus_lists, user=user, is_analyst=is_analyst, process_result=_process_result)
def get(self, request): user = request.user if not user.is_authenticated: administrated_coteries = [] joined_coteries = [] else: administrated_coteries = list(user.administrated_coterie_set.all()) joined_coteries = list(user.joined_coterie_set.all()) combined = administrated_coteries + joined_coteries prefetch_related_objects(combined, 'administrators') prefetch_related_objects(combined, 'members') prefetch_related_objects(combined, 'coteriedocument_set__unique_file') return JsonResponse( { 'administratedCoteries': administrated_coteries, 'joinedCoteries': joined_coteries, }, encoder=CoterieEncoder, safe=False)
def test_unknown(self): book1 = Book.objects.get(id=self.book1.id) with self.assertRaises(AttributeError): prefetch_related_objects([book1], "unknown_attribute")
def _get_json_for_individuals(individuals, user=None, project_guid=None, family_guid=None, add_sample_guids_field=False, family_fields=None, skip_nested=False, add_hpo_details=False, is_analyst=None, has_case_review_perm=None): """Returns a JSON representation for the given list of Individuals. Args: individuals (array): array of django models for the individual. user (object): Django User object for determining whether to include restricted/internal-only fields project_guid (string): An optional field to use as the projectGuid instead of querying the DB family_guid (boolean): An optional field to use as the familyGuid instead of querying the DB add_sample_guids_field (boolean): A flag to indicate weather sample ids should be added Returns: array: array of json objects """ if not individuals: return [] def _get_case_review_status_modified_by(modified_by): return modified_by.email or modified_by.username if hasattr( modified_by, 'email') else modified_by def _process_result(result, individual): mother = result.pop('mother', None) father = result.pop('father', None) result.update({ 'caseReviewStatusLastModifiedBy': _get_case_review_status_modified_by( result.get('caseReviewStatusLastModifiedBy')), 'maternalGuid': mother.guid if mother else None, 'paternalGuid': father.guid if father else None, 'maternalId': mother.individual_id if mother else None, 'paternalId': father.individual_id if father else None, 'displayName': result['displayName'] or result['individualId'], }) if add_sample_guids_field: result['sampleGuids'] = [ s.guid for s in individual.sample_set.all() ] result['igvSampleGuids'] = [ s.guid for s in individual.igvsample_set.all() ] kwargs = { 'additional_model_fields': _get_case_review_fields(individuals[0], has_case_review_perm, user, lambda indiv: indiv.family.project) } if project_guid or not skip_nested: nested_fields = [ { 'fields': ('family', 'guid'), 'value': family_guid }, { 'fields': ('family', 'project', 'guid'), 'key': 'projectGuid', 'value': project_guid }, ] if family_fields: for field in family_fields: nested_fields.append({ 'fields': ('family', field), 'key': _to_camel_case(field) }) kwargs.update({'nested_fields': nested_fields}) else: kwargs['additional_model_fields'].append('family_id') if add_hpo_details: kwargs['additional_model_fields'] += [ 'features', 'absent_features', 'nonstandard_features', 'absent_nonstandard_features' ] prefetch_related_objects(individuals, 'mother') prefetch_related_objects(individuals, 'father') if 'case_review_status_last_modified_by' in kwargs[ 'additional_model_fields']: prefetch_related_objects(individuals, 'case_review_status_last_modified_by') if add_sample_guids_field: prefetch_related_objects(individuals, 'sample_set') prefetch_related_objects(individuals, 'igvsample_set') parsed_individuals = _get_json_for_models(individuals, user=user, is_analyst=is_analyst, process_result=_process_result, **kwargs) if add_hpo_details: all_hpo_ids = set() for i in parsed_individuals: all_hpo_ids.update( [feature['id'] for feature in i.get('features') or []]) all_hpo_ids.update( [feature['id'] for feature in i.get('absentFeatures') or []]) hpo_terms_by_id = { hpo.hpo_id: hpo for hpo in HumanPhenotypeOntology.objects.filter( hpo_id__in=all_hpo_ids) } for i in parsed_individuals: for feature in i.get('features') or []: hpo = hpo_terms_by_id.get(feature['id']) if hpo: feature.update({ 'category': hpo.category_id, 'label': hpo.name }) for feature in i.get('absentFeatures') or []: hpo = hpo_terms_by_id.get(feature['id']) if hpo: feature.update({ 'category': hpo.category_id, 'label': hpo.name }) return parsed_individuals
def _get_json_for_models(models, nested_fields=None, user=None, is_analyst=None, process_result=None, guid_key=None, additional_model_fields=None): """Returns an array JSON representations of the given models. Args: models (array): Array of django models user (object): Django User object for determining whether to include restricted/internal-only fields nested_fields (array): Optional array of fields to get from the model that are nested on related objects process_result (lambda): Optional function to post-process a given model json guid_key (string): Optional key to use for the model's guid Returns: array: json objects """ if not models: return [] model_class = type(models[0]) fields = copy(model_class._meta.json_fields) if is_analyst is None: is_analyst = user and user_is_analyst(user) if is_analyst: fields += getattr(model_class._meta, 'internal_json_fields', []) if additional_model_fields: fields += additional_model_fields if 'created_by' in fields: prefetch_related_objects(models, 'created_by') for nested_field in nested_fields or []: if not nested_field.get('value'): prefetch_related_objects(models, '__'.join(nested_field['fields'][:-1])) results = [] for model in models: result = { _to_camel_case(field): getattr(model, field) for field in fields } for nested_field in (nested_fields or []): field_value = nested_field.get('value') if not field_value: field_value = model for field in nested_field['fields']: field_value = getattr(field_value, field) if field_value else None result[nested_field.get( 'key', _to_camel_case('_'.join( nested_field['fields'])))] = field_value if result.get('guid'): guid_key = guid_key or '{}{}Guid'.format( model_class.__name__[0].lower(), model_class.__name__[1:]) result[guid_key] = result.pop('guid') if result.get('createdBy'): result['createdBy'] = result['createdBy'].get_full_name( ) or result['createdBy'].email if process_result: process_result(result, model) results.append(result) return results
def add_variants_dataset_handler(request, project_guid): """Create or update samples for the given variant dataset Args: request: Django request object project_guid (string): GUID of the project that should be updated HTTP POST Request body - should contain the following json structure: { 'elasticsearchIndex': <String> (required) 'ignoreExtraSamplesInCallset': <Boolean> 'mappingFilePath': <String> } Response body - will contain the following structure: """ project = get_project_and_check_permissions(project_guid, request.user, permission_level=CAN_EDIT) request_json = json.loads(request.body) try: required_fields = ['elasticsearchIndex', 'datasetType'] if any(field not in request_json for field in required_fields): raise ValueError('request must contain fields: {}'.format( ', '.join(required_fields))) elasticsearch_index = request_json['elasticsearchIndex'].strip() dataset_type = request_json['datasetType'] if dataset_type not in Sample.DATASET_TYPE_LOOKUP: raise ValueError('Invalid dataset type "{}"'.format(dataset_type)) sample_ids, index_metadata = get_elasticsearch_index_samples( elasticsearch_index, dataset_type=dataset_type) if not sample_ids: raise ValueError( 'No samples found in the index. Make sure the specified caller type is correct' ) validate_index_metadata(index_metadata, project, elasticsearch_index, dataset_type=dataset_type) sample_type = index_metadata['sampleType'] sample_id_to_individual_id_mapping = load_mapping_file( request_json['mappingFilePath']) if request_json.get( 'mappingFilePath') else {} loaded_date = timezone.now() matched_sample_id_to_sample_record = match_sample_ids_to_sample_records( project=project, sample_ids=sample_ids, sample_type=sample_type, dataset_type=dataset_type, elasticsearch_index=elasticsearch_index, sample_id_to_individual_id_mapping= sample_id_to_individual_id_mapping, loaded_date=loaded_date, ) unmatched_samples = set(sample_ids) - set( matched_sample_id_to_sample_record.keys()) if request_json.get('ignoreExtraSamplesInCallset'): if len(matched_sample_id_to_sample_record) == 0: raise Exception( "None of the individuals or samples in the project matched the {} expected sample id(s)" .format(len(sample_ids))) elif len(unmatched_samples) > 0: raise Exception( 'Matches not found for ES sample ids: {}. Uploading a mapping file for these samples, or select the "Ignore extra samples in callset" checkbox to ignore.' .format(", ".join(unmatched_samples))) prefetch_related_objects(matched_sample_id_to_sample_record.values(), 'individual__family') included_families = { sample.individual.family for sample in matched_sample_id_to_sample_record.values() } missing_individuals = Individual.objects.filter( family__in=included_families, sample__is_active=True, sample__dataset_type=dataset_type, ).exclude(sample__in=matched_sample_id_to_sample_record.values() ).select_related('family') missing_family_individuals = defaultdict(list) for individual in missing_individuals: missing_family_individuals[individual.family].append(individual) if missing_family_individuals: raise Exception( 'The following families are included in the callset but are missing some family members: {}.' .format(', '.join( sorted([ '{} ({})'.format( family.family_id, ', '.join( sorted( [i.individual_id for i in missing_indivs]))) for family, missing_indivs in missing_family_individuals.items() ])))) inactivate_sample_guids = _update_variant_samples( matched_sample_id_to_sample_record, elasticsearch_index, loaded_date, dataset_type) except Exception as e: traceback.print_exc() return create_json_response({'errors': [e.message or str(e)]}, status=400) if not matched_sample_id_to_sample_record: return create_json_response({'samplesByGuid': {}}) family_guids_to_update = [ family.guid for family in included_families if family.analysis_status == Family.ANALYSIS_STATUS_WAITING_FOR_DATA ] Family.objects.filter(guid__in=family_guids_to_update).update( analysis_status=Family.ANALYSIS_STATUS_ANALYSIS_IN_PROGRESS) response_json = _get_samples_json(matched_sample_id_to_sample_record, inactivate_sample_guids, project_guid) response_json['familiesByGuid'] = { family_guid: { 'analysisStatus': Family.ANALYSIS_STATUS_ANALYSIS_IN_PROGRESS } for family_guid in family_guids_to_update } return create_json_response(response_json)
def anvil_export(request, project_guid): if project_guid == 'all': project_guid = None if project_guid: projects_by_guid = { project_guid: Project.objects.get(guid=project_guid) } else: projects_by_guid = { p.guid: p for p in Project.objects.filter( projectcategory__name__iexact='anvil') } families = _get_over_year_loaded_project_families( projects_by_guid.values()) prefetch_related_objects(families, 'individual_set') saved_variants_by_family = _get_saved_variants_by_family( projects_by_guid.values(), request.user) # Handle compound het genes compound_het_gene_id_by_family = {} for family_guid, saved_variants in saved_variants_by_family.items(): if len(saved_variants) > 1: potential_compound_het_variants = [ variant for variant in saved_variants if all( gen['numAlt'] < 2 for gen in variant['genotypes'].values()) ] main_gene_ids = { variant['mainTranscript']['geneId'] for variant in potential_compound_het_variants } if len(main_gene_ids) > 1: # This occurs in compound hets where some hits have a primary transcripts in different genes for gene_id in main_gene_ids: if all(gene_id in variant['transcripts'] for variant in potential_compound_het_variants): compound_het_gene_id_by_family[family_guid] = gene_id individuals = set() for family in families: individuals.update(family.individual_set.all()) rows = _get_json_for_individuals( list(individuals), project_guid=project_guid, family_fields=['family_id', 'coded_phenotype']) gene_ids = set() for row in rows: row['Project ID'] = projects_by_guid[row['projectGuid']].name saved_variants = saved_variants_by_family[row['familyGuid']] row['numSavedVariants'] = len(saved_variants) for i, variant in enumerate(saved_variants): genotype = variant['genotypes'].get(row['individualGuid'], {}) if genotype.get('numAlt', -1) > 0: gene_id = compound_het_gene_id_by_family.get( row['familyGuid']) or variant['mainTranscript']['geneId'] gene_ids.add(gene_id) variant_fields = { 'Zygosity': 'heterozygous' if genotype['numAlt'] == 1 else 'homozygous', 'Chrom': variant['chrom'], 'Pos': variant['pos'], 'Ref': variant['ref'], 'Alt': variant['alt'], 'hgvsc': variant['mainTranscript']['hgvsc'], 'hgvsp': variant['mainTranscript']['hgvsp'], 'Transcript': variant['mainTranscript']['transcriptId'], 'geneId': gene_id, } row.update({ '{} - {}'.format(k, i + 1): v for k, v in variant_fields.items() }) genes_by_id = get_genes(gene_ids) for row in rows: for key, gene_id in row.items(): if key.startswith('geneId') and genes_by_id.get(gene_id): row[key.replace('geneId', 'Gene')] = genes_by_id[gene_id]['geneSymbol'] return create_json_response({'anvilRows': rows})
def anvil_export(request, project_guid): if project_guid == 'all': project_guid = None if project_guid: projects_by_guid = {project_guid: Project.objects.get(guid=project_guid)} else: projects_by_guid = {p.guid: p for p in Project.objects.filter(projectcategory__name__iexact='anvil')} families = _get_over_year_loaded_project_families(projects_by_guid.values()) prefetch_related_objects(families, 'individual_set') saved_variants_by_family = _get_saved_variants_by_family(projects_by_guid.values(), request.user) # Handle compound het genes compound_het_gene_id_by_family = {} for family_guid, saved_variants in saved_variants_by_family.items(): if len(saved_variants) > 1: potential_compound_het_variants = [ variant for variant in saved_variants if all(gen['numAlt'] < 2 for gen in variant['genotypes'].values()) ] main_gene_ids = {variant['mainTranscript']['geneId'] for variant in potential_compound_het_variants} if len(main_gene_ids) > 1: # This occurs in compound hets where some hits have a primary transcripts in different genes for gene_id in main_gene_ids: if all(gene_id in variant['transcripts'] for variant in potential_compound_het_variants): compound_het_gene_id_by_family[family_guid] = gene_id individuals = set() for family in families: individuals.update(family.individual_set.all()) rows = _get_json_for_individuals(list(individuals), project_guid=project_guid, family_fields=['family_id', 'coded_phenotype']) gene_ids = set() for row in rows: row['Project_ID'] = projects_by_guid[row['projectGuid']].name saved_variants = saved_variants_by_family[row['familyGuid']] row['numSavedVariants'] = len(saved_variants) for i, variant in enumerate(saved_variants): genotype = variant['genotypes'].get(row['individualGuid'], {}) if genotype.get('numAlt', -1) > 0: gene_id = compound_het_gene_id_by_family.get(row['familyGuid']) or variant['mainTranscript']['geneId'] gene_ids.add(gene_id) variant_fields = { 'Zygosity': 'heterozygous' if genotype['numAlt'] == 1 else 'homozygous', 'Chrom': variant['chrom'], 'Pos': variant['pos'], 'Ref': variant['ref'], 'Alt': variant['alt'], 'hgvsc': variant['mainTranscript']['hgvsc'], 'hgvsp': variant['mainTranscript']['hgvsp'], 'Transcript': variant['mainTranscript']['transcriptId'], 'geneId': gene_id, } row.update({'{}-{}'.format(k, i + 1): v for k, v in variant_fields.items()}) genes_by_id = get_genes(gene_ids) for row in rows: for key, gene_id in row.items(): if key.startswith('geneId') and genes_by_id.get(gene_id): row[key.replace('geneId', 'Gene')] = genes_by_id[gene_id]['geneSymbol'] return create_json_response({'anvilRows': rows})
def elasticsearch_status(request): client = get_es_client() disk_fields = ['node', 'disk.avail', 'disk.used', 'disk.percent'] disk_status = [{ _to_camel_case(field.replace('.', '_')): disk[field] for field in disk_fields } for disk in client.cat.allocation(format="json", h=','.join(disk_fields))] index_fields = ['index', 'docs.count', 'store.size', 'creation.date.string'] indices = [{ _to_camel_case(field.replace('.', '_')): index[field] for field in index_fields } for index in client.cat.indices(format="json", h=','.join(index_fields)) if index['index'] not in ['.kibana', 'index_operations_log']] aliases = defaultdict(list) for alias in client.cat.aliases(format="json", h='alias,index'): aliases[alias['alias']].append(alias['index']) mappings = Index('_all', using=client).get_mapping(doc_type='variant') latest_loaded_samples = get_latest_loaded_samples() prefetch_related_objects(latest_loaded_samples, 'individual__family__project') seqr_index_projects = defaultdict(lambda: defaultdict(set)) es_projects = set() for sample in latest_loaded_samples: for index_name in sample.elasticsearch_index.split(','): project = sample.individual.family.project es_projects.add(project) if index_name in aliases: for aliased_index_name in aliases[index_name]: seqr_index_projects[aliased_index_name][project].add(sample.individual.guid) else: seqr_index_projects[index_name.rstrip('*')][project].add(sample.individual.guid) for index in indices: index_name = index['index'] index_mapping = mappings[index_name]['mappings']['variant'] index.update(index_mapping.get('_meta', {})) index['hasNestedGenotypes'] = 'samples_num_alt_1' in index_mapping['properties'] projects_for_index = [] for index_prefix in seqr_index_projects.keys(): if index_name.startswith(index_prefix): projects_for_index += seqr_index_projects.pop(index_prefix).keys() index['projects'] = [{'projectGuid': project.guid, 'projectName': project.name} for project in projects_for_index] errors = ['{} does not exist and is used by project(s) {}'.format( index, ', '.join(['{} ({} samples)'.format(p.name, len(indivs)) for p, indivs in project_individuals.items()]) ) for index, project_individuals in seqr_index_projects.items() if project_individuals] # TODO remove once all projects are switched off of mongo all_mongo_samples = Sample.objects.filter( dataset_type=Sample.DATASET_TYPE_VARIANT_CALLS, sample_status=Sample.SAMPLE_STATUS_LOADED, elasticsearch_index__isnull=True, ).exclude(individual__family__project__in=es_projects).prefetch_related('individual', 'individual__family__project') mongo_sample_individual_max_loaded_date = { agg['individual__guid']: agg['max_loaded_date'] for agg in all_mongo_samples.values('individual__guid').annotate(max_loaded_date=Max('loaded_date')) } mongo_project_samples = defaultdict(set) for s in all_mongo_samples: if s.loaded_date == mongo_sample_individual_max_loaded_date[s.individual.guid]: mongo_project_samples[s.individual.family.project].add(s.dataset_file_path) mongo_projects = [{'projectGuid': project.guid, 'projectName': project.name, 'sourceFilePaths': sample_file_paths} for project, sample_file_paths in mongo_project_samples.items()] return create_json_response({ 'indices': indices, 'diskStats': disk_status, 'elasticsearchHost': ELASTICSEARCH_SERVER, 'mongoProjects': mongo_projects, 'errors': errors, })
def get_json_for_genes(genes, user=None, add_dbnsfp=False, add_omim=False, add_constraints=False, add_notes=False, add_expression=False, add_primate_ai=False, add_mgi=False): """Returns a JSON representation of the given list of GeneInfo. Args: genes (array): array of django models for the GeneInfo. Returns: array: array of json objects """ total_gene_constraints = GeneConstraint.objects.count() if add_notes: gene_notes_json = get_json_for_gene_notes_by_gene_id([gene.gene_id for gene in genes], user) def _add_total_constraint_count(result, *args): result['totalGenes'] = total_gene_constraints def _process_result(result, gene): if add_dbnsfp: # prefetching only works with all() dbnsfp = next((dbnsfp for dbnsfp in gene.dbnsfpgene_set.all()), None) if dbnsfp: result.update(_get_json_for_model(dbnsfp)) else: result.update(_get_empty_json_for_model(dbNSFPGene)) if add_primate_ai: # prefetching only works with all() primate_ai = next((primate_ai for primate_ai in gene.primateai_set.all()), None) if primate_ai: result['primateAi'] = _get_json_for_model(primate_ai) if add_mgi: # prefetching only works with all() mgi = next((mgi for mgi in gene.mgi_set.all()), None) result['mgiMarkerId'] = mgi.marker_id if mgi else None if add_omim: omim_phenotypes = _get_json_for_models(gene.omim_set.all()) result['omimPhenotypes'] = [phenotype for phenotype in omim_phenotypes if phenotype['phenotypeMimNumber']] result['mimNumber'] = omim_phenotypes[0]['mimNumber'] if omim_phenotypes else None if add_constraints: constraint = next((constraint for constraint in gene.geneconstraint_set.all()), None) result['constraints'] = _get_json_for_model(constraint, process_result=_add_total_constraint_count) if constraint else {} if add_notes: result['notes'] = gene_notes_json.get(result['geneId'], []) if add_expression: result['expression'] = {ge.tissue_type: ge.expression_values for ge in gene.geneexpression_set.all()} if add_dbnsfp: prefetch_related_objects(genes, 'dbnsfpgene_set') if add_omim: prefetch_related_objects(genes, 'omim_set') if add_constraints: prefetch_related_objects(genes, Prefetch('geneconstraint_set', queryset=GeneConstraint.objects.order_by('-mis_z', '-pLI'))) if add_primate_ai: prefetch_related_objects(genes, 'primateai_set') if add_mgi: prefetch_related_objects(genes, 'mgi_set') if add_expression: prefetch_related_objects(genes, 'geneexpression_set') return _get_json_for_models(genes, process_result=_process_result)
def test_unknown(self): book1 = Book.objects.get(id=self.book1.id) with self.assertRaises(AttributeError): prefetch_related_objects([book1], 'unknown_attribute')
def get_object(self, queryset=None): object = super(PrefetchedSingleObjectMixin, self).get_object(queryset) prefetch_related_objects([object], *self.prefetch_related_lookups) return object
def collectQuizTasksForTopic(articles=None, topic=None, project=None): taskList = [] # getTopicTree returns the topic with all levels of its subtopic tree topictree = topic.getTopicTree() # Prefetching uses one query per related table to populate caches. # This helps us avoid per row queries when looping over rows. prefetch_related_objects(topictree, "questions__answers") # Set up the prefetch to retrieve all available hints for each article allHints = NLPHints.objects.all() fetchHints = Prefetch("hints", queryset=allHints, to_attr="allHints") logger.info("Found %d hints" % (len(allHints), )) # Set up Prefetch that will cache just the highlights matching # this topic to article.highlight_taskruns[n].highlightsForTopic topicHighlights = (HighlightGroup.objects.filter( topic=topic).prefetch_related("submitted_answers")) fetchHighlights = Prefetch("highlight_taskruns__highlights", queryset=topicHighlights, to_attr="highlightsForTopic") # Find articles highlighted with the topic within the provided queryset # distinct is essential after prefetch_related chained method articles = (articles.filter(highlight_taskruns__highlights__topic=topic). prefetch_related(fetchHighlights).prefetch_related( fetchHints).order_by("id").distinct()) project_data = ProjectSerializer(project, many=False).data topictree_data = TopicSerializer(topictree, many=True).data # With the prefetching config above, the loops below will # be hitting caches. Only 8 queries should be issued against 8 tables, # i.e. The query count will not be a function of number of rows returned. for article in articles: # Our prefetched highlightsForTopic is nested under # the ArticleHightlight record, in HighlightGroup # Not expecting more than one ArticleHighlight record # but safest to code as if there could be more than one. highlights = [ hg for ah in article.highlight_taskruns.all() for hg in ah.highlightsForTopic ] # At this point, we are processing one topic for one article # All the highlights for a given topic/case need to be in one task. # Need to sort here instead of the above prefetch because we want # to ignore the potential grouping effect if there was more than one # ArticleHighlight in above list comprehension # See data.pybossa_api.save_highlight_taskrun for import code sortkey = lambda x: x.case_number hg_by_case = sorted(highlights, key=sortkey) for case_number, hg_case_group in groupby(hg_by_case, key=sortkey): taskList.append({ "project": project_data, "topTopicId": topic.id, "topictree": topictree_data, "article": ArticleSerializer(article, many=False).data, "highlights": HighlightGroupSerializer(hg_case_group, many=True).data, "hints": NLPHintSerializer(article.allHints, many=True).data, }) return taskList
def get_attrs(self, item_list, user, **kwargs): alert_rules = {item.id: item for item in item_list} prefetch_related_objects(item_list, "snuba_query__environment") result = defaultdict(dict) triggers = AlertRuleTrigger.objects.filter(alert_rule__in=item_list).order_by("label") serialized_triggers = serialize(list(triggers)) for trigger, serialized in zip(triggers, serialized_triggers): alert_rule_triggers = result[alert_rules[trigger.alert_rule_id]].setdefault( "triggers", [] ) alert_rule_triggers.append(serialized) alert_rule_projects = AlertRule.objects.filter( id__in=[item.id for item in item_list] ).values_list("id", "snuba_query__subscriptions__project__slug") for alert_rule_id, project_slug in alert_rule_projects: rule_result = result[alert_rules[alert_rule_id]].setdefault("projects", []) rule_result.append(project_slug) for rule_activity in AlertRuleActivity.objects.filter( alert_rule__in=item_list, type=AlertRuleActivityType.CREATED.value ).select_related("alert_rule", "user"): if rule_activity.user: user = { "id": rule_activity.user.id, "name": rule_activity.user.get_display_name(), "email": rule_activity.user.email, } else: user = None result[alert_rules[rule_activity.alert_rule.id]].update({"created_by": user}) resolved_actors = {} owners_by_type = defaultdict(list) for item in item_list: if item.owner_id is not None: owners_by_type[actor_type_to_string(item.owner.type)].append(item.owner_id) for k, v in ACTOR_TYPES.items(): resolved_actors[k] = { a.actor_id: a.id for a in actor_type_to_class(v).objects.filter(actor_id__in=owners_by_type[k]) } for alert_rule in alert_rules.values(): if alert_rule.owner_id: type = actor_type_to_string(alert_rule.owner.type) if alert_rule.owner_id in resolved_actors[type]: result[alert_rule][ "owner" ] = f"{type}:{resolved_actors[type][alert_rule.owner_id]}" if "original_alert_rule" in self.expand: snapshot_activities = AlertRuleActivity.objects.filter( alert_rule__in=item_list, type=AlertRuleActivityType.SNAPSHOT.value, ) for activity in snapshot_activities: result[alert_rules[activity.alert_rule_id]][ "originalAlertRuleId" ] = activity.previous_alert_rule_id return result
def get_user_group_values(self, user) -> List[str]: if user.is_anonymous: return [] prefetch_related_objects([user], "groups") return [g.name for g in user.groups.all()]
def _get_json_for_individuals(individuals, user=None, project_guid=None, family_guid=None, add_sample_guids_field=False, family_fields=None, skip_nested=False): """Returns a JSON representation for the given list of Individuals. Args: individuals (array): array of django models for the individual. user (object): Django User object for determining whether to include restricted/internal-only fields project_guid (string): An optional field to use as the projectGuid instead of querying the DB family_guid (boolean): An optional field to use as the familyGuid instead of querying the DB add_sample_guids_field (boolean): A flag to indicate weather sample ids should be added Returns: array: array of json objects """ def _get_case_review_status_modified_by(modified_by): return modified_by.email or modified_by.username if hasattr( modified_by, 'email') else modified_by def _load_phenotips_data(phenotips_data): phenotips_json = None if phenotips_data: try: phenotips_json = json.loads(phenotips_data) except Exception as e: logger.error("Couldn't parse phenotips: {}".format(e)) return phenotips_json def _process_result(result, individual): mother = result.pop('mother', None) father = result.pop('father', None) result.update({ 'caseReviewStatusLastModifiedBy': _get_case_review_status_modified_by( result.get('caseReviewStatusLastModifiedBy')), 'phenotipsData': _load_phenotips_data(result['phenotipsData']), 'maternalGuid': mother.guid if mother else None, 'paternalGuid': father.guid if father else None, 'maternalId': mother.individual_id if mother else None, 'paternalId': father.individual_id if father else None, 'displayName': result['displayName'] or result['individualId'], }) if add_sample_guids_field: result['sampleGuids'] = [ s.guid for s in individual.sample_set.all() ] if project_guid or not skip_nested: nested_fields = [ { 'fields': ('family', 'guid'), 'value': family_guid }, { 'fields': ('family', 'project', 'guid'), 'key': 'projectGuid', 'value': project_guid }, ] if family_fields: for field in family_fields: nested_fields.append({ 'fields': ('family', field), 'key': _to_camel_case(field) }) kwargs = {'nested_fields': nested_fields} else: kwargs = {'additional_model_fields': ['family_id']} prefetch_related_objects(individuals, 'mother') prefetch_related_objects(individuals, 'father') prefetch_related_objects(individuals, 'case_review_status_last_modified_by') if add_sample_guids_field: prefetch_related_objects(individuals, 'sample_set') return _get_json_for_models(individuals, user=user, process_result=_process_result, **kwargs)
def handle(self, *args, **options): """transfer project""" project_arg = options['project'] elasticsearch_index = options['es_index'] project = Project.objects.get( Q(name=project_arg) | Q(guid=project_arg)) logger.info('Updating project genome version for {}'.format( project.name)) # Validate the provided index logger.info('Validating es index {}'.format(elasticsearch_index)) sample_ids, index_metadata = get_elasticsearch_index_samples( elasticsearch_index) validate_index_metadata(index_metadata, project, elasticsearch_index, genome_version=GENOME_VERSION_GRCh38) sample_type = index_metadata['sampleType'] dataset_path = index_metadata['sourceFilePath'] matched_sample_id_to_sample_record = match_sample_ids_to_sample_records( project=project, sample_ids=sample_ids, sample_type=sample_type, dataset_type=Sample.DATASET_TYPE_VARIANT_CALLS, elasticsearch_index=elasticsearch_index, sample_id_to_individual_id_mapping={}, ) unmatched_samples = set(sample_ids) - set( matched_sample_id_to_sample_record.keys()) if len(unmatched_samples) > 0: raise CommandError( 'Matches not found for ES sample ids: {}.'.format( ', '.join(unmatched_samples))) prefetch_related_objects(matched_sample_id_to_sample_record.values(), 'individual__family') included_families = { sample.individual.family for sample in matched_sample_id_to_sample_record.values() } missing_individuals = Individual.objects.filter( family__in=included_families, sample__is_active=True, sample__dataset_type=Sample.DATASET_TYPE_VARIANT_CALLS, ).exclude(sample__in=matched_sample_id_to_sample_record.values() ).select_related('family') missing_family_individuals = defaultdict(list) for individual in missing_individuals: missing_family_individuals[individual.family].append(individual) if missing_family_individuals: raise CommandError( 'The following families are included in the callset but are missing some family members: {}.' .format(', '.join([ '{} ({})'.format( family.family_id, ', '.join([i.individual_id for i in missing_indivs])) for family, missing_indivs in missing_family_individuals.items() ]))) # Get and clean up expected saved variants saved_variant_models_by_guid = { v.guid: v for v in SavedVariant.objects.filter(family__project=project) } deleted_no_tags = set() for guid, variant in saved_variant_models_by_guid.items(): if not (variant.varianttag_set.count() or variant.variantnote_set.count()): deleted_no_tags.add(guid) if deleted_no_tags: if raw_input( 'Do you want to delete the following {} saved variants with no tags (y/n)?: {} ' .format(len(deleted_no_tags), ', '.join(deleted_no_tags))) == 'y': for guid in deleted_no_tags: saved_variant_models_by_guid.pop(guid).delete() logger.info('Deleted {} variants'.format(len(deleted_no_tags))) expected_families = { sv.family for sv in saved_variant_models_by_guid.values() } missing_families = expected_families - included_families if missing_families: raise CommandError( 'The following families have saved variants but are missing from the callset: {}.' .format(', '.join([f.family_id for f in missing_families]))) # Lift-over saved variants _update_variant_samples(matched_sample_id_to_sample_record, elasticsearch_index, dataset_path) saved_variants = get_json_for_saved_variants( saved_variant_models_by_guid.values(), add_details=True) saved_variants_to_lift = [ v for v in saved_variants if v['genomeVersion'] != GENOME_VERSION_GRCh38 ] num_already_lifted = len(saved_variants) - len(saved_variants_to_lift) if num_already_lifted: if raw_input( 'Found {} saved variants already on Hg38. Continue with liftover (y/n)? ' .format(num_already_lifted)) != 'y': raise CommandError( 'Error: found {} saved variants already on Hg38'.format( num_already_lifted)) logger.info( 'Lifting over {} variants (skipping {} that are already lifted)'. format(len(saved_variants_to_lift), num_already_lifted)) liftover_to_38 = LiftOver('hg19', 'hg38') hg37_to_hg38_xpos = {} lift_failed = {} for v in saved_variants_to_lift: if not (hg37_to_hg38_xpos.get(v['xpos']) or v['xpos'] in lift_failed): hg38_coord = liftover_to_38.convert_coordinate( 'chr{}'.format(v['chrom'].lstrip('chr')), int(v['pos'])) if hg38_coord and hg38_coord[0]: hg37_to_hg38_xpos[v['xpos']] = get_xpos( hg38_coord[0][0], hg38_coord[0][1]) else: lift_failed[v['xpos']] = v if lift_failed: if raw_input( 'Unable to lift over the following {} coordinates. Continue with update (y/n)?: {} ' .format( len(lift_failed), ', '.join([ '{}:{}-{}-{} ({})'.format( v['chrom'], v['pos'], v['ref'], v['alt'], ', '.join(v['familyGuids'])) for v in lift_failed.values() ]))) != 'y': raise CommandError( 'Error: unable to lift over {} variants'.format( len(lift_failed))) saved_variants_map = defaultdict(list) for v in saved_variants_to_lift: if hg37_to_hg38_xpos.get(v['xpos']): variant_model = saved_variant_models_by_guid[v['variantGuid']] saved_variants_map[(hg37_to_hg38_xpos[v['xpos']], v['ref'], v['alt'])].append(variant_model) es_variants = get_es_variants_for_variant_tuples( expected_families, saved_variants_map.keys()) missing_variants = set( saved_variants_map.keys()) - {(v['xpos'], v['ref'], v['alt']) for v in es_variants} if missing_variants: missing_variant_strings = [] for xpos, ref, alt in missing_variants: var_id = '{}-{}-{}'.format(xpos, ref, alt) for v in saved_variants_map[(xpos, ref, alt)]: tags = v.varianttag_set.all() notes = v.variantnote_set.all() missing_variant_strings.append( '{var_id} {family_id}: {tags} ({guid})'.format( var_id=var_id, family_id=v.family.family_id, guid=v.guid, tags=', '.join([ tag.variant_tag_type.name for tag in tags ]) if tags else 'No Tags; {}'.format('; '.join( [note.note for note in notes])))) if raw_input( 'Unable to find the following {} variants in the index. Continue with update (y/n)?:\n{}\n' .format(len(missing_variants), '\n'.join(missing_variant_strings))) != 'y': raise CommandError( 'Error: unable to find {} lifted-over variants'.format( len(missing_variants))) logger.info('Successfully lifted over {} variants'.format( len(es_variants))) # Update saved variants missing_family_count = 0 for var in es_variants: saved_variant_models = saved_variants_map[(var['xpos'], var['ref'], var['alt'])] missing_saved_variants = [ v for v in saved_variant_models if v.family.guid not in var['familyGuids'] ] if missing_saved_variants: variant_id = '{}-{}-{}-{}'.format(var['chrom'], var['pos'], var['ref'], var['alt']) if raw_input( ('Variant {} (hg37: {}) not find for expected families {}. Continue with update (y/n)? ' .format( variant_id, missing_saved_variants[0].xpos, ', '.join([ '{} ({})'.format(v.family.guid, v.guid) for v in missing_saved_variants ])))) == 'y': var = get_single_es_variant( [v.family for v in saved_variant_models], variant_id, return_all_queried_families=True) missing_family_count += len(missing_saved_variants) else: raise CommandError( 'Error: unable to find family data for lifted over variant' ) for saved_variant in saved_variant_models: saved_variant.xpos_start = var['xpos'] saved_variant.saved_variant_json = var saved_variant.save() logger.info('Successfully updated {} variants'.format( len(es_variants))) # Update project and sample data update_model_from_json(project, {'genome_version': GENOME_VERSION_GRCh38}) reset_cached_search_results(project) logger.info('---Done---') logger.info( 'Succesfully lifted over {} variants. Skipped {} failed variants. Family data not updated for {} variants' .format(len(es_variants), len(missing_variants) + len(lift_failed), missing_family_count))
def get_elasticsearch_variants( self, project_id, family_id=None, variant_filter=None, genotype_filter=None, variant_id_filter=None, quality_filter=None, indivs_to_consider=None, include_all_consequences=False, user=None, max_results_limit=settings.VARIANT_QUERY_RESULTS_LIMIT, ): from xbrowse_server.base.models import Project, Family, Individual from seqr.models import Sample from seqr.utils.es_utils import _liftover_grch38_to_grch37 from xbrowse_server.mall import get_reference redis_client = None if settings.REDIS_SERVICE_HOSTNAME: try: redis_client = redis.StrictRedis(host=settings.REDIS_SERVICE_HOSTNAME, socket_connect_timeout=3) redis_client.ping() except redis.exceptions.TimeoutError as e: logger.warn("Unable to connect to redis host: {}".format(settings.REDIS_SERVICE_HOSTNAME) + str(e)) redis_client = None cache_key = "Variants___%s___%s___%s" % ( project_id, family_id, json.dumps([ variant_filter.toJSON() if variant_filter else None, genotype_filter, quality_filter, variant_id_filter, indivs_to_consider, include_all_consequences, ]) ) cached_results = redis_client and redis_client.get(cache_key) if cached_results is not None: variant_results = json.loads(cached_results) return [Variant.fromJSON(variant_json) for variant_json in variant_results] if family_id is None: project = Project.objects.get(project_id=project_id) elasticsearch_index = project.get_elasticsearch_index() logger.info("Searching in project elasticsearch index: " + str(elasticsearch_index)) else: family = Family.objects.get(project__project_id=project_id, family_id=family_id) elasticsearch_index = family.get_elasticsearch_index() project = family.project logger.info("Searching in family elasticsearch index: " + str(elasticsearch_index)) if indivs_to_consider is None and genotype_filter and not family_id: indivs_to_consider = genotype_filter.keys() individuals = Individual.objects.filter(family__project__project_id=project_id).only("indiv_id", "seqr_individual") if indivs_to_consider: individuals = individuals.filter(indiv_id__in=indivs_to_consider) if family_id is not None: individuals = individuals.filter(family__family_id=family_id) if not indivs_to_consider: indivs_to_consider = [i.indiv_id for i in individuals] prefetch_related_objects(individuals, "seqr_individual") es_indices = [index.rstrip('*') for index in elasticsearch_index.split(',')] samples = Sample.objects.filter( individual__in=[i.seqr_individual for i in individuals if i.seqr_individual], dataset_type=Sample.DATASET_TYPE_VARIANT_CALLS, sample_status=Sample.SAMPLE_STATUS_LOADED, elasticsearch_index__startswith=es_indices[0], loaded_date__isnull=False, ).order_by('-loaded_date') prefetch_related_objects(samples, "individual") family_individual_ids_to_sample_ids = {} for i in individuals: indiv_id = i.indiv_id sample_id = None if i.seqr_individual: sample_id = next(( sample.sample_id for sample in samples if sample.individual == i.seqr_individual and sample.elasticsearch_index.startswith(tuple(es_indices)) ), None) family_individual_ids_to_sample_ids[indiv_id] = sample_id or indiv_id query_json = self._make_db_query(genotype_filter, variant_filter) es_client = elasticsearch.Elasticsearch(host=settings.ELASTICSEARCH_SERVICE_HOSTNAME, timeout=30) mapping = es_client.indices.get_mapping(str(elasticsearch_index) + "*") index_fields = {} is_parent_child = False is_nested = False if elasticsearch_index in mapping and 'join_field' in mapping[elasticsearch_index]["mappings"]["variant"]["properties"]: # Nested indices are not sharded so all samples are in the single index logger.info("matching indices: " + str(elasticsearch_index)) is_parent_child = True elif elasticsearch_index in mapping and 'genotypes' in mapping[elasticsearch_index]["mappings"]["variant"]["properties"]: # Nested indices are not sharded so all samples are in the single index logger.info("matching indices: " + str(elasticsearch_index)) is_nested = True elif family_id is not None and len(family_individual_ids_to_sample_ids) > 0: # figure out which index to use # TODO add caching matching_indices = [] for raw_sample_id in family_individual_ids_to_sample_ids.values(): sample_id = _encode_name(raw_sample_id) for index_name, index_mapping in mapping.items(): if sample_id+"_num_alt" in index_mapping["mappings"]["variant"]["properties"]: matching_indices.append(index_name) index_fields.update(index_mapping["mappings"]["variant"]["properties"]) if len(matching_indices) > 0: break if not matching_indices: if family_id is not None and not family_individual_ids_to_sample_ids: logger.error("no individuals found for family %s" % (family_id)) elif not mapping: logger.error("no es mapping found for found with prefix %s" % (elasticsearch_index)) else: logger.error("%s not found in %s:\n%s" % (indiv_id, elasticsearch_index, pformat(index_mapping["mappings"]["variant"]["properties"]))) else: elasticsearch_index = ",".join(matching_indices) logger.info("matching indices: " + str(elasticsearch_index)) else: elasticsearch_index = str(elasticsearch_index)+"*" if not index_fields: for index_mapping in mapping.values(): index_fields.update(index_mapping["mappings"]["variant"]["properties"]) s = elasticsearch_dsl.Search(using=es_client, index=elasticsearch_index) #",".join(indices)) if variant_id_filter is not None: variant_id_filter_term = None for variant_id in variant_id_filter: q_obj = Q('term', **{"variantId": variant_id}) if variant_id_filter_term is None: variant_id_filter_term = q_obj else: variant_id_filter_term |= q_obj s = s.filter(variant_id_filter_term) genotype_filters = {} for key, value in query_json.items(): if key.startswith("genotypes"): indiv_id = ".".join(key.split(".")[1:-1]) sample_id = family_individual_ids_to_sample_ids.get(indiv_id) or indiv_id genotype_filter = value if type(genotype_filter) == int or type(genotype_filter) == basestring: genotype_filters[sample_id] = [('term', genotype_filter)] elif '$gte' in genotype_filter: genotype_filter = {k.replace("$", ""): v for k, v in genotype_filter.items()} genotype_filters[sample_id] = [('range', genotype_filter)] elif "$in" in genotype_filter: num_alt_values = genotype_filter['$in'] genotype_filters[sample_id] = [('term', num_alt_value) for num_alt_value in num_alt_values] sample_ids = [family_individual_ids_to_sample_ids.get(indiv_id) or indiv_id for indiv_id in (indivs_to_consider or [])] min_ab = None min_gq = None if quality_filter is not None and indivs_to_consider: min_ab = quality_filter.get('min_ab') if min_ab is not None and not is_nested: min_ab /= 100.0 # convert to fraction min_gq = quality_filter.get('min_gq') vcf_filter = quality_filter.get('vcf_filter') if vcf_filter is not None: s = s.filter(~Q('exists', field='filters')) if is_parent_child: quality_q = Q() if min_ab or min_gq: if min_ab is not None: # AB only relevant for hets quality_q &= Q(~Q('term', num_alt=1) | Q('range', ab={'gte': min_ab})) if min_gq is not None: quality_q &= Q('range', gq={'gte': min_gq}) if genotype_filters: # Return inner hits for all requested samples, even those without a specified genotype genotype_sample_ids = sample_ids or genotype_filters.keys() genotype_q = None for sample_id in genotype_sample_ids: sample_q = Q(Q('term', sample_id=sample_id) & quality_q) if genotype_filters.get(sample_id): q = None for (op, val) in genotype_filters[sample_id]: if q: q |= Q(op, num_alt=val) else: q = Q(op, num_alt=val) sample_q &= q if not genotype_q: genotype_q = sample_q else: genotype_q |= sample_q genotype_kwargs = {'query': genotype_q, 'min_children': len(genotype_sample_ids)} elif sample_ids: # Subquery for child docs with the requested sample IDs and quality metrics sample_id_q = Q('terms', sample_id=sample_ids) & quality_q # Only return variants where at least one of the requested samples has an alt allele s = s.filter(Q('has_child', type='genotype', query=(Q(Q('range', num_alt={'gte': 1}) & sample_id_q)))) # Return inner hits for all the requested samples regardless of genotype genotype_kwargs = {'query': sample_id_q, 'min_children': len(sample_ids)} else: # Return all inner hits for the variant # This case is only used by gene search, which also does not use quality filters genotype_kwargs = {'query': Q()} s = s.filter(Q('has_child', type='genotype', inner_hits={'size': genotype_kwargs.get('min_children', MAX_INNER_HITS)}, **genotype_kwargs)) if is_nested: if sample_ids and min_ab is not None: min_ab_filter_val = int(min_ab) - int(min_ab % 5) for sample_id in sample_ids: q = Q('term', samples_ab_0_to_5=sample_id) for i in range(5, min_ab_filter_val, 5): q = q | Q('term', **{'samples_ab_{}_to_{}'.format(i, i+5): sample_id}) # AB only relevant for hets s = s.filter(~Q(q) | ~Q('term', samples_num_alt_1=sample_id)) if sample_ids and min_gq is not None: min_gq_filter_val = int(min_gq) - int(min_gq % 5) for sample_id in sample_ids: q = Q('term', samples_gq_0_to_5=sample_id) for i in range(5, min_gq_filter_val, 5): q = q | Q('term', **{'samples_gq_{}_to_{}'.format(i, i+5): sample_id}) s = s.filter(~Q(q)) if genotype_filters: for sample_id, queries in genotype_filters.items(): if queries[0][0] == 'range': allowed_num_alt = range(queries[0][1]['gte'], 3) else: allowed_num_alt = [query[1] for query in queries] if 0 in allowed_num_alt: q = Q('term', samples_no_call=sample_id) if 1 not in allowed_num_alt: q = q | Q('term', samples_num_alt_1=sample_id) if 2 not in allowed_num_alt: q = q | Q('term', samples_num_alt_2=sample_id) s = s.filter(~q) else: q = Q('term', **{'samples_num_alt_{}'.format(allowed_num_alt[0]): sample_id}) for num_alt in allowed_num_alt[1:]: q = q | Q('term', **{'samples_num_alt_{}'.format(num_alt): sample_id}) s = s.filter(q) elif sample_ids: s = s.filter(Q('terms', samples_num_alt_1=sample_ids) | Q('terms', samples_num_alt_2=sample_ids)) else: for sample_id, queries in genotype_filters.items(): encoded_sample_id = _encode_name(sample_id) q = Q(queries[0][0], **{encoded_sample_id + "_num_alt": queries[0][1]}) for (op, val) in queries[1:]: q = q | Q(op, **{encoded_sample_id + "_num_alt": val}) s = s.filter(q) if sample_ids: atleast_one_nonref_genotype_filter = None for sample_id in sample_ids: encoded_sample_id = _encode_name(sample_id) q = Q('range', **{encoded_sample_id+"_num_alt": {'gte': 1}}) if atleast_one_nonref_genotype_filter is None: atleast_one_nonref_genotype_filter = q else: atleast_one_nonref_genotype_filter |= q s = s.filter(atleast_one_nonref_genotype_filter) if min_ab or min_gq: for sample_id in sample_ids: encoded_sample_id = _encode_name(sample_id) if min_ab: s = s.filter( ~Q('term', **{encoded_sample_id+"_num_alt": 1}) | Q('range', **{encoded_sample_id+"_ab": {'gte': min_ab}})) #logger.info("### ADDED FILTER: " + str({encoded_sample_id+"_ab": {'gte': min_ab}})) if min_gq: s = s.filter('range', **{encoded_sample_id+"_gq": {'gte': min_gq}}) #logger.info("### ADDED FILTER: " + str({encoded_sample_id+"_gq": {'gte': min_gq}})) # parse variant query annotation_groups_map = ANNOTATION_GROUPS_MAP_INTERNAL if user and user.is_staff else ANNOTATION_GROUPS_MAP for key, value in query_json.items(): if key == 'db_tags': so_annotations = query_json.get('db_tags', {}).get('$in', []) # handle clinvar filters selected_so_annotations_set = set(so_annotations) all_clinvar_filters_set = set(annotation_groups_map.get("clinvar", {}).get("children", [])) selected_clinvar_filters_set = all_clinvar_filters_set & selected_so_annotations_set all_hgmd_filters_set = set(annotation_groups_map.get("hgmd", {}).get("children", [])) selected_hgmd_filters_set = all_hgmd_filters_set & selected_so_annotations_set vep_consequences = list(selected_so_annotations_set - selected_clinvar_filters_set - selected_hgmd_filters_set) consequences_filter = Q("terms", transcriptConsequenceTerms=vep_consequences) if selected_clinvar_filters_set: clinvar_clinical_significance_terms = set() for clinvar_filter in selected_clinvar_filters_set: # translate selected filters to the corresponding clinvar clinical consequence terms if clinvar_filter == "pathogenic": clinvar_clinical_significance_terms.update(["Pathogenic", "Pathogenic/Likely_pathogenic"]) elif clinvar_filter == "likely_pathogenic": clinvar_clinical_significance_terms.update(["Likely_pathogenic", "Pathogenic/Likely_pathogenic"]) elif clinvar_filter == "benign": clinvar_clinical_significance_terms.update(["Benign", "Benign/Likely_benign"]) elif clinvar_filter == "likely_benign": clinvar_clinical_significance_terms.update(["Likely_benign", "Benign/Likely_benign"]) elif clinvar_filter == "vus_or_conflicting": clinvar_clinical_significance_terms.update([ "Conflicting_interpretations_of_pathogenicity", "Uncertain_significance", "not_provided", "other"]) else: raise ValueError("Unexpected clinvar filter: " + str(clinvar_filter)) consequences_filter = consequences_filter | Q("terms", clinvar_clinical_significance=list(clinvar_clinical_significance_terms)) if selected_hgmd_filters_set: hgmd_class = set() for hgmd_filter in selected_hgmd_filters_set: # translate selected filters to the corresponding hgmd clinical consequence terms if hgmd_filter == "disease_causing": hgmd_class.update(["DM"]) elif hgmd_filter == "likely_disease_causing": hgmd_class.update(["DM?"]) elif hgmd_filter == "hgmd_other": hgmd_class.update(["DP", "DFP", "FP", "FTV"]) else: raise ValueError("Unexpected hgmd filter: " + str(hgmd_filter)) consequences_filter = consequences_filter | Q("terms", hgmd_class=list(hgmd_class)) if 'intergenic_variant' in vep_consequences: # for many intergenic variants VEP doesn't add any annotations, so if user selected 'intergenic_variant', also match variants where transcriptConsequenceTerms is emtpy consequences_filter = consequences_filter | ~Q('exists', field='transcriptConsequenceTerms') s = s.filter(consequences_filter) #logger.info("==> transcriptConsequenceTerms: %s" % str(vep_consequences)) if key.startswith("genotypes"): continue if key == "db_gene_ids": db_gene_ids = query_json.get('db_gene_ids', {}) exclude_genes = db_gene_ids.get('$nin', []) gene_ids = exclude_genes or db_gene_ids.get('$in', []) if exclude_genes: s = s.exclude("terms", geneIds=gene_ids) else: s = s.filter("terms", geneIds=gene_ids) #logger.info("==> %s %s" % ("exclude" if exclude_genes else "include", "geneIds: " + str(gene_ids))) if key == "$or" and type(value) == list: q_terms = None for region_filter in value: xpos_filters = region_filter.get("$and", {}) # for example: $or : [{'$and': [{'xpos': {'$gte': 12345}}, {'xpos': {'$lte': 54321}}]}] xpos_filters_dict = {} for xpos_filter in xpos_filters: xpos_filter_setting = xpos_filter["xpos"] # for example {'$gte': 12345} or {'$lte': 54321} xpos_filters_dict.update(xpos_filter_setting) xpos_filter_setting = {k.replace("$", ""): v for k, v in xpos_filters_dict.items()} q = Q('range', **{"xpos": xpos_filter_setting}) if q_terms is None: q_terms = q else: q_terms |= q if q_terms is not None: s = s.filter(q_terms) #logger.info("==> xpos range: " + str({"xpos": xpos_filter_setting})) af_key_map = { "db_freqs.AF": ["AF"], "db_freqs.1kg_wgs_phase3": ["g1k_POPMAX_AF"], "db_freqs.exac_v3": ["exac_AF_POPMAX"], "db_freqs.topmed": ["topmed_AF"], "db_freqs.gnomad_exomes": ["gnomad_exomes_AF_POPMAX", "gnomad_exomes_AF_POPMAX_OR_GLOBAL"], "db_freqs.gnomad_genomes": ["gnomad_genomes_AF_POPMAX", "gnomad_genomes_AF_POPMAX_OR_GLOBAL"], "db_freqs.gnomad-exomes2": ["gnomad_exomes_AF_POPMAX", "gnomad_exomes_AF_POPMAX_OR_GLOBAL"], "db_freqs.gnomad-genomes2": ["gnomad_genomes_AF_POPMAX", "gnomad_genomes_AF_POPMAX_OR_GLOBAL"], } if key in af_key_map: for filter_key in af_key_map[key]: af_filter_setting = {k.replace("$", ""): v for k, v in value.items()} s = s.filter(Q('range', **{filter_key: af_filter_setting}) | ~Q('exists', field=filter_key)) #logger.info("==> %s: %s" % (filter_key, af_filter_setting)) ac_key_map = { "db_acs.AF": "AC", "db_acs.1kg_wgs_phase3": "g1k_AC", "db_acs.exac_v3": "exac_AC", "db_acs.topmed": "topmed_AC", "db_acs.gnomad_exomes": "gnomad_exomes_AC", "db_acs.gnomad_genomes": "gnomad_genomes_AC", "db_acs.gnomad-exomes2": "gnomad_exomes_AC", "db_acs.gnomad-genomes2": "gnomad_genomes_AC", } if key in ac_key_map: filter_key = ac_key_map[key] ac_filter_setting = {k.replace("$", ""): v for k, v in value.items()} s = s.filter(Q('range', **{filter_key: ac_filter_setting}) | ~Q('exists', field=filter_key)) hemi_key_map = { "db_hemi.exac_v3": "exac_AC_Hemi", "db_hemi.gnomad_exomes": "gnomad_exomes_Hemi", "db_hemi.gnomad_genomes": "gnomad_genomes_Hemi", "db_hemi.gnomad-exomes2": "gnomad_exomes_Hemi", "db_hemi.gnomad-genomes2": "gnomad_genomes_Hemi", } if key in hemi_key_map: filter_key = hemi_key_map[key] hemi_filter_setting = {k.replace("$", ""): v for k, v in value.items()} s = s.filter(Q('range', **{filter_key: hemi_filter_setting}) | ~Q('exists', field=filter_key)) hom_key_map = { "db_hom.exac_v3": "exac_AC_Hom", "db_hom.gnomad_exomes": "gnomad_exomes_Hom", "db_hom.gnomad_genomes": "gnomad_genomes_Hom", "db_hom.gnomad-exomes2": "gnomad_exomes_Hom", "db_hom.gnomad-genomes2": "gnomad_genomes_Hom", } if key in hom_key_map: filter_key = hom_key_map[key] hom_filter_setting = {k.replace("$", ""): v for k, v in value.items()} s = s.filter(Q('range', **{filter_key: hom_filter_setting}) | ~Q('exists', field=filter_key)) #s = s.sort("xpos") #logger.info("=====") #logger.info("FULL QUERY OBJ: " + pformat(s.__dict__)) #logger.info("FILTERS: " + pformat(s.to_dict())) # https://elasticsearch-py.readthedocs.io/en/master/helpers.html#elasticsearch.helpers.scan start = time.time() s = s.params(size=max_results_limit + 1) #if not include_all_consequences: # s = s.source(exclude=["sortedTranscriptConsequences"]) response = s.execute() logger.info("=====") logger.info("TOTAL: %s. Query took %s seconds" % (response.hits.total, time.time() - start)) if response.hits.total > max_results_limit + 1: raise Exception("This search matched too many variants. Please set additional filters and try again.") #print(pformat(response.to_dict())) project = Project.objects.get(project_id=project_id) #gene_list_map = project.get_gene_list_map() reference = get_reference() #for i, hit in enumerate(response.hits): variant_results = [] for i, hit in enumerate(response): # preserve_order=True #logger.info("HIT %s: %s %s %s" % (i, hit["variantId"], hit["geneIds"], pformat(hit.__dict__))) #print("HIT %s: %s" % (i, pformat(hit.to_dict()))) filters = ",".join(hit["filters"] or []) if "filters" in hit else "" genotypes = {} all_num_alt = [] if is_parent_child: genotypes_by_sample_id = {gen_hit['sample_id']: gen_hit for gen_hit in hit.meta.inner_hits.genotype} elif is_nested: genotypes_by_sample_id = {gen_hit['sample_id']: gen_hit for gen_hit in hit['genotypes']} for individual_id, sample_id in family_individual_ids_to_sample_ids.items(): def _get_hit_field(field): if is_parent_child or is_nested: gen_hit = genotypes_by_sample_id.get(sample_id, {}) key = field else: gen_hit = hit key = '{}_{}'.format(_encode_name(sample_id), field) return gen_hit[key] if key in gen_hit else None num_alt = _get_hit_field('num_alt') if num_alt is None: num_alt = -1 all_num_alt.append(num_alt) alleles = [] if num_alt == 0: alleles = [hit["ref"], hit["ref"]] elif num_alt == 1: alleles = [hit["ref"], hit["alt"]] elif num_alt == 2: alleles = [hit["alt"], hit["alt"]] elif num_alt == -1 or num_alt == None: alleles = [] else: raise ValueError("Invalid num_alt: " + str(num_alt)) genotypes[individual_id] = { 'ab': _get_hit_field('ab'), 'alleles': map(str, alleles), 'extras': { 'ad': _get_hit_field('ad'), 'dp': _get_hit_field('dp'), #'pl': '', }, 'filter': filters or "pass", 'gq': _get_hit_field('gq') or '', 'num_alt': num_alt, } vep_annotation = hit['sortedTranscriptConsequences'] if 'sortedTranscriptConsequences' in hit else None if vep_annotation is not None: if is_parent_child or is_nested: vep_annotation = [annot.to_dict() for annot in vep_annotation] else: vep_annotation = json.loads(str(vep_annotation)) gene_ids = list(hit['geneIds'] or []) worst_vep_index_per_gene = { gene_id: next((i for i, annot in enumerate(vep_annotation) if annot['gene_id'] == gene_id), None) for gene_id in gene_ids } if project.genome_version == GENOME_VERSION_GRCh37: grch38_coord = None if self.liftover_grch37_to_grch38: grch38_coord = self.liftover_grch37_to_grch38.convert_coordinate("chr%s" % hit["contig"].replace("chr", ""), int(hit["start"])) if grch38_coord and grch38_coord[0]: grch38_coord = "%s-%s-%s-%s "% (grch38_coord[0][0], grch38_coord[0][1], hit["ref"], hit["alt"]) else: grch38_coord = None else: grch38_coord = hit["variantId"] if project.genome_version == GENOME_VERSION_GRCh38: grch37_coord = None liftover_grch38_to_grch37 = _liftover_grch38_to_grch37() if liftover_grch38_to_grch37: grch37_coord = liftover_grch38_to_grch37.convert_coordinate("chr%s" % hit["contig"].replace("chr", ""), int(hit["start"])) if grch37_coord and grch37_coord[0]: grch37_coord = "%s-%s-%s-%s "% (grch37_coord[0][0], grch37_coord[0][1], hit["ref"], hit["alt"]) else: grch37_coord = None else: grch37_coord = hit["variantId"] freq_fields = { 'AF': "AF" if "AF" in index_fields else None, '1kg_wgs_AF': "g1k_AF" if "g1k_AF" in index_fields else None, '1kg_wgs_popmax_AF': "g1k_POPMAX_AF" if "g1k_POPMAX_AF" in index_fields else None, 'exac_v3_AF': "exac_AF" if "exac_AF" in index_fields else None, 'exac_v3_popmax_AF': "exac_AF_POPMAX" if "exac_AF_POPMAX" in index_fields else None, 'gnomad_exomes_AF': "gnomad_exomes_AF" if "gnomad_exomes_AF" in index_fields else None, 'gnomad_exomes_popmax_AF': "gnomad_exomes_AF_POPMAX_OR_GLOBAL" if "gnomad_exomes_AF_POPMAX_OR_GLOBAL" in index_fields else ( "gnomad_exomes_AF_POPMAX" if "gnomad_exomes_AF_POPMAX" in index_fields else None), 'gnomad_genomes_AF': "gnomad_genomes_AF" if "gnomad_genomes_AF" in index_fields else None, 'gnomad_genomes_popmax_AF': "gnomad_genomes_AF_POPMAX_OR_GLOBAL" if "gnomad_genomes_AF_POPMAX_OR_GLOBAL" in index_fields else ( "gnomad_genomes_AF_POPMAX" if "gnomad_genomes_AF_POPMAX" in index_fields else None), 'topmed_AF': "topmed_AF" if "topmed_AF" in index_fields else None, } result = { #u'_id': ObjectId('596d2207ff66f729285ca588'), 'alt': str(hit["alt"]) if "alt" in hit else None, 'annotation': { 'fathmm': fathmm_map.get(hit["dbnsfp_FATHMM_pred"].split(';')[0]) if "dbnsfp_FATHMM_pred" in hit and hit["dbnsfp_FATHMM_pred"] else None, 'muttaster': muttaster_map.get(hit["dbnsfp_MutationTaster_pred"].split(';')[0]) if "dbnsfp_MutationTaster_pred" in hit and hit["dbnsfp_MutationTaster_pred"] else None, 'polyphen': polyphen_map.get(hit["dbnsfp_Polyphen2_HVAR_pred"].split(';')[0]) if "dbnsfp_Polyphen2_HVAR_pred" in hit and hit["dbnsfp_Polyphen2_HVAR_pred"] else None, 'sift': sift_map.get(hit["dbnsfp_SIFT_pred"].split(';')[0]) if "dbnsfp_SIFT_pred" in hit and hit["dbnsfp_SIFT_pred"] else None, 'metasvm': metasvm_map.get(hit["dbnsfp_MetaSVM_pred"].split(';')[0]) if "dbnsfp_MetaSVM_pred" in hit and hit["dbnsfp_MetaSVM_pred"] else None, 'GERP_RS': float(hit["dbnsfp_GERP_RS"]) if "dbnsfp_GERP_RS" in hit and hit["dbnsfp_GERP_RS"] else None, 'phastCons100way_vertebrate': float(hit["dbnsfp_phastCons100way_vertebrate"]) if "dbnsfp_phastCons100way_vertebrate" in hit and hit["dbnsfp_phastCons100way_vertebrate"] else None, 'cadd_phred': hit["cadd_PHRED"] if "cadd_PHRED" in hit else None, 'dann_score': hit["dbnsfp_DANN_score"] if "dbnsfp_DANN_score" in hit else None, 'revel_score': hit["dbnsfp_REVEL_score"] if "dbnsfp_REVEL_score" in hit else None, 'eigen_phred': hit["eigen_Eigen_phred"] if "eigen_Eigen_phred" in hit else (hit["dbnsfp_Eigen_phred"] if "dbnsfp_Eigen_phred" in hit else None), 'mpc_score': hit["mpc_MPC"] if "mpc_MPC" in hit else None, 'primate_ai_score': hit["primate_ai_score"] if "primate_ai_score" in hit else None, 'splice_ai_delta_score': hit["splice_ai_delta_score"] if "splice_ai_delta_score" in hit else None, 'rsid': hit["rsid"] if "rsid" in hit else None, 'annotation_tags': list(hit["transcriptConsequenceTerms"] or []) if "transcriptConsequenceTerms" in hit else None, 'coding_gene_ids': list(hit['codingGeneIds'] or []), 'gene_ids': list(hit['geneIds'] or []), 'vep_annotation': vep_annotation, 'vep_group': str(hit['mainTranscript_major_consequence'] or "") if "mainTranscript_major_consequence" in hit else "", 'vep_consequence': str(hit['mainTranscript_major_consequence'] or "") if "mainTranscript_major_consequence" in hit else "", 'main_transcript': {k.replace('mainTranscript_', ''): hit[k] for k in dir(hit) if k.startswith('mainTranscript_')}, 'worst_vep_annotation_index': 0, 'worst_vep_index_per_gene': worst_vep_index_per_gene, }, 'chr': hit["contig"], 'coding_gene_ids': list(hit['codingGeneIds'] or []), 'gene_ids': gene_ids, 'coverage': { 'gnomad_exome_coverage': float(hit["gnomad_exome_coverage"] or -1) if "gnomad_exome_coverage" in hit else -1, 'gnomad_genome_coverage': float(hit["gnomad_genome_coverage"] or -1) if "gnomad_genome_coverage" in hit else -1, }, 'pop_counts': { 'AC': int(hit['AC'] or 0) if 'AC' in hit else None, 'AN': int(hit['AN'] or 0) if 'AN' in hit else None, 'g1kAC': int(hit['g1k_AC'] or 0) if 'g1k_AC' in hit else None, 'g1kAN': int(hit['g1k_AN'] or 0) if 'g1k_AN' in hit else None, 'exac_v3_AC': int(hit["exac_AC_Adj"] or 0) if "exac_AC_Adj" in hit else None, 'exac_v3_Het': int(hit["exac_AC_Het"] or 0) if "exac_AC_Het" in hit else None, 'exac_v3_Hom': int(hit["exac_AC_Hom"] or 0) if "exac_AC_Hom" in hit else None, 'exac_v3_Hemi': int(hit["exac_AC_Hemi"] or 0) if "exac_AC_Hemi" in hit else None, 'exac_v3_AN': int(hit["exac_AN_Adj"] or 0) if "exac_AN_Adj" in hit else None, 'gnomad_exomes_AC': int(hit["gnomad_exomes_AC"] or 0) if "gnomad_exomes_AC" in hit else None, 'gnomad_exomes_Hom': int(hit["gnomad_exomes_Hom"] or 0) if "gnomad_exomes_Hom" in hit else None, 'gnomad_exomes_Hemi': int(hit["gnomad_exomes_Hemi"] or 0) if "gnomad_exomes_Hemi" in hit else None, 'gnomad_exomes_AN': int(hit["gnomad_exomes_AN"] or 0) if "gnomad_exomes_AN" in hit else None, 'gnomad_genomes_AC': int(hit["gnomad_genomes_AC"] or 0) if "gnomad_genomes_AC" in hit else None, 'gnomad_genomes_Hom': int(hit["gnomad_genomes_Hom"] or 0) if "gnomad_genomes_Hom" in hit else None, 'gnomad_genomes_Hemi': int(hit["gnomad_genomes_Hemi"] or 0) if "gnomad_genomes_Hemi" in hit else None, 'gnomad_genomes_AN': int(hit["gnomad_genomes_AN"] or 0) if "gnomad_genomes_AN" in hit else None, 'topmed_AC': float(hit["topmed_AC"] or 0) if "topmed_AC" in hit else None, 'topmed_Het': float(hit["topmed_Het"] or 0) if "topmed_Het" in hit else None, 'topmed_Hom': float(hit["topmed_Hom"] or 0) if "topmed_Hom" in hit else None, 'topmed_AN': float(hit["topmed_AN"] or 0) if "topmed_AN" in hit else None, }, 'db_freqs': {k: float(hit[v] or 0.0) if v in hit else (0.0 if v else None) for k, v in freq_fields.items()}, #'popmax_populations': { # 'exac_popmax': hit["exac_POPMAX"] or None, # 'gnomad_exomes_popmax': hit["gnomad_exomes_POPMAX"] or None, # 'gnomad_genomes_popmax': hit["gnomad_genomes_POPMAX"] or None, #}, 'db_gene_ids': list((hit["geneIds"] or []) if "geneIds" in hit else []), 'db_tags': str(hit["transcriptConsequenceTerms"] or "") if "transcriptConsequenceTerms" in hit else None, 'extras': { 'clinvar_variant_id': hit['clinvar_variation_id'] if 'clinvar_variation_id' in hit and hit['clinvar_variation_id'] else None, 'clinvar_allele_id': hit['clinvar_allele_id'] if 'clinvar_allele_id' in hit and hit['clinvar_allele_id'] else None, 'clinvar_clinsig': hit['clinvar_clinical_significance'].lower() if ('clinvar_clinical_significance' in hit) and hit['clinvar_clinical_significance'] else None, 'clinvar_gold_stars': hit['clinvar_gold_stars'] if 'clinvar_gold_stars' in hit and hit['clinvar_gold_stars'] else None, 'hgmd_class': hit['hgmd_class'] if 'hgmd_class' in hit and user and user.is_staff else None, 'hgmd_accession': hit['hgmd_accession'] if 'hgmd_accession' in hit else None, 'genome_version': project.genome_version, 'grch37_coords': grch37_coord, 'grch38_coords': grch38_coord, 'alt_allele_pos': 0, 'orig_alt_alleles': map(str, [a.split("-")[-1] for a in hit["originalAltAlleles"]]) if "originalAltAlleles" in hit else None }, 'genotypes': genotypes, 'pos': long(hit['start']), 'pos_end': str(hit['end']), 'ref': str(hit['ref']), 'vartype': 'snp' if len(hit['ref']) == len(hit['alt']) else "indel", 'vcf_id': None, 'xpos': long(hit["xpos"]), 'xposx': long(hit["xpos"]), } result["annotation"]["freqs"] = result["db_freqs"] result["annotation"]["pop_counts"] = result["pop_counts"] result["annotation"]["db"] = "elasticsearch" result["extras"]["svlen"] = hit["SVLEN"] if "SVLEN" in hit else None result["extras"]["svtype"] = hit["SVTYPE"] if "SVTYPE" in hit else None logger.info("Result %s: GRCh37: %s GRCh38: %s - gene ids: %s, coding gene_ids: %s" % ( i, grch37_coord, grch38_coord, result["gene_ids"], result["coding_gene_ids"])) result["extras"]["project_id"] = project_id result["extras"]["family_id"] = family_id # add gene info gene_names = {} if vep_annotation is not None: gene_names = {vep_anno["gene_id"]: vep_anno.get("gene_symbol") for vep_anno in vep_annotation if vep_anno.get("gene_symbol")} result["extras"]["gene_names"] = gene_names try: genes = {} for gene_id in result["gene_ids"]: if gene_id: genes[gene_id] = reference.get_gene_summary(gene_id) or {} #if not genes: # genes = {vep_anno["gene_id"]: {"symbol": vep_anno["gene_symbol"]} for vep_anno in vep_annotation} result["extras"]["genes"] = genes except Exception as e: exc_type, exc_obj, exc_tb = sys.exc_info() logger.warn("WARNING: got unexpected error in add_gene_names_to_variants: %s : line %s" % (e, exc_tb.tb_lineno)) variant_results.append(result) logger.info("Finished returning the %s variants: %s seconds" % (response.hits.total, time.time() - start)) if redis_client: redis_client.set(cache_key, json.dumps(variant_results)) return [Variant.fromJSON(variant_json) for variant_json in variant_results]
def cache_skills(self, update=True): if not update and hasattr(self, 'cached_skills'): return prefetch_related_objects([self], Prefetch('skills', to_attr='cached_skills'))
def _copy_facts_to_subscribers(facts, subscribers): ''' The meat-and-potatoes of the copy operation. ''' from manabi.apps.flashcards.models import Card, Fact, Deck shared_deck = facts[0].deck subscriber_decks = shared_deck.subscriber_decks.filter( owner__in=subscribers, active=True, ) subscriber_deck_values = subscriber_decks.values_list('id', 'owner_id') subscriber_decks_already_with_facts = ( _subscriber_decks_already_with_facts(subscriber_decks, facts) ) fact_cards_prefetch = Prefetch( 'card_set', queryset=Card.objects.filter(active=True, suspended=False), to_attr='available_cards', ) try: facts = ( facts.filter(active=True) .prefetch_related(fact_cards_prefetch) ) except AttributeError: facts = [fact for fact in facts if fact.active] prefetch_related_objects(facts, fact_cards_prefetch) copied_facts = [] copied_cards = [] updated_subscriber_deck_ids = set() for shared_fact in facts: copy_attrs = [ 'active', 'suspended', 'new_fact_ordinal', 'expression', 'reading', 'meaning', 'example_sentence', 'jmdict_id', ] fact_kwargs = {attr: getattr(shared_fact, attr) for attr in copy_attrs} for subscriber_deck_id, subscriber_id in subscriber_deck_values: if _subscriber_deck_already_has_fact( subscriber_deck_id, shared_fact, subscriber_decks_already_with_facts, ): continue fact = Fact( deck_id=subscriber_deck_id, synchronized_with=shared_fact, **fact_kwargs ) copied_facts.append(fact) # Copy the cards. copied_cards_for_fact = [] for shared_card in shared_fact.available_cards: card = shared_card.copy(fact, owner_id=subscriber_id) copied_cards_for_fact.append(card) copied_cards.append(copied_cards_for_fact) updated_subscriber_deck_ids.add(subscriber_deck_id) # Persist everything. created_facts = Fact.objects.bulk_create( copied_facts, batch_size=BULK_BATCH_SIZE) for fact, fact_cards in zip(created_facts, copied_cards): for fact_card in fact_cards: fact_card.fact_id = fact.id Card.objects.bulk_create( itertools.chain.from_iterable(copied_cards), batch_size=BULK_BATCH_SIZE) # Refresh denormalized card count. for subscriber_deck_id in updated_subscriber_deck_ids: Deck.objects.filter(id=subscriber_deck_id).update( card_count=Card.objects.filter( deck_id=subscriber_deck_id, ).available().count(), )
def collectQuizTasksForTopic(articles=None, topic=None, project=None): taskList = [] # getTopicTree returns the topic with all levels of its subtopic tree topictree = topic.getTopicTree() # Prefetching uses one query per related table to populate caches. # This helps us avoid per row queries when looping over rows. prefetch_related_objects(topictree, "questions__answers") # Set up the prefetch to retrieve all available hints for each article allHints = NLPHints.objects.all() fetchHints = Prefetch("hints", queryset=allHints, to_attr="allHints") logger.info("Found %d hints" % (len(allHints),)) # Set up Prefetch that will cache just the highlights matching # this topic to article.highlight_taskruns[n].highlightsForTopic exclude_ids = [] # Pick the contributor based on what's selected in the GUI. contributor_id = project.task_config['contributor_id'] topicHighlights = HighlightGroup.objects.filter(topic=topic, article_highlight__contributor=contributor_id) # Filter the highlights based on the min tokens provided on project creation min_tokens_per_highlight = project.task_config['min_tokens'] max_tokens_per_highlight = project.task_config['max_tokens'] for topic_hlght in topicHighlights: total_count = topic_hlght.token_count() if (total_count < min_tokens_per_highlight or total_count > max_tokens_per_highlight): exclude_ids.append(topic_hlght.id) logger.info("Excluded HighlightGroup: {} {} {} tokens". format(topic_hlght.id, topic_hlght.topic.name, total_count)) topicHighlights = topicHighlights.exclude(id__in=exclude_ids) fetchHighlights = Prefetch("highlight_taskruns__highlights", queryset=topicHighlights, to_attr="highlightsForTopic") # Find articles highlighted with the topic within the provided queryset # distinct is essential after prefetch_related chained method articles = (articles .filter(highlight_taskruns__highlights__topic=topic) .prefetch_related(fetchHighlights) .prefetch_related(fetchHints) .order_by("article_number") .distinct()) logger.info("collectQuizTasks sorting by article_number for topic {}: {}" .format(topic.name, [article.article_number for article in articles]) ) project_data = ProjectSerializer(project, many=False).data topictree_data = TopicSerializer2(topictree, many=True).data # With the prefetching config above, the loops below will # be hitting caches. Only 8 queries should be issued against 8 tables, # i.e. The query count will not be a function of number of rows returned. for article in articles: # Our prefetched highlightsForTopic is nested under # the ArticleHightlight record, in HighlightGroup # Not expecting more than one ArticleHighlight record # but safest to code as if there could be more than one. highlights = [ hg for ah in article.highlight_taskruns.all() for hg in ah.highlightsForTopic ] # At this point, we are processing one topic for one article # All the highlights for a given topic/case need to be in one task. # Need to sort here instead of the above prefetch because we want # to ignore the potential grouping effect if there was more than one # ArticleHighlight in above list comprehension # See data.pybossa_api.save_quiz_taskrun for import code sortkey = lambda x: x.case_number hg_by_case = sorted(highlights, key=sortkey) # Although this code can send multiple HighlightGroups, the # Quiz task presenter will only use the first one. # So when there are multiple highlight taskruns in the database, # (for a given article and topic), # the taskrun to be processed by a Quiz will essentially be selected # at random. There will need to be a way to flag the # official "Gold Standard" HighlightGroup that was distilled from # multiple Highlighter taskruns, that will be the one sent to the Quiz. for case_number, hg_case_group in groupby(hg_by_case, key=sortkey): taskList.append({ "project": project_data, "topTopicId": topic.id, "topictree": topictree_data, "article": ArticleSerializer(article, many=False).data, "highlights": HighlightGroupSerializer( hg_case_group, many=True).data, "hints": NLPHintSerializer(article.allHints, many=True).data, }) return taskList
def render_rules(rules=None, version=None): ''' Render rules in a format that Prometheus understands :param rules: List of rules :type rules: list(Rule) :param int version: Prometheus rule format (1 or 2) :return: Returns rules in yaml or Prometheus v1 format :rtype: bytes This function can render in either v1 or v2 format We call prefetch_related_objects within this function to populate the other related objects that are mostly used for the sub lookups. ''' if rules is None: rules = models.Rule.objects.filter(enabled=True) if version is None: version = settings.PROMGEN['prometheus'].get('version', 1) prefetch_related_objects( rules, 'content_object', 'content_type', 'overrides__content_object', 'overrides__content_type', 'ruleannotation_set', 'rulelabel_set', ) # V1 format is a custom format which we render through django templates # See promgen/tests/examples/import.rule if version == 1: return render_to_string('promgen/prometheus.rule', { 'rules': rules }).encode('utf-8') # V2 format is a yaml dictionary which we build and then render # See promgen/tests/examples/import.rule.yml rule_list = collections.defaultdict(list) for r in rules: rule_list[str(r.content_object)].append({ 'alert': r.name, 'expr': macro.rulemacro(r.clause, r), 'for': r.duration, 'labels': r.labels, 'annotations': r.annotations, }) return yaml.safe_dump( { 'groups': [{ 'name': name, 'rules': rule_list[name] } for name in rule_list] }, default_flow_style=False, allow_unicode=True, encoding='utf-8')
def prefetch_related(self, *args): prefetch_related_objects(self.results, *args) return self
def _get_projects_details(projects, user, project_category_guid=None): for project in projects: check_permissions(project, user) prefetch_related_objects(projects, 'can_view_group') project_models_by_guid = {project.guid: project for project in projects} projects_json = get_json_for_projects(projects, user) locus_lists = set() functional_data_tag_types = get_json_for_variant_functional_data_tag_types( ) variant_tag_types_by_guid = { vtt.guid: vtt for vtt in VariantTagType.objects.filter( Q(project__in=projects) | Q(project__isnull=True)).prefetch_related('project') } variant_tag_types = _get_json_for_models( variant_tag_types_by_guid.values()) for project_json in projects_json: project = project_models_by_guid[project_json['projectGuid']] project_locus_lists = get_project_locus_list_models(project) locus_lists.update(project_locus_lists) project_json.update({ 'locusListGuids': [locus_list.guid for locus_list in project_locus_lists], 'variantTagTypes': [ vtt for vtt in variant_tag_types if variant_tag_types_by_guid[vtt['variantTagTypeGuid']].project is None or variant_tag_types_by_guid[vtt['variantTagTypeGuid']] .project.guid == project_json['projectGuid'] ], 'variantFunctionalTagTypes': functional_data_tag_types, }) families = _get_json_for_families( Family.objects.filter(project__in=projects), user) individuals = _get_json_for_individuals( Individual.objects.filter(family__project__in=projects), user=user) samples = get_json_for_samples( Sample.objects.filter(individual__family__project__in=projects)) analysis_groups = get_json_for_analysis_groups( AnalysisGroup.objects.filter(project__in=projects)) individual_guids_by_family = defaultdict(list) for individual in individuals: individual_guids_by_family[individual['familyGuid']].append( individual['individualGuid']) for family in families: family['individualGuids'] = individual_guids_by_family[ family['familyGuid']] sample_guids_by_individual = defaultdict(list) for sample in samples: sample_guids_by_individual[sample['individualGuid']].append( sample['sampleGuid']) for individual in individuals: individual['sampleGuids'] = sample_guids_by_individual[ individual['individualGuid']] response = { 'projectsByGuid': {p['projectGuid']: p for p in projects_json}, 'familiesByGuid': {f['familyGuid']: f for f in families}, 'individualsByGuid': {i['individualGuid']: i for i in individuals}, 'samplesByGuid': {s['sampleGuid']: s for s in samples}, 'locusListsByGuid': { ll['locusListGuid']: ll for ll in get_json_for_locus_lists(list(locus_lists), user) }, 'analysisGroupsByGuid': {ag['analysisGroupGuid']: ag for ag in analysis_groups}, } if project_category_guid: response['projectCategoriesByGuid'] = { project_category_guid: ProjectCategory.objects.get(guid=project_category_guid).json() } return response
def saved_variants_page(request, tag): gene = request.GET.get('gene') tag_type = VariantTagType.objects.get(name=tag, project__isnull=True) saved_variant_models = SavedVariant.objects.filter( varianttag__variant_tag_type=tag_type) if gene: saved_variant_models = saved_variant_models.filter( saved_variant_json__transcripts__has_key=gene) if saved_variant_models.count() > 10000 and not gene: return create_json_response( {'message': 'Select a gene to filter variants'}, status=400) prefetch_related_objects(saved_variant_models, 'family__project') response_json = get_json_for_saved_variants_with_tags( saved_variant_models, add_details=True, include_missing_variants=True) project_models_by_guid = { variant.family.project.guid: variant.family.project for variant in saved_variant_models } families = {variant.family for variant in saved_variant_models} individuals = Individual.objects.filter(family__in=families) saved_variants = response_json['savedVariantsByGuid'].values() genes = _saved_variant_genes(saved_variants) locus_list_guids = _add_locus_lists(project_models_by_guid.values(), saved_variants, genes) projects_json = get_json_for_projects( project_models_by_guid.values(), user=request.user, add_project_category_guids_field=False) functional_tag_types = get_json_for_variant_functional_data_tag_types() variant_tag_types = VariantTagType.objects.filter( Q(project__in=project_models_by_guid.values()) | Q(project__isnull=True)) prefetch_related_objects(variant_tag_types, 'project') variant_tags_json = _get_json_for_models(variant_tag_types) tag_projects = { vt.guid: vt.project.guid for vt in variant_tag_types if vt.project } for project_json in projects_json: project_guid = project_json['projectGuid'] project_variant_tags = [ vt for vt in variant_tags_json if tag_projects.get( vt['variantTagTypeGuid'], project_guid) == project_guid ] project_json.update({ 'locusListGuids': locus_list_guids, 'variantTagTypes': sorted(project_variant_tags, key=lambda variant_tag_type: variant_tag_type['order']), 'variantFunctionalTagTypes': functional_tag_types, }) families_json = _get_json_for_families(list(families), user=request.user, add_individual_guids_field=True) individuals_json = _get_json_for_individuals(individuals, user=request.user) locus_lists_by_guid = { locus_list['locusListGuid']: locus_list for locus_list in get_json_for_locus_lists( LocusList.objects.filter(guid__in=locus_list_guids), request.user) } response_json.update({ 'genesById': genes, 'projectsByGuid': {project['projectGuid']: project for project in projects_json}, 'familiesByGuid': {family['familyGuid']: family for family in families_json}, 'individualsByGuid': {indiv['individualGuid']: indiv for indiv in individuals_json}, 'locusListsByGuid': locus_lists_by_guid, }) return create_json_response(response_json)
def _get_json_for_families(families, user=None, add_individual_guids_field=False, project_guid=None, skip_nested=False, is_analyst=None, has_case_review_perm=None): """Returns a JSON representation of the given Family. Args: families (array): array of django models representing the family. user (object): Django User object for determining whether to include restricted/internal-only fields add_individual_guids_field (bool): whether to add an 'individualGuids' field. NOTE: this will require a database query. project_guid (boolean): An optional field to use as the projectGuid instead of querying the DB Returns: array: json objects """ if not families: return [] def _get_pedigree_image_url(pedigree_image): if isinstance(pedigree_image, ImageFieldFile): try: pedigree_image = pedigree_image.url except Exception: pedigree_image = None return os.path.join("/media/", pedigree_image) if pedigree_image else None analyst_users = set( User.objects.filter( groups__name=ANALYST_USER_GROUP) if ANALYST_USER_GROUP else []) def _process_result(result, family): result['analysedBy'] = [{ 'createdBy': { 'fullName': ab.created_by.get_full_name(), 'email': ab.created_by.email, 'isAnalyst': ab.created_by in analyst_users }, 'lastModifiedDate': ab.last_modified_date, } for ab in family.familyanalysedby_set.all()] pedigree_image = _get_pedigree_image_url(result.pop('pedigreeImage')) result['pedigreeImage'] = pedigree_image if add_individual_guids_field: result['individualGuids'] = [ i.guid for i in family.individual_set.all() ] if not result['displayName']: result['displayName'] = result['familyId'] if result['assignedAnalyst']: result['assignedAnalyst'] = { 'fullName': result['assignedAnalyst'].get_full_name(), 'email': result['assignedAnalyst'].email, } else: result['assignedAnalyst'] = None prefetch_related_objects(families, 'assigned_analyst') prefetch_related_objects(families, 'familyanalysedby_set__created_by') if add_individual_guids_field: prefetch_related_objects(families, 'individual_set') kwargs = { 'additional_model_fields': _get_case_review_fields(families[0], has_case_review_perm, user, lambda family: family.project) } if project_guid or not skip_nested: kwargs.update({ 'nested_fields': [{ 'fields': ('project', 'guid'), 'value': project_guid }] }) else: kwargs['additional_model_fields'].append('project_id') return _get_json_for_models(families, user=user, is_analyst=is_analyst, process_result=_process_result, **kwargs)
def elasticsearch_status(request): client = get_es_client() disk_fields = ['node', 'disk.avail', 'disk.used', 'disk.percent'] disk_status = [{ _to_camel_case(field.replace('.', '_')): disk[field] for field in disk_fields } for disk in client.cat.allocation(format="json", h=','.join(disk_fields)) ] index_fields = [ 'index', 'docs.count', 'store.size', 'creation.date.string' ] indices = [{ _to_camel_case(field.replace('.', '_')): index[field] for field in index_fields } for index in client.cat.indices(format="json", h=','.join(index_fields)) if all(not index['index'].startswith(omit_prefix) for omit_prefix in ['.', 'index_operations_log'])] aliases = defaultdict(list) for alias in client.cat.aliases(format="json", h='alias,index'): aliases[alias['alias']].append(alias['index']) mappings = Index('_all', using=client).get_mapping(doc_type='variant') active_samples = Sample.objects.filter( dataset_type=Sample.DATASET_TYPE_VARIANT_CALLS, is_active=True, elasticsearch_index__isnull=False, ).prefetch_related('individual', 'individual__family') prefetch_related_objects(active_samples, 'individual__family__project') seqr_index_projects = defaultdict(lambda: defaultdict(set)) es_projects = set() for sample in active_samples: for index_name in sample.elasticsearch_index.split(','): project = sample.individual.family.project es_projects.add(project) if index_name in aliases: for aliased_index_name in aliases[index_name]: seqr_index_projects[aliased_index_name][project].add( sample.individual.guid) else: seqr_index_projects[index_name.rstrip('*')][project].add( sample.individual.guid) for index in indices: index_name = index['index'] index_mapping = mappings[index_name]['mappings']['variant'] index.update(index_mapping.get('_meta', {})) projects_for_index = [] for index_prefix in seqr_index_projects.keys(): if index_name.startswith(index_prefix): projects_for_index += seqr_index_projects.pop( index_prefix).keys() index['projects'] = [{ 'projectGuid': project.guid, 'projectName': project.name } for project in projects_for_index] errors = [ '{} does not exist and is used by project(s) {}'.format( index, ', '.join([ '{} ({} samples)'.format(p.name, len(indivs)) for p, indivs in project_individuals.items() ])) for index, project_individuals in seqr_index_projects.items() if project_individuals ] return create_json_response({ 'indices': indices, 'diskStats': disk_status, 'elasticsearchHost': ELASTICSEARCH_SERVER, 'errors': errors, })
def elasticsearch_status(request): client = get_es_client() disk_fields = ['node', 'disk.avail', 'disk.used', 'disk.percent'] disk_status = [{ _to_camel_case(field.replace('.', '_')): disk[field] for field in disk_fields } for disk in client.cat.allocation(format="json", h=','.join(disk_fields)) ] index_fields = [ 'index', 'docs.count', 'store.size', 'creation.date.string' ] indices = [{ _to_camel_case(field.replace('.', '_')): index[field] for field in index_fields } for index in client.cat.indices(format="json", h=','.join(index_fields)) if index['index'] not in ['.kibana', 'index_operations_log']] aliases = defaultdict(list) for alias in client.cat.aliases(format="json", h='alias,index'): aliases[alias['alias']].append(alias['index']) mappings = Index('_all', using=client).get_mapping(doc_type='variant') latest_loaded_samples = get_latest_loaded_samples() prefetch_related_objects(latest_loaded_samples, 'individual__family__project') seqr_index_projects = defaultdict(lambda: defaultdict(set)) es_projects = set() for sample in latest_loaded_samples: for index_name in sample.elasticsearch_index.split(','): project = sample.individual.family.project es_projects.add(project) if index_name in aliases: for aliased_index_name in aliases[index_name]: seqr_index_projects[aliased_index_name][project].add( sample.individual.guid) else: seqr_index_projects[index_name.rstrip('*')][project].add( sample.individual.guid) for index in indices: index_name = index['index'] index_mapping = mappings[index_name]['mappings']['variant'] index.update(index_mapping.get('_meta', {})) index['hasNestedGenotypes'] = 'samples_num_alt_1' in index_mapping[ 'properties'] projects_for_index = [] for index_prefix in seqr_index_projects.keys(): if index_name.startswith(index_prefix): projects_for_index += seqr_index_projects.pop( index_prefix).keys() index['projects'] = [{ 'projectGuid': project.guid, 'projectName': project.name } for project in projects_for_index] errors = [ '{} does not exist and is used by project(s) {}'.format( index, ', '.join([ '{} ({} samples)'.format(p.name, len(indivs)) for p, indivs in project_individuals.items() ])) for index, project_individuals in seqr_index_projects.items() if project_individuals ] # TODO remove once all projects are switched off of mongo all_mongo_samples = Sample.objects.filter( dataset_type=Sample.DATASET_TYPE_VARIANT_CALLS, sample_status=Sample.SAMPLE_STATUS_LOADED, elasticsearch_index__isnull=True, ).exclude(individual__family__project__in=es_projects).prefetch_related( 'individual', 'individual__family__project') mongo_sample_individual_max_loaded_date = { agg['individual__guid']: agg['max_loaded_date'] for agg in all_mongo_samples.values('individual__guid').annotate( max_loaded_date=Max('loaded_date')) } mongo_project_samples = defaultdict(set) for s in all_mongo_samples: if s.loaded_date == mongo_sample_individual_max_loaded_date[ s.individual.guid]: mongo_project_samples[s.individual.family.project].add( s.dataset_file_path) mongo_projects = [{ 'projectGuid': project.guid, 'projectName': project.name, 'sourceFilePaths': sample_file_paths } for project, sample_file_paths in mongo_project_samples.items()] return create_json_response({ 'indices': indices, 'diskStats': disk_status, 'elasticsearchHost': ELASTICSEARCH_SERVER, 'mongoProjects': mongo_projects, 'errors': errors, })
def _form_search_response_data_from_vouchers(self, vouchers, user_email, user): """ Build a list of dictionaries that contains the relevant information for each voucher_application (redemption) or offer_assignment (assignment). Returns a list of dictionaries to be handed to the serializer for construction of pagination. """ def _prepare_redemption_data(coupon_data, offer_assignment=None): """ Prepares redemption data for the received voucher in coupon_data """ redemption_data = dict(coupon_data) redemption_data['course_title'] = None redemption_data['course_key'] = None redemption_data['redeemed_date'] = None redemption_data[ 'user_email'] = offer_assignment.user_email if offer_assignment else None redemptions_and_assignments.append(redemption_data) redemptions_and_assignments = [] prefetch_related_objects(vouchers, 'applications', 'coupon_vouchers', 'coupon_vouchers__coupon', 'offers', 'offers__condition', 'offers__offerassignment_set') for voucher in vouchers: coupon_vouchers = voucher.coupon_vouchers.all() coupon_voucher = coupon_vouchers[0] coupon_data = { 'coupon_id': coupon_voucher.coupon.id, 'coupon_name': coupon_voucher.coupon.title, 'code': voucher.code, 'voucher_id': voucher.id, } if user is not None: for application in voucher.applications.all(): if application.user.id == user.id: line = application.order.lines.first() redemption_data = dict(coupon_data) redemption_data[ 'course_title'] = line.product.course.name redemption_data['course_key'] = line.product.course.id redemption_data[ 'redeemed_date'] = application.date_created redemptions_and_assignments.append(redemption_data) offer = voucher and voucher.enterprise_offer all_offer_assignments = offer.offerassignment_set.all() offer_assignments = [] for assignment in all_offer_assignments: if (assignment.voucher_application is None and assignment.status in [OFFER_ASSIGNED, OFFER_ASSIGNMENT_EMAIL_PENDING] and assignment.code == voucher.code and (assignment.user_email == user_email if user_email else True)): offer_assignments.append(assignment) coupon_data['is_assigned'] = len(offer_assignments) # For the case when an unassigned voucher code is searched if len(offer_assignments) == 0: if not user_email: _prepare_redemption_data(coupon_data) else: for offer_assignment in offer_assignments: _prepare_redemption_data(coupon_data, offer_assignment) return redemptions_and_assignments
def search(self, page_number: int = 1, page_size: int = 25) -> None: """ Runs the search for this search and constructs :param page_number: The result page :param page_size: The number of items per page """ queryset = self.get_queryset() print(str(queryset.query)) self.paginator = Paginator(queryset, page_size) try: self.page = self.paginator.page(page_number) except EmptyPage: return cards = list(self.page) prefetch_related_objects(cards, "printings__face_printings") prefetch_related_objects(cards, "printings__localisations__ownerships") prefetch_related_objects(cards, "printings__localisations__language") prefetch_related_objects(cards, "printings__localisations__localised_faces") prefetch_related_objects(cards, "faces") prefetch_related_objects(cards, "printings__set") prefetch_related_objects(cards, "printings__rarity") preferred_set = self.get_preferred_set() self.results = [ SearchResult(card, selected_set=preferred_set) for card in cards ]
def site(request: "HttpRequest") -> dict: """Add site settings to the context under the 'site' key.""" site = get_current_site(request) if isinstance(site, Site): prefetch_related_objects([site], "settings__translations") return {"site": site}