def get_variants_for_inheritance_for_project(project, inheritance_mode): """ Get the variants for this project / inheritance combo Return dict of family -> list of variants """ # create search specification # this could theoretically differ by project, if there are different reference populations variant_filter = get_default_variant_filter('moderate_impact') variant_filter.ref_freqs.append(('1kg_wgs_phase3', g1k_freq_threshold)) variant_filter.ref_freqs.append(('1kg_wgs_phase3_popmax', g1k_popmax_freq_threshold)) variant_filter.ref_freqs.append(('exac_v3', exac_freq_threshold)) variant_filter.ref_freqs.append(('exac_v3_popmax', exac_popmax_threshold)) quality_filter = { 'vcf_filter': 'pass', 'min_gq': GQ_threshold, 'min_ab': AB_threshold, } # run MendelianVariantSearch for each family, collect results families = project.get_families() for i, family in enumerate(families): sys.stdout.write("Processing %s - family %s (%d / %d) .." % (inheritance_mode, family.family_id, i+1, len(families))) variant_list = list(get_variants_with_inheritance_mode( get_mall(project.project_id), family.xfamily(), inheritance_mode, variant_filter=variant_filter, quality_filter=quality_filter, )) yield family, variant_list print(" got %d variants" % len(variant_list))
def family_group_gene(request, project_id, family_group_slug, gene_id): project = get_object_or_404(Project, project_id=project_id) family_group = get_object_or_404(FamilyGroup, project=project, slug=family_group_slug) if not project.can_view(request.user): return HttpResponse('unauthorized') gene_id = get_gene_id_from_str(gene_id, get_reference()) gene = get_reference().get_gene(gene_id) varfilter = get_default_variant_filter( 'all_coding', mall.get_annotator().reference_population_slugs) variants_by_family = family_group_analysis.get_variants_in_gene( family_group, gene_id, variant_filter=varfilter) return render( request, 'family_group/family_group_gene.html', { 'project': project, 'family_group': family_group, 'family_group_json': json.dumps(family_group.toJSON()), 'gene_json': json.dumps(gene), 'gene': gene, 'variants_by_family_json': json.dumps(variants_by_family), })
def get_knockouts_in_gene(project, gene_id, quality_filter=None): """ Get all the variants in a gene, but filter out quality_filter genotypes """ indiv_id_list = [i.indiv_id for i in project.get_individuals()] # filter out variants > 0.01 AF in any of the reference populations reference_populations = mall.get_annotator().reference_population_slugs variant_filter = get_default_variant_filter('moderate_impact', reference_populations) variant_list = get_project_datastore( project.project_id).get_project_variants_in_gene( project.project_id, gene_id, variant_filter=variant_filter, ) variant_list = search_utils.filter_gene_variants_by_variant_filter( variant_list, gene_id, variant_filter) variation = CohortGeneVariation( get_reference(), gene_id, variant_list, indiv_id_list, quality_filter={}, ) knockouts = get_individuals_with_inheritance('recessive', variation, indiv_id_list) return knockouts, variation
def get_variants_for_inheritance_for_project(project, inheritance_mode): """ Get the variants for this project / inheritance combo Return dict of family -> list of variants """ # create search specification # this could theoretically differ by project, if there are different reference populations variant_filter = get_default_variant_filter("moderate_impact") variant_filter.ref_freqs.append(("1kg_wgs_phase3", g1k_freq_threshold)) variant_filter.ref_freqs.append(("1kg_wgs_phase3_popmax", g1k_popmax_freq_threshold)) variant_filter.ref_freqs.append(("exac_v3", exac_freq_threshold)) variant_filter.ref_freqs.append(("exac_v3_popmax", exac_popmax_threshold)) quality_filter = {"vcf_filter": "pass", "min_gq": GQ_threshold, "min_ab": AB_threshold} # run MendelianVariantSearch for each family, collect results families = project.get_families() for i, family in enumerate(families): sys.stdout.write( "Processing %s - family %s (%d / %d) .." % (inheritance_mode, family.family_id, i + 1, len(families)) ) variant_list = list( get_variants_with_inheritance_mode( get_mall(project.project_id), family.xfamily(), inheritance_mode, variant_filter=variant_filter, quality_filter=quality_filter, ) ) yield family, variant_list print(" got %d variants" % len(variant_list))
def gene_quicklook(request, project_id, gene_id): """ Summary of a gene in a project """ project = get_object_or_404(Project, project_id=project_id) if not project.can_view(request.user): return HttpResponse("Unauthorized") gene_id = get_gene_id_from_str(gene_id, get_reference()) gene = get_reference().get_gene(gene_id) sys.stderr.write(project_id + " - staring gene search for: %s %s \n" % (gene_id, gene)) variant_filter = get_default_variant_filter( 'all_coding', mall.get_annotator().reference_population_slugs) num_indivs = len( [i for i in project.get_individuals() if i.has_variant_data()]) aac_threshold = (.2 * num_indivs) + 5 rare_variants = [] for variant in project_analysis.get_variants_in_gene( project, gene_id, variant_filter=variant_filter): aac = get_alt_allele_count(variant) max_af = max(variant.annotation['freqs'].values()) if aac <= aac_threshold and max_af < .01: rare_variants.append(variant) add_extra_info_to_variants_project(get_reference(), project, rare_variants) knockouts = [] knockout_ids, variation = get_knockouts_in_gene(project, gene_id) for kid in knockout_ids: variants = variation.get_relevant_variants_for_indiv_ids([kid]) add_extra_info_to_variants_project(get_reference(), project, variants) knockouts.append({ 'indiv_id': kid, 'variants': [v.toJSON() for v in variants], }) sys.stderr.write("Retrieved %s variants \n" % len(rare_variants)) return render( request, 'project/gene_quicklook.html', { 'gene': gene, 'gene_json': json.dumps(gene), 'project': project, 'rare_variants_json': json.dumps([v.toJSON() for v in rare_variants]), 'individuals_json': json.dumps([i.get_json_obj() for i in project.get_individuals()]), 'knockouts_json': json.dumps(knockouts), })
def inheritance_matrix_for_gene(project, gene_id): """ Run get_family_matrix_for_gene for the families in this project """ variant_filter = get_default_variant_filter('moderate_impact', mall.get_annotator().reference_population_slugs) quality_filter = get_default_quality_filter('high_quality', mall.get_annotator().reference_population_slugs) matrix = get_family_matrix_for_gene( get_mall(), [f.xfamily() for f in project.get_active_families()], gene_id, variant_filter, quality_filter ) return matrix
def get_variants_for_inheritance_for_project(project, inheritance_mode): """ Get the variants for this project / inheritance combo Return dict of family -> list of variants """ # create search specification # this could theoretically differ by project, if there are different reference populations #variant_filter = VariantFilter(so_annotations=SO_SEVERITY_ORDER, ref_freqs=[]) variant_filter = get_default_variant_filter('moderate_impact') variant_filter.ref_freqs.append(('1kg_wgs_phase3', g1k_freq_threshold)) variant_filter.ref_freqs.append( ('1kg_wgs_phase3_popmax', g1k_popmax_freq_threshold)) variant_filter.ref_freqs.append(('exac_v3', exac_freq_threshold)) variant_filter.ref_freqs.append(('exac_v3_popmax', exac_popmax_threshold)) variant_filter.ref_freqs.append( ('merck-wgs-3793', merck_wgs_3793_threshold)) #variant_filter.ref_freqs.append(('merck-pcr-free-wgs-144', merck_wgs_144_threshold)) quality_filter = { # 'vcf_filter': 'pass', 'min_gq': GQ_threshold, 'min_ab': AB_threshold, } # run MendelianVariantSearch for each family, collect results families = project.get_families() for i, family in enumerate(families): print("Processing %s - family %s (%d / %d)" % (inheritance_mode, family.family_id, i + 1, len(families))) try: if inheritance_mode == "all_variants": yield family, list( get_variants(get_datastore(project.project_id), family.xfamily(), variant_filter=variant_filter, quality_filter=quality_filter, indivs_to_consider=family.indiv_id_list())) else: yield family, list( get_variants_with_inheritance_mode( get_mall(project.project_id), family.xfamily(), inheritance_mode, variant_filter=variant_filter, quality_filter=quality_filter, )) except ValueError as e: print("Error: %s. Skipping family %s" % (str(e), str(family)))
def inheritance_matrix_for_gene(project, gene_id): """ Run get_family_matrix_for_gene for the families in this project """ variant_filter = get_default_variant_filter( 'moderate_impact', mall.get_annotator().reference_population_slugs) quality_filter = get_default_quality_filter( 'high_quality', mall.get_annotator().reference_population_slugs) matrix = get_family_matrix_for_gene( get_mall(project.project_id), [f.xfamily() for f in project.get_active_families()], gene_id, variant_filter, quality_filter) return matrix
def get_variants_for_inheritance_for_project(project, inheritance_mode): """ Get the variants for this project / inheritance combo Return dict of family -> list of variants """ # create search specification # this could theoretically differ by project, if there are different reference populations #variant_filter = VariantFilter(so_annotations=SO_SEVERITY_ORDER, ref_freqs=[]) variant_filter = get_default_variant_filter('moderate_impact') variant_filter.ref_freqs.append(('1kg_wgs_phase3', g1k_freq_threshold)) variant_filter.ref_freqs.append(('1kg_wgs_phase3_popmax', g1k_popmax_freq_threshold)) variant_filter.ref_freqs.append(('exac_v3', exac_freq_threshold)) variant_filter.ref_freqs.append(('exac_v3_popmax', exac_popmax_threshold)) variant_filter.ref_freqs.append(('merck-wgs-3793', merck_wgs_3793_threshold)) #variant_filter.ref_freqs.append(('merck-pcr-free-wgs-144', merck_wgs_144_threshold)) quality_filter = { # 'vcf_filter': 'pass', 'min_gq': GQ_threshold, 'min_ab': AB_threshold, } # run MendelianVariantSearch for each family, collect results families = project.get_families() for i, family in enumerate(families): print("Processing %s - family %s (%d / %d)" % (inheritance_mode, family.family_id, i+1, len(families))) try: if inheritance_mode == "all_variants": yield family, list(get_variants( get_datastore(project.project_id), family.xfamily(), variant_filter=variant_filter, quality_filter=quality_filter, indivs_to_consider=family.indiv_id_list() )) else: yield family, list(get_variants_with_inheritance_mode( get_mall(project.project_id), family.xfamily(), inheritance_mode, variant_filter=variant_filter, quality_filter=quality_filter, )) except ValueError as e: print("Error: %s. Skipping family %s" % (str(e), str(family)))
def get_knockouts_in_gene(project, gene_id, gene_variants): """ Get all the variants in a gene, but filter out quality_filter genotypes """ indiv_id_list = [i.indiv_id for i in project.get_individuals()] # filter out variants > 0.01 AF in any of the reference populations reference_populations = mall.get_annotator().reference_population_slugs variant_filter = get_default_variant_filter('moderate_impact', reference_populations) variant_list = search_utils.filter_gene_variants_by_variant_filter(gene_variants, gene_id, variant_filter) variation = CohortGeneVariation( get_reference(), gene_id, variant_list, indiv_id_list, quality_filter={}, ) knockouts = get_individuals_with_inheritance('recessive', variation, indiv_id_list) return knockouts, variation
def gene_quicklook(request, project_id, gene_id): """ Summary of a gene in a project """ project = get_object_or_404(Project, project_id=project_id) if not project.can_view(request.user): return HttpResponse("Unauthorized") gene_id = get_gene_id_from_str(gene_id, get_reference()) gene = get_reference().get_gene(gene_id) sys.stderr.write(project_id + " - staring gene search for: %s %s \n" % (gene_id, gene)) variant_filter = get_default_variant_filter('all_coding', mall.get_annotator().reference_population_slugs) num_indivs = len([i for i in project.get_individuals() if i.has_variant_data()]) aac_threshold = (.2 * num_indivs) + 5 rare_variants = [] for variant in project_analysis.get_variants_in_gene(project, gene_id, variant_filter=variant_filter): aac = get_alt_allele_count(variant) max_af = max(variant.annotation['freqs'].values()) if aac <= aac_threshold and max_af < .01: rare_variants.append(variant) add_extra_info_to_variants_project(get_reference(), project, rare_variants) knockouts = [] knockout_ids, variation = get_knockouts_in_gene(project, gene_id) for kid in knockout_ids: variants = variation.get_relevant_variants_for_indiv_ids([kid]) add_extra_info_to_variants_project(get_reference(), project, variants) knockouts.append({ 'indiv_id': kid, 'variants': [v.toJSON() for v in variants], }) sys.stderr.write("Retrieved %s variants \n" % len(rare_variants)) return render(request, 'project/gene_quicklook.html', { 'gene': gene, 'gene_json': json.dumps(gene), 'project': project, 'rare_variants_json': json.dumps([v.toJSON() for v in rare_variants]), 'individuals_json': json.dumps([i.get_json_obj() for i in project.get_individuals()]), 'knockouts_json': json.dumps(knockouts), })
def family_group_gene(request, project_id, family_group_slug, gene_id): project = get_object_or_404(Project, project_id=project_id) family_group = get_object_or_404(FamilyGroup, project=project, slug=family_group_slug) if not project.can_view(request.user): return HttpResponse('unauthorized') gene_id = get_gene_id_from_str(gene_id, get_reference()) gene = get_reference().get_gene(gene_id) varfilter = get_default_variant_filter('all_coding', mall.get_annotator().reference_population_slugs) variants_by_family = family_group_analysis.get_variants_in_gene(family_group, gene_id, variant_filter=varfilter) return render(request, 'family_group/family_group_gene.html', { 'project': project, 'family_group': family_group, 'family_group_json': json.dumps(family_group.toJSON()), 'gene_json': json.dumps(gene), 'gene': gene, 'variants_by_family_json': json.dumps(variants_by_family), })
def get_knockouts_in_gene(project, gene_id, quality_filter=None): """ Get all the variants in a gene, but filter out quality_filter genotypes """ indiv_id_list = [i.indiv_id for i in project.get_individuals()] variant_filter = get_default_variant_filter('high_impact') variant_list = get_project_datastore().get_variants_in_gene( project.project_id, gene_id, variant_filter=variant_filter, ) variant_list = search_utils.filter_gene_variants_by_variant_filter(variant_list, gene_id, variant_filter) variation = CohortGeneVariation( get_reference(), gene_id, variant_list, indiv_id_list, quality_filter={}, ) knockouts = get_individuals_with_inheritance('recessive', variation, indiv_id_list) return knockouts, variation
def get_knockouts_in_gene(project, gene_id, quality_filter=None): """ Get all the variants in a gene, but filter out quality_filter genotypes """ indiv_id_list = [i.indiv_id for i in project.get_individuals()] variant_filter = get_default_variant_filter('high_impact') variant_list = get_project_datastore( project.project_id).get_project_variants_in_gene( project.project_id, gene_id, variant_filter=variant_filter, ) variant_list = search_utils.filter_gene_variants_by_variant_filter( variant_list, gene_id, variant_filter) variation = CohortGeneVariation( get_reference(), gene_id, variant_list, indiv_id_list, quality_filter={}, ) knockouts = get_individuals_with_inheritance('recessive', variation, indiv_id_list) return knockouts, variation
def gene_quicklook(request, project_id, gene_id): """ Summary of a gene in a project """ main_project = get_object_or_404(Project, project_id=project_id) if not main_project.can_view(request.user): return HttpResponse("Unauthorized") new_page_url = '/variant_search/project/{}'.format(main_project.seqr_project.guid) if main_project.seqr_project and main_project.seqr_project.has_new_search else None # other projects this user can view other_projects = get_loaded_projects_for_user(request.user, fields=['project_id', 'project_name']) if other_projects: other_projects_json = json.dumps([{'project_id': p.project_id, 'project_name': p.project_name} for p in sorted(other_projects, key=lambda p: p.project_id.lower())]) else: other_projects_json = None if gene_id is None: return render(request, 'project/gene_quicklook.html', { 'project': main_project, 'gene': None, 'gene_json': None, 'rare_variants_json': None, 'individuals_json': None, 'knockouts_json': None, 'other_projects_json': other_projects_json, 'new_page_url': new_page_url, }) projects_to_search_param = request.GET.get('selected_projects') if projects_to_search_param: project_ids = projects_to_search_param.split(",") projects_to_search = [project for project in other_projects if project.project_id in project_ids] if len(projects_to_search) < len(project_ids): # If not all the specified project ids are in the other projects list then they are not authorized return HttpResponse("Unauthorized") else: project_ids = [main_project.project_id] projects_to_search = [main_project] gene_id = get_gene_id_from_str(gene_id, get_reference()) gene = get_reference().get_gene(gene_id) # all rare coding variants variant_filter = get_default_variant_filter('all_coding', mall.get_annotator().reference_population_slugs) indiv_id_to_project_id = {} rare_variant_dict = {} rare_variants = [] individ_ids_and_variants = [] for project in projects_to_search: all_project_variants = project_analysis.get_variants_in_gene(project, gene_id, variant_filter=variant_filter) # compute knockout individuals knockout_ids, variation = get_knockouts_in_gene(project, gene_id, all_project_variants) for indiv_id in knockout_ids: variants = variation.get_relevant_variants_for_indiv_ids([indiv_id]) individ_ids_and_variants.append({ 'indiv_id': indiv_id, 'variants': variants, }) # compute rare variants project_variants = [] for i, variant in enumerate(all_project_variants): max_af = max([freq for label, freq in variant.annotation['freqs'].items() if label != "AF"]) # don't filter on within-cohort AF if not any([indiv_id for indiv_id, genotype in variant.genotypes.items() if genotype.num_alt > 0]): continue if max_af >= .01: continue # add project id to genotypes for indiv_id in variant.genotypes: indiv_id_to_project_id[indiv_id] = project.project_id # save this variant (or just the genotypes from this variant if the variant if it's been seen already in another project) variant_id = "%s-%s-%s-%s" % (variant.chr,variant.pos, variant.ref, variant.alt) if variant_id not in rare_variant_dict: rare_variant_dict[variant_id] = variant project_variants.append(variant) else: for indiv_id, genotype in variant.genotypes.items(): existing_genotype = rare_variant_dict[variant_id].genotypes.get(indiv_id) if not existing_genotype or existing_genotype.num_alt == -1: rare_variant_dict[variant_id].genotypes[indiv_id] = genotype if project != main_project: add_extra_info_to_variants_project(get_reference(), project, project_variants) rare_variants.extend(project_variants) all_variants = sum([i['variants'] for i in individ_ids_and_variants], rare_variants) add_extra_info_to_variants_project(get_reference(), main_project, all_variants, add_family_tags=True) download_csv = request.GET.get('download', '') if download_csv: response = HttpResponse(content_type='text/csv') response['Content-Disposition'] = 'attachment; filename="{}_{}.csv"'.format(download_csv, gene.get("symbol") or gene.get("transcript_name")) def get_row(variant, worst_annotation): if 'clinvar_allele_id' in variant.extras: measureset_id = variant.extras['clinvar_allele_id'] clinvar_significance = variant.extras['clinvar_clinsig'] else: measureset_id, clinvar_significance = get_reference().get_clinvar_info(*variant.unique_tuple()) genotypes = [] all_genotypes_string = "" for indiv_id in individuals_to_include: if indiv_id in variant.genotypes and variant.genotypes[indiv_id].num_alt > 0: genotype = variant.genotypes[indiv_id] allele_string = ">".join(genotype.alleles) all_genotypes_string += indiv_id + ":" + allele_string + " " genotypes.append(allele_string + " (" + str(genotype.gq) + ")") else: genotypes.append("") return [ gene["symbol"], variant.chr, variant.pos, variant.ref, variant.alt, variant.vcf_id or variant.annotation.get("rsid") or "", variant.annotation.get("vep_consequence") or "", worst_annotation.get("hgvsc") or "", (worst_annotation.get("hgvsp") or "").replace("%3D", "="), variant.annotation.get("sift") or "", variant.annotation.get("polyphen") or "", variant.annotation.get("mutationtaster_pred") or variant.annotation.get("muttaster") or "", (";".join(set((worst_annotation.get("fathmm_pred") or "").split('%3B')))) or variant.annotation.get("fathmm") or "", measureset_id or "", clinvar_significance or "", variant.annotation["freqs"].get("1kg_wgs_phase3") or variant.annotation["freqs"].get("1kg_wgs_AF") or "", variant.annotation["freqs"].get("1kg_wgs_phase3_popmax") or variant.annotation["freqs"].get("1kg_wgs_popmax_AF") or "", variant.annotation["freqs"].get("exac_v3") or variant.annotation["freqs"].get("exac_v3_AF") or "", variant.annotation["freqs"].get("exac_v3_popmax") or variant.annotation["freqs"].get("exac_v3_popmax_AF") or "", variant.annotation["freqs"].get("gnomad_exomes_AF") or "", variant.annotation["freqs"].get("gnomad_exomes_popmax_AF") or "", variant.annotation["freqs"].get("gnomad_genomes_AF") or "", variant.annotation["freqs"].get("gnomad_genomes_popmax_AF") or "", all_genotypes_string, ] + genotypes if download_csv == 'knockouts': individuals_to_include = [individ_id_and_variants["indiv_id"] for individ_id_and_variants in individ_ids_and_variants] rows = [] for individ_id_and_variants in individ_ids_and_variants: rare_variants = individ_id_and_variants["variants"] for variant in rare_variants: worst_annotation_idx = variant.annotation["worst_vep_index_per_gene"][gene_id] worst_annotation = variant.annotation["vep_annotation"][worst_annotation_idx] genotypes = [] all_genotypes_string = "" for indiv_id in individuals_to_include: if indiv_id in variant.genotypes and variant.genotypes[indiv_id].num_alt > 0: genotype = variant.genotypes[indiv_id] allele_string = ">".join(genotype.alleles) all_genotypes_string += indiv_id + ":" + allele_string + " " genotypes.append(allele_string + " (" + str(genotype.gq) + ")") else: genotypes.append("") rows.append(map(str, get_row(variant, worst_annotation))) elif download_csv == 'rare_variants': individuals_to_include = [] for variant in rare_variants: for indiv_id, genotype in variant.genotypes.items(): if genotype.num_alt > 0 and indiv_id not in individuals_to_include: individuals_to_include.append(indiv_id) rows = [] for variant in rare_variants: worst_annotation_idx = variant.annotation["worst_vep_index_per_gene"][gene_id] worst_annotation = variant.annotation["vep_annotation"][worst_annotation_idx] rows.append(map(str, get_row(variant, worst_annotation))) header = ["gene", "chr", "pos", "ref", "alt", "rsID", "impact", "HGVS.c", "HGVS.p", "sift", "polyphen", "muttaster", "fathmm", "clinvar_id", "clinvar_clinical_sig", "freq_1kg_wgs_phase3", "freq_1kg_wgs_phase3_popmax", "freq_exac_v3", "freq_exac_v3_popmax", "freq_gnomad_exomes", "freq_gnomad_exomes_popmax", "freq_gnomad_genomes", "freq_gnomad_genomes_popmax", "all_genotypes"] + list(map(lambda i: i + " (from %s)" % indiv_id_to_project_id[i], individuals_to_include)) writer = csv.writer(response) writer.writerow(header) for row in rows: writer.writerow(row) return response else: for individ_id_and_variants in individ_ids_and_variants: variants = individ_id_and_variants["variants"] individ_id_and_variants["variants"] = [v.toJSON() for v in variants] individ_ids = {i['indiv_id'] for i in individ_ids_and_variants} for var in rare_variants: individ_ids.update(var.genotypes.keys()) individuals = Individual.objects.filter( indiv_id__in=individ_ids, project__project_id__in=project_ids ).select_related('project').select_related('family').only('project__project_id', 'family__family_id', *Individual.INDIVIDUAL_JSON_FIELDS_NO_IDS) return render(request, 'project/gene_quicklook.html', { 'gene': gene, 'gene_json': json.dumps(gene), 'project': main_project, 'rare_variants_json': json.dumps([v.toJSON() for v in rare_variants]), 'individuals_json': json.dumps([i.get_json_obj(skip_has_variant_data=True) for i in individuals]), 'knockouts_json': json.dumps(individ_ids_and_variants), 'other_projects_json': other_projects_json, 'new_page_url': new_page_url, })
def gene_quicklook(request, project_id, gene_id): """ Summary of a gene in a project """ project = get_object_or_404(Project, project_id=project_id) if not project.can_view(request.user): return HttpResponse("Unauthorized") if project.project_status == Project.NEEDS_MORE_PHENOTYPES and not request.user.is_staff: return render(request, 'analysis_unavailable.html', {'reason': 'Awaiting phenotype data.'}) # other projects this user can view if request.user.is_staff: other_projects = [p for p in Project.objects.all()] # if p != project else: other_projects = [ c.project for c in ProjectCollaborator.objects.filter(user=request.user) ] # if c.project != project other_projects = filter( lambda p: get_project_datastore(p.project_id). project_collection_is_loaded(p.project_id), other_projects) if other_projects: other_projects_json = json.dumps([{ 'project_id': p.project_id, 'project_name': p.project_name } for p in sorted(other_projects, key=lambda p: p.project_id)]) else: other_projects_json = None if gene_id is None: return render( request, 'project/gene_quicklook.html', { 'project': project, 'gene': None, 'gene_json': None, 'rare_variants_json': None, 'individuals_json': None, 'knockouts_json': None, 'other_projects_json': other_projects_json, }) projects_to_search_param = request.GET.get('selected_projects') if projects_to_search_param: projects_to_search = [] project_ids = projects_to_search_param.split(",") for project_id in project_ids: project = get_object_or_404(Project, project_id=project_id) if not project.can_view(request.user): return HttpResponse("Unauthorized") projects_to_search.append(project) else: projects_to_search = [project] gene_id = get_gene_id_from_str(gene_id, get_reference()) gene = get_reference().get_gene(gene_id) sys.stderr.write( project_id + " - staring gene search for: %s in projects: %s\n" % (gene_id, ",".join([p.project_id for p in projects_to_search]) + "\n")) # all rare coding variants variant_filter = get_default_variant_filter( 'all_coding', mall.get_annotator().reference_population_slugs) indiv_id_to_project_id = {} rare_variant_dict = {} rare_variants = [] for project in projects_to_search: project_variants = [] for variant in project_analysis.get_variants_in_gene( project, gene_id, variant_filter=variant_filter): max_af = max(variant.annotation['freqs'].values()) if not any([ indiv_id for indiv_id, genotype in variant.genotypes.items() if genotype.num_alt > 0 ]): continue if max_af >= .01: continue # add project id to genotypes for indiv_id in variant.genotypes: indiv_id_to_project_id[indiv_id] = project.project_id # save this variant (or just the genotypes from this variant if the variant if it's been seen already in another project) variant_id = "%s-%s-%s-%s" % (variant.chr, variant.pos, variant.ref, variant.alt) if variant_id not in rare_variant_dict: rare_variant_dict[variant_id] = variant project_variants.append(variant) else: rare_variant_dict[variant_id].genotypes.update( variant.genotypes) #sys.stderr.write("gene_id: %s, variant: %s\n" % (gene_id, variant.toJSON()['annotation']['vep_annotation'])) add_extra_info_to_variants_project(get_reference(), project, project_variants) rare_variants.extend(project_variants) sys.stderr.write("Retreived %s rare variants\n" % len(rare_variants)) # compute knockout individuals individ_ids_and_variants = [] for project in projects_to_search: knockout_ids, variation = get_knockouts_in_gene(project, gene_id) for indiv_id in knockout_ids: variants = variation.get_relevant_variants_for_indiv_ids( [indiv_id]) add_extra_info_to_variants_project(get_reference(), project, variants) individ_ids_and_variants.append({ 'indiv_id': indiv_id, 'variants': variants, }) #sys.stderr.write("%s : %s: Retrieved %s knockout variants\n" % (project.project_id, indiv_id, len(variants), )) download_csv = request.GET.get('download', '') if download_csv: response = HttpResponse(content_type='text/csv') response[ 'Content-Disposition'] = 'attachment; filename="{}_{}.csv"'.format( download_csv, gene["transcript_name"]) if download_csv == 'knockouts': individuals_to_include = [ individ_id_and_variants["indiv_id"] for individ_id_and_variants in individ_ids_and_variants ] rows = [] for individ_id_and_variants in individ_ids_and_variants: rare_variants = individ_id_and_variants["variants"] for variant in rare_variants: worst_annotation_idx = variant.annotation[ "worst_vep_index_per_gene"][gene_id] worst_annotation = variant.annotation["vep_annotation"][ worst_annotation_idx] genotypes = [] all_genotypes_string = "" for indiv_id in individuals_to_include: if indiv_id in variant.genotypes and variant.genotypes[ indiv_id].num_alt > 0: genotype = variant.genotypes[indiv_id] allele_string = ">".join(genotype.alleles) all_genotypes_string += indiv_id + ":" + allele_string + " " genotypes.append(allele_string + " (" + str(genotype.gq) + ")") else: genotypes.append("") measureset_id, clinvar_significance = settings.CLINVAR_VARIANTS.get( variant.unique_tuple(), ("", "")) rows.append( map(str, [ gene["symbol"], variant.chr, variant.pos, variant.ref, variant.alt, variant.vcf_id or "", variant.annotation.get("vep_consequence", ""), worst_annotation.get("hgvsc", ""), worst_annotation.get("hgvsp", "").replace( "%3D", "="), worst_annotation.get("sift", ""), worst_annotation.get("polyphen", ""), worst_annotation.get("mutationtaster_pred", ""), ";".join( set( worst_annotation.get("fathmm_pred", "").split('%3B'))), measureset_id, clinvar_significance, variant.annotation["freqs"].get( "1kg_wgs_phase3", ""), variant.annotation["freqs"].get( "1kg_wgs_phase3_popmax", ""), variant.annotation["freqs"].get("exac_v3", ""), variant.annotation["freqs"].get( "exac_v3_popmax", ""), all_genotypes_string, ] + genotypes)) elif download_csv == 'rare_variants': individuals_to_include = [] for variant in rare_variants: for indiv_id, genotype in variant.genotypes.items(): if genotype.num_alt > 0 and indiv_id not in individuals_to_include: individuals_to_include.append(indiv_id) rows = [] for variant in rare_variants: worst_annotation_idx = variant.annotation[ "worst_vep_index_per_gene"][gene_id] worst_annotation = variant.annotation["vep_annotation"][ worst_annotation_idx] genotypes = [] all_genotypes_string = "" for indiv_id in individuals_to_include: if indiv_id in variant.genotypes and variant.genotypes[ indiv_id].num_alt > 0: genotype = variant.genotypes[indiv_id] allele_string = ">".join(genotype.alleles) all_genotypes_string += indiv_id + ":" + allele_string + " " genotypes.append(allele_string + " (" + str(genotype.gq) + ")") else: genotypes.append("") measureset_id, clinvar_significance = settings.CLINVAR_VARIANTS.get( variant.unique_tuple(), ("", "")) rows.append( map(str, [ gene["symbol"], variant.chr, variant.pos, variant.ref, variant.alt, variant.vcf_id or "", variant.annotation.get("vep_consequence", ""), worst_annotation.get("hgvsc", ""), worst_annotation.get("hgvsp", "").replace("%3D", "="), worst_annotation.get("sift", ""), worst_annotation.get("polyphen", ""), worst_annotation.get("mutationtaster_pred", ""), ";".join( set( worst_annotation.get("fathmm_pred", "").split('%3B'))), measureset_id, clinvar_significance, variant.annotation["freqs"].get("1kg_wgs_phase3", ""), variant.annotation["freqs"].get( "1kg_wgs_phase3_popmax", ""), variant.annotation["freqs"].get("exac_v3", ""), variant.annotation["freqs"].get("exac_v3_popmax", ""), all_genotypes_string, ] + genotypes)) header = [ "gene", "chr", "pos", "ref", "alt", "rsID", "impact", "HGVS.c", "HGVS.p", "sift", "polyphen", "muttaster", "fathmm", "clinvar_id", "clinvar_clinical_sig", "freq_1kg_wgs_phase3", "freq_1kg_wgs_phase3_popmax", "freq_exac_v3", "freq_exac_v3_popmax", "all_genotypes" ] + list( map(lambda i: i + " (from %s)" % indiv_id_to_project_id[i], individuals_to_include)) writer = csv.writer(response) writer.writerow(header) for row in rows: writer.writerow(row) return response else: for individ_id_and_variants in individ_ids_and_variants: variants = individ_id_and_variants["variants"] individ_id_and_variants["variants"] = [ v.toJSON() for v in variants ] return render( request, 'project/gene_quicklook.html', { 'gene': gene, 'gene_json': json.dumps(gene), 'project': project, 'rare_variants_json': json.dumps([v.toJSON() for v in rare_variants]), 'individuals_json': json.dumps([ i.get_json_obj() for project in projects_to_search for i in project.get_individuals() ]), 'knockouts_json': json.dumps(individ_ids_and_variants), 'other_projects_json': other_projects_json, })
def search_for_genes(self, gene_ids, project_id_list, output_filename, max_af=0.01): """ Search for a gene across project(s) Args: gene_ids (list): 'ENSG..' gene id strings. project_id_list (list): (optional) project ids to narrow down the search output_filename (string): output file name max_af (float): AF filter """ outfile = open(output_filename, 'w') header = [ "project_id", "gene", "chr", "pos", "ref", "alt", "rsID", "filter", "impact", "HGVS.c", "HGVS.p", "sift", "polyphen", "muttaster", "fathmm", "clinvar_id", "clinvar_clinical_sig", "freq_1kg_wgs_phase3", "freq_1kg_wgs_phase3_popmax", "freq_exac_v3", "freq_exac_v3_popmax", "all_genotypes" ] writer = csv.writer(outfile, delimiter='\t') writer.writerow(header) # all rare coding variants variant_filter = get_default_variant_filter( 'all_coding', mall.get_annotator().reference_population_slugs) print("All Filters: ") pprint(variant_filter.toJSON()) if project_id_list: projects = [ Project.objects.get(project_id=project_id) for project_id in project_id_list ] else: projects = Project.objects.all() print("Max AF threshold: %s" % max_af) print("Staring gene search for:\n%s\nin projects:\n%s\n" % (", ".join(gene_ids), ", ".join([p.project_id for p in projects]))) indiv_id_cache = {} for project in projects: project_id = project.project_id if get_project_datastore(project_id).project_collection_is_loaded( project_id): print("=====================") print("Searching project %s" % project_id) else: print( "Skipping project %s - gene search is not enabled for this project" % project_id) continue for gene_id in gene_ids: gene_id = get_gene_id_from_str(gene_id, get_reference()) gene = get_reference().get_gene(gene_id) print("-- searching %s for gene %s (%s)" % (project_id, gene["symbol"], gene_id)) for variant in project_analysis.get_variants_in_gene( project, gene_id, variant_filter=variant_filter): if max(variant.annotation['freqs'].values()) >= max_af: continue add_extra_info_to_variants_project(get_reference(), project, [variant]) worst_annotation_idx = variant.annotation[ "worst_vep_index_per_gene"][gene_id] worst_annotation = variant.annotation["vep_annotation"][ worst_annotation_idx] all_genotypes_list = [] pass_filter = "N/A" for indiv_id, genotype in variant.genotypes.items(): individual = indiv_id_cache.get('indiv_id') if individual is None: individual = Individual.objects.get( project=project, indiv_id=indiv_id) indiv_id_cache[indiv_id] = individual pass_filter = genotype.filter # filter value is stored in the genotypes even though it's the same for all individuals if genotype.num_alt > 0: all_genotypes_list.append( "%s%s[gt:%s GQ:%s AB:%0.3f]" % (indiv_id, "[Affected]" if individual.affected == "A" else ("[-]" if individual.affected == "N" else "[?]"), ">".join(genotype.alleles), genotype.gq, genotype.ab if genotype.ab is not None else float('NaN'))) measureset_id, clinvar_significance = get_clinvar_variants( ).get(variant.unique_tuple(), ("", "")) row = map(str, [ project_id, gene["symbol"], variant.chr, variant.pos, variant.ref, variant.alt, variant.vcf_id or "", pass_filter, variant.annotation.get("vep_consequence", ""), worst_annotation.get("hgvsc", ""), worst_annotation.get("hgvsp", "").replace("%3D", "="), worst_annotation.get("sift", ""), worst_annotation.get("polyphen", ""), worst_annotation.get("mutationtaster_pred", ""), ";".join( set( worst_annotation.get("fathmm_pred", "").split('%3B'))), measureset_id, clinvar_significance, variant.annotation["freqs"].get("1kg_wgs_phase3", ""), variant.annotation["freqs"].get( "1kg_wgs_phase3_popmax", ""), variant.annotation["freqs"].get("exac_v3", ""), variant.annotation["freqs"].get("exac_v3_popmax", ""), ", ".join(all_genotypes_list), ]) writer.writerow(row) outfile.close() print("Wrote out %s" % output_filename)
def search_for_gene(self, search_gene_id, project_id_list, max_af=0.01): ''' Search for a gene across project(s) Args: 1. search_gene_id: Gene ID to search for 2. proj_list: An optional list of projects to narrow down search to ''' gene_id = get_gene_id_from_str(search_gene_id, get_reference()) gene = get_reference().get_gene(gene_id) print("Staring gene search for: %s %s in projects: %s\n" % (search_gene_id, gene['gene_id'], ", ".join(project_id_list))) print("Max AF threshold: %s" % max_af) # all rare coding variants variant_filter = get_default_variant_filter('all_coding', mall.get_annotator().reference_population_slugs) print("All Filters: ") pprint(variant_filter.toJSON()) output_filename = 'results_'+search_gene_id + '.tsv' outfile = open(output_filename,'w') header = ["project_id","gene", "chr", "pos", "ref", "alt", "rsID", "filter", "impact", "HGVS.c", "HGVS.p", "sift", "polyphen", "muttaster", "fathmm", "clinvar_id", "clinvar_clinical_sig", "freq_1kg_wgs_phase3", "freq_1kg_wgs_phase3_popmax", "freq_exac_v3", "freq_exac_v3_popmax", "all_genotypes"] writer = csv.writer(outfile,delimiter='\t') writer.writerow(header) if project_id_list: for project_id in project_id_list: project = Project.objects.filter(project_id=project_id)[0] # TODO validate else: project_id_list = [p.project_id for p in Project.objects.all()] for project_id in project_id_list: project = Project.objects.filter(project_id=project_id)[0] if get_project_datastore(project_id).project_collection_is_loaded(project_id): print("Running on project %s" % project_id) else: print("Skipping project %s - gene search is not enabled for this project" % project_id) continue for variant in project_analysis.get_variants_in_gene(project, gene_id, variant_filter=variant_filter): if max(variant.annotation['freqs'].values()) >= max_af: continue #pprint(variant.toJSON()) add_extra_info_to_variants_project(get_reference(), project, [variant]) worst_annotation_idx = variant.annotation["worst_vep_index_per_gene"][gene_id] worst_annotation = variant.annotation["vep_annotation"][worst_annotation_idx] all_genotypes_list = [] pass_filter = "N/A" for indiv_id, genotype in variant.genotypes.items(): pass_filter = genotype.filter # filter value is stored in the genotypes even though it's the same for all individuals if genotype.num_alt > 0: all_genotypes_list.append("%s[gt:%s GQ:%s AB:%0.3f]" % (indiv_id, ">".join(genotype.alleles), genotype.gq, genotype.ab if genotype.ab is not None else float('NaN'))) measureset_id, clinvar_significance = settings.CLINVAR_VARIANTS.get(variant.unique_tuple(), ("", "")) row = map(str, [project_id, gene["symbol"], variant.chr, variant.pos, variant.ref, variant.alt, variant.vcf_id or "", pass_filter, variant.annotation.get("vep_consequence", ""), worst_annotation.get("hgvsc", ""), worst_annotation.get("hgvsp", "").replace("%3D", "="), worst_annotation.get("sift", ""), worst_annotation.get("polyphen", ""), worst_annotation.get("mutationtaster_pred", ""), ";".join(set(worst_annotation.get("fathmm_pred", "").split('%3B'))), measureset_id, clinvar_significance, variant.annotation["freqs"].get("1kg_wgs_phase3", ""), variant.annotation["freqs"].get("1kg_wgs_phase3_popmax", ""), variant.annotation["freqs"].get("exac_v3", ""), variant.annotation["freqs"].get("exac_v3_popmax", ""), ", ".join(all_genotypes_list), ]) writer.writerow(row) outfile.close() print("Wrote out %s" % output_filename)
def gene_quicklook(request, project_id, gene_id): """ Summary of a gene in a project """ project = get_object_or_404(Project, project_id=project_id) if not project.can_view(request.user): return HttpResponse("Unauthorized") if gene_id is None: return render(request, 'project/gene_quicklook.html', { 'project': project, 'gene': None, 'gene_json': None, 'rare_variants_json': None, 'individuals_json': None, 'knockouts_json': None, }) gene_id = get_gene_id_from_str(gene_id, get_reference()) gene = get_reference().get_gene(gene_id) sys.stderr.write(project_id + " - staring gene search for: %s %s \n" % (gene_id, gene)) # all rare coding variants variant_filter = get_default_variant_filter('all_coding', mall.get_annotator().reference_population_slugs) rare_variants = [] for variant in project_analysis.get_variants_in_gene(project, gene_id, variant_filter=variant_filter): max_af = max(variant.annotation['freqs'].values()) if max_af < .01: rare_variants.append(variant) #sys.stderr.write("gene_id: %s, variant: %s\n" % (gene_id, variant.toJSON()['annotation']['vep_annotation'])) add_extra_info_to_variants_project(get_reference(), project, rare_variants) # compute knockout individuals individ_ids_and_variants = [] knockout_ids, variation = get_knockouts_in_gene(project, gene_id) for indiv_id in knockout_ids: variants = variation.get_relevant_variants_for_indiv_ids([indiv_id]) add_extra_info_to_variants_project(get_reference(), project, variants) individ_ids_and_variants.append({ 'indiv_id': indiv_id, 'variants': variants, }) sys.stderr.write("Project-wide gene search retrieved %s rare variants for gene: %s \n" % (len(rare_variants), gene_id)) download_csv = request.GET.get('download', '') if download_csv: response = HttpResponse(content_type='text/csv') response['Content-Disposition'] = 'attachment; filename="{}_{}.csv"'.format(download_csv, gene["transcript_name"]) if download_csv == 'knockouts': individuals_to_include = [individ_id_and_variants["indiv_id"] for individ_id_and_variants in individ_ids_and_variants] rows = [] for individ_id_and_variants in individ_ids_and_variants: rare_variants = individ_id_and_variants["variants"] for variant in rare_variants: worst_annotation_idx = variant.annotation["worst_vep_index_per_gene"][gene_id] worst_annotation = variant.annotation["vep_annotation"][worst_annotation_idx] genotypes = [] all_genotypes_string = "" for indiv_id in individuals_to_include: genotype = variant.genotypes[indiv_id] allele_string = ">".join(genotype.alleles) all_genotypes_string += indiv_id + ":" + allele_string + " " if genotype.num_alt > 0: genotypes.append(allele_string + " (" + str(genotype.gq) + ")") else: genotypes.append("") measureset_id, clinvar_significance = settings.CLINVAR_VARIANTS.get(variant.unique_tuple(), ("", "")) rows.append(map(str, [ gene["symbol"], variant.chr, variant.pos, variant.ref, variant.alt, variant.vcf_id or "", variant.annotation.get("vep_consequence", ""), worst_annotation.get("hgvsc", ""), worst_annotation.get("hgvsp", "").replace("%3D", "="), worst_annotation.get("sift", ""), worst_annotation.get("polyphen", ""), worst_annotation.get("mutationtaster_pred", ""), ";".join(set(worst_annotation.get("fathmm_pred", "").split('%3B'))), measureset_id, clinvar_significance, variant.annotation["freqs"].get("1kg_wgs_phase3", ""), variant.annotation["freqs"].get("1kg_wgs_phase3_popmax", ""), variant.annotation["freqs"].get("exac_v3", ""), variant.annotation["freqs"].get("exac_v3_popmax", ""), all_genotypes_string, ] + genotypes)) elif download_csv == 'rare_variants': individuals_to_include = [] for variant in rare_variants: for indiv_id, genotype in variant.genotypes.items(): if genotype.num_alt > 0 and indiv_id not in individuals_to_include: individuals_to_include.append(indiv_id) rows = [] for variant in rare_variants: worst_annotation_idx = variant.annotation["worst_vep_index_per_gene"][gene_id] worst_annotation = variant.annotation["vep_annotation"][worst_annotation_idx] genotypes = [] all_genotypes_string = "" for indiv_id in individuals_to_include: genotype = variant.genotypes[indiv_id] allele_string = ">".join(genotype.alleles) all_genotypes_string += indiv_id + ":" + allele_string + " " if genotype.num_alt > 0: genotypes.append(allele_string + " (" + str(genotype.gq) + ")") else: genotypes.append("") measureset_id, clinvar_significance = settings.CLINVAR_VARIANTS.get(variant.unique_tuple(), ("", "")) rows.append(map(str, [ gene["symbol"], variant.chr, variant.pos, variant.ref, variant.alt, variant.vcf_id or "", variant.annotation.get("vep_consequence", ""), worst_annotation.get("hgvsc", ""), worst_annotation.get("hgvsp", "").replace("%3D", "="), worst_annotation.get("sift", ""), worst_annotation.get("polyphen", ""), worst_annotation.get("mutationtaster_pred", ""), ";".join(set(worst_annotation.get("fathmm_pred", "").split('%3B'))), measureset_id, clinvar_significance, variant.annotation["freqs"].get("1kg_wgs_phase3", ""), variant.annotation["freqs"].get("1kg_wgs_phase3_popmax", ""), variant.annotation["freqs"].get("exac_v3", ""), variant.annotation["freqs"].get("exac_v3_popmax", ""), all_genotypes_string, ] + genotypes)) header = ["gene", "chr", "pos", "ref", "alt", "rsID", "impact", "HGVS.c", "HGVS.p", "sift", "polyphen", "muttaster", "fathmm", "clinvar_id", "clinvar_clinical_sig", "freq_1kg_wgs_phase3", "freq_1kg_wgs_phase3_popmax", "freq_exac_v3", "freq_exac_v3_popmax", "all_genotypes"] + individuals_to_include writer = csv.writer(response) writer.writerow(header) for row in rows: writer.writerow(row) return response else: for individ_id_and_variants in individ_ids_and_variants: variants = individ_id_and_variants["variants"] individ_id_and_variants["variants"] = [v.toJSON() for v in variants] return render(request, 'project/gene_quicklook.html', { 'gene': gene, 'gene_json': json.dumps(gene), 'project': project, 'rare_variants_json': json.dumps([v.toJSON() for v in rare_variants]), 'individuals_json': json.dumps([i.get_json_obj() for i in project.get_individuals()]), 'knockouts_json': json.dumps(individ_ids_and_variants), })
def handle(self, *args, **options): if len(args) != 2: sys.exit( "ERROR: please specify the project_id and file of individual ids as command line args." ) project_id = args[0] individuals_file = args[1] # init objects project = Project.objects.get(project_id=project_id) all_individual_ids_in_project = set( [i.indiv_id for i in project.get_individuals()]) individuals_of_interest = [] invalid_individual_ids = [] with open(individuals_file) as f: for line in f: line = line.strip('\n') if not line or line.startswith("#"): continue individual_id = line.split("\t")[0] if individual_id in all_individual_ids_in_project: individuals_of_interest.append(individual_id) else: invalid_individual_ids.append(individual_id) print("Processing %s: %d individuals " % (project_id, len(individuals_of_interest))) if invalid_individual_ids: num_invalid = len(invalid_individual_ids) total_ids = len(all_individual_ids_in_project) sys.exit(( "ERROR: %(individuals_file)s: %(num_invalid)s out of %(total_ids)s ids are invalid. \nThe invalid ids are: " "%(invalid_individual_ids)s.\nValid ids are: %(individuals_of_interest)s" ) % locals()) # filter variant_filter = get_default_variant_filter('moderate_impact') variant_filter.ref_freqs.append(('1kg_wgs_phase3', g1k_freq_threshold)) variant_filter.ref_freqs.append( ('1kg_wgs_phase3_popmax', g1k_popmax_freq_threshold)) variant_filter.ref_freqs.append(('exac_v3', exac_freq_threshold)) variant_filter.ref_freqs.append( ('exac_v3_popmax', exac_popmax_threshold)) variant_filter.ref_freqs.append( ('merck-wgs-3793', merck_wgs_3793_threshold)) quality_filter = { 'vcf_filter': 'pass', 'min_gq': GQ_threshold, 'min_ab': AB_threshold, } # create individuals_variants.tsv individual_variants_f = gzip.open( 'individuals_in_%s.tsv.gz' % project_id, 'w') writer = csv.writer(individual_variants_f, dialect='excel', delimiter='\t') header_fields = [ 'project_id', 'family_id', 'individual_id', 'gene', 'chrom', 'pos', 'ref', 'alt', 'rsid', 'annotation', '1kg_af', '1kg_popmax_af', 'exac_af', 'exac_popmax_af', 'merck_wgs_3793_af', 'genotype_str', 'genotype_num_alt', 'genotype_allele_balance', 'genotype_AD', 'genotype_DP', 'genotype_GQ', 'genotype_PL', 'genotype_filter', ] writer.writerow(header_fields) # collect the resources that we'll need here annotator = get_annotator() custom_population_store = get_custom_population_store() individual_counter = 0 for i, family in enumerate(project.get_families()): for individual in family.get_individuals(): if individual.indiv_id not in individuals_of_interest: continue individual_counter += 1 print("%s: %s, individual %s" % (individual_counter, family.family_id, individual.indiv_id)) for variant in get_variants( get_datastore(project.project_id), family.xfamily(), variant_filter=variant_filter, quality_filter=quality_filter, indivs_to_consider=[individual.indiv_id]): genotype = variant.get_genotype(individual.indiv_id) if len(genotype.alleles) == 0 or genotype.extras[ "dp"] < DP_threshold or genotype.num_alt == 0: continue custom_populations = custom_population_store.get_frequencies( variant.xpos, variant.ref, variant.alt) genotype_str = "/".join( genotype.alleles) if genotype.alleles else "./." g1k_freq = variant.annotation['freqs']['1kg_wgs_phase3'] g1k_popmax_freq = variant.annotation['freqs'][ '1kg_wgs_phase3_popmax'] exac_freq = variant.annotation['freqs']['exac_v3'] exac_popmax_freq = variant.annotation['freqs'][ 'exac_v3_popmax'] merck_wgs_3793_freq = custom_populations.get( 'merck-wgs-3793', 0.0) assert g1k_freq <= g1k_freq_threshold, "g1k freq %s > %s" % ( g1k_freq, g1k_freq_threshold) assert g1k_popmax_freq <= g1k_popmax_freq_threshold, "g1k popmax freq %s > %s" % ( g1k_popmax_freq, g1k_popmax_freq_threshold) assert exac_freq <= exac_freq_threshold, "Exac freq %s > %s" % ( exac_freq, exac_freq_threshold) assert exac_popmax_freq <= exac_popmax_threshold, "Exac popmax freq %s > %s" % ( exac_popmax_freq, exac_popmax_threshold) assert merck_wgs_3793_freq <= merck_wgs_3793_threshold assert genotype.gq >= GQ_threshold, "%s %s - GQ is %s " % ( variant.chr, variant.pos, genotype.gq) assert genotype.extras[ "dp"] >= DP_threshold, "%s %s - GQ is %s " % ( variant.chr, variant.pos, genotype.extras["dp"]) if genotype.num_alt == 1: assert genotype.ab >= AB_threshold / 100., "%s %s - AB is %s " % ( variant.chr, variant.pos, genotype.ab) assert genotype.filter == "pass", "%s %s - filter is %s " % ( variant.chr, variant.pos, genotype.filter) writer.writerow( map(str, [ project_id, family.family_id, individual.indiv_id, get_gene_symbol(variant), variant.chr, variant.pos, variant.ref, variant.alt, variant.vcf_id, variant.annotation['vep_group'], g1k_freq, g1k_popmax_freq, exac_freq, exac_popmax_freq, merck_wgs_3793_freq, genotype_str, genotype.num_alt, genotype.ab, genotype.extras["ad"], genotype.extras["dp"], genotype.gq, genotype.extras["pl"], genotype.filter, ])) individual_variants_f.flush() individual_variants_f.close()
def search_for_genes(self, gene_or_variant_ids, project_id_list, output_filename, max_af=0.01, knockouts=False, in_clinvar_only=False, include_non_coding=False): """ Search for a gene across project(s) Args: gene_or_variant_ids (list): 'ENSG..' gene id strings. project_id_list (list): (optional) project ids to narrow down the search output_filename (string): output file name max_af (float): AF filter in_clinvar_only (bool): include_non_coding (bool): """ projects = [ Project.objects.get(project_id=project_id) for project_id in project_id_list ] outfile = open(output_filename, 'w') header = [ "project_id", "gene", "chr", "pos", "ref", "alt", "rsID", "filter", "impact", "HGVS.c", "HGVS.p", "sift", "polyphen", "muttaster", "fathmm", "clinvar_id", "clinvar_clinical_sig", "freq_1kg_wgs_phase3", "freq_1kg_wgs_phase3_popmax", "freq_exac_v3", "freq_exac_v3_popmax", "gnomad-exomes", "gnomad-genomes", "families", "all_genotypes" ] writer = csv.writer(outfile, delimiter='\t') writer.writerow(header) # all rare coding variants if not knockouts: variant_filter = get_default_variant_filter( 'all_coding', mall.get_annotator().reference_population_slugs) #variant_filter.set_max_AF(max_af) if include_non_coding: variant_filter.so_annotations = [] print("All Filters: ") pprint(variant_filter.toJSON()) #print("Max AF threshold: %s" % max_af) print("Starting search for:\n%s\nin projects:\n%s\n" % (", ".join(gene_or_variant_ids), ", ".join( [p.project_id for p in projects]))) for project in projects: project_id = project.project_id if get_project_datastore(project).project_collection_is_loaded( project): print("=====================") print("Searching project %s" % project_id) else: print( "Skipping project %s - gene search is not enabled for this project" % project_id) continue indiv_cache = {} for gene_or_variant_id in gene_or_variant_ids: chrom_pos_match = re.match("([0-9XY]{1,2})-([0-9]{1,9})", gene_or_variant_id) chrom_pos_ref_alt_match = re.match( "([0-9XY]{1,2})-([0-9]{1,9})-([ACTG]+)-([ACTG]+)", gene_or_variant_id) if chrom_pos_match or chrom_pos_ref_alt_match: chrom = chrom_pos_match.group(1) pos = int(chrom_pos_match.group(2)) xpos = genomeloc.get_xpos(chrom, pos) ref = alt = None if chrom_pos_ref_alt_match: ref = chrom_pos_ref_alt_match.group(3) alt = chrom_pos_ref_alt_match.group(4) variant = get_project_datastore( project).get_single_variant(project.project_id, None, xpos, ref, alt) if variant is None: continue variants = [variant] print("-- searching %s for variant %s-%s-%s: found %s" % (project_id, xpos, ref, alt, variant)) worst_annotation_idx = variant.annotation[ 'worst_vep_annotation_index'] print(variant.annotation["vep_annotation"] [worst_annotation_idx]) gene_id = variant.annotation["vep_annotation"][ worst_annotation_idx]['gene_id'] gene = get_reference().get_gene(gene_id) else: gene_id = get_gene_id_from_str(gene_or_variant_id, get_reference()) gene = get_reference().get_gene(gene_id) print("-- searching %s for gene %s (%s)" % (project_id, gene["symbol"], gene_id)) if knockouts: knockout_ids, variation = project_analysis.get_knockouts_in_gene( project, gene_id) variants = variation.get_relevant_variants_for_indiv_ids( knockout_ids) else: variants = project_analysis.get_variants_in_gene( project, gene_id, variant_filter=variant_filter) for variant in variants: if not chrom_pos_match and not chrom_pos_ref_alt_match and max( variant.annotation['freqs'].values()) >= max_af: continue add_extra_info_to_variants_project(get_reference(), project, [variant]) worst_annotation_idx = variant.annotation[ "worst_vep_index_per_gene"].get(gene_id) if worst_annotation_idx is not None: worst_annotation = variant.annotation[ "vep_annotation"][worst_annotation_idx] else: worst_annotation = None all_genotypes_list = [] pass_filter = "N/A" family_ids = set() for indiv_id, genotype in variant.genotypes.items(): if indiv_id in indiv_cache: individual = indiv_cache[indiv_id] if individual == 'deleted': continue else: try: individual = Individual.objects.get( project=project, indiv_id=indiv_id) indiv_cache[indiv_id] = individual except ObjectDoesNotExist: # this can happen when an individual is deleted from the project - from postgres, but not from mong indiv_cache[indiv_id] = 'deleted' continue except MultipleObjectsReturned: # when several families have an individual with the same id individuals = Individual.objects.filter( project=project, indiv_id=indiv_id) individual = individuals[0] indiv_cache[indiv_id] = individual pass_filter = genotype.filter # filter value is stored in the genotypes even though it's the same for all individuals if genotype.num_alt > 0: family_ids.add(individual.family.family_id) all_genotypes_list.append( "%s/%s%s[gt:%s GQ:%s AB:%0.3f]" % (individual.family.family_id, indiv_id, "[Affected]" if individual.affected == "A" else ("[-]" if individual.affected == "N" else "[?]"), ">".join(genotype.alleles), genotype.gq, genotype.ab if genotype.ab is not None else float('NaN'))) if len(all_genotypes_list) == 0: continue measureset_id, clinvar_significance = get_reference( ).get_clinvar_info(*variant.unique_tuple()) if in_clinvar_only and ( not clinvar_significance or "path" not in clinvar_significance.lower()): continue row = map(str, [ project_id, gene, variant.chr, variant.pos, variant.ref, variant.alt, variant.vcf_id or "", pass_filter, variant.annotation.get("vep_consequence", ""), worst_annotation.get("hgvsc", "") if worst_annotation else "", (worst_annotation.get("hgvsp", "") or "").replace( "%3D", "=") if worst_annotation else "", worst_annotation.get("sift", "") if worst_annotation else "", worst_annotation.get("polyphen", "") if worst_annotation else "", worst_annotation.get("mutationtaster_pred", "") if worst_annotation else "", ";".join( set( worst_annotation.get("fathmm_pred", "").split('%3B'))) if worst_annotation else "", measureset_id, clinvar_significance, variant.annotation["freqs"].get("1kg_wgs_phase3", ""), variant.annotation["freqs"].get( "1kg_wgs_phase3_popmax", ""), variant.annotation["freqs"].get("exac_v3", ""), variant.annotation["freqs"].get("exac_v3_popmax", ""), variant.annotation["freqs"].get("gnomad-exomes2", ""), variant.annotation["freqs"].get("gnomad-genomes2", ""), ", ".join(sorted(list(family_ids))), ", ".join(all_genotypes_list), ]) writer.writerow(row) outfile.close() print("Wrote out %s" % output_filename)
def gene_quicklook(request, project_id, gene_id): """ Summary of a gene in a project """ main_project = get_object_or_404(Project, project_id=project_id) if not main_project.can_view(request.user): return HttpResponse("Unauthorized") # other projects this user can view other_projects = get_loaded_projects_for_user( request.user, fields=['project_id', 'project_name']) if other_projects: other_projects_json = json.dumps([{ 'project_id': p.project_id, 'project_name': p.project_name } for p in sorted(other_projects, key=lambda p: p.project_id.lower())]) else: other_projects_json = None if gene_id is None: return render( request, 'project/gene_quicklook.html', { 'project': main_project, 'gene': None, 'gene_json': None, 'rare_variants_json': None, 'individuals_json': None, 'knockouts_json': None, 'other_projects_json': other_projects_json, }) projects_to_search_param = request.GET.get('selected_projects') if projects_to_search_param: project_ids = projects_to_search_param.split(",") projects_to_search = [ project for project in other_projects if project.project_id in project_ids ] if len(projects_to_search) < len(project_ids): # If not all the specified project ids are in the other projects list then they are not authorized return HttpResponse("Unauthorized") else: project_ids = [main_project.project_id] projects_to_search = [main_project] gene_id = get_gene_id_from_str(gene_id, get_reference()) gene = get_reference().get_gene(gene_id) sys.stderr.write( project_id + " - staring gene search for: %s in projects: %s\n" % (gene_id, ",".join([p.project_id for p in projects_to_search]) + "\n")) # all rare coding variants variant_filter = get_default_variant_filter( 'all_coding', mall.get_annotator().reference_population_slugs) indiv_id_to_project_id = {} rare_variant_dict = {} rare_variants = [] individ_ids_and_variants = [] for project in projects_to_search: all_project_variants = project_analysis.get_variants_in_gene( project, gene_id, variant_filter=variant_filter) # compute knockout individuals knockout_ids, variation = get_knockouts_in_gene( project, gene_id, all_project_variants) for indiv_id in knockout_ids: variants = variation.get_relevant_variants_for_indiv_ids( [indiv_id]) individ_ids_and_variants.append({ 'indiv_id': indiv_id, 'variants': variants, }) # compute rare variants project_variants = [] for i, variant in enumerate(all_project_variants): max_af = max([ freq for label, freq in variant.annotation['freqs'].items() if label != "AF" ]) # don't filter on within-cohort AF if not any([ indiv_id for indiv_id, genotype in variant.genotypes.items() if genotype.num_alt > 0 ]): continue if max_af >= .01: continue # add project id to genotypes for indiv_id in variant.genotypes: indiv_id_to_project_id[indiv_id] = project.project_id # save this variant (or just the genotypes from this variant if the variant if it's been seen already in another project) variant_id = "%s-%s-%s-%s" % (variant.chr, variant.pos, variant.ref, variant.alt) if variant_id not in rare_variant_dict: rare_variant_dict[variant_id] = variant project_variants.append(variant) else: rare_variant_dict[variant_id].genotypes.update( variant.genotypes) rare_variants.extend(project_variants) all_variants = sum([i['variants'] for i in individ_ids_and_variants], rare_variants) add_extra_info_to_variants_project(get_reference(), project, all_variants) download_csv = request.GET.get('download', '') if download_csv: response = HttpResponse(content_type='text/csv') response[ 'Content-Disposition'] = 'attachment; filename="{}_{}.csv"'.format( download_csv, gene.get("symbol") or gene.get("transcript_name")) if download_csv == 'knockouts': individuals_to_include = [ individ_id_and_variants["indiv_id"] for individ_id_and_variants in individ_ids_and_variants ] rows = [] for individ_id_and_variants in individ_ids_and_variants: rare_variants = individ_id_and_variants["variants"] for variant in rare_variants: worst_annotation_idx = variant.annotation[ "worst_vep_index_per_gene"][gene_id] worst_annotation = variant.annotation["vep_annotation"][ worst_annotation_idx] genotypes = [] all_genotypes_string = "" for indiv_id in individuals_to_include: if indiv_id in variant.genotypes and variant.genotypes[ indiv_id].num_alt > 0: genotype = variant.genotypes[indiv_id] allele_string = ">".join(genotype.alleles) all_genotypes_string += indiv_id + ":" + allele_string + " " genotypes.append(allele_string + " (" + str(genotype.gq) + ")") else: genotypes.append("") measureset_id, clinvar_significance = get_reference( ).get_clinvar_info(*variant.unique_tuple()) rows.append( map(str, [ gene["symbol"], variant.chr, variant.pos, variant.ref, variant.alt, variant.vcf_id or "", variant.annotation.get("vep_consequence", ""), worst_annotation.get("hgvsc", ""), worst_annotation.get("hgvsp", "").replace( "%3D", "="), worst_annotation.get("sift", ""), worst_annotation.get("polyphen", ""), worst_annotation.get("mutationtaster_pred", ""), ";".join( set( worst_annotation.get("fathmm_pred", "").split('%3B'))), measureset_id, clinvar_significance, variant.annotation["freqs"].get( "1kg_wgs_phase3", ""), variant.annotation["freqs"].get( "1kg_wgs_phase3_popmax", ""), variant.annotation["freqs"].get("exac_v3", ""), variant.annotation["freqs"].get( "exac_v3_popmax", ""), all_genotypes_string, ] + genotypes)) elif download_csv == 'rare_variants': individuals_to_include = [] for variant in rare_variants: for indiv_id, genotype in variant.genotypes.items(): if genotype.num_alt > 0 and indiv_id not in individuals_to_include: individuals_to_include.append(indiv_id) rows = [] for variant in rare_variants: worst_annotation_idx = variant.annotation[ "worst_vep_index_per_gene"][gene_id] worst_annotation = variant.annotation["vep_annotation"][ worst_annotation_idx] genotypes = [] all_genotypes_string = "" for indiv_id in individuals_to_include: if indiv_id in variant.genotypes and variant.genotypes[ indiv_id].num_alt > 0: genotype = variant.genotypes[indiv_id] allele_string = ">".join(genotype.alleles) all_genotypes_string += indiv_id + ":" + allele_string + " " genotypes.append(allele_string + " (" + str(genotype.gq) + ")") else: genotypes.append("") measureset_id, clinvar_significance = get_reference( ).get_clinvar_info(*variant.unique_tuple()) rows.append( map(str, [ gene["symbol"], variant.chr, variant.pos, variant.ref, variant.alt, variant.vcf_id or "", variant.annotation.get("vep_consequence", ""), worst_annotation.get("hgvsc", ""), worst_annotation.get("hgvsp", "").replace("%3D", "="), worst_annotation.get("sift", ""), worst_annotation.get("polyphen", ""), worst_annotation.get("mutationtaster_pred", ""), ";".join( set( worst_annotation.get("fathmm_pred", "").split('%3B'))), measureset_id, clinvar_significance, variant.annotation["freqs"].get("1kg_wgs_phase3", ""), variant.annotation["freqs"].get( "1kg_wgs_phase3_popmax", ""), variant.annotation["freqs"].get("exac_v3", ""), variant.annotation["freqs"].get("exac_v3_popmax", ""), all_genotypes_string, ] + genotypes)) header = [ "gene", "chr", "pos", "ref", "alt", "rsID", "impact", "HGVS.c", "HGVS.p", "sift", "polyphen", "muttaster", "fathmm", "clinvar_id", "clinvar_clinical_sig", "freq_1kg_wgs_phase3", "freq_1kg_wgs_phase3_popmax", "freq_exac_v3", "freq_exac_v3_popmax", "all_genotypes" ] + list( map(lambda i: i + " (from %s)" % indiv_id_to_project_id[i], individuals_to_include)) writer = csv.writer(response) writer.writerow(header) for row in rows: writer.writerow(row) return response else: for individ_id_and_variants in individ_ids_and_variants: variants = individ_id_and_variants["variants"] individ_id_and_variants["variants"] = [ v.toJSON() for v in variants ] individ_ids = {i['indiv_id'] for i in individ_ids_and_variants} for var in rare_variants: individ_ids.update(var.genotypes.keys()) individuals = Individual.objects.filter( indiv_id__in=individ_ids, project__project_id__in=project_ids).select_related( 'project').select_related('family').only( 'project__project_id', 'family__family_id', *Individual.INDIVIDUAL_JSON_FIELDS_NO_IDS) return render( request, 'project/gene_quicklook.html', { 'gene': gene, 'gene_json': json.dumps(gene), 'project': main_project, 'rare_variants_json': json.dumps([v.toJSON() for v in rare_variants]), 'individuals_json': json.dumps([ i.get_json_obj(skip_has_variant_data=True) for i in individuals ]), 'knockouts_json': json.dumps(individ_ids_and_variants), 'other_projects_json': other_projects_json, })
def search_for_gene(self, search_gene_id, project_id_list, max_af=0.01): ''' Search for a gene across project(s) Args: 1. search_gene_id: Gene ID to search for 2. proj_list: An optional list of projects to narrow down search to ''' gene_id = get_gene_id_from_str(search_gene_id, get_reference()) gene = get_reference().get_gene(gene_id) print("Staring gene search for: %s %s in projects: %s\n" % (search_gene_id, gene['gene_id'], ", ".join(project_id_list))) print("Max AF threshold: %s" % max_af) # all rare coding variants variant_filter = get_default_variant_filter( 'all_coding', mall.get_annotator().reference_population_slugs) print("All Filters: ") pprint(variant_filter.toJSON()) output_filename = 'results_' + search_gene_id + '.tsv' outfile = open(output_filename, 'w') header = [ "project_id", "gene", "chr", "pos", "ref", "alt", "rsID", "filter", "impact", "HGVS.c", "HGVS.p", "sift", "polyphen", "muttaster", "fathmm", "clinvar_id", "clinvar_clinical_sig", "freq_1kg_wgs_phase3", "freq_1kg_wgs_phase3_popmax", "freq_exac_v3", "freq_exac_v3_popmax", "all_genotypes" ] writer = csv.writer(outfile, delimiter='\t') writer.writerow(header) if project_id_list: for project_id in project_id_list: project = Project.objects.filter( project_id=project_id)[0] # TODO validate else: project_id_list = [p.project_id for p in Project.objects.all()] for project_id in project_id_list: project = Project.objects.filter(project_id=project_id)[0] if get_project_datastore(project_id).project_collection_is_loaded( project_id): print("Running on project %s" % project_id) else: print( "Skipping project %s - gene search is not enabled for this project" % project_id) continue for variant in project_analysis.get_variants_in_gene( project, gene_id, variant_filter=variant_filter): if max(variant.annotation['freqs'].values()) >= max_af: continue #pprint(variant.toJSON()) add_extra_info_to_variants_project(get_reference(), project, [variant]) worst_annotation_idx = variant.annotation[ "worst_vep_index_per_gene"][gene_id] worst_annotation = variant.annotation["vep_annotation"][ worst_annotation_idx] all_genotypes_list = [] pass_filter = "N/A" for indiv_id, genotype in variant.genotypes.items(): pass_filter = genotype.filter # filter value is stored in the genotypes even though it's the same for all individuals if genotype.num_alt > 0: all_genotypes_list.append( "%s[gt:%s GQ:%s AB:%0.3f]" % (indiv_id, ">".join( genotype.alleles), genotype.gq, genotype.ab if genotype.ab is not None else float('NaN'))) measureset_id, clinvar_significance = settings.CLINVAR_VARIANTS.get( variant.unique_tuple(), ("", "")) row = map(str, [ project_id, gene["symbol"], variant.chr, variant.pos, variant.ref, variant.alt, variant.vcf_id or "", pass_filter, variant.annotation.get("vep_consequence", ""), worst_annotation.get("hgvsc", ""), worst_annotation.get("hgvsp", "").replace("%3D", "="), worst_annotation.get("sift", ""), worst_annotation.get("polyphen", ""), worst_annotation.get("mutationtaster_pred", ""), ";".join( set( worst_annotation.get("fathmm_pred", "").split('%3B'))), measureset_id, clinvar_significance, variant.annotation["freqs"].get("1kg_wgs_phase3", ""), variant.annotation["freqs"].get("1kg_wgs_phase3_popmax", ""), variant.annotation["freqs"].get("exac_v3", ""), variant.annotation["freqs"].get("exac_v3_popmax", ""), ", ".join(all_genotypes_list), ]) writer.writerow(row) outfile.close() print("Wrote out %s" % output_filename)
def gene_quicklook(request, project_id, gene_id): """ Summary of a gene in a project """ project = get_object_or_404(Project, project_id=project_id) if not project.can_view(request.user): return HttpResponse("Unauthorized") if project.project_status == Project.NEEDS_MORE_PHENOTYPES and not request.user.is_staff: return render(request, 'analysis_unavailable.html', {'reason': 'Awaiting phenotype data.'}) if gene_id is None: return render( request, 'project/gene_quicklook.html', { 'project': project, 'gene': None, 'gene_json': None, 'rare_variants_json': None, 'individuals_json': None, 'knockouts_json': None, }) gene_id = get_gene_id_from_str(gene_id, get_reference()) gene = get_reference().get_gene(gene_id) sys.stderr.write(project_id + " - staring gene search for: %s %s \n" % (gene_id, gene)) # all rare coding variants variant_filter = get_default_variant_filter( 'all_coding', mall.get_annotator().reference_population_slugs) rare_variants = [] for variant in project_analysis.get_variants_in_gene( project, gene_id, variant_filter=variant_filter): max_af = max(variant.annotation['freqs'].values()) if not any([ indiv_id for indiv_id, genotype in variant.genotypes.items() if genotype.num_alt > 0 ]): continue if max_af < .01: rare_variants.append(variant) #sys.stderr.write("gene_id: %s, variant: %s\n" % (gene_id, variant.toJSON()['annotation']['vep_annotation'])) add_extra_info_to_variants_project(get_reference(), project, rare_variants) # compute knockout individuals individ_ids_and_variants = [] knockout_ids, variation = get_knockouts_in_gene(project, gene_id) for indiv_id in knockout_ids: variants = variation.get_relevant_variants_for_indiv_ids([indiv_id]) add_extra_info_to_variants_project(get_reference(), project, variants) individ_ids_and_variants.append({ 'indiv_id': indiv_id, 'variants': variants, }) sys.stderr.write( "Project-wide gene search retrieved %s rare variants for gene: %s \n" % (len(rare_variants), gene_id)) download_csv = request.GET.get('download', '') if download_csv: response = HttpResponse(content_type='text/csv') response[ 'Content-Disposition'] = 'attachment; filename="{}_{}.csv"'.format( download_csv, gene["transcript_name"]) if download_csv == 'knockouts': individuals_to_include = [ individ_id_and_variants["indiv_id"] for individ_id_and_variants in individ_ids_and_variants ] rows = [] for individ_id_and_variants in individ_ids_and_variants: rare_variants = individ_id_and_variants["variants"] for variant in rare_variants: worst_annotation_idx = variant.annotation[ "worst_vep_index_per_gene"][gene_id] worst_annotation = variant.annotation["vep_annotation"][ worst_annotation_idx] genotypes = [] all_genotypes_string = "" for indiv_id in individuals_to_include: genotype = variant.genotypes[indiv_id] allele_string = ">".join(genotype.alleles) all_genotypes_string += indiv_id + ":" + allele_string + " " if genotype.num_alt > 0: genotypes.append(allele_string + " (" + str(genotype.gq) + ")") else: genotypes.append("") measureset_id, clinvar_significance = settings.CLINVAR_VARIANTS.get( variant.unique_tuple(), ("", "")) rows.append( map(str, [ gene["symbol"], variant.chr, variant.pos, variant.ref, variant.alt, variant.vcf_id or "", variant.annotation.get("vep_consequence", ""), worst_annotation.get("hgvsc", ""), worst_annotation.get("hgvsp", "").replace( "%3D", "="), worst_annotation.get("sift", ""), worst_annotation.get("polyphen", ""), worst_annotation.get("mutationtaster_pred", ""), ";".join( set( worst_annotation.get("fathmm_pred", "").split('%3B'))), measureset_id, clinvar_significance, variant.annotation["freqs"].get( "1kg_wgs_phase3", ""), variant.annotation["freqs"].get( "1kg_wgs_phase3_popmax", ""), variant.annotation["freqs"].get("exac_v3", ""), variant.annotation["freqs"].get( "exac_v3_popmax", ""), all_genotypes_string, ] + genotypes)) elif download_csv == 'rare_variants': individuals_to_include = [] for variant in rare_variants: for indiv_id, genotype in variant.genotypes.items(): if genotype.num_alt > 0 and indiv_id not in individuals_to_include: individuals_to_include.append(indiv_id) rows = [] for variant in rare_variants: worst_annotation_idx = variant.annotation[ "worst_vep_index_per_gene"][gene_id] worst_annotation = variant.annotation["vep_annotation"][ worst_annotation_idx] genotypes = [] all_genotypes_string = "" for indiv_id in individuals_to_include: genotype = variant.genotypes[indiv_id] allele_string = ">".join(genotype.alleles) all_genotypes_string += indiv_id + ":" + allele_string + " " if genotype.num_alt > 0: genotypes.append(allele_string + " (" + str(genotype.gq) + ")") else: genotypes.append("") measureset_id, clinvar_significance = settings.CLINVAR_VARIANTS.get( variant.unique_tuple(), ("", "")) rows.append( map(str, [ gene["symbol"], variant.chr, variant.pos, variant.ref, variant.alt, variant.vcf_id or "", variant.annotation.get("vep_consequence", ""), worst_annotation.get("hgvsc", ""), worst_annotation.get("hgvsp", "").replace("%3D", "="), worst_annotation.get("sift", ""), worst_annotation.get("polyphen", ""), worst_annotation.get("mutationtaster_pred", ""), ";".join( set( worst_annotation.get("fathmm_pred", "").split('%3B'))), measureset_id, clinvar_significance, variant.annotation["freqs"].get("1kg_wgs_phase3", ""), variant.annotation["freqs"].get( "1kg_wgs_phase3_popmax", ""), variant.annotation["freqs"].get("exac_v3", ""), variant.annotation["freqs"].get("exac_v3_popmax", ""), all_genotypes_string, ] + genotypes)) header = [ "gene", "chr", "pos", "ref", "alt", "rsID", "impact", "HGVS.c", "HGVS.p", "sift", "polyphen", "muttaster", "fathmm", "clinvar_id", "clinvar_clinical_sig", "freq_1kg_wgs_phase3", "freq_1kg_wgs_phase3_popmax", "freq_exac_v3", "freq_exac_v3_popmax", "all_genotypes" ] + individuals_to_include writer = csv.writer(response) writer.writerow(header) for row in rows: writer.writerow(row) return response else: for individ_id_and_variants in individ_ids_and_variants: variants = individ_id_and_variants["variants"] individ_id_and_variants["variants"] = [ v.toJSON() for v in variants ] return render( request, 'project/gene_quicklook.html', { 'gene': gene, 'gene_json': json.dumps(gene), 'project': project, 'rare_variants_json': json.dumps([v.toJSON() for v in rare_variants]), 'individuals_json': json.dumps( [i.get_json_obj() for i in project.get_individuals()]), 'knockouts_json': json.dumps(individ_ids_and_variants), })
def handle(self, *args, **options): if len(args) != 2: sys.exit("ERROR: please specify the project_id and file of individual ids as command line args.") project_id = args[0] individuals_file = args[1] # init objects project = Project.objects.get(project_id=project_id) all_individual_ids_in_project = set([i.indiv_id for i in project.get_individuals()]) individuals_of_interest = [] invalid_individual_ids = [] with open(individuals_file) as f: for line in f: line = line.strip('\n') if not line or line.startswith("#"): continue individual_id = line.split("\t")[0] if individual_id in all_individual_ids_in_project: individuals_of_interest.append(individual_id) else: invalid_individual_ids.append(individual_id) print("Processing %s: %d individuals " % (project_id, len(individuals_of_interest))) if invalid_individual_ids: num_invalid = len(invalid_individual_ids) total_ids = len(all_individual_ids_in_project) sys.exit(("ERROR: %(individuals_file)s: %(num_invalid)s out of %(total_ids)s ids are invalid. \nThe invalid ids are: " "%(invalid_individual_ids)s.\nValid ids are: %(individuals_of_interest)s") % locals()) # filter variant_filter = get_default_variant_filter('moderate_impact') variant_filter.ref_freqs.append(('1kg_wgs_phase3', g1k_freq_threshold)) variant_filter.ref_freqs.append(('1kg_wgs_phase3_popmax', g1k_popmax_freq_threshold)) variant_filter.ref_freqs.append(('exac_v3', exac_freq_threshold)) variant_filter.ref_freqs.append(('exac_v3_popmax', exac_popmax_threshold)) variant_filter.ref_freqs.append(('merck-wgs-3793', merck_wgs_3793_threshold)) quality_filter = { 'vcf_filter': 'pass', 'min_gq': GQ_threshold, 'min_ab': AB_threshold, } # create individuals_variants.tsv individual_variants_f = gzip.open('individuals_in_%s.tsv.gz' % project_id, 'w') writer = csv.writer(individual_variants_f, dialect='excel', delimiter='\t') header_fields = [ 'project_id', 'family_id', 'individual_id', 'gene', 'chrom', 'pos', 'ref', 'alt', 'rsid', 'annotation', '1kg_af', '1kg_popmax_af', 'exac_af', 'exac_popmax_af', 'merck_wgs_3793_af', 'genotype_str', 'genotype_num_alt', 'genotype_allele_balance', 'genotype_AD', 'genotype_DP', 'genotype_GQ', 'genotype_PL', 'genotype_filter', ] writer.writerow(header_fields) # collect the resources that we'll need here annotator = get_annotator() custom_population_store = get_custom_population_store() individual_counter = 0 for i, family in enumerate(project.get_families()): for individual in family.get_individuals(): if individual.indiv_id not in individuals_of_interest: continue individual_counter += 1 print("%s: %s, individual %s" % (individual_counter, family.family_id, individual.indiv_id)) for variant in get_variants(get_datastore(project.project_id), family.xfamily(), variant_filter = variant_filter, quality_filter = quality_filter, indivs_to_consider = [individual.indiv_id] ): genotype = variant.get_genotype(individual.indiv_id) if len(genotype.alleles) == 0 or genotype.extras["dp"] < DP_threshold or genotype.num_alt == 0: continue custom_populations = custom_population_store.get_frequencies(variant.xpos, variant.ref, variant.alt) genotype_str = "/".join(genotype.alleles) if genotype.alleles else "./." g1k_freq = variant.annotation['freqs']['1kg_wgs_phase3'] g1k_popmax_freq = variant.annotation['freqs']['1kg_wgs_phase3_popmax'] exac_freq = variant.annotation['freqs']['exac_v3'] exac_popmax_freq = variant.annotation['freqs']['exac_v3_popmax'] merck_wgs_3793_freq = custom_populations.get('merck-wgs-3793', 0.0) assert g1k_freq <= g1k_freq_threshold, "g1k freq %s > %s" % (g1k_freq, g1k_freq_threshold) assert g1k_popmax_freq <= g1k_popmax_freq_threshold, "g1k popmax freq %s > %s" % (g1k_popmax_freq, g1k_popmax_freq_threshold) assert exac_freq <= exac_freq_threshold, "Exac freq %s > %s" % (exac_freq, exac_freq_threshold) assert exac_popmax_freq <= exac_popmax_threshold, "Exac popmax freq %s > %s" % (exac_popmax_freq, exac_popmax_threshold) assert merck_wgs_3793_freq <= merck_wgs_3793_threshold assert genotype.gq >= GQ_threshold, "%s %s - GQ is %s " % (variant.chr, variant.pos, genotype.gq) assert genotype.extras["dp"] >= DP_threshold, "%s %s - GQ is %s " % (variant.chr, variant.pos, genotype.extras["dp"]) if genotype.num_alt == 1: assert genotype.ab >= AB_threshold/100., "%s %s - AB is %s " % (variant.chr, variant.pos, genotype.ab) assert genotype.filter == "pass", "%s %s - filter is %s " % (variant.chr, variant.pos, genotype.filter) writer.writerow(map(str, [ project_id, family.family_id, individual.indiv_id, get_gene_symbol(variant), variant.chr, variant.pos, variant.ref, variant.alt, variant.vcf_id, variant.annotation['vep_group'], g1k_freq, g1k_popmax_freq, exac_freq, exac_popmax_freq, merck_wgs_3793_freq, genotype_str, genotype.num_alt, genotype.ab, genotype.extras["ad"], genotype.extras["dp"], genotype.gq, genotype.extras["pl"], genotype.filter, ])) individual_variants_f.flush() individual_variants_f.close()