def gene_quicklook(request, project_id, gene_id): """ Summary of a gene in a project """ project = get_object_or_404(Project, project_id=project_id) if not project.can_view(request.user): return HttpResponse("Unauthorized") gene_id = get_gene_id_from_str(gene_id, get_reference()) gene = get_reference().get_gene(gene_id) sys.stderr.write(project_id + " - staring gene search for: %s %s \n" % (gene_id, gene)) variant_filter = get_default_variant_filter( 'all_coding', mall.get_annotator().reference_population_slugs) num_indivs = len( [i for i in project.get_individuals() if i.has_variant_data()]) aac_threshold = (.2 * num_indivs) + 5 rare_variants = [] for variant in project_analysis.get_variants_in_gene( project, gene_id, variant_filter=variant_filter): aac = get_alt_allele_count(variant) max_af = max(variant.annotation['freqs'].values()) if aac <= aac_threshold and max_af < .01: rare_variants.append(variant) add_extra_info_to_variants_project(get_reference(), project, rare_variants) knockouts = [] knockout_ids, variation = get_knockouts_in_gene(project, gene_id) for kid in knockout_ids: variants = variation.get_relevant_variants_for_indiv_ids([kid]) add_extra_info_to_variants_project(get_reference(), project, variants) knockouts.append({ 'indiv_id': kid, 'variants': [v.toJSON() for v in variants], }) sys.stderr.write("Retrieved %s variants \n" % len(rare_variants)) return render( request, 'project/gene_quicklook.html', { 'gene': gene, 'gene_json': json.dumps(gene), 'project': project, 'rare_variants_json': json.dumps([v.toJSON() for v in rare_variants]), 'individuals_json': json.dumps([i.get_json_obj() for i in project.get_individuals()]), 'knockouts_json': json.dumps(knockouts), })
def gene_quicklook(request, project_id, gene_id): """ Summary of a gene in a project """ project = get_object_or_404(Project, project_id=project_id) if not project.can_view(request.user): return HttpResponse("Unauthorized") gene_id = get_gene_id_from_str(gene_id, get_reference()) gene = get_reference().get_gene(gene_id) sys.stderr.write(project_id + " - staring gene search for: %s %s \n" % (gene_id, gene)) variant_filter = get_default_variant_filter('all_coding', mall.get_annotator().reference_population_slugs) num_indivs = len([i for i in project.get_individuals() if i.has_variant_data()]) aac_threshold = (.2 * num_indivs) + 5 rare_variants = [] for variant in project_analysis.get_variants_in_gene(project, gene_id, variant_filter=variant_filter): aac = get_alt_allele_count(variant) max_af = max(variant.annotation['freqs'].values()) if aac <= aac_threshold and max_af < .01: rare_variants.append(variant) add_extra_info_to_variants_project(get_reference(), project, rare_variants) knockouts = [] knockout_ids, variation = get_knockouts_in_gene(project, gene_id) for kid in knockout_ids: variants = variation.get_relevant_variants_for_indiv_ids([kid]) add_extra_info_to_variants_project(get_reference(), project, variants) knockouts.append({ 'indiv_id': kid, 'variants': [v.toJSON() for v in variants], }) sys.stderr.write("Retrieved %s variants \n" % len(rare_variants)) return render(request, 'project/gene_quicklook.html', { 'gene': gene, 'gene_json': json.dumps(gene), 'project': project, 'rare_variants_json': json.dumps([v.toJSON() for v in rare_variants]), 'individuals_json': json.dumps([i.get_json_obj() for i in project.get_individuals()]), 'knockouts_json': json.dumps(knockouts), })
def gene_quicklook(request, project_id, gene_id): """ Summary of a gene in a project """ project = get_object_or_404(Project, project_id=project_id) if not project.can_view(request.user): return HttpResponse("Unauthorized") if project.project_status == Project.NEEDS_MORE_PHENOTYPES and not request.user.is_staff: return render(request, 'analysis_unavailable.html', {'reason': 'Awaiting phenotype data.'}) # other projects this user can view if request.user.is_staff: other_projects = [p for p in Project.objects.all()] # if p != project else: other_projects = [ c.project for c in ProjectCollaborator.objects.filter(user=request.user) ] # if c.project != project other_projects = filter( lambda p: get_project_datastore(p.project_id). project_collection_is_loaded(p.project_id), other_projects) if other_projects: other_projects_json = json.dumps([{ 'project_id': p.project_id, 'project_name': p.project_name } for p in sorted(other_projects, key=lambda p: p.project_id)]) else: other_projects_json = None if gene_id is None: return render( request, 'project/gene_quicklook.html', { 'project': project, 'gene': None, 'gene_json': None, 'rare_variants_json': None, 'individuals_json': None, 'knockouts_json': None, 'other_projects_json': other_projects_json, }) projects_to_search_param = request.GET.get('selected_projects') if projects_to_search_param: projects_to_search = [] project_ids = projects_to_search_param.split(",") for project_id in project_ids: project = get_object_or_404(Project, project_id=project_id) if not project.can_view(request.user): return HttpResponse("Unauthorized") projects_to_search.append(project) else: projects_to_search = [project] gene_id = get_gene_id_from_str(gene_id, get_reference()) gene = get_reference().get_gene(gene_id) sys.stderr.write( project_id + " - staring gene search for: %s in projects: %s\n" % (gene_id, ",".join([p.project_id for p in projects_to_search]) + "\n")) # all rare coding variants variant_filter = get_default_variant_filter( 'all_coding', mall.get_annotator().reference_population_slugs) indiv_id_to_project_id = {} rare_variant_dict = {} rare_variants = [] for project in projects_to_search: project_variants = [] for variant in project_analysis.get_variants_in_gene( project, gene_id, variant_filter=variant_filter): max_af = max(variant.annotation['freqs'].values()) if not any([ indiv_id for indiv_id, genotype in variant.genotypes.items() if genotype.num_alt > 0 ]): continue if max_af >= .01: continue # add project id to genotypes for indiv_id in variant.genotypes: indiv_id_to_project_id[indiv_id] = project.project_id # save this variant (or just the genotypes from this variant if the variant if it's been seen already in another project) variant_id = "%s-%s-%s-%s" % (variant.chr, variant.pos, variant.ref, variant.alt) if variant_id not in rare_variant_dict: rare_variant_dict[variant_id] = variant project_variants.append(variant) else: rare_variant_dict[variant_id].genotypes.update( variant.genotypes) #sys.stderr.write("gene_id: %s, variant: %s\n" % (gene_id, variant.toJSON()['annotation']['vep_annotation'])) add_extra_info_to_variants_project(get_reference(), project, project_variants) rare_variants.extend(project_variants) sys.stderr.write("Retreived %s rare variants\n" % len(rare_variants)) # compute knockout individuals individ_ids_and_variants = [] for project in projects_to_search: knockout_ids, variation = get_knockouts_in_gene(project, gene_id) for indiv_id in knockout_ids: variants = variation.get_relevant_variants_for_indiv_ids( [indiv_id]) add_extra_info_to_variants_project(get_reference(), project, variants) individ_ids_and_variants.append({ 'indiv_id': indiv_id, 'variants': variants, }) #sys.stderr.write("%s : %s: Retrieved %s knockout variants\n" % (project.project_id, indiv_id, len(variants), )) download_csv = request.GET.get('download', '') if download_csv: response = HttpResponse(content_type='text/csv') response[ 'Content-Disposition'] = 'attachment; filename="{}_{}.csv"'.format( download_csv, gene["transcript_name"]) if download_csv == 'knockouts': individuals_to_include = [ individ_id_and_variants["indiv_id"] for individ_id_and_variants in individ_ids_and_variants ] rows = [] for individ_id_and_variants in individ_ids_and_variants: rare_variants = individ_id_and_variants["variants"] for variant in rare_variants: worst_annotation_idx = variant.annotation[ "worst_vep_index_per_gene"][gene_id] worst_annotation = variant.annotation["vep_annotation"][ worst_annotation_idx] genotypes = [] all_genotypes_string = "" for indiv_id in individuals_to_include: if indiv_id in variant.genotypes and variant.genotypes[ indiv_id].num_alt > 0: genotype = variant.genotypes[indiv_id] allele_string = ">".join(genotype.alleles) all_genotypes_string += indiv_id + ":" + allele_string + " " genotypes.append(allele_string + " (" + str(genotype.gq) + ")") else: genotypes.append("") measureset_id, clinvar_significance = settings.CLINVAR_VARIANTS.get( variant.unique_tuple(), ("", "")) rows.append( map(str, [ gene["symbol"], variant.chr, variant.pos, variant.ref, variant.alt, variant.vcf_id or "", variant.annotation.get("vep_consequence", ""), worst_annotation.get("hgvsc", ""), worst_annotation.get("hgvsp", "").replace( "%3D", "="), worst_annotation.get("sift", ""), worst_annotation.get("polyphen", ""), worst_annotation.get("mutationtaster_pred", ""), ";".join( set( worst_annotation.get("fathmm_pred", "").split('%3B'))), measureset_id, clinvar_significance, variant.annotation["freqs"].get( "1kg_wgs_phase3", ""), variant.annotation["freqs"].get( "1kg_wgs_phase3_popmax", ""), variant.annotation["freqs"].get("exac_v3", ""), variant.annotation["freqs"].get( "exac_v3_popmax", ""), all_genotypes_string, ] + genotypes)) elif download_csv == 'rare_variants': individuals_to_include = [] for variant in rare_variants: for indiv_id, genotype in variant.genotypes.items(): if genotype.num_alt > 0 and indiv_id not in individuals_to_include: individuals_to_include.append(indiv_id) rows = [] for variant in rare_variants: worst_annotation_idx = variant.annotation[ "worst_vep_index_per_gene"][gene_id] worst_annotation = variant.annotation["vep_annotation"][ worst_annotation_idx] genotypes = [] all_genotypes_string = "" for indiv_id in individuals_to_include: if indiv_id in variant.genotypes and variant.genotypes[ indiv_id].num_alt > 0: genotype = variant.genotypes[indiv_id] allele_string = ">".join(genotype.alleles) all_genotypes_string += indiv_id + ":" + allele_string + " " genotypes.append(allele_string + " (" + str(genotype.gq) + ")") else: genotypes.append("") measureset_id, clinvar_significance = settings.CLINVAR_VARIANTS.get( variant.unique_tuple(), ("", "")) rows.append( map(str, [ gene["symbol"], variant.chr, variant.pos, variant.ref, variant.alt, variant.vcf_id or "", variant.annotation.get("vep_consequence", ""), worst_annotation.get("hgvsc", ""), worst_annotation.get("hgvsp", "").replace("%3D", "="), worst_annotation.get("sift", ""), worst_annotation.get("polyphen", ""), worst_annotation.get("mutationtaster_pred", ""), ";".join( set( worst_annotation.get("fathmm_pred", "").split('%3B'))), measureset_id, clinvar_significance, variant.annotation["freqs"].get("1kg_wgs_phase3", ""), variant.annotation["freqs"].get( "1kg_wgs_phase3_popmax", ""), variant.annotation["freqs"].get("exac_v3", ""), variant.annotation["freqs"].get("exac_v3_popmax", ""), all_genotypes_string, ] + genotypes)) header = [ "gene", "chr", "pos", "ref", "alt", "rsID", "impact", "HGVS.c", "HGVS.p", "sift", "polyphen", "muttaster", "fathmm", "clinvar_id", "clinvar_clinical_sig", "freq_1kg_wgs_phase3", "freq_1kg_wgs_phase3_popmax", "freq_exac_v3", "freq_exac_v3_popmax", "all_genotypes" ] + list( map(lambda i: i + " (from %s)" % indiv_id_to_project_id[i], individuals_to_include)) writer = csv.writer(response) writer.writerow(header) for row in rows: writer.writerow(row) return response else: for individ_id_and_variants in individ_ids_and_variants: variants = individ_id_and_variants["variants"] individ_id_and_variants["variants"] = [ v.toJSON() for v in variants ] return render( request, 'project/gene_quicklook.html', { 'gene': gene, 'gene_json': json.dumps(gene), 'project': project, 'rare_variants_json': json.dumps([v.toJSON() for v in rare_variants]), 'individuals_json': json.dumps([ i.get_json_obj() for project in projects_to_search for i in project.get_individuals() ]), 'knockouts_json': json.dumps(individ_ids_and_variants), 'other_projects_json': other_projects_json, })
def gene_quicklook(request, project_id, gene_id): """ Summary of a gene in a project """ project = get_object_or_404(Project, project_id=project_id) if not project.can_view(request.user): return HttpResponse("Unauthorized") if gene_id is None: return render(request, 'project/gene_quicklook.html', { 'project': project, 'gene': None, 'gene_json': None, 'rare_variants_json': None, 'individuals_json': None, 'knockouts_json': None, }) gene_id = get_gene_id_from_str(gene_id, get_reference()) gene = get_reference().get_gene(gene_id) sys.stderr.write(project_id + " - staring gene search for: %s %s \n" % (gene_id, gene)) # all rare coding variants variant_filter = get_default_variant_filter('all_coding', mall.get_annotator().reference_population_slugs) rare_variants = [] for variant in project_analysis.get_variants_in_gene(project, gene_id, variant_filter=variant_filter): max_af = max(variant.annotation['freqs'].values()) if max_af < .01: rare_variants.append(variant) #sys.stderr.write("gene_id: %s, variant: %s\n" % (gene_id, variant.toJSON()['annotation']['vep_annotation'])) add_extra_info_to_variants_project(get_reference(), project, rare_variants) # compute knockout individuals individ_ids_and_variants = [] knockout_ids, variation = get_knockouts_in_gene(project, gene_id) for indiv_id in knockout_ids: variants = variation.get_relevant_variants_for_indiv_ids([indiv_id]) add_extra_info_to_variants_project(get_reference(), project, variants) individ_ids_and_variants.append({ 'indiv_id': indiv_id, 'variants': variants, }) sys.stderr.write("Project-wide gene search retrieved %s rare variants for gene: %s \n" % (len(rare_variants), gene_id)) download_csv = request.GET.get('download', '') if download_csv: response = HttpResponse(content_type='text/csv') response['Content-Disposition'] = 'attachment; filename="{}_{}.csv"'.format(download_csv, gene["transcript_name"]) if download_csv == 'knockouts': individuals_to_include = [individ_id_and_variants["indiv_id"] for individ_id_and_variants in individ_ids_and_variants] rows = [] for individ_id_and_variants in individ_ids_and_variants: rare_variants = individ_id_and_variants["variants"] for variant in rare_variants: worst_annotation_idx = variant.annotation["worst_vep_index_per_gene"][gene_id] worst_annotation = variant.annotation["vep_annotation"][worst_annotation_idx] genotypes = [] all_genotypes_string = "" for indiv_id in individuals_to_include: genotype = variant.genotypes[indiv_id] allele_string = ">".join(genotype.alleles) all_genotypes_string += indiv_id + ":" + allele_string + " " if genotype.num_alt > 0: genotypes.append(allele_string + " (" + str(genotype.gq) + ")") else: genotypes.append("") measureset_id, clinvar_significance = settings.CLINVAR_VARIANTS.get(variant.unique_tuple(), ("", "")) rows.append(map(str, [ gene["symbol"], variant.chr, variant.pos, variant.ref, variant.alt, variant.vcf_id or "", variant.annotation.get("vep_consequence", ""), worst_annotation.get("hgvsc", ""), worst_annotation.get("hgvsp", "").replace("%3D", "="), worst_annotation.get("sift", ""), worst_annotation.get("polyphen", ""), worst_annotation.get("mutationtaster_pred", ""), ";".join(set(worst_annotation.get("fathmm_pred", "").split('%3B'))), measureset_id, clinvar_significance, variant.annotation["freqs"].get("1kg_wgs_phase3", ""), variant.annotation["freqs"].get("1kg_wgs_phase3_popmax", ""), variant.annotation["freqs"].get("exac_v3", ""), variant.annotation["freqs"].get("exac_v3_popmax", ""), all_genotypes_string, ] + genotypes)) elif download_csv == 'rare_variants': individuals_to_include = [] for variant in rare_variants: for indiv_id, genotype in variant.genotypes.items(): if genotype.num_alt > 0 and indiv_id not in individuals_to_include: individuals_to_include.append(indiv_id) rows = [] for variant in rare_variants: worst_annotation_idx = variant.annotation["worst_vep_index_per_gene"][gene_id] worst_annotation = variant.annotation["vep_annotation"][worst_annotation_idx] genotypes = [] all_genotypes_string = "" for indiv_id in individuals_to_include: genotype = variant.genotypes[indiv_id] allele_string = ">".join(genotype.alleles) all_genotypes_string += indiv_id + ":" + allele_string + " " if genotype.num_alt > 0: genotypes.append(allele_string + " (" + str(genotype.gq) + ")") else: genotypes.append("") measureset_id, clinvar_significance = settings.CLINVAR_VARIANTS.get(variant.unique_tuple(), ("", "")) rows.append(map(str, [ gene["symbol"], variant.chr, variant.pos, variant.ref, variant.alt, variant.vcf_id or "", variant.annotation.get("vep_consequence", ""), worst_annotation.get("hgvsc", ""), worst_annotation.get("hgvsp", "").replace("%3D", "="), worst_annotation.get("sift", ""), worst_annotation.get("polyphen", ""), worst_annotation.get("mutationtaster_pred", ""), ";".join(set(worst_annotation.get("fathmm_pred", "").split('%3B'))), measureset_id, clinvar_significance, variant.annotation["freqs"].get("1kg_wgs_phase3", ""), variant.annotation["freqs"].get("1kg_wgs_phase3_popmax", ""), variant.annotation["freqs"].get("exac_v3", ""), variant.annotation["freqs"].get("exac_v3_popmax", ""), all_genotypes_string, ] + genotypes)) header = ["gene", "chr", "pos", "ref", "alt", "rsID", "impact", "HGVS.c", "HGVS.p", "sift", "polyphen", "muttaster", "fathmm", "clinvar_id", "clinvar_clinical_sig", "freq_1kg_wgs_phase3", "freq_1kg_wgs_phase3_popmax", "freq_exac_v3", "freq_exac_v3_popmax", "all_genotypes"] + individuals_to_include writer = csv.writer(response) writer.writerow(header) for row in rows: writer.writerow(row) return response else: for individ_id_and_variants in individ_ids_and_variants: variants = individ_id_and_variants["variants"] individ_id_and_variants["variants"] = [v.toJSON() for v in variants] return render(request, 'project/gene_quicklook.html', { 'gene': gene, 'gene_json': json.dumps(gene), 'project': project, 'rare_variants_json': json.dumps([v.toJSON() for v in rare_variants]), 'individuals_json': json.dumps([i.get_json_obj() for i in project.get_individuals()]), 'knockouts_json': json.dumps(individ_ids_and_variants), })
def gene_quicklook(request, project_id, gene_id): """ Summary of a gene in a project """ main_project = get_object_or_404(Project, project_id=project_id) if not main_project.can_view(request.user): return HttpResponse("Unauthorized") # other projects this user can view other_projects = get_loaded_projects_for_user( request.user, fields=['project_id', 'project_name']) if other_projects: other_projects_json = json.dumps([{ 'project_id': p.project_id, 'project_name': p.project_name } for p in sorted(other_projects, key=lambda p: p.project_id.lower())]) else: other_projects_json = None if gene_id is None: return render( request, 'project/gene_quicklook.html', { 'project': main_project, 'gene': None, 'gene_json': None, 'rare_variants_json': None, 'individuals_json': None, 'knockouts_json': None, 'other_projects_json': other_projects_json, }) projects_to_search_param = request.GET.get('selected_projects') if projects_to_search_param: project_ids = projects_to_search_param.split(",") projects_to_search = [ project for project in other_projects if project.project_id in project_ids ] if len(projects_to_search) < len(project_ids): # If not all the specified project ids are in the other projects list then they are not authorized return HttpResponse("Unauthorized") else: project_ids = [main_project.project_id] projects_to_search = [main_project] gene_id = get_gene_id_from_str(gene_id, get_reference()) gene = get_reference().get_gene(gene_id) sys.stderr.write( project_id + " - staring gene search for: %s in projects: %s\n" % (gene_id, ",".join([p.project_id for p in projects_to_search]) + "\n")) # all rare coding variants variant_filter = get_default_variant_filter( 'all_coding', mall.get_annotator().reference_population_slugs) indiv_id_to_project_id = {} rare_variant_dict = {} rare_variants = [] individ_ids_and_variants = [] for project in projects_to_search: all_project_variants = project_analysis.get_variants_in_gene( project, gene_id, variant_filter=variant_filter) # compute knockout individuals knockout_ids, variation = get_knockouts_in_gene( project, gene_id, all_project_variants) for indiv_id in knockout_ids: variants = variation.get_relevant_variants_for_indiv_ids( [indiv_id]) individ_ids_and_variants.append({ 'indiv_id': indiv_id, 'variants': variants, }) # compute rare variants project_variants = [] for i, variant in enumerate(all_project_variants): max_af = max([ freq for label, freq in variant.annotation['freqs'].items() if label != "AF" ]) # don't filter on within-cohort AF if not any([ indiv_id for indiv_id, genotype in variant.genotypes.items() if genotype.num_alt > 0 ]): continue if max_af >= .01: continue # add project id to genotypes for indiv_id in variant.genotypes: indiv_id_to_project_id[indiv_id] = project.project_id # save this variant (or just the genotypes from this variant if the variant if it's been seen already in another project) variant_id = "%s-%s-%s-%s" % (variant.chr, variant.pos, variant.ref, variant.alt) if variant_id not in rare_variant_dict: rare_variant_dict[variant_id] = variant project_variants.append(variant) else: rare_variant_dict[variant_id].genotypes.update( variant.genotypes) rare_variants.extend(project_variants) all_variants = sum([i['variants'] for i in individ_ids_and_variants], rare_variants) add_extra_info_to_variants_project(get_reference(), project, all_variants) download_csv = request.GET.get('download', '') if download_csv: response = HttpResponse(content_type='text/csv') response[ 'Content-Disposition'] = 'attachment; filename="{}_{}.csv"'.format( download_csv, gene.get("symbol") or gene.get("transcript_name")) if download_csv == 'knockouts': individuals_to_include = [ individ_id_and_variants["indiv_id"] for individ_id_and_variants in individ_ids_and_variants ] rows = [] for individ_id_and_variants in individ_ids_and_variants: rare_variants = individ_id_and_variants["variants"] for variant in rare_variants: worst_annotation_idx = variant.annotation[ "worst_vep_index_per_gene"][gene_id] worst_annotation = variant.annotation["vep_annotation"][ worst_annotation_idx] genotypes = [] all_genotypes_string = "" for indiv_id in individuals_to_include: if indiv_id in variant.genotypes and variant.genotypes[ indiv_id].num_alt > 0: genotype = variant.genotypes[indiv_id] allele_string = ">".join(genotype.alleles) all_genotypes_string += indiv_id + ":" + allele_string + " " genotypes.append(allele_string + " (" + str(genotype.gq) + ")") else: genotypes.append("") measureset_id, clinvar_significance = get_reference( ).get_clinvar_info(*variant.unique_tuple()) rows.append( map(str, [ gene["symbol"], variant.chr, variant.pos, variant.ref, variant.alt, variant.vcf_id or "", variant.annotation.get("vep_consequence", ""), worst_annotation.get("hgvsc", ""), worst_annotation.get("hgvsp", "").replace( "%3D", "="), worst_annotation.get("sift", ""), worst_annotation.get("polyphen", ""), worst_annotation.get("mutationtaster_pred", ""), ";".join( set( worst_annotation.get("fathmm_pred", "").split('%3B'))), measureset_id, clinvar_significance, variant.annotation["freqs"].get( "1kg_wgs_phase3", ""), variant.annotation["freqs"].get( "1kg_wgs_phase3_popmax", ""), variant.annotation["freqs"].get("exac_v3", ""), variant.annotation["freqs"].get( "exac_v3_popmax", ""), all_genotypes_string, ] + genotypes)) elif download_csv == 'rare_variants': individuals_to_include = [] for variant in rare_variants: for indiv_id, genotype in variant.genotypes.items(): if genotype.num_alt > 0 and indiv_id not in individuals_to_include: individuals_to_include.append(indiv_id) rows = [] for variant in rare_variants: worst_annotation_idx = variant.annotation[ "worst_vep_index_per_gene"][gene_id] worst_annotation = variant.annotation["vep_annotation"][ worst_annotation_idx] genotypes = [] all_genotypes_string = "" for indiv_id in individuals_to_include: if indiv_id in variant.genotypes and variant.genotypes[ indiv_id].num_alt > 0: genotype = variant.genotypes[indiv_id] allele_string = ">".join(genotype.alleles) all_genotypes_string += indiv_id + ":" + allele_string + " " genotypes.append(allele_string + " (" + str(genotype.gq) + ")") else: genotypes.append("") measureset_id, clinvar_significance = get_reference( ).get_clinvar_info(*variant.unique_tuple()) rows.append( map(str, [ gene["symbol"], variant.chr, variant.pos, variant.ref, variant.alt, variant.vcf_id or "", variant.annotation.get("vep_consequence", ""), worst_annotation.get("hgvsc", ""), worst_annotation.get("hgvsp", "").replace("%3D", "="), worst_annotation.get("sift", ""), worst_annotation.get("polyphen", ""), worst_annotation.get("mutationtaster_pred", ""), ";".join( set( worst_annotation.get("fathmm_pred", "").split('%3B'))), measureset_id, clinvar_significance, variant.annotation["freqs"].get("1kg_wgs_phase3", ""), variant.annotation["freqs"].get( "1kg_wgs_phase3_popmax", ""), variant.annotation["freqs"].get("exac_v3", ""), variant.annotation["freqs"].get("exac_v3_popmax", ""), all_genotypes_string, ] + genotypes)) header = [ "gene", "chr", "pos", "ref", "alt", "rsID", "impact", "HGVS.c", "HGVS.p", "sift", "polyphen", "muttaster", "fathmm", "clinvar_id", "clinvar_clinical_sig", "freq_1kg_wgs_phase3", "freq_1kg_wgs_phase3_popmax", "freq_exac_v3", "freq_exac_v3_popmax", "all_genotypes" ] + list( map(lambda i: i + " (from %s)" % indiv_id_to_project_id[i], individuals_to_include)) writer = csv.writer(response) writer.writerow(header) for row in rows: writer.writerow(row) return response else: for individ_id_and_variants in individ_ids_and_variants: variants = individ_id_and_variants["variants"] individ_id_and_variants["variants"] = [ v.toJSON() for v in variants ] individ_ids = {i['indiv_id'] for i in individ_ids_and_variants} for var in rare_variants: individ_ids.update(var.genotypes.keys()) individuals = Individual.objects.filter( indiv_id__in=individ_ids, project__project_id__in=project_ids).select_related( 'project').select_related('family').only( 'project__project_id', 'family__family_id', *Individual.INDIVIDUAL_JSON_FIELDS_NO_IDS) return render( request, 'project/gene_quicklook.html', { 'gene': gene, 'gene_json': json.dumps(gene), 'project': main_project, 'rare_variants_json': json.dumps([v.toJSON() for v in rare_variants]), 'individuals_json': json.dumps([ i.get_json_obj(skip_has_variant_data=True) for i in individuals ]), 'knockouts_json': json.dumps(individ_ids_and_variants), 'other_projects_json': other_projects_json, })
def search_for_gene(self, search_gene_id, project_id_list, max_af=0.01): ''' Search for a gene across project(s) Args: 1. search_gene_id: Gene ID to search for 2. proj_list: An optional list of projects to narrow down search to ''' gene_id = get_gene_id_from_str(search_gene_id, get_reference()) gene = get_reference().get_gene(gene_id) print("Staring gene search for: %s %s in projects: %s\n" % (search_gene_id, gene['gene_id'], ", ".join(project_id_list))) print("Max AF threshold: %s" % max_af) # all rare coding variants variant_filter = get_default_variant_filter( 'all_coding', mall.get_annotator().reference_population_slugs) print("All Filters: ") pprint(variant_filter.toJSON()) output_filename = 'results_' + search_gene_id + '.tsv' outfile = open(output_filename, 'w') header = [ "project_id", "gene", "chr", "pos", "ref", "alt", "rsID", "filter", "impact", "HGVS.c", "HGVS.p", "sift", "polyphen", "muttaster", "fathmm", "clinvar_id", "clinvar_clinical_sig", "freq_1kg_wgs_phase3", "freq_1kg_wgs_phase3_popmax", "freq_exac_v3", "freq_exac_v3_popmax", "all_genotypes" ] writer = csv.writer(outfile, delimiter='\t') writer.writerow(header) if project_id_list: for project_id in project_id_list: project = Project.objects.filter( project_id=project_id)[0] # TODO validate else: project_id_list = [p.project_id for p in Project.objects.all()] for project_id in project_id_list: project = Project.objects.filter(project_id=project_id)[0] if get_project_datastore(project_id).project_collection_is_loaded( project_id): print("Running on project %s" % project_id) else: print( "Skipping project %s - gene search is not enabled for this project" % project_id) continue for variant in project_analysis.get_variants_in_gene( project, gene_id, variant_filter=variant_filter): if max(variant.annotation['freqs'].values()) >= max_af: continue #pprint(variant.toJSON()) add_extra_info_to_variants_project(get_reference(), project, [variant]) worst_annotation_idx = variant.annotation[ "worst_vep_index_per_gene"][gene_id] worst_annotation = variant.annotation["vep_annotation"][ worst_annotation_idx] all_genotypes_list = [] pass_filter = "N/A" for indiv_id, genotype in variant.genotypes.items(): pass_filter = genotype.filter # filter value is stored in the genotypes even though it's the same for all individuals if genotype.num_alt > 0: all_genotypes_list.append( "%s[gt:%s GQ:%s AB:%0.3f]" % (indiv_id, ">".join( genotype.alleles), genotype.gq, genotype.ab if genotype.ab is not None else float('NaN'))) measureset_id, clinvar_significance = settings.CLINVAR_VARIANTS.get( variant.unique_tuple(), ("", "")) row = map(str, [ project_id, gene["symbol"], variant.chr, variant.pos, variant.ref, variant.alt, variant.vcf_id or "", pass_filter, variant.annotation.get("vep_consequence", ""), worst_annotation.get("hgvsc", ""), worst_annotation.get("hgvsp", "").replace("%3D", "="), worst_annotation.get("sift", ""), worst_annotation.get("polyphen", ""), worst_annotation.get("mutationtaster_pred", ""), ";".join( set( worst_annotation.get("fathmm_pred", "").split('%3B'))), measureset_id, clinvar_significance, variant.annotation["freqs"].get("1kg_wgs_phase3", ""), variant.annotation["freqs"].get("1kg_wgs_phase3_popmax", ""), variant.annotation["freqs"].get("exac_v3", ""), variant.annotation["freqs"].get("exac_v3_popmax", ""), ", ".join(all_genotypes_list), ]) writer.writerow(row) outfile.close() print("Wrote out %s" % output_filename)
def gene_quicklook(request, project_id, gene_id): """ Summary of a gene in a project """ main_project = get_object_or_404(Project, project_id=project_id) if not main_project.can_view(request.user): return HttpResponse("Unauthorized") new_page_url = '/variant_search/project/{}'.format(main_project.seqr_project.guid) if main_project.seqr_project and main_project.seqr_project.has_new_search else None # other projects this user can view other_projects = get_loaded_projects_for_user(request.user, fields=['project_id', 'project_name']) if other_projects: other_projects_json = json.dumps([{'project_id': p.project_id, 'project_name': p.project_name} for p in sorted(other_projects, key=lambda p: p.project_id.lower())]) else: other_projects_json = None if gene_id is None: return render(request, 'project/gene_quicklook.html', { 'project': main_project, 'gene': None, 'gene_json': None, 'rare_variants_json': None, 'individuals_json': None, 'knockouts_json': None, 'other_projects_json': other_projects_json, 'new_page_url': new_page_url, }) projects_to_search_param = request.GET.get('selected_projects') if projects_to_search_param: project_ids = projects_to_search_param.split(",") projects_to_search = [project for project in other_projects if project.project_id in project_ids] if len(projects_to_search) < len(project_ids): # If not all the specified project ids are in the other projects list then they are not authorized return HttpResponse("Unauthorized") else: project_ids = [main_project.project_id] projects_to_search = [main_project] gene_id = get_gene_id_from_str(gene_id, get_reference()) gene = get_reference().get_gene(gene_id) # all rare coding variants variant_filter = get_default_variant_filter('all_coding', mall.get_annotator().reference_population_slugs) indiv_id_to_project_id = {} rare_variant_dict = {} rare_variants = [] individ_ids_and_variants = [] for project in projects_to_search: all_project_variants = project_analysis.get_variants_in_gene(project, gene_id, variant_filter=variant_filter) # compute knockout individuals knockout_ids, variation = get_knockouts_in_gene(project, gene_id, all_project_variants) for indiv_id in knockout_ids: variants = variation.get_relevant_variants_for_indiv_ids([indiv_id]) individ_ids_and_variants.append({ 'indiv_id': indiv_id, 'variants': variants, }) # compute rare variants project_variants = [] for i, variant in enumerate(all_project_variants): max_af = max([freq for label, freq in variant.annotation['freqs'].items() if label != "AF"]) # don't filter on within-cohort AF if not any([indiv_id for indiv_id, genotype in variant.genotypes.items() if genotype.num_alt > 0]): continue if max_af >= .01: continue # add project id to genotypes for indiv_id in variant.genotypes: indiv_id_to_project_id[indiv_id] = project.project_id # save this variant (or just the genotypes from this variant if the variant if it's been seen already in another project) variant_id = "%s-%s-%s-%s" % (variant.chr,variant.pos, variant.ref, variant.alt) if variant_id not in rare_variant_dict: rare_variant_dict[variant_id] = variant project_variants.append(variant) else: for indiv_id, genotype in variant.genotypes.items(): existing_genotype = rare_variant_dict[variant_id].genotypes.get(indiv_id) if not existing_genotype or existing_genotype.num_alt == -1: rare_variant_dict[variant_id].genotypes[indiv_id] = genotype if project != main_project: add_extra_info_to_variants_project(get_reference(), project, project_variants) rare_variants.extend(project_variants) all_variants = sum([i['variants'] for i in individ_ids_and_variants], rare_variants) add_extra_info_to_variants_project(get_reference(), main_project, all_variants, add_family_tags=True) download_csv = request.GET.get('download', '') if download_csv: response = HttpResponse(content_type='text/csv') response['Content-Disposition'] = 'attachment; filename="{}_{}.csv"'.format(download_csv, gene.get("symbol") or gene.get("transcript_name")) def get_row(variant, worst_annotation): if 'clinvar_allele_id' in variant.extras: measureset_id = variant.extras['clinvar_allele_id'] clinvar_significance = variant.extras['clinvar_clinsig'] else: measureset_id, clinvar_significance = get_reference().get_clinvar_info(*variant.unique_tuple()) genotypes = [] all_genotypes_string = "" for indiv_id in individuals_to_include: if indiv_id in variant.genotypes and variant.genotypes[indiv_id].num_alt > 0: genotype = variant.genotypes[indiv_id] allele_string = ">".join(genotype.alleles) all_genotypes_string += indiv_id + ":" + allele_string + " " genotypes.append(allele_string + " (" + str(genotype.gq) + ")") else: genotypes.append("") return [ gene["symbol"], variant.chr, variant.pos, variant.ref, variant.alt, variant.vcf_id or variant.annotation.get("rsid") or "", variant.annotation.get("vep_consequence") or "", worst_annotation.get("hgvsc") or "", (worst_annotation.get("hgvsp") or "").replace("%3D", "="), variant.annotation.get("sift") or "", variant.annotation.get("polyphen") or "", variant.annotation.get("mutationtaster_pred") or variant.annotation.get("muttaster") or "", (";".join(set((worst_annotation.get("fathmm_pred") or "").split('%3B')))) or variant.annotation.get("fathmm") or "", measureset_id or "", clinvar_significance or "", variant.annotation["freqs"].get("1kg_wgs_phase3") or variant.annotation["freqs"].get("1kg_wgs_AF") or "", variant.annotation["freqs"].get("1kg_wgs_phase3_popmax") or variant.annotation["freqs"].get("1kg_wgs_popmax_AF") or "", variant.annotation["freqs"].get("exac_v3") or variant.annotation["freqs"].get("exac_v3_AF") or "", variant.annotation["freqs"].get("exac_v3_popmax") or variant.annotation["freqs"].get("exac_v3_popmax_AF") or "", variant.annotation["freqs"].get("gnomad_exomes_AF") or "", variant.annotation["freqs"].get("gnomad_exomes_popmax_AF") or "", variant.annotation["freqs"].get("gnomad_genomes_AF") or "", variant.annotation["freqs"].get("gnomad_genomes_popmax_AF") or "", all_genotypes_string, ] + genotypes if download_csv == 'knockouts': individuals_to_include = [individ_id_and_variants["indiv_id"] for individ_id_and_variants in individ_ids_and_variants] rows = [] for individ_id_and_variants in individ_ids_and_variants: rare_variants = individ_id_and_variants["variants"] for variant in rare_variants: worst_annotation_idx = variant.annotation["worst_vep_index_per_gene"][gene_id] worst_annotation = variant.annotation["vep_annotation"][worst_annotation_idx] genotypes = [] all_genotypes_string = "" for indiv_id in individuals_to_include: if indiv_id in variant.genotypes and variant.genotypes[indiv_id].num_alt > 0: genotype = variant.genotypes[indiv_id] allele_string = ">".join(genotype.alleles) all_genotypes_string += indiv_id + ":" + allele_string + " " genotypes.append(allele_string + " (" + str(genotype.gq) + ")") else: genotypes.append("") rows.append(map(str, get_row(variant, worst_annotation))) elif download_csv == 'rare_variants': individuals_to_include = [] for variant in rare_variants: for indiv_id, genotype in variant.genotypes.items(): if genotype.num_alt > 0 and indiv_id not in individuals_to_include: individuals_to_include.append(indiv_id) rows = [] for variant in rare_variants: worst_annotation_idx = variant.annotation["worst_vep_index_per_gene"][gene_id] worst_annotation = variant.annotation["vep_annotation"][worst_annotation_idx] rows.append(map(str, get_row(variant, worst_annotation))) header = ["gene", "chr", "pos", "ref", "alt", "rsID", "impact", "HGVS.c", "HGVS.p", "sift", "polyphen", "muttaster", "fathmm", "clinvar_id", "clinvar_clinical_sig", "freq_1kg_wgs_phase3", "freq_1kg_wgs_phase3_popmax", "freq_exac_v3", "freq_exac_v3_popmax", "freq_gnomad_exomes", "freq_gnomad_exomes_popmax", "freq_gnomad_genomes", "freq_gnomad_genomes_popmax", "all_genotypes"] + list(map(lambda i: i + " (from %s)" % indiv_id_to_project_id[i], individuals_to_include)) writer = csv.writer(response) writer.writerow(header) for row in rows: writer.writerow(row) return response else: for individ_id_and_variants in individ_ids_and_variants: variants = individ_id_and_variants["variants"] individ_id_and_variants["variants"] = [v.toJSON() for v in variants] individ_ids = {i['indiv_id'] for i in individ_ids_and_variants} for var in rare_variants: individ_ids.update(var.genotypes.keys()) individuals = Individual.objects.filter( indiv_id__in=individ_ids, project__project_id__in=project_ids ).select_related('project').select_related('family').only('project__project_id', 'family__family_id', *Individual.INDIVIDUAL_JSON_FIELDS_NO_IDS) return render(request, 'project/gene_quicklook.html', { 'gene': gene, 'gene_json': json.dumps(gene), 'project': main_project, 'rare_variants_json': json.dumps([v.toJSON() for v in rare_variants]), 'individuals_json': json.dumps([i.get_json_obj(skip_has_variant_data=True) for i in individuals]), 'knockouts_json': json.dumps(individ_ids_and_variants), 'other_projects_json': other_projects_json, 'new_page_url': new_page_url, })
def gene_quicklook(request, project_id, gene_id): """ Summary of a gene in a project """ project = get_object_or_404(Project, project_id=project_id) if not project.can_view(request.user): return HttpResponse("Unauthorized") if project.project_status == Project.NEEDS_MORE_PHENOTYPES and not request.user.is_staff: return render(request, 'analysis_unavailable.html', {'reason': 'Awaiting phenotype data.'}) if gene_id is None: return render( request, 'project/gene_quicklook.html', { 'project': project, 'gene': None, 'gene_json': None, 'rare_variants_json': None, 'individuals_json': None, 'knockouts_json': None, }) gene_id = get_gene_id_from_str(gene_id, get_reference()) gene = get_reference().get_gene(gene_id) sys.stderr.write(project_id + " - staring gene search for: %s %s \n" % (gene_id, gene)) # all rare coding variants variant_filter = get_default_variant_filter( 'all_coding', mall.get_annotator().reference_population_slugs) rare_variants = [] for variant in project_analysis.get_variants_in_gene( project, gene_id, variant_filter=variant_filter): max_af = max(variant.annotation['freqs'].values()) if not any([ indiv_id for indiv_id, genotype in variant.genotypes.items() if genotype.num_alt > 0 ]): continue if max_af < .01: rare_variants.append(variant) #sys.stderr.write("gene_id: %s, variant: %s\n" % (gene_id, variant.toJSON()['annotation']['vep_annotation'])) add_extra_info_to_variants_project(get_reference(), project, rare_variants) # compute knockout individuals individ_ids_and_variants = [] knockout_ids, variation = get_knockouts_in_gene(project, gene_id) for indiv_id in knockout_ids: variants = variation.get_relevant_variants_for_indiv_ids([indiv_id]) add_extra_info_to_variants_project(get_reference(), project, variants) individ_ids_and_variants.append({ 'indiv_id': indiv_id, 'variants': variants, }) sys.stderr.write( "Project-wide gene search retrieved %s rare variants for gene: %s \n" % (len(rare_variants), gene_id)) download_csv = request.GET.get('download', '') if download_csv: response = HttpResponse(content_type='text/csv') response[ 'Content-Disposition'] = 'attachment; filename="{}_{}.csv"'.format( download_csv, gene["transcript_name"]) if download_csv == 'knockouts': individuals_to_include = [ individ_id_and_variants["indiv_id"] for individ_id_and_variants in individ_ids_and_variants ] rows = [] for individ_id_and_variants in individ_ids_and_variants: rare_variants = individ_id_and_variants["variants"] for variant in rare_variants: worst_annotation_idx = variant.annotation[ "worst_vep_index_per_gene"][gene_id] worst_annotation = variant.annotation["vep_annotation"][ worst_annotation_idx] genotypes = [] all_genotypes_string = "" for indiv_id in individuals_to_include: genotype = variant.genotypes[indiv_id] allele_string = ">".join(genotype.alleles) all_genotypes_string += indiv_id + ":" + allele_string + " " if genotype.num_alt > 0: genotypes.append(allele_string + " (" + str(genotype.gq) + ")") else: genotypes.append("") measureset_id, clinvar_significance = settings.CLINVAR_VARIANTS.get( variant.unique_tuple(), ("", "")) rows.append( map(str, [ gene["symbol"], variant.chr, variant.pos, variant.ref, variant.alt, variant.vcf_id or "", variant.annotation.get("vep_consequence", ""), worst_annotation.get("hgvsc", ""), worst_annotation.get("hgvsp", "").replace( "%3D", "="), worst_annotation.get("sift", ""), worst_annotation.get("polyphen", ""), worst_annotation.get("mutationtaster_pred", ""), ";".join( set( worst_annotation.get("fathmm_pred", "").split('%3B'))), measureset_id, clinvar_significance, variant.annotation["freqs"].get( "1kg_wgs_phase3", ""), variant.annotation["freqs"].get( "1kg_wgs_phase3_popmax", ""), variant.annotation["freqs"].get("exac_v3", ""), variant.annotation["freqs"].get( "exac_v3_popmax", ""), all_genotypes_string, ] + genotypes)) elif download_csv == 'rare_variants': individuals_to_include = [] for variant in rare_variants: for indiv_id, genotype in variant.genotypes.items(): if genotype.num_alt > 0 and indiv_id not in individuals_to_include: individuals_to_include.append(indiv_id) rows = [] for variant in rare_variants: worst_annotation_idx = variant.annotation[ "worst_vep_index_per_gene"][gene_id] worst_annotation = variant.annotation["vep_annotation"][ worst_annotation_idx] genotypes = [] all_genotypes_string = "" for indiv_id in individuals_to_include: genotype = variant.genotypes[indiv_id] allele_string = ">".join(genotype.alleles) all_genotypes_string += indiv_id + ":" + allele_string + " " if genotype.num_alt > 0: genotypes.append(allele_string + " (" + str(genotype.gq) + ")") else: genotypes.append("") measureset_id, clinvar_significance = settings.CLINVAR_VARIANTS.get( variant.unique_tuple(), ("", "")) rows.append( map(str, [ gene["symbol"], variant.chr, variant.pos, variant.ref, variant.alt, variant.vcf_id or "", variant.annotation.get("vep_consequence", ""), worst_annotation.get("hgvsc", ""), worst_annotation.get("hgvsp", "").replace("%3D", "="), worst_annotation.get("sift", ""), worst_annotation.get("polyphen", ""), worst_annotation.get("mutationtaster_pred", ""), ";".join( set( worst_annotation.get("fathmm_pred", "").split('%3B'))), measureset_id, clinvar_significance, variant.annotation["freqs"].get("1kg_wgs_phase3", ""), variant.annotation["freqs"].get( "1kg_wgs_phase3_popmax", ""), variant.annotation["freqs"].get("exac_v3", ""), variant.annotation["freqs"].get("exac_v3_popmax", ""), all_genotypes_string, ] + genotypes)) header = [ "gene", "chr", "pos", "ref", "alt", "rsID", "impact", "HGVS.c", "HGVS.p", "sift", "polyphen", "muttaster", "fathmm", "clinvar_id", "clinvar_clinical_sig", "freq_1kg_wgs_phase3", "freq_1kg_wgs_phase3_popmax", "freq_exac_v3", "freq_exac_v3_popmax", "all_genotypes" ] + individuals_to_include writer = csv.writer(response) writer.writerow(header) for row in rows: writer.writerow(row) return response else: for individ_id_and_variants in individ_ids_and_variants: variants = individ_id_and_variants["variants"] individ_id_and_variants["variants"] = [ v.toJSON() for v in variants ] return render( request, 'project/gene_quicklook.html', { 'gene': gene, 'gene_json': json.dumps(gene), 'project': project, 'rare_variants_json': json.dumps([v.toJSON() for v in rare_variants]), 'individuals_json': json.dumps( [i.get_json_obj() for i in project.get_individuals()]), 'knockouts_json': json.dumps(individ_ids_and_variants), })
def search_for_genes(self, gene_or_variant_ids, project_id_list, output_filename, max_af=0.01, knockouts=False, in_clinvar_only=False, include_non_coding=False): """ Search for a gene across project(s) Args: gene_or_variant_ids (list): 'ENSG..' gene id strings. project_id_list (list): (optional) project ids to narrow down the search output_filename (string): output file name max_af (float): AF filter in_clinvar_only (bool): include_non_coding (bool): """ projects = [ Project.objects.get(project_id=project_id) for project_id in project_id_list ] outfile = open(output_filename, 'w') header = [ "project_id", "gene", "chr", "pos", "ref", "alt", "rsID", "filter", "impact", "HGVS.c", "HGVS.p", "sift", "polyphen", "muttaster", "fathmm", "clinvar_id", "clinvar_clinical_sig", "freq_1kg_wgs_phase3", "freq_1kg_wgs_phase3_popmax", "freq_exac_v3", "freq_exac_v3_popmax", "gnomad-exomes", "gnomad-genomes", "families", "all_genotypes" ] writer = csv.writer(outfile, delimiter='\t') writer.writerow(header) # all rare coding variants if not knockouts: variant_filter = get_default_variant_filter( 'all_coding', mall.get_annotator().reference_population_slugs) #variant_filter.set_max_AF(max_af) if include_non_coding: variant_filter.so_annotations = [] print("All Filters: ") pprint(variant_filter.toJSON()) #print("Max AF threshold: %s" % max_af) print("Starting search for:\n%s\nin projects:\n%s\n" % (", ".join(gene_or_variant_ids), ", ".join( [p.project_id for p in projects]))) for project in projects: project_id = project.project_id if get_project_datastore(project).project_collection_is_loaded( project): print("=====================") print("Searching project %s" % project_id) else: print( "Skipping project %s - gene search is not enabled for this project" % project_id) continue indiv_cache = {} for gene_or_variant_id in gene_or_variant_ids: chrom_pos_match = re.match("([0-9XY]{1,2})-([0-9]{1,9})", gene_or_variant_id) chrom_pos_ref_alt_match = re.match( "([0-9XY]{1,2})-([0-9]{1,9})-([ACTG]+)-([ACTG]+)", gene_or_variant_id) if chrom_pos_match or chrom_pos_ref_alt_match: chrom = chrom_pos_match.group(1) pos = int(chrom_pos_match.group(2)) xpos = genomeloc.get_xpos(chrom, pos) ref = alt = None if chrom_pos_ref_alt_match: ref = chrom_pos_ref_alt_match.group(3) alt = chrom_pos_ref_alt_match.group(4) variant = get_project_datastore( project).get_single_variant(project.project_id, None, xpos, ref, alt) if variant is None: continue variants = [variant] print("-- searching %s for variant %s-%s-%s: found %s" % (project_id, xpos, ref, alt, variant)) worst_annotation_idx = variant.annotation[ 'worst_vep_annotation_index'] print(variant.annotation["vep_annotation"] [worst_annotation_idx]) gene_id = variant.annotation["vep_annotation"][ worst_annotation_idx]['gene_id'] gene = get_reference().get_gene(gene_id) else: gene_id = get_gene_id_from_str(gene_or_variant_id, get_reference()) gene = get_reference().get_gene(gene_id) print("-- searching %s for gene %s (%s)" % (project_id, gene["symbol"], gene_id)) if knockouts: knockout_ids, variation = project_analysis.get_knockouts_in_gene( project, gene_id) variants = variation.get_relevant_variants_for_indiv_ids( knockout_ids) else: variants = project_analysis.get_variants_in_gene( project, gene_id, variant_filter=variant_filter) for variant in variants: if not chrom_pos_match and not chrom_pos_ref_alt_match and max( variant.annotation['freqs'].values()) >= max_af: continue add_extra_info_to_variants_project(get_reference(), project, [variant]) worst_annotation_idx = variant.annotation[ "worst_vep_index_per_gene"].get(gene_id) if worst_annotation_idx is not None: worst_annotation = variant.annotation[ "vep_annotation"][worst_annotation_idx] else: worst_annotation = None all_genotypes_list = [] pass_filter = "N/A" family_ids = set() for indiv_id, genotype in variant.genotypes.items(): if indiv_id in indiv_cache: individual = indiv_cache[indiv_id] if individual == 'deleted': continue else: try: individual = Individual.objects.get( project=project, indiv_id=indiv_id) indiv_cache[indiv_id] = individual except ObjectDoesNotExist: # this can happen when an individual is deleted from the project - from postgres, but not from mong indiv_cache[indiv_id] = 'deleted' continue except MultipleObjectsReturned: # when several families have an individual with the same id individuals = Individual.objects.filter( project=project, indiv_id=indiv_id) individual = individuals[0] indiv_cache[indiv_id] = individual pass_filter = genotype.filter # filter value is stored in the genotypes even though it's the same for all individuals if genotype.num_alt > 0: family_ids.add(individual.family.family_id) all_genotypes_list.append( "%s/%s%s[gt:%s GQ:%s AB:%0.3f]" % (individual.family.family_id, indiv_id, "[Affected]" if individual.affected == "A" else ("[-]" if individual.affected == "N" else "[?]"), ">".join(genotype.alleles), genotype.gq, genotype.ab if genotype.ab is not None else float('NaN'))) if len(all_genotypes_list) == 0: continue measureset_id, clinvar_significance = get_reference( ).get_clinvar_info(*variant.unique_tuple()) if in_clinvar_only and ( not clinvar_significance or "path" not in clinvar_significance.lower()): continue row = map(str, [ project_id, gene, variant.chr, variant.pos, variant.ref, variant.alt, variant.vcf_id or "", pass_filter, variant.annotation.get("vep_consequence", ""), worst_annotation.get("hgvsc", "") if worst_annotation else "", (worst_annotation.get("hgvsp", "") or "").replace( "%3D", "=") if worst_annotation else "", worst_annotation.get("sift", "") if worst_annotation else "", worst_annotation.get("polyphen", "") if worst_annotation else "", worst_annotation.get("mutationtaster_pred", "") if worst_annotation else "", ";".join( set( worst_annotation.get("fathmm_pred", "").split('%3B'))) if worst_annotation else "", measureset_id, clinvar_significance, variant.annotation["freqs"].get("1kg_wgs_phase3", ""), variant.annotation["freqs"].get( "1kg_wgs_phase3_popmax", ""), variant.annotation["freqs"].get("exac_v3", ""), variant.annotation["freqs"].get("exac_v3_popmax", ""), variant.annotation["freqs"].get("gnomad-exomes2", ""), variant.annotation["freqs"].get("gnomad-genomes2", ""), ", ".join(sorted(list(family_ids))), ", ".join(all_genotypes_list), ]) writer.writerow(row) outfile.close() print("Wrote out %s" % output_filename)
def search_for_gene(self, search_gene_id, project_id_list, max_af=0.01): ''' Search for a gene across project(s) Args: 1. search_gene_id: Gene ID to search for 2. proj_list: An optional list of projects to narrow down search to ''' gene_id = get_gene_id_from_str(search_gene_id, get_reference()) gene = get_reference().get_gene(gene_id) print("Staring gene search for: %s %s in projects: %s\n" % (search_gene_id, gene['gene_id'], ", ".join(project_id_list))) print("Max AF threshold: %s" % max_af) # all rare coding variants variant_filter = get_default_variant_filter('all_coding', mall.get_annotator().reference_population_slugs) print("All Filters: ") pprint(variant_filter.toJSON()) output_filename = 'results_'+search_gene_id + '.tsv' outfile = open(output_filename,'w') header = ["project_id","gene", "chr", "pos", "ref", "alt", "rsID", "filter", "impact", "HGVS.c", "HGVS.p", "sift", "polyphen", "muttaster", "fathmm", "clinvar_id", "clinvar_clinical_sig", "freq_1kg_wgs_phase3", "freq_1kg_wgs_phase3_popmax", "freq_exac_v3", "freq_exac_v3_popmax", "all_genotypes"] writer = csv.writer(outfile,delimiter='\t') writer.writerow(header) if project_id_list: for project_id in project_id_list: project = Project.objects.filter(project_id=project_id)[0] # TODO validate else: project_id_list = [p.project_id for p in Project.objects.all()] for project_id in project_id_list: project = Project.objects.filter(project_id=project_id)[0] if get_project_datastore(project_id).project_collection_is_loaded(project_id): print("Running on project %s" % project_id) else: print("Skipping project %s - gene search is not enabled for this project" % project_id) continue for variant in project_analysis.get_variants_in_gene(project, gene_id, variant_filter=variant_filter): if max(variant.annotation['freqs'].values()) >= max_af: continue #pprint(variant.toJSON()) add_extra_info_to_variants_project(get_reference(), project, [variant]) worst_annotation_idx = variant.annotation["worst_vep_index_per_gene"][gene_id] worst_annotation = variant.annotation["vep_annotation"][worst_annotation_idx] all_genotypes_list = [] pass_filter = "N/A" for indiv_id, genotype in variant.genotypes.items(): pass_filter = genotype.filter # filter value is stored in the genotypes even though it's the same for all individuals if genotype.num_alt > 0: all_genotypes_list.append("%s[gt:%s GQ:%s AB:%0.3f]" % (indiv_id, ">".join(genotype.alleles), genotype.gq, genotype.ab if genotype.ab is not None else float('NaN'))) measureset_id, clinvar_significance = settings.CLINVAR_VARIANTS.get(variant.unique_tuple(), ("", "")) row = map(str, [project_id, gene["symbol"], variant.chr, variant.pos, variant.ref, variant.alt, variant.vcf_id or "", pass_filter, variant.annotation.get("vep_consequence", ""), worst_annotation.get("hgvsc", ""), worst_annotation.get("hgvsp", "").replace("%3D", "="), worst_annotation.get("sift", ""), worst_annotation.get("polyphen", ""), worst_annotation.get("mutationtaster_pred", ""), ";".join(set(worst_annotation.get("fathmm_pred", "").split('%3B'))), measureset_id, clinvar_significance, variant.annotation["freqs"].get("1kg_wgs_phase3", ""), variant.annotation["freqs"].get("1kg_wgs_phase3_popmax", ""), variant.annotation["freqs"].get("exac_v3", ""), variant.annotation["freqs"].get("exac_v3_popmax", ""), ", ".join(all_genotypes_list), ]) writer.writerow(row) outfile.close() print("Wrote out %s" % output_filename)
def search_for_genes(self, gene_ids, project_id_list, output_filename, max_af=0.01): """ Search for a gene across project(s) Args: gene_ids (list): 'ENSG..' gene id strings. project_id_list (list): (optional) project ids to narrow down the search output_filename (string): output file name max_af (float): AF filter """ outfile = open(output_filename, 'w') header = [ "project_id", "gene", "chr", "pos", "ref", "alt", "rsID", "filter", "impact", "HGVS.c", "HGVS.p", "sift", "polyphen", "muttaster", "fathmm", "clinvar_id", "clinvar_clinical_sig", "freq_1kg_wgs_phase3", "freq_1kg_wgs_phase3_popmax", "freq_exac_v3", "freq_exac_v3_popmax", "all_genotypes" ] writer = csv.writer(outfile, delimiter='\t') writer.writerow(header) # all rare coding variants variant_filter = get_default_variant_filter( 'all_coding', mall.get_annotator().reference_population_slugs) print("All Filters: ") pprint(variant_filter.toJSON()) if project_id_list: projects = [ Project.objects.get(project_id=project_id) for project_id in project_id_list ] else: projects = Project.objects.all() print("Max AF threshold: %s" % max_af) print("Staring gene search for:\n%s\nin projects:\n%s\n" % (", ".join(gene_ids), ", ".join([p.project_id for p in projects]))) indiv_id_cache = {} for project in projects: project_id = project.project_id if get_project_datastore(project_id).project_collection_is_loaded( project_id): print("=====================") print("Searching project %s" % project_id) else: print( "Skipping project %s - gene search is not enabled for this project" % project_id) continue for gene_id in gene_ids: gene_id = get_gene_id_from_str(gene_id, get_reference()) gene = get_reference().get_gene(gene_id) print("-- searching %s for gene %s (%s)" % (project_id, gene["symbol"], gene_id)) for variant in project_analysis.get_variants_in_gene( project, gene_id, variant_filter=variant_filter): if max(variant.annotation['freqs'].values()) >= max_af: continue add_extra_info_to_variants_project(get_reference(), project, [variant]) worst_annotation_idx = variant.annotation[ "worst_vep_index_per_gene"][gene_id] worst_annotation = variant.annotation["vep_annotation"][ worst_annotation_idx] all_genotypes_list = [] pass_filter = "N/A" for indiv_id, genotype in variant.genotypes.items(): individual = indiv_id_cache.get('indiv_id') if individual is None: individual = Individual.objects.get( project=project, indiv_id=indiv_id) indiv_id_cache[indiv_id] = individual pass_filter = genotype.filter # filter value is stored in the genotypes even though it's the same for all individuals if genotype.num_alt > 0: all_genotypes_list.append( "%s%s[gt:%s GQ:%s AB:%0.3f]" % (indiv_id, "[Affected]" if individual.affected == "A" else ("[-]" if individual.affected == "N" else "[?]"), ">".join(genotype.alleles), genotype.gq, genotype.ab if genotype.ab is not None else float('NaN'))) measureset_id, clinvar_significance = get_clinvar_variants( ).get(variant.unique_tuple(), ("", "")) row = map(str, [ project_id, gene["symbol"], variant.chr, variant.pos, variant.ref, variant.alt, variant.vcf_id or "", pass_filter, variant.annotation.get("vep_consequence", ""), worst_annotation.get("hgvsc", ""), worst_annotation.get("hgvsp", "").replace("%3D", "="), worst_annotation.get("sift", ""), worst_annotation.get("polyphen", ""), worst_annotation.get("mutationtaster_pred", ""), ";".join( set( worst_annotation.get("fathmm_pred", "").split('%3B'))), measureset_id, clinvar_significance, variant.annotation["freqs"].get("1kg_wgs_phase3", ""), variant.annotation["freqs"].get( "1kg_wgs_phase3_popmax", ""), variant.annotation["freqs"].get("exac_v3", ""), variant.annotation["freqs"].get("exac_v3_popmax", ""), ", ".join(all_genotypes_list), ]) writer.writerow(row) outfile.close() print("Wrote out %s" % output_filename)