def handle(self, *args, **options): for project_id in args: print("Deleting data from mongodb for project: " + project_id) p = Project.objects.get(project_id = project_id) get_mall(p).variant_store.delete_project(project_id) get_project_datastore(p).delete_project_store(project_id) print("Done")
def handle(self, *args, **options): for project_id in args: print("Deleting data from mongodb for project: " + project_id) p = Project.objects.get(project_id=project_id) get_mall(p).variant_store.delete_project(project_id) get_project_datastore(p).delete_project_store(project_id) print("Done")
def delete_project(project_id, delete_data=False): """ Delete a project and perform any cleanup (ie. deleting from datastore and removing temp files) """ project = Project.objects.get(project_id=project_id) if delete_data: get_project_datastore(project).delete_project_store(project_id) get_mall(project).variant_store.delete_project(project_id) project.individual_set.all().delete() project.family_set.all().delete() project.delete()
def delete_project(project_id): """ Delete a project and perform any cleanup (ie. deleting from datastore and removing temp files) """ print("Deleting %s" % project_id) project = Project.objects.get(project_id=project_id) get_project_datastore(project_id).delete_project_store(project_id) get_mall(project_id).variant_store.delete_project(project_id) project.individual_set.all().delete() project.family_set.all().delete() project.delete() print("Successfully deleted %s" % project_id)
def load_project_datastore(project_id, vcf_files=None, start_from_chrom=None, end_with_chrom=None): """ Load this project into the project datastore Which allows queries over all variants in a project """ print(date.strftime(datetime.now(), "%m/%d/%Y %H:%M:%S -- starting load_project_datastore: " + project_id)) project = Project.objects.get(project_id=project_id) get_project_datastore(project_id).delete_project_store(project_id) get_project_datastore(project_id).add_project(project_id) for vcf_file in sorted(project.get_all_vcf_files(), key=lambda v:v.path()): vcf_file_path = vcf_file.path() if vcf_files is not None and vcf_file_path not in vcf_files: print("Skipping - %(vcf_file_path)s is not in %(vcf_files)s" % locals()) project_indiv_ids = [i.indiv_id for i in project.get_individuals()] vcf_ids = vcf_file.sample_id_list() indiv_id_list = [i for i in project_indiv_ids if i in vcf_ids] get_project_datastore(project_id).add_variants_to_project_from_vcf( vcf_file.file_handle(), project_id, indiv_id_list=indiv_id_list, start_from_chrom=start_from_chrom, end_with_chrom=end_with_chrom ) get_project_datastore(project_id).set_project_collection_to_loaded(project_id) print(date.strftime(datetime.now(), "%m/%d/%Y %H:%M:%S -- load_project_datastore: " + project_id + " is done!"))
def project_home(request, project_id): project = get_object_or_404(Project, project_id=project_id) if not project.can_view(request.user): return HttpResponse("unauthorized") project.set_accessed() if project.can_admin(request.user): auth_level = "admin" elif project.can_edit(request.user): auth_level = "editor" elif project.is_public: auth_level = "public" elif project.can_view(request.user): auth_level = "viewer" else: raise Exception("Authx - how did we get here?!?") return render( request, "project.html", { "project": project, "auth_level": auth_level, "can_edit": project.can_edit(request.user), "is_manager": project.can_admin(request.user), "has_gene_search": get_project_datastore(project_id).project_collection_is_loaded(project_id), }, )
def project_home(request, project_id): project = get_object_or_404(Project, project_id=project_id) if not project.can_view(request.user): raise PermissionDenied project.set_accessed() if project.can_admin(request.user): auth_level = 'admin' elif project.can_edit(request.user): auth_level = 'editor' elif project.is_public: auth_level = 'public' elif project.can_view(request.user): auth_level = 'viewer' else: raise Exception("Authx - how did we get here?!?") phenotips_supported=True if settings.PROJECTS_WITHOUT_PHENOTIPS is not None and project_id in settings.PROJECTS_WITHOUT_PHENOTIPS: phenotips_supported=False return render(request, 'project.html', { 'phenotips_supported':phenotips_supported, 'project': project, 'auth_level': auth_level, 'can_edit': project.can_edit(request.user), 'is_manager': project.can_admin(request.user), 'has_gene_search': get_project_datastore(project).project_collection_is_loaded(project), 'new_page_url': '/project/{}/project_page'.format(project.seqr_project.guid) if project.seqr_project else None, })
def project_home(request, project_id): project = get_object_or_404(Project, project_id=project_id) if not project.can_view(request.user): raise PermissionDenied project.set_accessed() if project.can_admin(request.user): auth_level = 'admin' elif project.can_edit(request.user): auth_level = 'editor' elif project.is_public: auth_level = 'public' elif project.can_view(request.user): auth_level = 'viewer' else: raise Exception("Authx - how did we get here?!?") phenotips_supported=True if settings.PROJECTS_WITHOUT_PHENOTIPS is not None and project_id in settings.PROJECTS_WITHOUT_PHENOTIPS: phenotips_supported=False return render(request, 'project.html', { 'phenotips_supported':phenotips_supported, 'project': project, 'auth_level': auth_level, 'can_edit': project.can_edit(request.user), 'is_manager': project.can_admin(request.user), 'has_gene_search': get_project_datastore(project_id).project_collection_is_loaded(project_id) })
def handle(self, *args, **options): """load CADD scores for all variants in a project, or all variants in the annotator_store.""" annotator_store = mall.get_annotator().get_annotator_datastore() if options['cadd_file']: print("Loading " + options['cadd_file']) load_from_cadd_file(options['cadd_file']) elif options['project_id']: print("Loading " + options['project_id']) project = Project.objects.get(project_id=options['project_id']) variant_collection = get_project_datastore(project)._get_project_collection(options['project_id']).find({'annotation.cadd_phred': {'$exists' : False}}) else: variant_collection = annotator_store.variants.find({'annotation.cadd_phred': {'$exists' : False}}) #print("Variant collection: " + str(variant_collection)) #print("Annotating %s variants" % variant_collection.count()) for r in tqdm.tqdm(variant_collection, unit=' variants'): #, total=variant_collection.count()): chrom, pos = genomeloc.get_chr_pos(r['xpos']) cadd_phred = fetch(chrom, pos, r['ref'], r['alt']) if cadd_phred is not None: result = annotator_store.variants.update({'xpos': r['xpos'], 'ref': r['ref'], 'alt': r['alt']}, {'$set': {'annotation.cadd_phred': cadd_phred}}, upsert=False) assert result['updatedExisting'] print("Done")
def get_variants_in_gene(project, gene_id, variant_filter=None, quality_filter=None): """ Get all the variants in a gene, but filter out quality_filter genotypes """ variant_list = get_project_datastore().get_variants_in_gene(project.project_id, gene_id, variant_filter=variant_filter) variant_list = search_utils.filter_gene_variants_by_variant_filter(variant_list, gene_id, variant_filter) return variant_list
def mendelian_variant_search(request, project_id, family_id): project = get_object_or_404(Project, project_id=project_id) family = get_object_or_404(Family, project=project, family_id=family_id) if not project.can_view(request.user): raise PermissionDenied if not family.has_data('variation'): return render(request, 'analysis_unavailable.html', { 'reason': 'This family does not have any variant data.' }) elif project.project_status == Project.NEEDS_MORE_PHENOTYPES and not request.user.is_staff: return render(request, 'analysis_unavailable.html', { 'reason': 'Awaiting phenotype data.' }) has_gene_search = get_project_datastore(project_id).project_collection_is_loaded(project_id) gene_lists = [project_gene_list.gene_list.toJSON(details=True) for project_gene_list in ProjectGeneList.objects.filter(project=project)] sys.stderr.write("returning mendelian_variant_search page for %(project_id)s %(family_id)s. has_gene_search = %(has_gene_search)s\n " % locals() ) return render(request, 'mendelian_variant_search.html', { 'gene_lists': json.dumps(gene_lists), 'project': project, 'family': family, 'family_genotype_filters_json': json.dumps(x_inheritance.get_genotype_filters(family.xfamily())), 'has_gene_search': has_gene_search or get_elasticsearch_dataset(project_id) is not None })
def project_home(request, project_id): project = get_object_or_404(Project, project_id=project_id) if not project.can_view(request.user): return HttpResponse('unauthorized') project.set_accessed() if project.can_admin(request.user): auth_level = 'admin' elif project.can_edit(request.user): auth_level = 'editor' elif project.is_public: auth_level = 'public' elif project.can_view(request.user): auth_level = 'viewer' else: raise Exception("Authx - how did we get here?!?") return render(request, 'project.html', { 'project': project, 'auth_level': auth_level, 'can_edit': project.can_edit(request.user), 'is_manager': project.can_admin(request.user), 'has_gene_search': get_project_datastore(project_id).project_collection_is_loaded(project_id) })
def get_knockouts_in_gene(project, gene_id, quality_filter=None): """ Get all the variants in a gene, but filter out quality_filter genotypes """ indiv_id_list = [i.indiv_id for i in project.get_individuals()] # filter out variants > 0.01 AF in any of the reference populations reference_populations = mall.get_annotator().reference_population_slugs variant_filter = get_default_variant_filter('moderate_impact', reference_populations) variant_list = get_project_datastore( project.project_id).get_project_variants_in_gene( project.project_id, gene_id, variant_filter=variant_filter, ) variant_list = search_utils.filter_gene_variants_by_variant_filter( variant_list, gene_id, variant_filter) variation = CohortGeneVariation( get_reference(), gene_id, variant_list, indiv_id_list, quality_filter={}, ) knockouts = get_individuals_with_inheritance('recessive', variation, indiv_id_list) return knockouts, variation
def get_variants_in_gene(project, gene_id, variant_filter=None, quality_filter=None): """ Get all the variants in a gene, but filter out quality_filter genotypes """ variant_list = get_project_datastore(project).get_project_variants_in_gene(project.project_id, gene_id, variant_filter=variant_filter) variant_list = search_utils.filter_gene_variants_by_variant_filter(variant_list, gene_id, variant_filter) return variant_list
def transfer_project(self, from_project_id, destination_project_id): print("From: " + from_project_id) print("To: " + destination_project_id) from_project = Project.objects.get(project_id=from_project_id) destination_project = Project.objects.get(project_id=destination_project_id) # Make sure individuals are the same indivs_missing_from_dest_project = (set( [i.indiv_id for i in Individual.objects.filter(project=from_project)]) - set( [i.indiv_id for i in Individual.objects.filter(project=destination_project)])) if indivs_missing_from_dest_project: raise Exception("Individuals missing from dest project: " + str(indivs_missing_from_dest_project)) # update VCFs vcfs = from_project.families_by_vcf().keys() for vcf_file_path in vcfs: vcf_file = VCFFile.objects.get_or_create(file_path=os.path.abspath(vcf_file_path))[0] sample_management.add_vcf_file_to_project(destination_project, vcf_file) print("Added %s to project %s" % (vcf_file, destination_project.project_id)) families_db = get_datastore()._db projects_db = get_project_datastore()._db print("==========") print("Checking 'from' Projects and Families:") if not check_that_exists(projects_db.projects, {'project_id': from_project_id}, not_more_than_one=True): raise ValueError("There needs to be 1 project db in %(from_project_id)s" % locals()) if not check_that_exists(families_db.families, {'project_id': from_project_id}, not_more_than_one=False): raise ValueError("There needs to be atleast 1 family db in %(from_project_id)s" % locals()) print("==========") print("Make Updates:") datestamp = datetime.now().strftime("%Y-%m-%d") if check_that_exists(projects_db.projects, {'project_id': destination_project_id}, not_more_than_one=True): result = update(projects_db.projects, {'project_id': destination_project_id}, {'project_id': destination_project_id+'_previous', 'version': datestamp}) if check_that_exists(families_db.families, {'project_id': destination_project_id}, not_more_than_one=False): result = update(families_db.families, {'project_id': destination_project_id}, {'project_id': destination_project_id+'_previous', 'version': datestamp}) result = update(projects_db.projects, {'project_id': from_project_id}, {'project_id': destination_project_id, 'version': '2'}) result = update(families_db.families, {'project_id': from_project_id}, {'project_id': destination_project_id, 'version': '2'}) print("==========") print("Checking Projects:") if not check_that_exists(projects_db.projects, {'project_id': destination_project_id}, not_more_than_one=True): raise ValueError("After: There needs to be 1 project db in %(destination_project_id)s" % locals()) if not check_that_exists(families_db.families, {'project_id': destination_project_id}, not_more_than_one=False): raise ValueError("After: There needs to be atleast 1 family db in %(destination_project_id)s" % locals()) update_family_analysis_status(destination_project_id) print("Data transfer finished.") i = raw_input("Delete the 'from' project: %s? [Y/n] " % from_project_id) if i.strip() == 'Y': sample_management.delete_project(from_project_id) print("Project %s deleted" % from_project_id) else: print("Project not deleted")
def handle(self, *args, **options): project_id = options['project_id'] print("Loading data into project: " + project_id) project = Project.objects.get(project_id = project_id) cnv_filename = options['cnv_filename'] bed_files_directory = options['bed_files_directory'] if not os.path.isfile(cnv_filename): raise ValueError("CNV file %s doesn't exist" % options['cnv_filename']) with open(cnv_filename) as f: header_fields = f.readline().rstrip('\n').split('\t') for line in f: fields = line.rstrip('\n').split('\t') row_dict = dict(zip(header_fields, fields)) chrom = "chr"+row_dict['chr'] start = int(row_dict['start']) end = int(row_dict['end']) #left_overhang = int(row_dict['left_overhang_start']) #right_overhang = int(row_dict['right_overhang_end']) sample_id = row_dict['sample'] try: i = Individual.objects.get(project=project, indiv_id__istartswith=sample_id) except Exception as e: print("WARNING: %s: %s not found in %s" % (e, sample_id, project)) continue bed_file_path = os.path.join(bed_files_directory, "%s.bed" % sample_id) if not os.path.isfile(bed_file_path): print("WARNING: .bed file not found: " + bed_file_path) if i.cnv_bed_file != bed_file_path: print("Setting cnv_bed_file path to %s" % bed_file_path) i.cnv_bed_file = bed_file_path i.save() project_collection = get_project_datastore(project)._get_project_collection(project_id) family_collection = get_mall(project).variant_store._get_family_collection(project_id, i.family.family_id) for collection in filter(None, [project_collection, family_collection]): collection.update_many( {'$and': [ {'xpos': {'$gte': genomeloc.get_single_location(chrom, start)} }, {'xpos': {'$lte': genomeloc.get_single_location(chrom, end)}} ]}, {'$set': {'genotypes.%s.extras.cnvs' % i.indiv_id: row_dict}}) #result = list(collection.find({'$and' : [ # {'xpos': {'$gte': genomeloc.get_single_location(chrom, start)}}, # {'xpos' :{'$lte': genomeloc.get_single_location(chrom, end)}}]}, # {'genotypes.%s.extras.cnvs' % i.indiv_id :1 })) #print(chrom, start, end, len(result), result[0] if result else None) print("Done")
def load_project_datastore(project_id, vcf_files=None, start_from_chrom=None, end_with_chrom=None): """ Load this project into the project datastore Which allows queries over all variants in a project """ print( date.strftime( datetime.now(), "%m/%d/%Y %H:%M:%S -- starting load_project_datastore: " + project_id + (" from chrom: " + start_from_chrom) if start_from_chrom else "")) settings.EVENTS_COLLECTION.insert({ 'event_type': 'load_project_datastore_started', 'date': timezone.now(), 'project_id': project_id }) project = Project.objects.get(project_id=project_id) get_project_datastore(project_id).delete_project_store(project_id) get_project_datastore(project_id).add_project(project_id) for vcf_file in sorted(project.get_all_vcf_files(), key=lambda v: v.path()): vcf_file_path = vcf_file.path() if vcf_files is not None and vcf_file_path not in vcf_files: print("Skipping - %(vcf_file_path)s is not in %(vcf_files)s" % locals()) project_indiv_ids = [i.indiv_id for i in project.get_individuals()] vcf_ids = vcf_file.sample_id_list() indiv_id_list = [i for i in project_indiv_ids if i in vcf_ids] get_project_datastore(project_id).add_variants_to_project_from_vcf( vcf_file.file_handle(), project_id, indiv_id_list=indiv_id_list, start_from_chrom=start_from_chrom, end_with_chrom=end_with_chrom) get_project_datastore(project_id).set_project_collection_to_loaded( project_id) print( date.strftime( datetime.now(), "%m/%d/%Y %H:%M:%S -- load_project_datastore: " + project_id + " is done!")) settings.EVENTS_COLLECTION.insert({ 'event_type': 'load_project_datastore_finished', 'date': timezone.now(), 'project_id': project_id })
def project_home(request, project_id): project = get_object_or_404(Project, project_id=project_id) if not project.can_view(request.user): raise PermissionDenied project.set_accessed() if project.can_admin(request.user): auth_level = 'admin' elif project.can_edit(request.user): auth_level = 'editor' elif project.is_public: auth_level = 'public' elif project.can_view(request.user): auth_level = 'viewer' else: raise Exception("Authx - how did we get here?!?") #phenotips_supported=False #if not (settings.PROJECTS_WITHOUT_PHENOTIPS is None or project_id in settings.PROJECTS_WITHOUT_PHENOTIPS): # phenotips_supported=True phenotips_supported = True if settings.PROJECTS_WITHOUT_PHENOTIPS is not None and project_id in settings.PROJECTS_WITHOUT_PHENOTIPS: phenotips_supported = False #indiv_phenotype_counts=[] #binned_counts={} #categorized_phenotype_counts={} #if phenotips_supported: # try: # indiv_phenotype_counts= get_phenotype_entry_metrics_for_project(project_id) # binned_counts=aggregate_phenotype_counts_into_bins(indiv_phenotype_counts) # categorized_phenotype_counts=categorize_phenotype_counts(binned_counts) # except Exception as e: # print 'error looking for project information in PhenoTips:logging & moving,there might not be any data' # logger.error('project_views:'+str(e)) return render( request, 'project.html', { 'phenotips_supported': phenotips_supported, 'project': project, 'auth_level': auth_level, 'can_edit': project.can_edit(request.user), 'is_manager': project.can_admin(request.user), 'has_gene_search': get_project_datastore(project_id).project_collection_is_loaded( project_id) })
def _has_gene_search(project): """ Returns True if this project has Gene Search enabled. DEPRECATED - will be removed along with mongodb. Args: project (object): django project """ return get_project_datastore( project.deprecated_project_id).project_collection_is_loaded(project.deprecated_project_id)
def load_project_datastore(project_id, vcf_files=None): """ Load this project into the project datastore Which allows queries over all variants in a project """ print( date.strftime( datetime.now(), "%m/%d/%Y %H:%M:%S -- starting load_project_datastore: " + project_id)) project = Project.objects.get(project_id=project_id) get_project_datastore(project_id).delete_project_store(project_id) get_project_datastore(project_id).add_project(project_id) for vcf_file in project.get_all_vcf_files(): vcf_file_path = vcf_file.path() if vcf_files is not None and vcf_file_path not in vcf_files: print("Skipping - %(vcf_file_path)s is not in %(vcf_files)s" % locals()) project_indiv_ids = [i.indiv_id for i in project.get_individuals()] vcf_ids = vcf_file.sample_id_list() indiv_id_list = [i for i in project_indiv_ids if i in vcf_ids] get_project_datastore(project_id).add_variants_to_project_from_vcf( vcf_file.file_handle(), project_id, indiv_id_list=indiv_id_list) print( date.strftime( datetime.now(), "%m/%d/%Y %H:%M:%S -- load_project_datastore: " + project_id + " is done!"))
def update_pop_freqs_in_project_tables(self): # Load project tables population_frequency_store = mall.get_annotator().get_population_frequency_store() db = sqlite3.connect("reference_populations_project_tables.db", isolation_level=None) db.execute("CREATE TABLE if not exists all_projects(project_id varchar(200), started bool, finished bool)") db.execute("CREATE UNIQUE INDEX IF NOT EXISTS all_projects_idx ON all_projects(project_id)") import random other_project_ids = [p.project_id for p in Project.objects.all() if p.project_id != "myoseq_v11"] random.shuffle(other_project_ids) project_ids = ["myoseq_v11"] + other_project_ids for project_id in project_ids: db.execute("INSERT OR IGNORE INTO all_projects VALUES (?, 0, 0)", (project_id,)) # Go through each project and update the variant records population_slugs_to_load = [ population_spec["slug"] for population_spec in annotator_settings.reference_populations ] while True: remaining_work = list(db.execute("SELECT project_id FROM all_projects WHERE started=0")) print("%d projects remaining" % len(remaining_work)) if not remaining_work: print("Done with all projects") break project_id, = remaining_work[0] project_store = get_project_datastore(project_id) print(" updating %s " % project_id) db.execute("UPDATE all_projects SET started=1 WHERE project_id=?", (project_id,)) project_collection = project_store._get_project_collection(project_id) for variant_dict in project_collection.find(): freqs = population_frequency_store.get_frequencies( variant_dict["xpos"], variant_dict["ref"], variant_dict["alt"] ) full_freqs = { "db_freqs." + population_slug: freqs.get(population_slug, 0) for population_slug in population_slugs_to_load } project_collection.update( {"xpos": variant_dict["xpos"], "ref": variant_dict["ref"], "alt": variant_dict["alt"]}, {"$set": full_freqs}, upsert=False, ) print(" ---> done updating project_id: %s" % project_id) db.execute("UPDATE all_projects SET finished=1 WHERE project_id=?", (project_id,))
def _has_gene_search(project): """ Returns True if this project has Gene Search enabled. DEPRECATED - will be removed along with mongodb. Args: project (object): django project """ try: base_project = BaseProject.objects.get(seqr_project=project) except ObjectDoesNotExist as e: return False return base_project.has_elasticsearch_index() or get_project_datastore(base_project).project_collection_is_loaded(base_project)
def project_home(request, project_id): project = get_object_or_404(Project, project_id=project_id) if not project.can_view(request.user): raise PermissionDenied project.set_accessed() if project.can_admin(request.user): auth_level = 'admin' elif project.can_edit(request.user): auth_level = 'editor' elif project.is_public: auth_level = 'public' elif project.can_view(request.user): auth_level = 'viewer' else: raise Exception("Authx - how did we get here?!?") #phenotips_supported=False #if not (settings.PROJECTS_WITHOUT_PHENOTIPS is None or project_id in settings.PROJECTS_WITHOUT_PHENOTIPS): # phenotips_supported=True phenotips_supported=True if settings.PROJECTS_WITHOUT_PHENOTIPS is not None and project_id in settings.PROJECTS_WITHOUT_PHENOTIPS: phenotips_supported=False #indiv_phenotype_counts=[] #binned_counts={} #categorized_phenotype_counts={} #if phenotips_supported: # try: # indiv_phenotype_counts= get_phenotype_entry_metrics_for_project(project_id) # binned_counts=aggregate_phenotype_counts_into_bins(indiv_phenotype_counts) # categorized_phenotype_counts=categorize_phenotype_counts(binned_counts) # except Exception as e: # print 'error looking for project information in PhenoTips:logging & moving,there might not be any data' # logger.error('project_views:'+str(e)) return render(request, 'project.html', { 'phenotips_supported':phenotips_supported, 'project': project, 'auth_level': auth_level, 'can_edit': project.can_edit(request.user), 'is_manager': project.can_admin(request.user), 'has_gene_search': get_project_datastore(project_id).project_collection_is_loaded(project_id) })
def update_pop_freqs_in_project_tables(self): # Load project tables population_frequency_store = mall.get_annotator().get_population_frequency_store() db = sqlite3.connect("reference_populations_project_tables.db", isolation_level=None) db.execute("CREATE TABLE if not exists all_projects(project_id varchar(200), started bool, finished bool)") db.execute("CREATE UNIQUE INDEX IF NOT EXISTS all_projects_idx ON all_projects(project_id)") import random other_project_ids = [p.project_id for p in Project.objects.all() if p.project_id != "myoseq_v11"] random.shuffle(other_project_ids) project_ids = ["myoseq_v11"] + other_project_ids for project_id in project_ids: db.execute("INSERT OR IGNORE INTO all_projects VALUES (?, 0, 0)", (project_id,)) # Go through each project and update the variant records population_slugs_to_load = [population_spec['slug'] for population_spec in annotator_settings.reference_populations] while True: remaining_work = list(db.execute("SELECT project_id FROM all_projects WHERE started=0")) print("%d projects remaining" % len(remaining_work)) if not remaining_work: print("Done with all projects") break project_id, = remaining_work[0] project_store = get_project_datastore(project_id) print(" updating %s " % project_id) db.execute("UPDATE all_projects SET started=1 WHERE project_id=?", (project_id,)) project_collection = project_store._get_project_collection(project_id) for variant_dict in project_collection.find(): freqs = population_frequency_store.get_frequencies(variant_dict['xpos'], variant_dict['ref'], variant_dict['alt']) full_freqs = {'db_freqs.'+population_slug: freqs.get(population_slug, 0) for population_slug in population_slugs_to_load} project_collection.update({'xpos':variant_dict['xpos'], 'ref' :variant_dict['ref'], 'alt': variant_dict['alt']}, {'$set': full_freqs}, upsert=False) print(" ---> done updating project_id: %s" % project_id) db.execute("UPDATE all_projects SET finished=1 WHERE project_id=?", (project_id,))
def handle(self, *args, **options): """load CADD scores for all variants in a project, or all variants in the annotator_store.""" annotator_store = mall.get_annotator().get_annotator_datastore() if options['cadd_file']: print("Loading " + options['cadd_file']) load_from_cadd_file(options['cadd_file']) elif options['project_id']: print("Loading " + options['project_id']) project = Project.objects.get(project_id=options['project_id']) variant_collection = get_project_datastore( project)._get_project_collection(options['project_id']).find( {'annotation.cadd_phred': { '$exists': False }}) else: variant_collection = annotator_store.variants.find( {'annotation.cadd_phred': { '$exists': False }}) #print("Variant collection: " + str(variant_collection)) #print("Annotating %s variants" % variant_collection.count()) for r in tqdm.tqdm( variant_collection, unit=' variants'): #, total=variant_collection.count()): chrom, pos = genomeloc.get_chr_pos(r['xpos']) cadd_phred = fetch(chrom, pos, r['ref'], r['alt']) if cadd_phred is not None: result = annotator_store.variants.update( { 'xpos': r['xpos'], 'ref': r['ref'], 'alt': r['alt'] }, {'$set': { 'annotation.cadd_phred': cadd_phred }}, upsert=False) assert result['updatedExisting'] print("Done")
def mendelian_variant_search(request, project_id, family_id): project = get_object_or_404(Project, project_id=project_id) family = get_object_or_404(Family, project=project, family_id=family_id) if not project.can_view(request.user): return HttpResponse('unauthorized') if not family.has_data('variation'): return render(request, 'analysis_unavailable.html', { 'reason': 'This family does not have any variant data.' }) has_gene_search = get_project_datastore(project_id).project_collection_is_loaded(project_id) sys.stderr.write("Running mendelian_variant_search on %(project_id)s %(family_id)s. has_gene_search = %(has_gene_search)s\n " % locals() ) return render(request, 'mendelian_variant_search.html', { 'project': project, 'family': family, 'family_genotype_filters_json': json.dumps(x_inheritance.get_genotype_filters(family.xfamily())), 'has_gene_search': has_gene_search })
def get_knockouts_in_gene(project, gene_id, quality_filter=None): """ Get all the variants in a gene, but filter out quality_filter genotypes """ indiv_id_list = [i.indiv_id for i in project.get_individuals()] variant_filter = get_default_variant_filter('high_impact') variant_list = get_project_datastore().get_variants_in_gene( project.project_id, gene_id, variant_filter=variant_filter, ) variant_list = search_utils.filter_gene_variants_by_variant_filter(variant_list, gene_id, variant_filter) variation = CohortGeneVariation( get_reference(), gene_id, variant_list, indiv_id_list, quality_filter={}, ) knockouts = get_individuals_with_inheritance('recessive', variation, indiv_id_list) return knockouts, variation
def mendelian_variant_search(request, project_id, family_id): project = get_object_or_404(Project, project_id=project_id) family = get_object_or_404(Family, project=project, family_id=family_id) if not project.can_view(request.user): raise PermissionDenied if not family.has_data('variation'): return render(request, 'analysis_unavailable.html', { 'reason': 'This family does not have any variant data.' }) has_gene_search = get_project_datastore(project_id).project_collection_is_loaded(project_id) gene_lists = [project_gene_list.gene_list.toJSON(details=True) for project_gene_list in ProjectGeneList.objects.filter(project=project)] sys.stderr.write("Running mendelian_variant_search on %(project_id)s %(family_id)s. has_gene_search = %(has_gene_search)s\n " % locals() ) return render(request, 'mendelian_variant_search.html', { 'gene_lists': json.dumps(gene_lists), 'project': project, 'family': family, 'family_genotype_filters_json': json.dumps(x_inheritance.get_genotype_filters(family.xfamily())), 'has_gene_search': has_gene_search })
def mendelian_variant_search(request, project_id, family_id): project = get_object_or_404(Project, project_id=project_id) family = get_object_or_404(Family, project=project, family_id=family_id) if not project.can_view(request.user): raise PermissionDenied if not family.has_data('variation'): return render( request, 'analysis_unavailable.html', {'reason': 'This family does not have any variant data.'}) has_gene_search = get_project_datastore( project).project_collection_is_loaded(project) gene_lists = [ project_gene_list.gene_list.toJSON(details=True) for project_gene_list in ProjectGeneList.objects.filter( project=project) ] sys.stderr.write( "returning mendelian_variant_search page for %(project_id)s %(family_id)s. has_gene_search = %(has_gene_search)s\n " % locals()) return render( request, 'mendelian_variant_search.html', { 'gene_lists': json.dumps(gene_lists), 'project': project, 'family': family, 'family_genotype_filters_json': json.dumps(x_inheritance.get_genotype_filters(family.xfamily())), 'has_gene_search': has_gene_search, 'new_page_url': '/variant_search/family/{0}'.format(family.seqr_family.guid) if family.seqr_family and family.seqr_family.project.has_new_search else None, })
def load_project_datastore(project_id): """ Load this project into the project datastore Which allows queries over all variants in a project """ project = Project.objects.get(project_id=project_id) get_project_datastore().delete_project(project_id) get_project_datastore().add_project(project_id, project.get_reference_population_slugs()) for vcf_file in project.get_all_vcf_files(): project_indiv_ids = [i.indiv_id for i in project.get_individuals()] vcf_ids = vcf_file.sample_id_list() indiv_id_list = [i for i in project_indiv_ids if i in vcf_ids] get_project_datastore().add_variants_to_project_from_vcf( vcf_file.file_handle(), project_id, indiv_id_list=indiv_id_list )
def load_project_datastore(project_id, vcf_files=None, start_from_chrom=None, end_with_chrom=None): """ Load this project into the project datastore Which allows queries over all variants in a project """ if not settings.DEBUG: settings.EVENTS_COLLECTION.insert({ 'event_type': 'load_project_datastore_started', 'date': timezone.now(), 'project_id': project_id, }) project = Project.objects.get(project_id=project_id) get_project_datastore(project).delete_project_store(project_id) get_project_datastore(project).add_project(project_id) for vcf_file in sorted(project.get_all_vcf_files(), key=lambda v:v.path()): vcf_file_path = vcf_file.path() if vcf_files is not None and vcf_file_path not in vcf_files: print("Skipping - %(vcf_file_path)s is not in %(vcf_files)s" % locals()) project_indiv_ids = [i.indiv_id for i in project.get_individuals()] vcf_ids = vcf_file.sample_id_list() indiv_id_list = [i for i in project_indiv_ids if i in vcf_ids] get_project_datastore(project).add_variants_to_project_from_vcf( vcf_file.file_handle(), project_id, indiv_id_list=indiv_id_list, start_from_chrom=start_from_chrom, end_with_chrom=end_with_chrom ) get_project_datastore(project).set_project_collection_to_loaded(project_id) if not settings.DEBUG: settings.EVENTS_COLLECTION.insert({ 'event_type': 'load_project_datastore_finished', 'date': timezone.now(), 'project_id': project_id })
def get_knockouts_in_gene(project, gene_id, quality_filter=None): """ Get all the variants in a gene, but filter out quality_filter genotypes """ indiv_id_list = [i.indiv_id for i in project.get_individuals()] # filter out variants > 0.01 AF in any of the reference populations reference_populations = mall.get_annotator().reference_population_slugs variant_filter = get_default_variant_filter('moderate_impact', reference_populations) variant_list = get_project_datastore(project.project_id).get_project_variants_in_gene( project.project_id, gene_id, variant_filter=variant_filter, ) variant_list = search_utils.filter_gene_variants_by_variant_filter(variant_list, gene_id, variant_filter) variation = CohortGeneVariation( get_reference(), gene_id, variant_list, indiv_id_list, quality_filter={}, ) knockouts = get_individuals_with_inheritance('recessive', variation, indiv_id_list) return knockouts, variation
def get_knockouts_in_gene(project, gene_id, quality_filter=None): """ Get all the variants in a gene, but filter out quality_filter genotypes """ indiv_id_list = [i.indiv_id for i in project.get_individuals()] variant_filter = get_default_variant_filter('high_impact') variant_list = get_project_datastore( project.project_id).get_project_variants_in_gene( project.project_id, gene_id, variant_filter=variant_filter, ) variant_list = search_utils.filter_gene_variants_by_variant_filter( variant_list, gene_id, variant_filter) variation = CohortGeneVariation( get_reference(), gene_id, variant_list, indiv_id_list, quality_filter={}, ) knockouts = get_individuals_with_inheritance('recessive', variation, indiv_id_list) return knockouts, variation
def transfer_project(self, from_project_id, destination_project_id): print("From: " + from_project_id) print("To: " + destination_project_id) from_project = Project.objects.get(project_id=from_project_id) destination_project = Project.objects.get( project_id=destination_project_id) # Make sure individuals are the same indivs_missing_from_dest_project = (set([ i.indiv_id for i in Individual.objects.filter(project=from_project) ]) - set([ i.indiv_id for i in Individual.objects.filter(project=destination_project) ])) if indivs_missing_from_dest_project: raise Exception("Individuals missing from dest project: " + str(indivs_missing_from_dest_project)) # update VCFs vcfs = from_project.families_by_vcf().keys() for vcf_file_path in vcfs: vcf_file = VCFFile.objects.get_or_create( file_path=os.path.abspath(vcf_file_path))[0] sample_management.add_vcf_file_to_project(destination_project, vcf_file) print("Added %s to project %s" % (vcf_file, destination_project.project_id)) families_db = get_datastore(from_project_id)._db projects_db = get_project_datastore(from_project_id)._db print("==========") print("Checking Projects:") check_that_exists(projects_db.projects, {'project_id': from_project_id}, not_more_than_one=True) check_that_exists(projects_db.projects, {'project_id': destination_project_id}, not_more_than_one=True) print("==========") print("Checking Families:") check_that_exists(families_db.families, {'project_id': from_project_id}, not_more_than_one=False) check_that_exists(families_db.families, {'project_id': destination_project_id}, not_more_than_one=False) print("==========") print("Make Updates:") result = update( projects_db.projects, {'project_id': destination_project_id}, { 'project_id': destination_project_id + '_previous1', 'version': '1' }) result = update(projects_db.projects, {'project_id': from_project_id}, { 'project_id': destination_project_id, 'version': '2' }) result = update( families_db.families, {'project_id': destination_project_id}, { 'project_id': destination_project_id + '_previous1', 'version': '1' }) result = update(families_db.families, {'project_id': from_project_id}, { 'project_id': destination_project_id, 'version': '2' }) print("==========") print("Checking Projects:") check_that_exists(projects_db.projects, {'project_id': destination_project_id}, not_more_than_one=True) print("==========") print("Checking Families:") check_that_exists(families_db.families, {'project_id': destination_project_id}, not_more_than_one=False) update_family_analysis_status(destination_project_id) print("Data transfer finished.") i = raw_input("Delete the 'from' project: %s? [Y/n] " % from_project_id) if i.strip() == 'Y': sample_management.delete_project(from_project_id) print("Project %s deleted" % from_project_id) else: print("Project not deleted")
def search_for_genes(self, gene_or_variant_ids, project_id_list, output_filename, max_af=0.01, knockouts=False, in_clinvar_only=False, include_non_coding=False): """ Search for a gene across project(s) Args: gene_or_variant_ids (list): 'ENSG..' gene id strings. project_id_list (list): (optional) project ids to narrow down the search output_filename (string): output file name max_af (float): AF filter in_clinvar_only (bool): include_non_coding (bool): """ projects = [ Project.objects.get(project_id=project_id) for project_id in project_id_list ] outfile = open(output_filename, 'w') header = [ "project_id", "gene", "chr", "pos", "ref", "alt", "rsID", "filter", "impact", "HGVS.c", "HGVS.p", "sift", "polyphen", "muttaster", "fathmm", "clinvar_id", "clinvar_clinical_sig", "freq_1kg_wgs_phase3", "freq_1kg_wgs_phase3_popmax", "freq_exac_v3", "freq_exac_v3_popmax", "gnomad-exomes", "gnomad-genomes", "families", "all_genotypes" ] writer = csv.writer(outfile, delimiter='\t') writer.writerow(header) # all rare coding variants if not knockouts: variant_filter = get_default_variant_filter( 'all_coding', mall.get_annotator().reference_population_slugs) #variant_filter.set_max_AF(max_af) if include_non_coding: variant_filter.so_annotations = [] print("All Filters: ") pprint(variant_filter.toJSON()) #print("Max AF threshold: %s" % max_af) print("Starting search for:\n%s\nin projects:\n%s\n" % (", ".join(gene_or_variant_ids), ", ".join( [p.project_id for p in projects]))) for project in projects: project_id = project.project_id if get_project_datastore(project).project_collection_is_loaded( project): print("=====================") print("Searching project %s" % project_id) else: print( "Skipping project %s - gene search is not enabled for this project" % project_id) continue indiv_cache = {} for gene_or_variant_id in gene_or_variant_ids: chrom_pos_match = re.match("([0-9XY]{1,2})-([0-9]{1,9})", gene_or_variant_id) chrom_pos_ref_alt_match = re.match( "([0-9XY]{1,2})-([0-9]{1,9})-([ACTG]+)-([ACTG]+)", gene_or_variant_id) if chrom_pos_match or chrom_pos_ref_alt_match: chrom = chrom_pos_match.group(1) pos = int(chrom_pos_match.group(2)) xpos = genomeloc.get_xpos(chrom, pos) ref = alt = None if chrom_pos_ref_alt_match: ref = chrom_pos_ref_alt_match.group(3) alt = chrom_pos_ref_alt_match.group(4) variant = get_project_datastore( project).get_single_variant(project.project_id, None, xpos, ref, alt) if variant is None: continue variants = [variant] print("-- searching %s for variant %s-%s-%s: found %s" % (project_id, xpos, ref, alt, variant)) worst_annotation_idx = variant.annotation[ 'worst_vep_annotation_index'] print(variant.annotation["vep_annotation"] [worst_annotation_idx]) gene_id = variant.annotation["vep_annotation"][ worst_annotation_idx]['gene_id'] gene = get_reference().get_gene(gene_id) else: gene_id = get_gene_id_from_str(gene_or_variant_id, get_reference()) gene = get_reference().get_gene(gene_id) print("-- searching %s for gene %s (%s)" % (project_id, gene["symbol"], gene_id)) if knockouts: knockout_ids, variation = project_analysis.get_knockouts_in_gene( project, gene_id) variants = variation.get_relevant_variants_for_indiv_ids( knockout_ids) else: variants = project_analysis.get_variants_in_gene( project, gene_id, variant_filter=variant_filter) for variant in variants: if not chrom_pos_match and not chrom_pos_ref_alt_match and max( variant.annotation['freqs'].values()) >= max_af: continue add_extra_info_to_variants_project(get_reference(), project, [variant]) worst_annotation_idx = variant.annotation[ "worst_vep_index_per_gene"].get(gene_id) if worst_annotation_idx is not None: worst_annotation = variant.annotation[ "vep_annotation"][worst_annotation_idx] else: worst_annotation = None all_genotypes_list = [] pass_filter = "N/A" family_ids = set() for indiv_id, genotype in variant.genotypes.items(): if indiv_id in indiv_cache: individual = indiv_cache[indiv_id] if individual == 'deleted': continue else: try: individual = Individual.objects.get( project=project, indiv_id=indiv_id) indiv_cache[indiv_id] = individual except ObjectDoesNotExist: # this can happen when an individual is deleted from the project - from postgres, but not from mong indiv_cache[indiv_id] = 'deleted' continue except MultipleObjectsReturned: # when several families have an individual with the same id individuals = Individual.objects.filter( project=project, indiv_id=indiv_id) individual = individuals[0] indiv_cache[indiv_id] = individual pass_filter = genotype.filter # filter value is stored in the genotypes even though it's the same for all individuals if genotype.num_alt > 0: family_ids.add(individual.family.family_id) all_genotypes_list.append( "%s/%s%s[gt:%s GQ:%s AB:%0.3f]" % (individual.family.family_id, indiv_id, "[Affected]" if individual.affected == "A" else ("[-]" if individual.affected == "N" else "[?]"), ">".join(genotype.alleles), genotype.gq, genotype.ab if genotype.ab is not None else float('NaN'))) if len(all_genotypes_list) == 0: continue measureset_id, clinvar_significance = get_reference( ).get_clinvar_info(*variant.unique_tuple()) if in_clinvar_only and ( not clinvar_significance or "path" not in clinvar_significance.lower()): continue row = map(str, [ project_id, gene, variant.chr, variant.pos, variant.ref, variant.alt, variant.vcf_id or "", pass_filter, variant.annotation.get("vep_consequence", ""), worst_annotation.get("hgvsc", "") if worst_annotation else "", (worst_annotation.get("hgvsp", "") or "").replace( "%3D", "=") if worst_annotation else "", worst_annotation.get("sift", "") if worst_annotation else "", worst_annotation.get("polyphen", "") if worst_annotation else "", worst_annotation.get("mutationtaster_pred", "") if worst_annotation else "", ";".join( set( worst_annotation.get("fathmm_pred", "").split('%3B'))) if worst_annotation else "", measureset_id, clinvar_significance, variant.annotation["freqs"].get("1kg_wgs_phase3", ""), variant.annotation["freqs"].get( "1kg_wgs_phase3_popmax", ""), variant.annotation["freqs"].get("exac_v3", ""), variant.annotation["freqs"].get("exac_v3_popmax", ""), variant.annotation["freqs"].get("gnomad-exomes2", ""), variant.annotation["freqs"].get("gnomad-genomes2", ""), ", ".join(sorted(list(family_ids))), ", ".join(all_genotypes_list), ]) writer.writerow(row) outfile.close() print("Wrote out %s" % output_filename)
def gene_quicklook(request, project_id, gene_id): """ Summary of a gene in a project """ project = get_object_or_404(Project, project_id=project_id) if not project.can_view(request.user): return HttpResponse("Unauthorized") if project.project_status == Project.NEEDS_MORE_PHENOTYPES and not request.user.is_staff: return render(request, 'analysis_unavailable.html', {'reason': 'Awaiting phenotype data.'}) # other projects this user can view if request.user.is_staff: other_projects = [p for p in Project.objects.all()] # if p != project else: other_projects = [ c.project for c in ProjectCollaborator.objects.filter(user=request.user) ] # if c.project != project other_projects = filter( lambda p: get_project_datastore(p.project_id). project_collection_is_loaded(p.project_id), other_projects) if other_projects: other_projects_json = json.dumps([{ 'project_id': p.project_id, 'project_name': p.project_name } for p in sorted(other_projects, key=lambda p: p.project_id)]) else: other_projects_json = None if gene_id is None: return render( request, 'project/gene_quicklook.html', { 'project': project, 'gene': None, 'gene_json': None, 'rare_variants_json': None, 'individuals_json': None, 'knockouts_json': None, 'other_projects_json': other_projects_json, }) projects_to_search_param = request.GET.get('selected_projects') if projects_to_search_param: projects_to_search = [] project_ids = projects_to_search_param.split(",") for project_id in project_ids: project = get_object_or_404(Project, project_id=project_id) if not project.can_view(request.user): return HttpResponse("Unauthorized") projects_to_search.append(project) else: projects_to_search = [project] gene_id = get_gene_id_from_str(gene_id, get_reference()) gene = get_reference().get_gene(gene_id) sys.stderr.write( project_id + " - staring gene search for: %s in projects: %s\n" % (gene_id, ",".join([p.project_id for p in projects_to_search]) + "\n")) # all rare coding variants variant_filter = get_default_variant_filter( 'all_coding', mall.get_annotator().reference_population_slugs) indiv_id_to_project_id = {} rare_variant_dict = {} rare_variants = [] for project in projects_to_search: project_variants = [] for variant in project_analysis.get_variants_in_gene( project, gene_id, variant_filter=variant_filter): max_af = max(variant.annotation['freqs'].values()) if not any([ indiv_id for indiv_id, genotype in variant.genotypes.items() if genotype.num_alt > 0 ]): continue if max_af >= .01: continue # add project id to genotypes for indiv_id in variant.genotypes: indiv_id_to_project_id[indiv_id] = project.project_id # save this variant (or just the genotypes from this variant if the variant if it's been seen already in another project) variant_id = "%s-%s-%s-%s" % (variant.chr, variant.pos, variant.ref, variant.alt) if variant_id not in rare_variant_dict: rare_variant_dict[variant_id] = variant project_variants.append(variant) else: rare_variant_dict[variant_id].genotypes.update( variant.genotypes) #sys.stderr.write("gene_id: %s, variant: %s\n" % (gene_id, variant.toJSON()['annotation']['vep_annotation'])) add_extra_info_to_variants_project(get_reference(), project, project_variants) rare_variants.extend(project_variants) sys.stderr.write("Retreived %s rare variants\n" % len(rare_variants)) # compute knockout individuals individ_ids_and_variants = [] for project in projects_to_search: knockout_ids, variation = get_knockouts_in_gene(project, gene_id) for indiv_id in knockout_ids: variants = variation.get_relevant_variants_for_indiv_ids( [indiv_id]) add_extra_info_to_variants_project(get_reference(), project, variants) individ_ids_and_variants.append({ 'indiv_id': indiv_id, 'variants': variants, }) #sys.stderr.write("%s : %s: Retrieved %s knockout variants\n" % (project.project_id, indiv_id, len(variants), )) download_csv = request.GET.get('download', '') if download_csv: response = HttpResponse(content_type='text/csv') response[ 'Content-Disposition'] = 'attachment; filename="{}_{}.csv"'.format( download_csv, gene["transcript_name"]) if download_csv == 'knockouts': individuals_to_include = [ individ_id_and_variants["indiv_id"] for individ_id_and_variants in individ_ids_and_variants ] rows = [] for individ_id_and_variants in individ_ids_and_variants: rare_variants = individ_id_and_variants["variants"] for variant in rare_variants: worst_annotation_idx = variant.annotation[ "worst_vep_index_per_gene"][gene_id] worst_annotation = variant.annotation["vep_annotation"][ worst_annotation_idx] genotypes = [] all_genotypes_string = "" for indiv_id in individuals_to_include: if indiv_id in variant.genotypes and variant.genotypes[ indiv_id].num_alt > 0: genotype = variant.genotypes[indiv_id] allele_string = ">".join(genotype.alleles) all_genotypes_string += indiv_id + ":" + allele_string + " " genotypes.append(allele_string + " (" + str(genotype.gq) + ")") else: genotypes.append("") measureset_id, clinvar_significance = settings.CLINVAR_VARIANTS.get( variant.unique_tuple(), ("", "")) rows.append( map(str, [ gene["symbol"], variant.chr, variant.pos, variant.ref, variant.alt, variant.vcf_id or "", variant.annotation.get("vep_consequence", ""), worst_annotation.get("hgvsc", ""), worst_annotation.get("hgvsp", "").replace( "%3D", "="), worst_annotation.get("sift", ""), worst_annotation.get("polyphen", ""), worst_annotation.get("mutationtaster_pred", ""), ";".join( set( worst_annotation.get("fathmm_pred", "").split('%3B'))), measureset_id, clinvar_significance, variant.annotation["freqs"].get( "1kg_wgs_phase3", ""), variant.annotation["freqs"].get( "1kg_wgs_phase3_popmax", ""), variant.annotation["freqs"].get("exac_v3", ""), variant.annotation["freqs"].get( "exac_v3_popmax", ""), all_genotypes_string, ] + genotypes)) elif download_csv == 'rare_variants': individuals_to_include = [] for variant in rare_variants: for indiv_id, genotype in variant.genotypes.items(): if genotype.num_alt > 0 and indiv_id not in individuals_to_include: individuals_to_include.append(indiv_id) rows = [] for variant in rare_variants: worst_annotation_idx = variant.annotation[ "worst_vep_index_per_gene"][gene_id] worst_annotation = variant.annotation["vep_annotation"][ worst_annotation_idx] genotypes = [] all_genotypes_string = "" for indiv_id in individuals_to_include: if indiv_id in variant.genotypes and variant.genotypes[ indiv_id].num_alt > 0: genotype = variant.genotypes[indiv_id] allele_string = ">".join(genotype.alleles) all_genotypes_string += indiv_id + ":" + allele_string + " " genotypes.append(allele_string + " (" + str(genotype.gq) + ")") else: genotypes.append("") measureset_id, clinvar_significance = settings.CLINVAR_VARIANTS.get( variant.unique_tuple(), ("", "")) rows.append( map(str, [ gene["symbol"], variant.chr, variant.pos, variant.ref, variant.alt, variant.vcf_id or "", variant.annotation.get("vep_consequence", ""), worst_annotation.get("hgvsc", ""), worst_annotation.get("hgvsp", "").replace("%3D", "="), worst_annotation.get("sift", ""), worst_annotation.get("polyphen", ""), worst_annotation.get("mutationtaster_pred", ""), ";".join( set( worst_annotation.get("fathmm_pred", "").split('%3B'))), measureset_id, clinvar_significance, variant.annotation["freqs"].get("1kg_wgs_phase3", ""), variant.annotation["freqs"].get( "1kg_wgs_phase3_popmax", ""), variant.annotation["freqs"].get("exac_v3", ""), variant.annotation["freqs"].get("exac_v3_popmax", ""), all_genotypes_string, ] + genotypes)) header = [ "gene", "chr", "pos", "ref", "alt", "rsID", "impact", "HGVS.c", "HGVS.p", "sift", "polyphen", "muttaster", "fathmm", "clinvar_id", "clinvar_clinical_sig", "freq_1kg_wgs_phase3", "freq_1kg_wgs_phase3_popmax", "freq_exac_v3", "freq_exac_v3_popmax", "all_genotypes" ] + list( map(lambda i: i + " (from %s)" % indiv_id_to_project_id[i], individuals_to_include)) writer = csv.writer(response) writer.writerow(header) for row in rows: writer.writerow(row) return response else: for individ_id_and_variants in individ_ids_and_variants: variants = individ_id_and_variants["variants"] individ_id_and_variants["variants"] = [ v.toJSON() for v in variants ] return render( request, 'project/gene_quicklook.html', { 'gene': gene, 'gene_json': json.dumps(gene), 'project': project, 'rare_variants_json': json.dumps([v.toJSON() for v in rare_variants]), 'individuals_json': json.dumps([ i.get_json_obj() for project in projects_to_search for i in project.get_individuals() ]), 'knockouts_json': json.dumps(individ_ids_and_variants), 'other_projects_json': other_projects_json, })
def handle(self, *args, **options): project_id = options['project_id'] print("Loading data into project: " + project_id) project = Project.objects.get(project_id=project_id) cnv_filename = options['cnv_filename'] bed_files_directory = options['bed_files_directory'] if not os.path.isfile(cnv_filename): raise ValueError("CNV file %s doesn't exist" % options['cnv_filename']) with open(cnv_filename) as f: header_fields = f.readline().rstrip('\n').split('\t') for line in f: fields = line.rstrip('\n').split('\t') row_dict = dict(zip(header_fields, fields)) chrom = "chr" + row_dict['chr'] start = int(row_dict['start']) end = int(row_dict['end']) #left_overhang = int(row_dict['left_overhang_start']) #right_overhang = int(row_dict['right_overhang_end']) sample_id = row_dict['sample'] try: i = Individual.objects.get(project=project, indiv_id__istartswith=sample_id) except Exception as e: print("WARNING: %s: %s not found in %s" % (e, sample_id, project)) continue bed_file_path = os.path.join(bed_files_directory, "%s.bed" % sample_id) if not os.path.isfile(bed_file_path): print("WARNING: .bed file not found: " + bed_file_path) if i.cnv_bed_file != bed_file_path: print("Setting cnv_bed_file path to %s" % bed_file_path) i.cnv_bed_file = bed_file_path i.save() project_collection = get_project_datastore( project)._get_project_collection(project_id) family_collection = get_mall( project).variant_store._get_family_collection( project_id, i.family.family_id) for collection in filter( None, [project_collection, family_collection]): collection.update_many( { '$and': [{ 'xpos': { '$gte': genomeloc.get_single_location( chrom, start) } }, { 'xpos': { '$lte': genomeloc.get_single_location(chrom, end) } }] }, { '$set': { 'genotypes.%s.extras.cnvs' % i.indiv_id: row_dict } }) #result = list(collection.find({'$and' : [ # {'xpos': {'$gte': genomeloc.get_single_location(chrom, start)}}, # {'xpos' :{'$lte': genomeloc.get_single_location(chrom, end)}}]}, # {'genotypes.%s.extras.cnvs' % i.indiv_id :1 })) #print(chrom, start, end, len(result), result[0] if result else None) print("Done")
def search_for_genes(self, gene_ids, project_id_list, output_filename, max_af=0.01): """ Search for a gene across project(s) Args: gene_ids (list): 'ENSG..' gene id strings. project_id_list (list): (optional) project ids to narrow down the search output_filename (string): output file name max_af (float): AF filter """ outfile = open(output_filename, 'w') header = [ "project_id", "gene", "chr", "pos", "ref", "alt", "rsID", "filter", "impact", "HGVS.c", "HGVS.p", "sift", "polyphen", "muttaster", "fathmm", "clinvar_id", "clinvar_clinical_sig", "freq_1kg_wgs_phase3", "freq_1kg_wgs_phase3_popmax", "freq_exac_v3", "freq_exac_v3_popmax", "all_genotypes" ] writer = csv.writer(outfile, delimiter='\t') writer.writerow(header) # all rare coding variants variant_filter = get_default_variant_filter( 'all_coding', mall.get_annotator().reference_population_slugs) print("All Filters: ") pprint(variant_filter.toJSON()) if project_id_list: projects = [ Project.objects.get(project_id=project_id) for project_id in project_id_list ] else: projects = Project.objects.all() print("Max AF threshold: %s" % max_af) print("Staring gene search for:\n%s\nin projects:\n%s\n" % (", ".join(gene_ids), ", ".join([p.project_id for p in projects]))) indiv_id_cache = {} for project in projects: project_id = project.project_id if get_project_datastore(project_id).project_collection_is_loaded( project_id): print("=====================") print("Searching project %s" % project_id) else: print( "Skipping project %s - gene search is not enabled for this project" % project_id) continue for gene_id in gene_ids: gene_id = get_gene_id_from_str(gene_id, get_reference()) gene = get_reference().get_gene(gene_id) print("-- searching %s for gene %s (%s)" % (project_id, gene["symbol"], gene_id)) for variant in project_analysis.get_variants_in_gene( project, gene_id, variant_filter=variant_filter): if max(variant.annotation['freqs'].values()) >= max_af: continue add_extra_info_to_variants_project(get_reference(), project, [variant]) worst_annotation_idx = variant.annotation[ "worst_vep_index_per_gene"][gene_id] worst_annotation = variant.annotation["vep_annotation"][ worst_annotation_idx] all_genotypes_list = [] pass_filter = "N/A" for indiv_id, genotype in variant.genotypes.items(): individual = indiv_id_cache.get('indiv_id') if individual is None: individual = Individual.objects.get( project=project, indiv_id=indiv_id) indiv_id_cache[indiv_id] = individual pass_filter = genotype.filter # filter value is stored in the genotypes even though it's the same for all individuals if genotype.num_alt > 0: all_genotypes_list.append( "%s%s[gt:%s GQ:%s AB:%0.3f]" % (indiv_id, "[Affected]" if individual.affected == "A" else ("[-]" if individual.affected == "N" else "[?]"), ">".join(genotype.alleles), genotype.gq, genotype.ab if genotype.ab is not None else float('NaN'))) measureset_id, clinvar_significance = get_clinvar_variants( ).get(variant.unique_tuple(), ("", "")) row = map(str, [ project_id, gene["symbol"], variant.chr, variant.pos, variant.ref, variant.alt, variant.vcf_id or "", pass_filter, variant.annotation.get("vep_consequence", ""), worst_annotation.get("hgvsc", ""), worst_annotation.get("hgvsp", "").replace("%3D", "="), worst_annotation.get("sift", ""), worst_annotation.get("polyphen", ""), worst_annotation.get("mutationtaster_pred", ""), ";".join( set( worst_annotation.get("fathmm_pred", "").split('%3B'))), measureset_id, clinvar_significance, variant.annotation["freqs"].get("1kg_wgs_phase3", ""), variant.annotation["freqs"].get( "1kg_wgs_phase3_popmax", ""), variant.annotation["freqs"].get("exac_v3", ""), variant.annotation["freqs"].get("exac_v3_popmax", ""), ", ".join(all_genotypes_list), ]) writer.writerow(row) outfile.close() print("Wrote out %s" % output_filename)
def search_for_gene(self, search_gene_id, project_id_list, max_af=0.01): ''' Search for a gene across project(s) Args: 1. search_gene_id: Gene ID to search for 2. proj_list: An optional list of projects to narrow down search to ''' gene_id = get_gene_id_from_str(search_gene_id, get_reference()) gene = get_reference().get_gene(gene_id) print("Staring gene search for: %s %s in projects: %s\n" % (search_gene_id, gene['gene_id'], ", ".join(project_id_list))) print("Max AF threshold: %s" % max_af) # all rare coding variants variant_filter = get_default_variant_filter('all_coding', mall.get_annotator().reference_population_slugs) print("All Filters: ") pprint(variant_filter.toJSON()) output_filename = 'results_'+search_gene_id + '.tsv' outfile = open(output_filename,'w') header = ["project_id","gene", "chr", "pos", "ref", "alt", "rsID", "filter", "impact", "HGVS.c", "HGVS.p", "sift", "polyphen", "muttaster", "fathmm", "clinvar_id", "clinvar_clinical_sig", "freq_1kg_wgs_phase3", "freq_1kg_wgs_phase3_popmax", "freq_exac_v3", "freq_exac_v3_popmax", "all_genotypes"] writer = csv.writer(outfile,delimiter='\t') writer.writerow(header) if project_id_list: for project_id in project_id_list: project = Project.objects.filter(project_id=project_id)[0] # TODO validate else: project_id_list = [p.project_id for p in Project.objects.all()] for project_id in project_id_list: project = Project.objects.filter(project_id=project_id)[0] if get_project_datastore(project_id).project_collection_is_loaded(project_id): print("Running on project %s" % project_id) else: print("Skipping project %s - gene search is not enabled for this project" % project_id) continue for variant in project_analysis.get_variants_in_gene(project, gene_id, variant_filter=variant_filter): if max(variant.annotation['freqs'].values()) >= max_af: continue #pprint(variant.toJSON()) add_extra_info_to_variants_project(get_reference(), project, [variant]) worst_annotation_idx = variant.annotation["worst_vep_index_per_gene"][gene_id] worst_annotation = variant.annotation["vep_annotation"][worst_annotation_idx] all_genotypes_list = [] pass_filter = "N/A" for indiv_id, genotype in variant.genotypes.items(): pass_filter = genotype.filter # filter value is stored in the genotypes even though it's the same for all individuals if genotype.num_alt > 0: all_genotypes_list.append("%s[gt:%s GQ:%s AB:%0.3f]" % (indiv_id, ">".join(genotype.alleles), genotype.gq, genotype.ab if genotype.ab is not None else float('NaN'))) measureset_id, clinvar_significance = settings.CLINVAR_VARIANTS.get(variant.unique_tuple(), ("", "")) row = map(str, [project_id, gene["symbol"], variant.chr, variant.pos, variant.ref, variant.alt, variant.vcf_id or "", pass_filter, variant.annotation.get("vep_consequence", ""), worst_annotation.get("hgvsc", ""), worst_annotation.get("hgvsp", "").replace("%3D", "="), worst_annotation.get("sift", ""), worst_annotation.get("polyphen", ""), worst_annotation.get("mutationtaster_pred", ""), ";".join(set(worst_annotation.get("fathmm_pred", "").split('%3B'))), measureset_id, clinvar_significance, variant.annotation["freqs"].get("1kg_wgs_phase3", ""), variant.annotation["freqs"].get("1kg_wgs_phase3_popmax", ""), variant.annotation["freqs"].get("exac_v3", ""), variant.annotation["freqs"].get("exac_v3_popmax", ""), ", ".join(all_genotypes_list), ]) writer.writerow(row) outfile.close() print("Wrote out %s" % output_filename)
def search_for_gene(self, search_gene_id, project_id_list, max_af=0.01): ''' Search for a gene across project(s) Args: 1. search_gene_id: Gene ID to search for 2. proj_list: An optional list of projects to narrow down search to ''' gene_id = get_gene_id_from_str(search_gene_id, get_reference()) gene = get_reference().get_gene(gene_id) print("Staring gene search for: %s %s in projects: %s\n" % (search_gene_id, gene['gene_id'], ", ".join(project_id_list))) print("Max AF threshold: %s" % max_af) # all rare coding variants variant_filter = get_default_variant_filter( 'all_coding', mall.get_annotator().reference_population_slugs) print("All Filters: ") pprint(variant_filter.toJSON()) output_filename = 'results_' + search_gene_id + '.tsv' outfile = open(output_filename, 'w') header = [ "project_id", "gene", "chr", "pos", "ref", "alt", "rsID", "filter", "impact", "HGVS.c", "HGVS.p", "sift", "polyphen", "muttaster", "fathmm", "clinvar_id", "clinvar_clinical_sig", "freq_1kg_wgs_phase3", "freq_1kg_wgs_phase3_popmax", "freq_exac_v3", "freq_exac_v3_popmax", "all_genotypes" ] writer = csv.writer(outfile, delimiter='\t') writer.writerow(header) if project_id_list: for project_id in project_id_list: project = Project.objects.filter( project_id=project_id)[0] # TODO validate else: project_id_list = [p.project_id for p in Project.objects.all()] for project_id in project_id_list: project = Project.objects.filter(project_id=project_id)[0] if get_project_datastore(project_id).project_collection_is_loaded( project_id): print("Running on project %s" % project_id) else: print( "Skipping project %s - gene search is not enabled for this project" % project_id) continue for variant in project_analysis.get_variants_in_gene( project, gene_id, variant_filter=variant_filter): if max(variant.annotation['freqs'].values()) >= max_af: continue #pprint(variant.toJSON()) add_extra_info_to_variants_project(get_reference(), project, [variant]) worst_annotation_idx = variant.annotation[ "worst_vep_index_per_gene"][gene_id] worst_annotation = variant.annotation["vep_annotation"][ worst_annotation_idx] all_genotypes_list = [] pass_filter = "N/A" for indiv_id, genotype in variant.genotypes.items(): pass_filter = genotype.filter # filter value is stored in the genotypes even though it's the same for all individuals if genotype.num_alt > 0: all_genotypes_list.append( "%s[gt:%s GQ:%s AB:%0.3f]" % (indiv_id, ">".join( genotype.alleles), genotype.gq, genotype.ab if genotype.ab is not None else float('NaN'))) measureset_id, clinvar_significance = settings.CLINVAR_VARIANTS.get( variant.unique_tuple(), ("", "")) row = map(str, [ project_id, gene["symbol"], variant.chr, variant.pos, variant.ref, variant.alt, variant.vcf_id or "", pass_filter, variant.annotation.get("vep_consequence", ""), worst_annotation.get("hgvsc", ""), worst_annotation.get("hgvsp", "").replace("%3D", "="), worst_annotation.get("sift", ""), worst_annotation.get("polyphen", ""), worst_annotation.get("mutationtaster_pred", ""), ";".join( set( worst_annotation.get("fathmm_pred", "").split('%3B'))), measureset_id, clinvar_significance, variant.annotation["freqs"].get("1kg_wgs_phase3", ""), variant.annotation["freqs"].get("1kg_wgs_phase3_popmax", ""), variant.annotation["freqs"].get("exac_v3", ""), variant.annotation["freqs"].get("exac_v3_popmax", ""), ", ".join(all_genotypes_list), ]) writer.writerow(row) outfile.close() print("Wrote out %s" % output_filename)