def load_variants_for_cohort_list(project, cohorts): for cohort in cohorts: family_list = [] print "Adding {}".format(cohort.cohort_id) family_list.append({ 'project_id': cohort.project.project_id, 'family_id': cohort.cohort_id, 'individuals': cohort.indiv_id_list(), }) # add all families from this vcf to the datastore get_mall(project.project_id).variant_store.add_family_set(family_list) vcf_files = cohort.get_vcf_files() # create the VCF ID map vcf_id_map = {} for individual in cohort.get_individuals(): if individual.vcf_id: vcf_id_map[individual.vcf_id] = individual.indiv_id # load them all into the datastore for vcf_file in vcf_files: family_tuple_list = [(f['project_id'], f['family_id']) for f in family_list] get_mall(project.project_id).variant_store.load_family_set( vcf_file.path(), family_tuple_list, reference_populations=project.get_reference_population_slugs(), vcf_id_map=vcf_id_map, )
def mendelian_variant_search_spec(request): project, family = get_project_and_family_for_user(request.user, request.GET) search_hash = request.GET.get('search_hash') search_spec_dict, variants = cache_utils.get_cached_results(project.project_id, search_hash) search_spec = MendelianVariantSearchSpec.fromJSON(search_spec_dict) if variants is None: variants = api_utils.calculate_mendelian_variant_search(search_spec, family.xfamily()) else: variants = [Variant.fromJSON(v) for v in variants] add_extra_info_to_variants_family(get_reference(), family, variants) return_type = request.GET.get('return_type') if return_type == 'json' or not return_type: return JSONResponse({ 'is_error': False, 'variants': [v.toJSON() for v in variants], 'search_spec': search_spec_dict, }) elif request.GET.get('return_type') == 'csv': response = HttpResponse(content_type='text/csv') response['Content-Disposition'] = 'attachment; filename="results_{}.csv"'.format(search_hash) writer = csv.writer(response) indiv_ids = family.indiv_ids_with_variant_data() headers = xbrowse_displays.get_variant_display_headers(get_mall(project.project_id), project, indiv_ids) writer.writerow(headers) for variant in variants: fields = xbrowse_displays.get_display_fields_for_variant(get_mall(project.project_id), project, variant, indiv_ids) writer.writerow(fields) return response
def handle(self, *args, **options): for project_id in args: print("Deleting data from mongodb for project: " + project_id) p = Project.objects.get(project_id=project_id) get_mall(p).variant_store.delete_project(project_id) get_project_datastore(p).delete_project_store(project_id) print("Done")
def mendelian_variant_search_spec(request): project, family = get_project_and_family_for_user(request.user, request.GET) # TODO: use form search_hash = request.GET.get('search_hash') search_spec_dict, variants = cache_utils.get_cached_results(project.project_id, search_hash) search_spec = MendelianVariantSearchSpec.fromJSON(search_spec_dict) if variants is None: variants = api_utils.calculate_mendelian_variant_search(search_spec, family.xfamily()) else: variants = [Variant.fromJSON(v) for v in variants] add_extra_info_to_variants_family(get_reference(), family, variants) return_type = request.GET.get('return_type') if return_type == 'json' or not return_type: return JSONResponse({ 'is_error': False, 'variants': [v.toJSON() for v in variants], 'search_spec': search_spec_dict, }) elif request.GET.get('return_type') == 'csv': response = HttpResponse(content_type='text/csv') response['Content-Disposition'] = 'attachment; filename="results_{}.csv"'.format(search_hash) writer = csv.writer(response) indiv_ids = family.indiv_ids_with_variant_data() headers = xbrowse_displays.get_variant_display_headers(get_mall(project.project_id), project, indiv_ids) writer.writerow(headers) for variant in variants: fields = xbrowse_displays.get_display_fields_for_variant(get_mall(project.project_id), project, variant, indiv_ids) writer.writerow(fields) return response
def handle(self, *args, **options): for project_id in args: print("Deleting data from mongodb for project: " + project_id) p = Project.objects.get(project_id = project_id) get_mall(p).variant_store.delete_project(project_id) get_project_datastore(p).delete_project_store(project_id) print("Done")
def load_variants_for_family_list(project, families, vcf_file, mark_as_loaded=True): """ Reload variants for a list of families, all from the same vcf """ family_list = [] for family in families: family_list.append({ 'project_id': family.project.project_id, 'family_id': family.family_id, 'individuals': family.indiv_ids_with_variant_data(), }) # add all families from this vcf to the datastore get_mall(project.project_id).variant_store.add_family_set(family_list) # create the VCF ID map vcf_id_map = {} for family in families: for individual in family.get_individuals(): if individual.vcf_id: vcf_id_map[individual.vcf_id] = individual.indiv_id # load them all into the datastore family_tuple_list = [(f['project_id'], f['family_id']) for f in family_list] get_mall(project.project_id).variant_store.load_family_set( vcf_file, family_tuple_list, reference_populations=project.get_reference_population_slugs(), vcf_id_map=vcf_id_map, mark_as_loaded=mark_as_loaded, ) # finish up each family for family in families: _family_postprocessing(family)
def delete_project(project_id): """ Delete a project and perform any cleanup (ie. deleting from datastore and removing temp files) """ project = Project.objects.get(project_id=project_id) get_mall(project_id).variant_store.delete_project(project_id) project.individual_set.all().delete() project.family_set.all().delete() project.delete()
def handle(self, *args, **options): number_of_variants_to_check = int(options.get("number_of_variants_to_check") or 20000) if not args: args = [p.project_id for p in Project.objects.all()] args.reverse() for project_id in args: try: project = Project.objects.get(project_id=project_id) except: print("ERROR: Project not found. Skipping..") continue all_counter = 0 #found_counter = 0 not_found_counter = 0 not_found_variants = [] for vcf_file in project.get_all_vcf_files(): path = vcf_file.file_path #print("Processing %s - %s" % (project.project_id, path)) if not os.path.isfile(path) and path.endswith(".vcf"): path = path + ".gz" if path.endswith(".gz"): f = gzip.open(path) else: f = open(path) if f: for variant in vcf_stuff.iterate_vcf(f): all_counter += 1 try: get_mall(project_id).annotator.get_annotation(variant.xpos, variant.ref, variant.alt) except ValueError, e: not_found_counter += 1 if len(not_found_variants) < 30: chrom, pos = genomeloc.get_chr_pos(variant.xpos) chrom = chrom.replace("chr","") ref, alt = variant.ref, variant.alt not_found_variants.append("%(chrom)s-%(pos)s-%(ref)s-%(alt)s" % locals()) #print("WARNING: variant not found in annotator cache: " + str(e)) #if not_found_counter > 5: # print("---- ERROR: 5 variants not found. Project %s should be reloaded." % project_id) # break found_counter = 0 #else: # found_counter += 1 # if found_counter > 15000: # #print("---- Found 5000 variants in a row. Project %s looks ok." % project_id) # break if all_counter >= number_of_variants_to_check: fraction_missing = float(not_found_counter) / all_counter if not_found_counter > 10: print("---- ERROR: (%(fraction_missing)0.2f%%) %(not_found_counter)s / %(all_counter)s variants not found. Project %(project_id)s should be reloaded. Examples: " % locals()) for v in not_found_variants: print("http://exac.broadinstitute.org/variant/" + v) break
def delete_family(project_id, family_id): """ Delete a project and perform any cleanup (ie. deleting from datastore and removing temp files) """ family = Family.objects.get(project__project_id=project_id, family_id=family_id) for individual in family.get_individuals(): update_xbrowse_model(individual, family=None) get_mall(project_id).variant_store.delete_family(project_id, family_id) delete_xbrowse_model(family)
def delete_family(project_id, family_id): """ Delete a project and perform any cleanup (ie. deleting from datastore and removing temp files) """ family = Family.objects.get(project__project_id=project_id, family_id=family_id) for individual in family.get_individuals(): individual.family = None individual.save() get_mall(project_id).variant_store.delete_family(project_id, family_id) family.delete()
def delete_project(project_id, delete_data=False): """ Delete a project and perform any cleanup (ie. deleting from datastore and removing temp files) """ print("Deleting %s" % project_id) project = Project.objects.get(project_id=project_id) if delete_data: get_project_datastore(project_id).delete_project_store(project_id) get_mall(project_id).variant_store.delete_project(project_id) project.individual_set.all().delete() project.family_set.all().delete() project.delete() print("Successfully deleted %s" % project_id)
def load_project_variants(project_id, force_load_annotations=False, force_load_variants=False, ignore_csq_in_vcf=False, start_from_chrom=None, end_with_chrom=None): """ Load any families and cohorts in this project that aren't loaded already """ print "Loading project %s" % project_id print(date.strftime(datetime.now(), "%m/%d/%Y %H:%M:%S -- loading project: " + project_id + " - db.variants cache")) project = Project.objects.get(project_id=project_id) for vcf_obj in sorted(project.get_all_vcf_files(), key=lambda v:v.path()): if not os.path.isfile(vcf_obj.path()): print("Skipping " + vcf_obj.path()) continue r = vcf.VCFReader(filename=vcf_obj.path()) if not ignore_csq_in_vcf and "CSQ" not in r.infos: raise ValueError("VEP annotations not found in VCF: " + vcf_obj.path()) mall.get_annotator().add_preannotated_vcf_file(vcf_obj.path(), force=force_load_annotations, start_from_chrom=start_from_chrom, end_with_chrom=end_with_chrom) # batch load families by VCF file for vcf_file, families in project.families_by_vcf().items(): if not force_load_variants: # filter out families that have already finished loading families = [f for f in families if get_mall(project).variant_store.get_family_status(project_id, f.family_id) != 'loaded'] for i in xrange(0, len(families), settings.FAMILY_LOAD_BATCH_SIZE): load_variants_for_family_list(project, families[i:i+settings.FAMILY_LOAD_BATCH_SIZE], vcf_file, start_from_chrom=start_from_chrom, end_with_chrom=end_with_chrom) # now load cohorts load_cohorts(project_id)
def load_project_variants(project_id, force_annotations=False, ignore_csq_in_vcf=False): """ Load any families and cohorts in this project that aren't loaded already """ print "Loading project %s" % project_id print(date.strftime(datetime.now(), "%m/%d/%Y %H:%M:%S -- loading project: " + project_id + " - db.variants cache")) project = Project.objects.get(project_id=project_id) for vcf_obj in project.get_all_vcf_files(): r = vcf.VCFReader(filename=vcf_obj.path()) if not ignore_csq_in_vcf and "CSQ" not in r.infos: raise ValueError("VEP annotations not found in VCF: " + vcf_file) mall.get_annotator().add_preannotated_vcf_file(vcf_obj.path(), force=force_annotations) # batch load families by VCF file for vcf_file, families in project.families_by_vcf().items(): families = [f for f in families if get_mall(project.project_id).variant_store.get_family_status(project_id, f.family_id) != 'loaded'] for i in xrange(0, len(families), settings.FAMILY_LOAD_BATCH_SIZE): print(date.strftime(datetime.now(), "%m/%d/%Y %H:%M:%S -- loading project: " + project_id + " - families batch %d - %d families" % (i, len(families[i:i+settings.FAMILY_LOAD_BATCH_SIZE])) )) load_variants_for_family_list(project, families[i:i+settings.FAMILY_LOAD_BATCH_SIZE], vcf_file) # now load cohorts load_cohorts(project_id)
def combine_mendelian_families_variants(request): project, family_group = utils.get_project_and_family_group_for_user(request.user, request.GET) form = api_forms.CombineMendelianFamiliesVariantsForm(request.GET) if form.is_valid(): variants_grouped = get_variants_by_family_for_gene( get_mall(project.project_id), [f.xfamily() for f in form.cleaned_data['families']], form.cleaned_data['inheritance_mode'], form.cleaned_data['gene_id'], variant_filter=form.cleaned_data['variant_filter'], quality_filter=form.cleaned_data['quality_filter'] ) variants_by_family = [] for family in form.cleaned_data['families']: variants = variants_grouped[(family.project.project_id, family.family_id)] add_extra_info_to_variants_family(get_reference(), family, variants) variants_by_family.append({ 'project_id': family.project.project_id, 'family_id': family.family_id, 'family_name': str(family), 'variants': [v.toJSON() for v in variants], }) return JSONResponse({ 'is_error': False, 'variants_by_family': variants_by_family, }) else: return JSONResponse({ 'is_error': True, 'error': server_utils.form_error_string(form) })
def get_variants_for_inheritance_for_project(project, inheritance_mode): """ Get the variants for this project / inheritance combo Return dict of family -> list of variants """ # create search specification # this could theoretically differ by project, if there are different reference populations variant_filter = get_default_variant_filter("moderate_impact") variant_filter.ref_freqs.append(("1kg_wgs_phase3", g1k_freq_threshold)) variant_filter.ref_freqs.append(("1kg_wgs_phase3_popmax", g1k_popmax_freq_threshold)) variant_filter.ref_freqs.append(("exac_v3", exac_freq_threshold)) variant_filter.ref_freqs.append(("exac_v3_popmax", exac_popmax_threshold)) quality_filter = {"vcf_filter": "pass", "min_gq": GQ_threshold, "min_ab": AB_threshold} # run MendelianVariantSearch for each family, collect results families = project.get_families() for i, family in enumerate(families): sys.stdout.write( "Processing %s - family %s (%d / %d) .." % (inheritance_mode, family.family_id, i + 1, len(families)) ) variant_list = list( get_variants_with_inheritance_mode( get_mall(project.project_id), family.xfamily(), inheritance_mode, variant_filter=variant_filter, quality_filter=quality_filter, ) ) yield family, variant_list print(" got %d variants" % len(variant_list))
def get_variants_for_inheritance_for_project(project, inheritance_mode): """ Get the variants for this project / inheritance combo Return dict of family -> list of variants """ # create search specification # this could theoretically differ by project, if there are different reference populations variant_filter = get_default_variant_filter('moderate_impact') variant_filter.ref_freqs.append(('1kg_wgs_phase3', g1k_freq_threshold)) variant_filter.ref_freqs.append(('1kg_wgs_phase3_popmax', g1k_popmax_freq_threshold)) variant_filter.ref_freqs.append(('exac_v3', exac_freq_threshold)) variant_filter.ref_freqs.append(('exac_v3_popmax', exac_popmax_threshold)) quality_filter = { 'vcf_filter': 'pass', 'min_gq': GQ_threshold, 'min_ab': AB_threshold, } # run MendelianVariantSearch for each family, collect results families = project.get_families() for i, family in enumerate(families): sys.stdout.write("Processing %s - family %s (%d / %d) .." % (inheritance_mode, family.family_id, i+1, len(families))) variant_list = list(get_variants_with_inheritance_mode( get_mall(project.project_id), family.xfamily(), inheritance_mode, variant_filter=variant_filter, quality_filter=quality_filter, )) yield family, variant_list print(" got %d variants" % len(variant_list))
def get_variants_in_gene(family_group, gene_id, variant_filter=None, quality_filter=None): """ """ variants_by_family = [] for family in family_group.get_families(): variant_list = list( get_mall( family.project.project_id).variant_store.get_variants_in_gene( family.project.project_id, family.family_id, gene_id, variant_filter=variant_filter)) variant_list = search_utils.filter_gene_variants_by_variant_filter( variant_list, gene_id, variant_filter) add_extra_info_to_variants_family(get_reference(), family, variant_list) variants_by_family.append({ 'variants': [v.toJSON() for v in variant_list], 'family_id': family.family_id, 'project_id': family.project.project_id, 'family_name': str(family), }) return variants_by_family
def handle(self, *args, **options): if len(args) < 1: print("Please provide the project_id. The individual_id(s) are optional") return project_id = args[0] try: project = Project.objects.get(project_id=project_id) except ObjectDoesNotExist: sys.exit("Invalid project id: " + project_id) individual_ids = args[1:] try: if individual_ids: individual_ids = [Individual.objects.get(project=project, indiv_id=individual_id) for individual_id in individual_ids] else: individual_ids = [i for i in Individual.objects.filter(project=project)] except ObjectDoesNotExist: sys.exit("Invalid individual ids: " + str(individual_ids)) for i in individual_ids: family_collection = get_mall(project_id).variant_store._get_family_collection(project_id, i.family.family_id) if family_collection is None: print("WARNING: Family %s data not loaded in variant datastore. Skipping individual %s." % (i.family.family_id, i)) continue self.handle_individual(project, i) print("Finished generating report")
def load_cohorts(project_id): # now load cohorts project = Project.objects.get(project_id=project_id) for vcf_file, cohorts in project.cohorts_by_vcf().items(): cohorts = [c for c in cohorts if get_mall(project).variant_store.get_family_status(project_id, c.cohort_id) != 'loaded'] for i in xrange(0, len(cohorts), settings.FAMILY_LOAD_BATCH_SIZE): print("Loading project %s - cohorts: %s" % (project_id, cohorts[i:i+settings.FAMILY_LOAD_BATCH_SIZE])) load_variants_for_cohort_list(project, cohorts[i:i+settings.FAMILY_LOAD_BATCH_SIZE])
def handle(self, *args, **options): project_id = options['project_id'] print("Loading data into project: " + project_id) project = Project.objects.get(project_id = project_id) cnv_filename = options['cnv_filename'] bed_files_directory = options['bed_files_directory'] if not os.path.isfile(cnv_filename): raise ValueError("CNV file %s doesn't exist" % options['cnv_filename']) with open(cnv_filename) as f: header_fields = f.readline().rstrip('\n').split('\t') for line in f: fields = line.rstrip('\n').split('\t') row_dict = dict(zip(header_fields, fields)) chrom = "chr"+row_dict['chr'] start = int(row_dict['start']) end = int(row_dict['end']) #left_overhang = int(row_dict['left_overhang_start']) #right_overhang = int(row_dict['right_overhang_end']) sample_id = row_dict['sample'] try: i = Individual.objects.get(project=project, indiv_id__istartswith=sample_id) except Exception as e: print("WARNING: %s: %s not found in %s" % (e, sample_id, project)) continue bed_file_path = os.path.join(bed_files_directory, "%s.bed" % sample_id) if not os.path.isfile(bed_file_path): print("WARNING: .bed file not found: " + bed_file_path) if i.cnv_bed_file != bed_file_path: print("Setting cnv_bed_file path to %s" % bed_file_path) i.cnv_bed_file = bed_file_path i.save() project_collection = get_project_datastore(project)._get_project_collection(project_id) family_collection = get_mall(project).variant_store._get_family_collection(project_id, i.family.family_id) for collection in filter(None, [project_collection, family_collection]): collection.update_many( {'$and': [ {'xpos': {'$gte': genomeloc.get_single_location(chrom, start)} }, {'xpos': {'$lte': genomeloc.get_single_location(chrom, end)}} ]}, {'$set': {'genotypes.%s.extras.cnvs' % i.indiv_id: row_dict}}) #result = list(collection.find({'$and' : [ # {'xpos': {'$gte': genomeloc.get_single_location(chrom, start)}}, # {'xpos' :{'$lte': genomeloc.get_single_location(chrom, end)}}]}, # {'genotypes.%s.extras.cnvs' % i.indiv_id :1 })) #print(chrom, start, end, len(result), result[0] if result else None) print("Done")
def load_project_variants(project_id, force_load_annotations=False, force_load_variants=False, ignore_csq_in_vcf=False, start_from_chrom=None, end_with_chrom=None): """ Load any families and cohorts in this project that aren't loaded already """ print "Loading project %s" % project_id print( date.strftime( datetime.now(), "%m/%d/%Y %H:%M:%S -- loading project: " + project_id + " - db.variants cache")) project = Project.objects.get(project_id=project_id) for vcf_obj in sorted(project.get_all_vcf_files(), key=lambda v: v.path()): if not os.path.isfile(vcf_obj.path()): print("Skipping " + vcf_obj.path()) continue r = vcf.VCFReader(filename=vcf_obj.path()) if not ignore_csq_in_vcf and "CSQ" not in r.infos: raise ValueError("VEP annotations not found in VCF: " + vcf_obj.path()) mall.get_annotator().add_preannotated_vcf_file( vcf_obj.path(), force=force_load_annotations, start_from_chrom=start_from_chrom, end_with_chrom=end_with_chrom) # batch load families by VCF file for vcf_file, families in project.families_by_vcf().items(): if not force_load_variants: # filter out families that have already finished loading families = [ f for f in families if get_mall(project.project_id).variant_store. get_family_status(project_id, f.family_id) != 'loaded' ] for i in xrange(0, len(families), settings.FAMILY_LOAD_BATCH_SIZE): print( date.strftime( datetime.now(), "%m/%d/%Y %H:%M:%S -- loading project: " + project_id + " - families batch %d - %d families" % (i, len(families[i:i + settings.FAMILY_LOAD_BATCH_SIZE])))) load_variants_for_family_list( project, families[i:i + settings.FAMILY_LOAD_BATCH_SIZE], vcf_file, start_from_chrom=start_from_chrom, end_with_chrom=end_with_chrom) # now load cohorts load_cohorts(project_id)
def calculate_mendelian_variant_search(search_spec, xfamily): variants = None if search_spec.search_mode == 'standard_inheritance': variants = list(get_variants_with_inheritance_mode( get_mall(), xfamily, search_spec.inheritance_mode, variant_filter=search_spec.variant_filter, quality_filter=search_spec.genotype_quality_filter, )) elif search_spec.search_mode == 'custom_inheritance': variants = list(get_variants_family( get_datastore(), xfamily, genotype_filter=search_spec.genotype_inheritance_filter, variant_filter=search_spec.variant_filter, quality_filter=search_spec.genotype_quality_filter, )) elif search_spec.search_mode == 'gene_burden': gene_stream = get_genes_family( get_datastore(), get_reference(), xfamily, burden_filter=search_spec.gene_burden_filter, variant_filter=search_spec.variant_filter, quality_filter=search_spec.genotype_quality_filter, ) variants = list(stream_utils.gene_stream_to_variant_stream(gene_stream, get_reference())) elif search_spec.search_mode == 'allele_count': variants = list(get_variants_allele_count( get_datastore(), xfamily, search_spec.allele_count_filter, variant_filter=search_spec.variant_filter, quality_filter=search_spec.genotype_quality_filter, )) elif search_spec.search_mode == 'all_variants': variants = list(get_variants_family( get_datastore(), xfamily, variant_filter=search_spec.variant_filter, quality_filter=search_spec.genotype_quality_filter, )) return variants
def load_variants_for_family_list(project, families, vcf_file, mark_as_loaded=True, start_from_chrom=None, end_with_chrom=None): """ Reload variants for a list of families, all from the same vcf """ family_list = [] for family in families: family_list.append({ 'project_id': family.project.project_id, 'family_id': family.family_id, 'individuals': family.indiv_ids_with_variant_data(), }) # add all families from this vcf to the datastore get_mall(project.project_id).variant_store.add_family_set(family_list) # create the VCF ID map vcf_id_map = {} for family in families: for individual in family.get_individuals(): if individual.vcf_id: vcf_id_map[individual.vcf_id] = individual.indiv_id # load them all into the datastore family_tuple_list = [(f['project_id'], f['family_id']) for f in family_list] get_mall(project.project_id).variant_store.load_family_set( vcf_file, family_tuple_list, reference_populations=project.get_reference_population_slugs(), vcf_id_map=vcf_id_map, mark_as_loaded=mark_as_loaded, start_from_chrom=start_from_chrom, end_with_chrom=end_with_chrom, ) # finish up each family for family in families: _family_postprocessing(family)
def load_cohorts(project_id): # now load cohorts print(date.strftime(datetime.now(), "%m/%d/%Y %H:%M:%S -- loading project: " + project_id + " - cohorts")) project = Project.objects.get(project_id=project_id) for vcf_file, cohorts in project.cohorts_by_vcf().items(): cohorts = [c for c in cohorts if get_mall(project.project_id).variant_store.get_family_status(project_id, c.cohort_id) != 'loaded'] for i in xrange(0, len(cohorts), settings.FAMILY_LOAD_BATCH_SIZE): print("Loading project %s - cohorts: %s" % (project_id, cohorts[i:i+settings.FAMILY_LOAD_BATCH_SIZE])) load_variants_for_cohort_list(project, cohorts[i:i+settings.FAMILY_LOAD_BATCH_SIZE]) print(date.strftime(datetime.now(), "%m/%d/%Y %H:%M:%S -- finished loading project: " + project_id))
def clean_project(project_id): """ Clear data for this project from all the xbrowse resources: - datastore - coverage store - cnv store Does not remove any of the project's data links - so no data is lost, but everything must be rebuilt """ project = Project.objects.get(project_id=project_id) individuals = project.get_individuals() # datastore get_mall(project_id).variant_store.delete_project(project_id) # coverage store for individual in individuals: get_coverage_store().remove_sample(individual.get_coverage_store_id()) # cnv store for individual in individuals: get_cnv_store().remove_sample(individual.get_coverage_store_id())
def calculate_mendelian_variant_search(search_spec, xfamily): sys.stderr.write(" mendelian_variant_search for %s - search mode: %s %s\n" % (xfamily.project_id, search_spec.search_mode, search_spec.__dict__)) variants = None if search_spec.search_mode == 'standard_inheritance': variants = list(get_variants_with_inheritance_mode( get_mall(xfamily.project_id), xfamily, search_spec.inheritance_mode, variant_filter=search_spec.variant_filter, quality_filter=search_spec.quality_filter, )) elif search_spec.search_mode == 'custom_inheritance': variants = list(get_variants_family( get_datastore(xfamily.project_id), xfamily, genotype_filter=search_spec.genotype_inheritance_filter, variant_filter=search_spec.variant_filter, quality_filter=search_spec.quality_filter, )) elif search_spec.search_mode == 'gene_burden': gene_stream = get_genes_family( get_datastore(xfamily.project_id), get_reference(), xfamily, burden_filter=search_spec.gene_burden_filter, variant_filter=search_spec.variant_filter, quality_filter=search_spec.quality_filter, ) variants = list(stream_utils.gene_stream_to_variant_stream(gene_stream, get_reference())) elif search_spec.search_mode == 'allele_count': variants = list(get_variants_allele_count( get_datastore(xfamily.project_id), xfamily, search_spec.allele_count_filter, variant_filter=search_spec.variant_filter, quality_filter=search_spec.quality_filter, )) elif search_spec.search_mode == 'all_variants': variants = list(get_variants_family( get_datastore(xfamily.project_id), xfamily, variant_filter=search_spec.variant_filter, quality_filter=search_spec.quality_filter, indivs_to_consider=xfamily.indiv_id_list(), )) return variants
def get_variants_for_inheritance_for_project(project, inheritance_mode): """ Get the variants for this project / inheritance combo Return dict of family -> list of variants """ # create search specification # this could theoretically differ by project, if there are different reference populations #variant_filter = VariantFilter(so_annotations=SO_SEVERITY_ORDER, ref_freqs=[]) variant_filter = get_default_variant_filter('moderate_impact') variant_filter.ref_freqs.append(('1kg_wgs_phase3', g1k_freq_threshold)) variant_filter.ref_freqs.append( ('1kg_wgs_phase3_popmax', g1k_popmax_freq_threshold)) variant_filter.ref_freqs.append(('exac_v3', exac_freq_threshold)) variant_filter.ref_freqs.append(('exac_v3_popmax', exac_popmax_threshold)) variant_filter.ref_freqs.append( ('merck-wgs-3793', merck_wgs_3793_threshold)) #variant_filter.ref_freqs.append(('merck-pcr-free-wgs-144', merck_wgs_144_threshold)) quality_filter = { # 'vcf_filter': 'pass', 'min_gq': GQ_threshold, 'min_ab': AB_threshold, } # run MendelianVariantSearch for each family, collect results families = project.get_families() for i, family in enumerate(families): print("Processing %s - family %s (%d / %d)" % (inheritance_mode, family.family_id, i + 1, len(families))) try: if inheritance_mode == "all_variants": yield family, list( get_variants(get_datastore(project.project_id), family.xfamily(), variant_filter=variant_filter, quality_filter=quality_filter, indivs_to_consider=family.indiv_id_list())) else: yield family, list( get_variants_with_inheritance_mode( get_mall(project.project_id), family.xfamily(), inheritance_mode, variant_filter=variant_filter, quality_filter=quality_filter, )) except ValueError as e: print("Error: %s. Skipping family %s" % (str(e), str(family)))
def load_cohorts(project_id): # now load cohorts project = Project.objects.get(project_id=project_id) for vcf_file, cohorts in project.cohorts_by_vcf().items(): cohorts = [ c for c in cohorts if get_mall(project).variant_store.get_family_status( project_id, c.cohort_id) != 'loaded' ] for i in xrange(0, len(cohorts), settings.FAMILY_LOAD_BATCH_SIZE): print("Loading project %s - cohorts: %s" % (project_id, cohorts[i:i + settings.FAMILY_LOAD_BATCH_SIZE])) load_variants_for_cohort_list( project, cohorts[i:i + settings.FAMILY_LOAD_BATCH_SIZE])
def inheritance_matrix_for_gene(project, gene_id): """ Run get_family_matrix_for_gene for the families in this project """ variant_filter = get_default_variant_filter('moderate_impact', mall.get_annotator().reference_population_slugs) quality_filter = get_default_quality_filter('high_quality', mall.get_annotator().reference_population_slugs) matrix = get_family_matrix_for_gene( get_mall(), [f.xfamily() for f in project.get_active_families()], gene_id, variant_filter, quality_filter ) return matrix
def inheritance_matrix_for_gene(project, gene_id): """ Run get_family_matrix_for_gene for the families in this project """ variant_filter = get_default_variant_filter( 'moderate_impact', mall.get_annotator().reference_population_slugs) quality_filter = get_default_quality_filter( 'high_quality', mall.get_annotator().reference_population_slugs) matrix = get_family_matrix_for_gene( get_mall(project.project_id), [f.xfamily() for f in project.get_active_families()], gene_id, variant_filter, quality_filter) return matrix
def load_project_variants(project_id, force_annotations=False): """ Load any families and cohorts in this project that aren't loaded already """ print "Loading project %s" % project_id project = Project.objects.get(project_id=project_id) for vcf in project.get_all_vcf_files(): mall.get_annotator().add_vcf_file_to_annotator(vcf.path(), force_all=force_annotations) # batch load families by VCF file for vcf_file, families in project.families_by_vcf().items(): families = [f for f in families if get_mall().variant_store.get_family_status(project_id, f.family_id) != 'loaded'] for i in xrange(0, len(families), settings.FAMILY_LOAD_BATCH_SIZE): load_variants_for_family_list(project, families[i:i+settings.FAMILY_LOAD_BATCH_SIZE], vcf_file) # now load cohorts # TODO: load cohorts and families together for vcf_file, cohorts in project.cohorts_by_vcf().items(): cohorts = [c for c in cohorts if get_mall().variant_store.get_family_status(project_id, c.cohort_id) != 'loaded'] for i in xrange(0, len(cohorts), settings.FAMILY_LOAD_BATCH_SIZE): load_variants_for_cohort_list(project, cohorts[i:i+settings.FAMILY_LOAD_BATCH_SIZE], vcf_file) print "Finished loading project %s!" % project_id
def get_variants_for_inheritance_for_project(project, inheritance_mode): """ Get the variants for this project / inheritance combo Return dict of family -> list of variants """ # create search specification # this could theoretically differ by project, if there are different reference populations #variant_filter = VariantFilter(so_annotations=SO_SEVERITY_ORDER, ref_freqs=[]) variant_filter = get_default_variant_filter('moderate_impact') variant_filter.ref_freqs.append(('1kg_wgs_phase3', g1k_freq_threshold)) variant_filter.ref_freqs.append(('1kg_wgs_phase3_popmax', g1k_popmax_freq_threshold)) variant_filter.ref_freqs.append(('exac_v3', exac_freq_threshold)) variant_filter.ref_freqs.append(('exac_v3_popmax', exac_popmax_threshold)) variant_filter.ref_freqs.append(('merck-wgs-3793', merck_wgs_3793_threshold)) #variant_filter.ref_freqs.append(('merck-pcr-free-wgs-144', merck_wgs_144_threshold)) quality_filter = { # 'vcf_filter': 'pass', 'min_gq': GQ_threshold, 'min_ab': AB_threshold, } # run MendelianVariantSearch for each family, collect results families = project.get_families() for i, family in enumerate(families): print("Processing %s - family %s (%d / %d)" % (inheritance_mode, family.family_id, i+1, len(families))) try: if inheritance_mode == "all_variants": yield family, list(get_variants( get_datastore(project.project_id), family.xfamily(), variant_filter=variant_filter, quality_filter=quality_filter, indivs_to_consider=family.indiv_id_list() )) else: yield family, list(get_variants_with_inheritance_mode( get_mall(project.project_id), family.xfamily(), inheritance_mode, variant_filter=variant_filter, quality_filter=quality_filter, )) except ValueError as e: print("Error: %s. Skipping family %s" % (str(e), str(family)))
def get_gene_diangostic_info(family, gene_id, variant_filter=None): diagnostic_info = GeneDiagnosticInfo(gene_id) diagnostic_info._gene_phenotype_summary = get_gene_phenotype_summary( get_reference(), gene_id) diagnostic_info._gene_sequencing_summary = get_gene_sequencing_summary( get_coverage_store(), family, gene_id) diagnostic_info._variants = get_diagnostic_search_variants_in_family( get_mall(family.project).variant_store, family, gene_id, variant_filter) diagnostic_info._cnvs = get_diagnostic_search_cnvs_in_family( get_cnv_store(), family, gene_id, ) return diagnostic_info
def get_gene_diangostic_info(family, gene_id, variant_filter=None): diagnostic_info = GeneDiagnosticInfo(gene_id) diagnostic_info._gene_phenotype_summary = get_gene_phenotype_summary(get_reference(), gene_id) diagnostic_info._gene_sequencing_summary = get_gene_sequencing_summary(get_coverage_store(), family, gene_id) diagnostic_info._variants = get_diagnostic_search_variants_in_family( get_mall(family.project).variant_store, family, gene_id, variant_filter ) diagnostic_info._cnvs = get_diagnostic_search_cnvs_in_family( get_cnv_store(), family, gene_id, ) return diagnostic_info
def calculate_combine_mendelian_families(family_group, search_spec, user=None): """ Calculate search results from the params in search_spec Should be called after cache is checked - this does all the computation Returns (is_error, genes) tuple """ xfamilygroup = family_group.xfamilygroup() genes = [] for gene_id, family_id_list in get_families_by_gene( get_mall(family_group.project), xfamilygroup, search_spec.inheritance_mode, search_spec.variant_filter, search_spec.quality_filter, user=user, ): xgene = get_reference().get_gene(gene_id) if xgene is None: continue try: start_pos, end_pos = get_reference().get_gene_bounds(gene_id) chr, start = genomeloc.get_chr_pos(start_pos) end = genomeloc.get_chr_pos(end_pos)[1] except KeyError: chr, start, end = None, None, None gene = { 'gene_info': xgene, 'gene_id': gene_id, 'gene_name': xgene['symbol'], 'chr': chr, 'start': start, 'end': end, 'family_id_list': family_id_list, } genes.append(gene) return genes
def get_variants_in_gene(family_group, gene_id, variant_filter=None, quality_filter=None): """ """ variants_by_family = [] for family in family_group.get_families(): variant_list = list(get_mall(family.project.project_id).variant_store.get_variants_in_gene( family.project.project_id, family.family_id, gene_id, variant_filter=variant_filter )) variant_list = search_utils.filter_gene_variants_by_variant_filter(variant_list, gene_id, variant_filter) add_extra_info_to_variants_family(get_reference(), family, variant_list) variants_by_family.append({ 'variants': [v.toJSON() for v in variant_list], 'family_id': family.family_id, 'project_id': family.project.project_id, 'family_name': str(family), }) return variants_by_family
def handle(self, *args, **options): if len(args) < 1: print( "Please provide the project_id. The individual_id(s) are optional" ) return project_id = args[0] try: project = Project.objects.get(project_id=project_id) except ObjectDoesNotExist: sys.exit("Invalid project id: " + project_id) individual_ids = args[1:] try: if individual_ids: individual_ids = [ Individual.objects.get(project=project, indiv_id=individual_id) for individual_id in individual_ids ] else: individual_ids = [ i for i in Individual.objects.filter(project=project) ] except ObjectDoesNotExist: sys.exit("Invalid individual ids: " + str(individual_ids)) for i in individual_ids: family_collection = get_mall( project_id).variant_store._get_family_collection( project_id, i.family.family_id) if family_collection is None: print( "WARNING: Family %s data not loaded in variant datastore. Skipping individual %s." % (i.family.family_id, i)) continue self.handle_individual(project, i) print("Finished generating report")
def load_cohorts(project_id): # now load cohorts print( date.strftime( datetime.now(), "%m/%d/%Y %H:%M:%S -- loading project: " + project_id + " - cohorts")) project = Project.objects.get(project_id=project_id) for vcf_file, cohorts in project.cohorts_by_vcf().items(): cohorts = [ c for c in cohorts if get_mall(project.project_id).variant_store.get_family_status( project_id, c.cohort_id) != 'loaded' ] for i in xrange(0, len(cohorts), settings.FAMILY_LOAD_BATCH_SIZE): print("Loading project %s - cohorts: %s" % (project_id, cohorts[i:i + settings.FAMILY_LOAD_BATCH_SIZE])) load_variants_for_cohort_list( project, cohorts[i:i + settings.FAMILY_LOAD_BATCH_SIZE]) print( date.strftime( datetime.now(), "%m/%d/%Y %H:%M:%S -- finished loading project: " + project_id))
def load_project_variants(project_id, force_annotations=False, ignore_csq_in_vcf=False): """ Load any families and cohorts in this project that aren't loaded already """ print "Loading project %s" % project_id print( date.strftime( datetime.now(), "%m/%d/%Y %H:%M:%S -- loading project: " + project_id + " - db.variants cache")) os.system("du /mongo/mongodb") project = Project.objects.get(project_id=project_id) for vcf_obj in project.get_all_vcf_files(): r = vcf.VCFReader(filename=vcf_obj.path()) if not ignore_csq_in_vcf and "CSQ" in r.infos: mall.get_annotator().add_preannotated_vcf_file( vcf_obj.path(), force=force_annotations) else: mall.get_annotator().add_vcf_file_to_annotator( vcf_obj.path(), force_all=force_annotations) # batch load families by VCF file for vcf_file, families in project.families_by_vcf().items(): families = [ f for f in families if get_mall(project.project_id).variant_store.get_family_status( project_id, f.family_id) != 'loaded' ] for i in xrange(0, len(families), settings.FAMILY_LOAD_BATCH_SIZE): print( date.strftime( datetime.now(), "%m/%d/%Y %H:%M:%S -- loading project: " + project_id + " - families batch %d - %d families" % (i, len(families[i:i + settings.FAMILY_LOAD_BATCH_SIZE])))) load_variants_for_family_list( project, families[i:i + settings.FAMILY_LOAD_BATCH_SIZE], vcf_file) # now load cohorts print( date.strftime( datetime.now(), "%m/%d/%Y %H:%M:%S -- loading project: " + project_id + " - cohorts")) # TODO: load cohorts and families together print( date.strftime( datetime.now(), "%m/%d/%Y %H:%M:%S -- loading project: " + project_id + " - cohorts")) os.system("du /mongo/mongodb") for vcf_file, cohorts in project.cohorts_by_vcf().items(): cohorts = [ c for c in cohorts if get_mall(project.project_id).variant_store.get_family_status( project_id, c.cohort_id) != 'loaded' ] for i in xrange(0, len(cohorts), settings.FAMILY_LOAD_BATCH_SIZE): print("Loading project %s - cohorts: %s" % (project_id, cohorts[i:i + settings.FAMILY_LOAD_BATCH_SIZE])) load_variants_for_cohort_list( project, cohorts[i:i + settings.FAMILY_LOAD_BATCH_SIZE]) print( date.strftime( datetime.now(), "%m/%d/%Y %H:%M:%S -- finished loading project: " + project_id))
def combine_mendelian_families_spec(request): project, family_group = utils.get_project_and_family_group_for_user(request.user, request.GET) if not project.can_view(request.user): raise PermissionDenied search_hash = request.GET.get('search_hash') search_spec, genes = cache_utils.get_cached_results(project.project_id, search_hash) search_spec_obj = MendelianVariantSearchSpec.fromJSON(search_spec) if request.GET.get('return_type') != 'csv' or not request.GET.get('group_by_variants'): if genes is None: genes = api_utils.calculate_combine_mendelian_families(family_group, search_spec) api_utils.add_extra_info_to_genes(project, get_reference(), genes) if request.GET.get('return_type') != 'csv': return JSONResponse({ 'is_error': False, 'genes': genes, 'search_spec': search_spec, }) else: response = HttpResponse(content_type='text/csv') response['Content-Disposition'] = 'attachment; filename="family_group_results_{}.csv"'.format(search_hash) writer = csv.writer(response) writer.writerow(["gene", "# families", "family list", "chrom", "start", "end"]) for gene in genes: family_id_list = [family_id for (project_id, family_id) in gene["family_id_list"]] writer.writerow(map(str, [gene["gene_name"], len(family_id_list), " ".join(family_id_list), gene["chr"], gene["start"], gene["end"], ""])) return response else: # download results grouped by variant indiv_id_list = [] for family in family_group.get_families(): indiv_id_list.extend(family.indiv_ids_with_variant_data()) response = HttpResponse(content_type='text/csv') response['Content-Disposition'] = 'attachment; filename="results_{}.csv"'.format(search_hash) writer = csv.writer(response) headers = ['genes','chr','pos','ref','alt','worst_annotation' ] headers.extend(project.get_reference_population_slugs()) headers.extend([ 'polyphen','sift','muttaster','fathmm']) for indiv_id in indiv_id_list: headers.append(indiv_id) headers.append(indiv_id+'_gq') headers.append(indiv_id+'_dp') writer.writerow(headers) mall = get_mall(project.project_id) variant_key_to_individual_id_to_variant = defaultdict(dict) variant_key_to_variant = {} for family in family_group.get_families(): for variant in get_variants_with_inheritance_mode( mall, family.xfamily(), search_spec_obj.inheritance_mode, search_spec_obj.variant_filter, search_spec_obj.quality_filter, ): if len(variant.coding_gene_ids) == 0: continue variant_key = (variant.xpos, variant.ref, variant.alt) variant_key_to_variant[variant_key] = variant for indiv_id in family.indiv_ids_with_variant_data(): variant_key_to_individual_id_to_variant[variant_key][indiv_id] = variant for variant_key in sorted(variant_key_to_individual_id_to_variant.keys()): variant = variant_key_to_variant[variant_key] individual_id_to_variant = variant_key_to_individual_id_to_variant[variant_key] genes = [mall.reference.get_gene_symbol(gene_id) for gene_id in variant.coding_gene_ids] fields = [] fields.append(','.join(genes)) fields.extend([ variant.chr, str(variant.pos), variant.ref, variant.alt, variant.annotation.get('vep_group', '.'), ]) for ref_population_slug in project.get_reference_population_slugs(): fields.append(variant.annotation['freqs'][ref_population_slug]) for field_key in ['polyphen', 'sift', 'muttaster', 'fathmm']: fields.append(variant.annotation[field_key]) for indiv_id in indiv_id_list: variant = individual_id_to_variant.get(indiv_id) genotype = None if variant is not None: genotype = variant.get_genotype(indiv_id) if genotype is None: fields.extend(['.', '.', '.']) else: if genotype.num_alt == 0: fields.append("%s/%s" % (variant.ref, variant.ref)) elif genotype.num_alt == 1: fields.append("%s/%s" % (variant.ref, variant.alt)) elif genotype.num_alt == 2: fields.append("%s/%s" % (variant.alt, variant.alt)) else: fields.append("./.") fields.append(str(genotype.gq) if genotype.gq is not None else '.') fields.append(genotype.extras['dp'] if genotype.extras.get('dp') is not None else '.') writer.writerow(fields) return response
def handle(self, *args, **options): project_id = args[0] inheritance_mode = args[1] fam_list_file_path = args[2] project = Project.objects.get(project_id=project_id) families = [] for line in open(fam_list_file_path): family_id = line.strip('\n') families.append(Family.objects.get(project=project, family_id=family_id)) # create search spec variant_filter = next(f for f in project.get_default_variant_filters() if f['slug'] == 'moderate_impact')['variant_filter'] quality_filter = { 'min_gq': 20, 'min_ab': 25, } # run MendelianVariantSearch for each family, collect results family_results = {} for family in families: family_results[family] = list(get_variants_with_inheritance_mode( get_mall(project_id), family.xfamily(), inheritance_mode, variant_filter=variant_filter, quality_filter=quality_filter, )) # create family_variants.tsv f = open('family_variants.tsv', 'w') writer = csv.writer(f, dialect='excel', delimiter='\t') writer.writerow([ '#family_id', 'gene', 'chrom', 'ref', 'alt', 'rsid', 'annotation', ]) for family in families: for variant in family_results[family]: writer.writerow([ family.family_id, get_gene_symbol(variant), variant.chr, variant.ref, variant.alt, variant.vcf_id, variant.annotation['vep_group'], ]) f.close() # create variants.tsv by_variant = {} variant_info = {} for family in families: for variant in family_results[family]: if variant.unique_tuple() not in by_variant: by_variant[variant.unique_tuple()] = set() variant_info[variant.unique_tuple()] = variant by_variant[variant.unique_tuple()].add(family.family_id) f = open('variants.tsv', 'w') writer = csv.writer(f, dialect='excel', delimiter='\t') headers = [ '#chrom', 'ref', 'alt', 'rsid', 'gene' 'annotation', 'num_families', ] headers.extend([fam.family_id for fam in families]) writer.writerow(headers) for variant_t in sorted(variant_info.keys()): variant = variant_info[variant_t] fields = [ variant.chr, variant.ref, variant.alt, variant.vcf_id, get_gene_symbol(variant_info[variant_t]), variant.annotation['vep_group'], str(len(by_variant[variant_t])), ] for family in families: fields.append('1' if family.family_id in by_variant[variant_t] else '0') writer.writerow(fields) f.close() # create genes.tsv by_gene = {} for family in families: for variant in family_results[family]: gene_symbol = get_gene_symbol(variant) if gene_symbol not in by_gene: by_gene[gene_symbol] = set() by_gene[gene_symbol].add(family.family_id) f = open('genes.tsv', 'w') writer = csv.writer(f, dialect='excel', delimiter='\t') headers = [ '#gene', 'num_families', ] headers.extend([fam.family_id for fam in families]) writer.writerow(headers) for gene_symbol in sorted(by_gene.keys()): fields = [ gene_symbol, str(len(by_gene[gene_symbol])), ] for family in families: fields.append('1' if family.family_id in by_gene[gene_symbol] else '0') writer.writerow(fields) f.close()
def handle(self, *args, **options): project_id = args[0] inheritance_mode = args[1] fam_list_file_path = args[2] project = Project.objects.get(project_id=project_id) families = [] for line in open(fam_list_file_path): family_id = line.strip('\n') families.append( Family.objects.get(project=project, family_id=family_id)) # create search spec variant_filter = next( f for f in project.get_default_variant_filters() if f['slug'] == 'moderate_impact')['variant_filter'] quality_filter = { 'min_gq': 30, 'min_ab': 25, } # run MendelianVariantSearch for each family, collect results family_results = {} for family in families: family_results[family] = list( get_variants_with_inheritance_mode( get_mall(project_id), family.xfamily(), inheritance_mode, variant_filter=variant_filter, quality_filter=quality_filter, )) # create family_variants.tsv f = open('family_variants.tsv', 'w') writer = csv.writer(f, dialect='excel', delimiter='\t') writer.writerow([ '#family_id', 'gene', 'chrom', 'ref', 'alt', 'rsid', 'annotation', ]) for family in families: for variant in family_results[family]: writer.writerow([ family.family_id, get_gene_symbol(variant), variant.chr, variant.ref, variant.alt, variant.vcf_id, variant.annotation['vep_group'], ]) f.close() # create variants.tsv by_variant = {} variant_info = {} for family in families: for variant in family_results[family]: if variant.unique_tuple() not in by_variant: by_variant[variant.unique_tuple()] = set() variant_info[variant.unique_tuple()] = variant by_variant[variant.unique_tuple()].add(family.family_id) f = open('variants.tsv', 'w') writer = csv.writer(f, dialect='excel', delimiter='\t') headers = [ '#chrom', 'ref', 'alt', 'rsid', 'gene' 'annotation', 'num_families', ] headers.extend([fam.family_id for fam in families]) writer.writerow(headers) for variant_t in sorted(variant_info.keys()): variant = variant_info[variant_t] fields = [ variant.chr, variant.ref, variant.alt, variant.vcf_id, get_gene_symbol(variant_info[variant_t]), variant.annotation['vep_group'], str(len(by_variant[variant_t])), ] for family in families: fields.append('1' if family.family_id in by_variant[variant_t] else '0') writer.writerow(fields) f.close() # create genes.tsv by_gene = {} for family in families: for variant in family_results[family]: gene_symbol = get_gene_symbol(variant) if gene_symbol not in by_gene: by_gene[gene_symbol] = set() by_gene[gene_symbol].add(family.family_id) f = open('genes.tsv', 'w') writer = csv.writer(f, dialect='excel', delimiter='\t') headers = [ '#gene', 'num_families', ] headers.extend([fam.family_id for fam in families]) writer.writerow(headers) for gene_symbol in sorted(by_gene.keys()): fields = [ gene_symbol, str(len(by_gene[gene_symbol])), ] for family in families: fields.append('1' if family.family_id in by_gene[gene_symbol] else '0') writer.writerow(fields) f.close()
def handle(self, *args, **options): project_id = options['project_id'] print("Loading data into project: " + project_id) project = Project.objects.get(project_id=project_id) cnv_filename = options['cnv_filename'] bed_files_directory = options['bed_files_directory'] if not os.path.isfile(cnv_filename): raise ValueError("CNV file %s doesn't exist" % options['cnv_filename']) with open(cnv_filename) as f: header_fields = f.readline().rstrip('\n').split('\t') for line in f: fields = line.rstrip('\n').split('\t') row_dict = dict(zip(header_fields, fields)) chrom = "chr" + row_dict['chr'] start = int(row_dict['start']) end = int(row_dict['end']) #left_overhang = int(row_dict['left_overhang_start']) #right_overhang = int(row_dict['right_overhang_end']) sample_id = row_dict['sample'] try: i = Individual.objects.get(project=project, indiv_id__istartswith=sample_id) except Exception as e: print("WARNING: %s: %s not found in %s" % (e, sample_id, project)) continue bed_file_path = os.path.join(bed_files_directory, "%s.bed" % sample_id) if not os.path.isfile(bed_file_path): print("WARNING: .bed file not found: " + bed_file_path) if i.cnv_bed_file != bed_file_path: print("Setting cnv_bed_file path to %s" % bed_file_path) i.cnv_bed_file = bed_file_path i.save() project_collection = get_project_datastore( project)._get_project_collection(project_id) family_collection = get_mall( project).variant_store._get_family_collection( project_id, i.family.family_id) for collection in filter( None, [project_collection, family_collection]): collection.update_many( { '$and': [{ 'xpos': { '$gte': genomeloc.get_single_location( chrom, start) } }, { 'xpos': { '$lte': genomeloc.get_single_location(chrom, end) } }] }, { '$set': { 'genotypes.%s.extras.cnvs' % i.indiv_id: row_dict } }) #result = list(collection.find({'$and' : [ # {'xpos': {'$gte': genomeloc.get_single_location(chrom, start)}}, # {'xpos' :{'$lte': genomeloc.get_single_location(chrom, end)}}]}, # {'genotypes.%s.extras.cnvs' % i.indiv_id :1 })) #print(chrom, start, end, len(result), result[0] if result else None) print("Done")
def handle_individual(self, project, individual): project_id = project.project_id individual_id = individual.indiv_id print("Processing individual %s" % individual_id) # get variants that have been tagged or that have a note that starts with "REPORT" variants_in_report_and_notes = defaultdict(str) for vt in VariantTag.objects.filter(project_tag__project=project, project_tag__tag="REPORT", family=individual.family): variants_in_report_and_notes[(vt.xpos, vt.ref, vt.alt)] = "" for vn in VariantNote.objects.filter(project=project, family=individual.family): if vn.note and vn.note.strip().startswith("REPORT"): variants_in_report_and_notes[(vn.xpos, vn.ref, vn.alt)] = "" header = [ "gene_name", "genotype", "variant", "functional_class", "hgvs_c", "hgvs_p", "rsid", "exac_global_af", "exac_pop_max_af", "exac_pop_max_population", "clinvar_clinsig", "clinvar_clnrevstat", "number_of_stars", "clinvar_url", "comments" ] if len(variants_in_report_and_notes) != 0: with open( "report_for_%s_%s.flagged.txt" % (project_id, individual_id), "w") as out: #print("\t".join(header)) out.write("\t".join(header) + "\n") # retrieve text of all notes that were left for any of these variants for vn in VariantNote.objects.filter(project=project, family=individual.family): if vn.note and (vn.xpos, vn.ref, vn.alt) in variants_in_report_and_notes: other_notes = variants_in_report_and_notes[(vn.xpos, vn.ref, vn.alt)] if len(other_notes) > 0: other_notes += "||" variants_in_report_and_notes[( vn.xpos, vn.ref, vn.alt)] = other_notes + "%s|%s|%s" % ( vn.date_saved, vn.user.email, vn.note.strip()) for (xpos, ref, alt), notes in variants_in_report_and_notes.items(): #chrom, pos = genomeloc.get_chr_pos(xpos) v = get_mall(project_id).variant_store.get_single_variant( project_id, individual.family.family_id, xpos, ref, alt) if v is None: raise ValueError( "Couldn't find variant in variant store for: %s, %s, %s %s %s" % (project_id, individual.family.family_id, xpos, ref, alt)) row = self.get_output_row(v, xpos, ref, alt, individual_id, individual.family, all_fields=True, comments=notes) if row is None: continue #print("\t".join(row)) out.write("\t".join(row) + "\n") #print(variant_tag.project_tag.title, variant_tag.project_tag.tag, variant_tag.xpos, variant_tag.ref, variant_tag.alt) with open("report_for_%s_%s.genes.txt" % (project_id, individual_id), "w") as out: header = ["gene_chrom", "gene_start", "gene_end" ] + header + ["json_dump"] #print("\t".join(header)) out.write("\t".join(header) + "\n") for gene_id, (chrom, start, end) in gene_loc.items(): xpos_start = genomeloc.get_single_location( "chr" + chrom, start) xpos_end = genomeloc.get_single_location("chr" + chrom, end) for v in get_mall( project_id).variant_store.get_variants_in_range( project_id, individual.family.family_id, xpos_start, xpos_end): json_dump = str(v.genotypes) try: notes = variants_in_report_and_notes[(v.xpos, v.ref, v.alt)] except KeyError: notes = "" row = self.get_output_row(v, v.xpos, v.ref, v.alt, individual_id, individual.family, comments=notes, gene_id=gene_id) if row is None: continue row = map(str, [chrom, start, end] + row + [json_dump]) #print("\t".join(row)) out.write("\t".join(row) + "\n")
def calculate_mendelian_variant_search(search_spec, family, user=None): xfamily = family.xfamily() project = family.project variants = None if search_spec.search_mode == 'standard_inheritance': variants = list( get_variants_with_inheritance_mode( get_mall(project), xfamily, search_spec.inheritance_mode, variant_filter=search_spec.variant_filter, quality_filter=search_spec.quality_filter, user=user, )) elif search_spec.search_mode == 'custom_inheritance': variants = list( get_variants_family( get_datastore(project), xfamily, genotype_filter=search_spec.genotype_inheritance_filter, variant_filter=search_spec.variant_filter, quality_filter=search_spec.quality_filter, user=user, )) elif search_spec.search_mode == 'gene_burden': gene_stream = get_genes_family( get_datastore(project), get_reference(), xfamily, burden_filter=search_spec.gene_burden_filter, variant_filter=search_spec.variant_filter, quality_filter=search_spec.quality_filter, user=user, ) variants = list( stream_utils.gene_stream_to_variant_stream(gene_stream, get_reference())) elif search_spec.search_mode == 'allele_count': variants = list( get_variants_allele_count( get_datastore(project), xfamily, search_spec.allele_count_filter, variant_filter=search_spec.variant_filter, quality_filter=search_spec.quality_filter, user=user, )) elif search_spec.search_mode == 'all_variants': variants = list( get_variants_family( get_datastore(project), xfamily, variant_filter=search_spec.variant_filter, quality_filter=search_spec.quality_filter, indivs_to_consider=xfamily.indiv_id_list(), user=user, )) for variant in variants: variant.set_extra('family_id', family.family_id) return variants
def calculate_mendelian_variant_search(search_spec, xfamily): sys.stderr.write(( "mendelian_variant_search for %s - search mode: %s \n" "variant_filter: %s \ninheritance_mode: %s \nallele_count_filter: %s \nquality_filter: %s \ngenotype_inheritance_filter: %s \n" ) % (xfamily.project_id, search_spec.search_mode, search_spec.variant_filter.toJSON() if search_spec.variant_filter else '', search_spec.inheritance_mode, search_spec.allele_count_filter, search_spec.quality_filter, search_spec.genotype_inheritance_filter)) variants = None if search_spec.search_mode == 'standard_inheritance': variants = list( get_variants_with_inheritance_mode( get_mall(xfamily.project_id), xfamily, search_spec.inheritance_mode, variant_filter=search_spec.variant_filter, quality_filter=search_spec.quality_filter, )) elif search_spec.search_mode == 'custom_inheritance': variants = list( get_variants_family( get_datastore(xfamily.project_id), xfamily, genotype_filter=search_spec.genotype_inheritance_filter, variant_filter=search_spec.variant_filter, quality_filter=search_spec.quality_filter, )) elif search_spec.search_mode == 'gene_burden': gene_stream = get_genes_family( get_datastore(xfamily.project_id), get_reference(), xfamily, burden_filter=search_spec.gene_burden_filter, variant_filter=search_spec.variant_filter, quality_filter=search_spec.quality_filter, ) variants = list( stream_utils.gene_stream_to_variant_stream(gene_stream, get_reference())) elif search_spec.search_mode == 'allele_count': variants = list( get_variants_allele_count( get_datastore(xfamily.project_id), xfamily, search_spec.allele_count_filter, variant_filter=search_spec.variant_filter, quality_filter=search_spec.quality_filter, )) elif search_spec.search_mode == 'all_variants': variants = list( get_variants_family( get_datastore(xfamily.project_id), xfamily, variant_filter=search_spec.variant_filter, quality_filter=search_spec.quality_filter, indivs_to_consider=xfamily.indiv_id_list(), )) return variants
def handle(self, *args, **options): number_of_variants_to_check = int( options.get("number_of_variants_to_check") or 20000) if not args: args = [p.project_id for p in Project.objects.all()] args.reverse() for project_id in args: try: project = Project.objects.get(project_id=project_id) except: print("ERROR: Project not found. Skipping..") continue all_counter = 0 #found_counter = 0 not_found_counter = 0 not_found_variants = [] for vcf_file in project.get_all_vcf_files(): path = vcf_file.file_path #print("Processing %s - %s" % (project.project_id, path)) if not os.path.isfile(path) and path.endswith(".vcf"): path = path + ".gz" if path.endswith(".gz"): f = gzip.open(path) else: f = open(path) if f: for variant in vcf_stuff.iterate_vcf(f): all_counter += 1 try: get_mall(project).annotator.get_annotation( variant.xpos, variant.ref, variant.alt) except ValueError, e: not_found_counter += 1 if len(not_found_variants) < 30: chrom, pos = genomeloc.get_chr_pos( variant.xpos) chrom = chrom.replace("chr", "") ref, alt = variant.ref, variant.alt not_found_variants.append( "%(chrom)s-%(pos)s-%(ref)s-%(alt)s" % locals()) #print("WARNING: variant not found in annotator cache: " + str(e)) #if not_found_counter > 5: # print("---- ERROR: 5 variants not found. Project %s should be reloaded." % project_id) # break found_counter = 0 #else: # found_counter += 1 # if found_counter > 15000: # #print("---- Found 5000 variants in a row. Project %s looks ok." % project_id) # break if all_counter >= number_of_variants_to_check: fraction_missing = float( not_found_counter) / all_counter if not_found_counter > 10: print( "---- ERROR: (%(fraction_missing)0.2f%%) %(not_found_counter)s / %(all_counter)s variants not found. Project %(project_id)s should be reloaded. Examples: " % locals()) for v in not_found_variants: print( "http://exac.broadinstitute.org/variant/" + v) break
def handle_individual(self, project, individual): project_id = project.project_id individual_id = individual.indiv_id print("Processing individual %s" % individual_id) # get variants that have been tagged or that have a note that starts with "REPORT" variants_in_report_and_notes = defaultdict(str) for vt in VariantTag.objects.filter(project_tag__project=project, project_tag__tag="REPORT", family=individual.family): variants_in_report_and_notes[(vt.xpos, vt.ref, vt.alt)] = "" for vn in VariantNote.objects.filter(project=project, family=individual.family): if vn.note and vn.note.strip().startswith("REPORT"): variants_in_report_and_notes[(vn.xpos, vn.ref, vn.alt)] = "" header = ["gene_name", "genotype", "variant", "functional_class", "hgvs_c", "hgvs_p", "rsid", "exac_global_af", "exac_pop_max_af", "exac_pop_max_population", "clinvar_clinsig", "clinvar_clnrevstat", "number_of_stars", "clinvar_url", "comments"] if len(variants_in_report_and_notes) != 0: with open("report_for_%s_%s.flagged.txt" % (project_id, individual_id), "w") as out: #print("\t".join(header)) out.write("\t".join(header) + "\n") # retrieve text of all notes that were left for any of these variants for vn in VariantNote.objects.filter(project=project, family=individual.family): if vn.note and (vn.xpos, vn.ref, vn.alt) in variants_in_report_and_notes: other_notes = variants_in_report_and_notes[(vn.xpos, vn.ref, vn.alt)] if len(other_notes) > 0: other_notes += "||" variants_in_report_and_notes[(vn.xpos, vn.ref, vn.alt)] = other_notes + "%s|%s|%s" % (vn.date_saved, vn.user.email, vn.note.strip()) for (xpos, ref, alt), notes in variants_in_report_and_notes.items(): #chrom, pos = genomeloc.get_chr_pos(xpos) v = get_mall(project_id).variant_store.get_single_variant(project_id, individual.family.family_id, xpos, ref, alt) if v is None: raise ValueError("Couldn't find variant in variant store for: %s, %s, %s %s %s" % (project_id, individual.family.family_id, xpos, ref, alt)) row = self.get_output_row(v, xpos, ref, alt, individual_id, individual.family, all_fields=True, comments=notes) if row is None: continue #print("\t".join(row)) out.write("\t".join(row) + "\n") #print(variant_tag.project_tag.title, variant_tag.project_tag.tag, variant_tag.xpos, variant_tag.ref, variant_tag.alt) with open("report_for_%s_%s.genes.txt" % (project_id, individual_id), "w") as out: header = ["gene_chrom", "gene_start", "gene_end"] + header + ["json_dump"] #print("\t".join(header)) out.write("\t".join(header) + "\n") for gene_id, (chrom, start, end) in gene_loc.items(): xpos_start = genomeloc.get_single_location("chr" + chrom, start) xpos_end = genomeloc.get_single_location("chr" + chrom, end) for v in get_mall(project_id).variant_store.get_variants_in_range(project_id, individual.family.family_id, xpos_start, xpos_end): json_dump = str(v.genotypes) try: notes = variants_in_report_and_notes[(v.xpos, v.ref, v.alt)] except KeyError: notes = "" row = self.get_output_row(v, v.xpos, v.ref, v.alt, individual_id, individual.family, comments=notes, gene_id=gene_id) if row is None: continue row = map(str, [chrom, start, end] + row + [json_dump]) #print("\t".join(row)) out.write("\t".join(row) + "\n")
def calculate_mendelian_variant_search(search_spec, family, user=None): xfamily = family.xfamily() project = family.project variants = None if search_spec.search_mode == 'standard_inheritance': variants = list(get_variants_with_inheritance_mode( get_mall(project), xfamily, search_spec.inheritance_mode, variant_filter=search_spec.variant_filter, quality_filter=search_spec.quality_filter, user=user, )) elif search_spec.search_mode == 'custom_inheritance': variants = list(get_variants_family( get_datastore(project), xfamily, genotype_filter=search_spec.genotype_inheritance_filter, variant_filter=search_spec.variant_filter, quality_filter=search_spec.quality_filter, user=user, )) elif search_spec.search_mode == 'gene_burden': gene_stream = get_genes_family( get_datastore(project), get_reference(), xfamily, burden_filter=search_spec.gene_burden_filter, variant_filter=search_spec.variant_filter, quality_filter=search_spec.quality_filter, user=user, ) variants = list(stream_utils.gene_stream_to_variant_stream(gene_stream, get_reference())) elif search_spec.search_mode == 'allele_count': variants = list(get_variants_allele_count( get_datastore(project), xfamily, search_spec.allele_count_filter, variant_filter=search_spec.variant_filter, quality_filter=search_spec.quality_filter, user=user, )) elif search_spec.search_mode == 'all_variants': variants = list(get_variants_family( get_datastore(project), xfamily, variant_filter=search_spec.variant_filter, quality_filter=search_spec.quality_filter, indivs_to_consider=xfamily.indiv_id_list(), user=user, )) for variant in variants: variant.set_extra('family_id', family.family_id) return variants
def calculate_mendelian_variant_search(search_spec, xfamily): sys.stderr.write(" cohort_variant_search - inheritance_mode: %s" % search_spec.inheritance_mode) variants = None if search_spec.search_mode == 'standard_inheritance': variants = list( get_variants_with_inheritance_mode( get_mall(xfamily.project_id), xfamily, search_spec.inheritance_mode, variant_filter=search_spec.variant_filter, quality_filter=search_spec.genotype_quality_filter, )) elif search_spec.search_mode == 'custom_inheritance': variants = list( get_variants_family( get_datastore(xfamily.project_id), xfamily, genotype_filter=search_spec.genotype_inheritance_filter, variant_filter=search_spec.variant_filter, quality_filter=search_spec.genotype_quality_filter, )) elif search_spec.search_mode == 'gene_burden': gene_stream = get_genes_family( get_datastore(xfamily.project_id), get_reference(), xfamily, burden_filter=search_spec.gene_burden_filter, variant_filter=search_spec.variant_filter, quality_filter=search_spec.genotype_quality_filter, ) variants = list( stream_utils.gene_stream_to_variant_stream(gene_stream, get_reference())) elif search_spec.search_mode == 'allele_count': variants = list( get_variants_allele_count( get_datastore(xfamily.project_id), xfamily, search_spec.allele_count_filter, variant_filter=search_spec.variant_filter, quality_filter=search_spec.genotype_quality_filter, )) elif search_spec.search_mode == 'all_variants': variants = list( get_variants_family( get_datastore(xfamily.project_id), xfamily, variant_filter=search_spec.variant_filter, quality_filter=search_spec.genotype_quality_filter, )) return variants
def write_snp_fileset(family, output_dir_path): """ Write a set of files for a family that can be passed to linkage engine Creates the following files: variants.txt [family_id].fam markers.txt disease_model.json """ individuals = family.get_individuals() # fam file fam_file_path = os.path.join(output_dir_path, family.family_id + '.fam') f = open(fam_file_path, 'w') for indiv in individuals: fields = [ family.family_id, indiv.indiv_id, indiv.paternal_id if indiv.paternal_id else '.', indiv.maternal_id if indiv.maternal_id else '.', '2' if indiv.gender == 'F' else ('1' if indiv.gender == 'F' else '0'), '2' if indiv.affected == 'A' else ('1' if indiv.affected == 'N' else '0'), ] f.write('\t'.join(fields)+'\n') f.close() # markers.txt markers_path = os.path.join(output_dir_path, 'markers.txt') shutil.copy(settings.COMMON_SNP_FILE, markers_path) # disease model disease_model_path = os.path.join(output_dir_path, 'disease_model.txt') f = open(disease_model_path, 'w') f.writelines([ "DD\t.001\n", "Dd\t.001\n", "dd\t.999\n", ]) f.close() # variants.txt variants_file_path = os.path.join(output_dir_path, 'variants.txt') f = open(variants_file_path, 'w') f.write('#CHR\tPOS\tREF\tALT') for indiv in individuals: f.write('\t'+indiv.indiv_id) f.write('\n') for _line in open(settings.COMMON_SNP_FILE): fields = _line.strip('\n').split('\t') xpos = genomeloc.get_single_location('chr'+fields[0], int(fields[1])) ref = fields[2] alt = fields[3] variant = get_mall().variant_store.get_single_variant(family.project.project_id, family.family_id, xpos, ref, alt) fields = [ fields[0], fields[1], fields[2], fields[3], ] for indiv in individuals: if variant: genotype = variant.get_genotype(indiv.indiv_id) fields.append(str(genotype.num_alt) if genotype.num_alt is not None else '.') else: fields.append('0') f.write('\t'.join(fields)+'\n') f.close()