示例#1
0
def load_variants_for_cohort_list(project, cohorts):

    for cohort in cohorts:
        family_list = []
        print "Adding {}".format(cohort.cohort_id)
        family_list.append({
            'project_id': cohort.project.project_id,
            'family_id': cohort.cohort_id,
            'individuals': cohort.indiv_id_list(),
        })

        # add all families from this vcf to the datastore
        get_mall(project.project_id).variant_store.add_family_set(family_list)

        vcf_files = cohort.get_vcf_files()

        # create the VCF ID map
        vcf_id_map = {}
        for individual in cohort.get_individuals():
            if individual.vcf_id:
                vcf_id_map[individual.vcf_id] = individual.indiv_id

        # load them all into the datastore
        for vcf_file in vcf_files:
            family_tuple_list = [(f['project_id'], f['family_id'])
                                 for f in family_list]
            get_mall(project.project_id).variant_store.load_family_set(
                vcf_file.path(),
                family_tuple_list,
                reference_populations=project.get_reference_population_slugs(),
                vcf_id_map=vcf_id_map,
            )
示例#2
0
文件: views.py 项目: rpete/seqr
def mendelian_variant_search_spec(request):

    project, family = get_project_and_family_for_user(request.user, request.GET)

    search_hash = request.GET.get('search_hash')
    search_spec_dict, variants = cache_utils.get_cached_results(project.project_id, search_hash)
    search_spec = MendelianVariantSearchSpec.fromJSON(search_spec_dict)
    if variants is None:
        variants = api_utils.calculate_mendelian_variant_search(search_spec, family.xfamily())
    else:
        variants = [Variant.fromJSON(v) for v in variants]
    add_extra_info_to_variants_family(get_reference(), family, variants)
    return_type = request.GET.get('return_type')
    if return_type == 'json' or not return_type:
        return JSONResponse({
            'is_error': False,
            'variants': [v.toJSON() for v in variants],
            'search_spec': search_spec_dict,
        })
    elif request.GET.get('return_type') == 'csv':
        response = HttpResponse(content_type='text/csv')
        response['Content-Disposition'] = 'attachment; filename="results_{}.csv"'.format(search_hash)
        writer = csv.writer(response)
        indiv_ids = family.indiv_ids_with_variant_data()
        headers = xbrowse_displays.get_variant_display_headers(get_mall(project.project_id), project, indiv_ids)
        writer.writerow(headers)
        for variant in variants:
            fields = xbrowse_displays.get_display_fields_for_variant(get_mall(project.project_id), project, variant, indiv_ids)
            writer.writerow(fields)
        return response
示例#3
0
 def handle(self, *args, **options):
     for project_id in args:
         print("Deleting data from mongodb for project: " + project_id)
         p = Project.objects.get(project_id=project_id)
         get_mall(p).variant_store.delete_project(project_id)
         get_project_datastore(p).delete_project_store(project_id)
         print("Done")
示例#4
0
文件: views.py 项目: frichter/seqr
def mendelian_variant_search_spec(request):

    project, family = get_project_and_family_for_user(request.user, request.GET)

    # TODO: use form

    search_hash = request.GET.get('search_hash')
    search_spec_dict, variants = cache_utils.get_cached_results(project.project_id, search_hash)
    search_spec = MendelianVariantSearchSpec.fromJSON(search_spec_dict)
    if variants is None:
        variants = api_utils.calculate_mendelian_variant_search(search_spec, family.xfamily())
    else:
        variants = [Variant.fromJSON(v) for v in variants]
    add_extra_info_to_variants_family(get_reference(), family, variants)
    return_type = request.GET.get('return_type')
    if return_type == 'json' or not return_type:
        return JSONResponse({
            'is_error': False,
            'variants': [v.toJSON() for v in variants],
            'search_spec': search_spec_dict,
        })
    elif request.GET.get('return_type') == 'csv':
        response = HttpResponse(content_type='text/csv')
        response['Content-Disposition'] = 'attachment; filename="results_{}.csv"'.format(search_hash)
        writer = csv.writer(response)
        indiv_ids = family.indiv_ids_with_variant_data()
        headers = xbrowse_displays.get_variant_display_headers(get_mall(project.project_id), project, indiv_ids)
        writer.writerow(headers)
        for variant in variants:
            fields = xbrowse_displays.get_display_fields_for_variant(get_mall(project.project_id), project, variant, indiv_ids)
            writer.writerow(fields)
        return response
示例#5
0
 def handle(self, *args, **options):
     for project_id in args:
         print("Deleting data from mongodb for project: " + project_id)
         p = Project.objects.get(project_id = project_id)
         get_mall(p).variant_store.delete_project(project_id)
         get_project_datastore(p).delete_project_store(project_id)
         print("Done")
示例#6
0
def load_variants_for_cohort_list(project, cohorts):

    for cohort in cohorts:
        family_list = []
        print "Adding {}".format(cohort.cohort_id)
        family_list.append({
            'project_id': cohort.project.project_id,
            'family_id': cohort.cohort_id,
            'individuals': cohort.indiv_id_list(),
            })

        # add all families from this vcf to the datastore
        get_mall(project.project_id).variant_store.add_family_set(family_list)

        vcf_files = cohort.get_vcf_files()

        # create the VCF ID map
        vcf_id_map = {}
        for individual in cohort.get_individuals():
            if individual.vcf_id:
                vcf_id_map[individual.vcf_id] = individual.indiv_id

        # load them all into the datastore
        for vcf_file in vcf_files:
            family_tuple_list = [(f['project_id'], f['family_id']) for f in family_list]
            get_mall(project.project_id).variant_store.load_family_set(
                vcf_file.path(),
                family_tuple_list,
                reference_populations=project.get_reference_population_slugs(),
                vcf_id_map=vcf_id_map,
            )
示例#7
0
def load_variants_for_family_list(project, families, vcf_file, mark_as_loaded=True):
    """
    Reload variants for a list of families, all from the same vcf
    """
    family_list = []
    for family in families:
        family_list.append({
            'project_id': family.project.project_id,
            'family_id': family.family_id,
            'individuals': family.indiv_ids_with_variant_data(),
        })

    # add all families from this vcf to the datastore
    get_mall(project.project_id).variant_store.add_family_set(family_list)

    # create the VCF ID map
    vcf_id_map = {}
    for family in families:
        for individual in family.get_individuals():
            if individual.vcf_id:
                vcf_id_map[individual.vcf_id] = individual.indiv_id

    # load them all into the datastore
    family_tuple_list = [(f['project_id'], f['family_id']) for f in family_list]
    get_mall(project.project_id).variant_store.load_family_set(
        vcf_file,
        family_tuple_list,
        reference_populations=project.get_reference_population_slugs(),
        vcf_id_map=vcf_id_map,
        mark_as_loaded=mark_as_loaded,
    )

    # finish up each family
    for family in families:
        _family_postprocessing(family)
示例#8
0
def delete_project(project_id):
    """
    Delete a project and perform any cleanup (ie. deleting from datastore and removing temp files)
    """
    project = Project.objects.get(project_id=project_id)
    get_mall(project_id).variant_store.delete_project(project_id)
    project.individual_set.all().delete()
    project.family_set.all().delete()
    project.delete()
示例#9
0
def delete_project(project_id):
    """
    Delete a project and perform any cleanup (ie. deleting from datastore and removing temp files)
    """
    project = Project.objects.get(project_id=project_id)
    get_mall(project_id).variant_store.delete_project(project_id)
    project.individual_set.all().delete()
    project.family_set.all().delete()
    project.delete()
示例#10
0
    def handle(self, *args, **options):
        number_of_variants_to_check = int(options.get("number_of_variants_to_check") or 20000)

        if not args:
            args = [p.project_id for p in Project.objects.all()]
            args.reverse()

        for project_id in args:
            try:
                project = Project.objects.get(project_id=project_id)
            except:
                print("ERROR: Project not found. Skipping..")
                continue
            all_counter = 0
            #found_counter = 0
            not_found_counter = 0
            not_found_variants = []
            for vcf_file in project.get_all_vcf_files():
                path = vcf_file.file_path
                #print("Processing %s - %s" % (project.project_id, path))
                if not os.path.isfile(path) and path.endswith(".vcf"):
                    path = path + ".gz"
                if path.endswith(".gz"):
                    f = gzip.open(path)
                else:
                    f = open(path)
                if f:
                    for variant in vcf_stuff.iterate_vcf(f):
                        all_counter += 1
                        try:
                            get_mall(project_id).annotator.get_annotation(variant.xpos, variant.ref, variant.alt)
                        except ValueError, e:
                            not_found_counter += 1
                            if len(not_found_variants) < 30:
                                chrom, pos = genomeloc.get_chr_pos(variant.xpos)
                                chrom = chrom.replace("chr","")
                                ref, alt = variant.ref, variant.alt
                                not_found_variants.append("%(chrom)s-%(pos)s-%(ref)s-%(alt)s" % locals())
                            #print("WARNING: variant not found in annotator cache: " + str(e))
                            #if not_found_counter > 5:
                            #    print("---- ERROR: 5 variants not found. Project %s should be reloaded." % project_id)
                            #    break
                            found_counter = 0
                        #else:
                        #    found_counter += 1
                        #    if found_counter > 15000:
                        #        #print("---- Found 5000 variants in a row. Project %s looks ok." % project_id)
                        #        break
                        if all_counter >= number_of_variants_to_check:
                            fraction_missing = float(not_found_counter) / all_counter
                            if not_found_counter > 10:
                                print("---- ERROR: (%(fraction_missing)0.2f%%)  %(not_found_counter)s / %(all_counter)s variants not found. Project %(project_id)s should be reloaded. Examples: " % locals())

                                for v in not_found_variants:
                                    print("http://exac.broadinstitute.org/variant/" + v)
                            break
示例#11
0
def delete_family(project_id, family_id):
    """
    Delete a project and perform any cleanup (ie. deleting from datastore and removing temp files)
    """
    family = Family.objects.get(project__project_id=project_id, family_id=family_id)
    for individual in family.get_individuals():
        update_xbrowse_model(individual, family=None)

    get_mall(project_id).variant_store.delete_family(project_id, family_id)
    delete_xbrowse_model(family)
示例#12
0
def delete_family(project_id, family_id):
    """
    Delete a project and perform any cleanup (ie. deleting from datastore and removing temp files)
    """
    family = Family.objects.get(project__project_id=project_id, family_id=family_id)
    for individual in family.get_individuals():
        update_xbrowse_model(individual, family=None)

    get_mall(project_id).variant_store.delete_family(project_id, family_id)
    delete_xbrowse_model(family)
示例#13
0
def delete_family(project_id, family_id):
    """
    Delete a project and perform any cleanup (ie. deleting from datastore and removing temp files)
    """
    family = Family.objects.get(project__project_id=project_id, family_id=family_id)
    for individual in family.get_individuals():
        individual.family = None
        individual.save()
    get_mall(project_id).variant_store.delete_family(project_id, family_id)
    family.delete()
示例#14
0
def delete_family(project_id, family_id):
    """
    Delete a project and perform any cleanup (ie. deleting from datastore and removing temp files)
    """
    family = Family.objects.get(project__project_id=project_id,
                                family_id=family_id)
    for individual in family.get_individuals():
        individual.family = None
        individual.save()
    get_mall(project_id).variant_store.delete_family(project_id, family_id)
    family.delete()
示例#15
0
def delete_project(project_id, delete_data=False):
    """
    Delete a project and perform any cleanup (ie. deleting from datastore and removing temp files)
    """
    print("Deleting %s" % project_id)
    project = Project.objects.get(project_id=project_id)
    if delete_data:
        get_project_datastore(project_id).delete_project_store(project_id)
        get_mall(project_id).variant_store.delete_project(project_id)

    project.individual_set.all().delete()
    project.family_set.all().delete()
    project.delete()
    print("Successfully deleted %s" % project_id)
示例#16
0
def load_project_variants(project_id, force_load_annotations=False, force_load_variants=False, ignore_csq_in_vcf=False, start_from_chrom=None, end_with_chrom=None):
    """
    Load any families and cohorts in this project that aren't loaded already
    """
    print "Loading project %s" % project_id
    print(date.strftime(datetime.now(), "%m/%d/%Y %H:%M:%S  -- loading project: " + project_id + " - db.variants cache"))
    project = Project.objects.get(project_id=project_id)

    for vcf_obj in sorted(project.get_all_vcf_files(), key=lambda v:v.path()):
        if not os.path.isfile(vcf_obj.path()):
            print("Skipping " + vcf_obj.path())
            continue

        r = vcf.VCFReader(filename=vcf_obj.path())
        if not ignore_csq_in_vcf and "CSQ" not in r.infos:
            raise ValueError("VEP annotations not found in VCF: " + vcf_obj.path())

        mall.get_annotator().add_preannotated_vcf_file(vcf_obj.path(), force=force_load_annotations, start_from_chrom=start_from_chrom, end_with_chrom=end_with_chrom)


    # batch load families by VCF file
    for vcf_file, families in project.families_by_vcf().items():
        if not force_load_variants:
            # filter out families that have already finished loading
            families = [f for f in families if get_mall(project).variant_store.get_family_status(project_id, f.family_id) != 'loaded']

        for i in xrange(0, len(families), settings.FAMILY_LOAD_BATCH_SIZE):
            load_variants_for_family_list(project, families[i:i+settings.FAMILY_LOAD_BATCH_SIZE], vcf_file, start_from_chrom=start_from_chrom, end_with_chrom=end_with_chrom)

    # now load cohorts
    load_cohorts(project_id)
示例#17
0
def load_project_variants(project_id, force_annotations=False, ignore_csq_in_vcf=False):
    """
    Load any families and cohorts in this project that aren't loaded already 
    """
    print "Loading project %s" % project_id
    print(date.strftime(datetime.now(), "%m/%d/%Y %H:%M:%S  -- loading project: " + project_id + " - db.variants cache"))
    project = Project.objects.get(project_id=project_id)

    for vcf_obj in project.get_all_vcf_files():
        r = vcf.VCFReader(filename=vcf_obj.path())
        if not ignore_csq_in_vcf and "CSQ" not in r.infos:
            raise ValueError("VEP annotations not found in VCF: " + vcf_file)

        mall.get_annotator().add_preannotated_vcf_file(vcf_obj.path(), force=force_annotations)
        

    # batch load families by VCF file
    for vcf_file, families in project.families_by_vcf().items():
        families = [f for f in families if get_mall(project.project_id).variant_store.get_family_status(project_id, f.family_id) != 'loaded']
        for i in xrange(0, len(families), settings.FAMILY_LOAD_BATCH_SIZE):
            print(date.strftime(datetime.now(), "%m/%d/%Y %H:%M:%S  -- loading project: " + project_id + " - families batch %d - %d families" % (i, len(families[i:i+settings.FAMILY_LOAD_BATCH_SIZE])) ))
            load_variants_for_family_list(project, families[i:i+settings.FAMILY_LOAD_BATCH_SIZE], vcf_file)

    # now load cohorts
    load_cohorts(project_id)
示例#18
0
文件: views.py 项目: rpete/seqr
def combine_mendelian_families_variants(request):

    project, family_group = utils.get_project_and_family_group_for_user(request.user, request.GET)

    form = api_forms.CombineMendelianFamiliesVariantsForm(request.GET)
    if form.is_valid():
        variants_grouped = get_variants_by_family_for_gene(
            get_mall(project.project_id),
            [f.xfamily() for f in form.cleaned_data['families']],
            form.cleaned_data['inheritance_mode'],
            form.cleaned_data['gene_id'],
            variant_filter=form.cleaned_data['variant_filter'],
            quality_filter=form.cleaned_data['quality_filter']
        )
        variants_by_family = []
        for family in form.cleaned_data['families']:
            variants = variants_grouped[(family.project.project_id, family.family_id)]
            add_extra_info_to_variants_family(get_reference(), family, variants)
            variants_by_family.append({
                'project_id': family.project.project_id,
                'family_id': family.family_id,
                'family_name': str(family),
                'variants': [v.toJSON() for v in variants],
            })
        return JSONResponse({
            'is_error': False,
            'variants_by_family': variants_by_family,
        })

    else:
        return JSONResponse({
            'is_error': True,
            'error': server_utils.form_error_string(form)
        })
def get_variants_for_inheritance_for_project(project, inheritance_mode):
    """
    Get the variants for this project / inheritance combo
    Return dict of family -> list of variants
    """

    # create search specification
    # this could theoretically differ by project, if there are different reference populations
    variant_filter = get_default_variant_filter("moderate_impact")
    variant_filter.ref_freqs.append(("1kg_wgs_phase3", g1k_freq_threshold))
    variant_filter.ref_freqs.append(("1kg_wgs_phase3_popmax", g1k_popmax_freq_threshold))
    variant_filter.ref_freqs.append(("exac_v3", exac_freq_threshold))
    variant_filter.ref_freqs.append(("exac_v3_popmax", exac_popmax_threshold))
    quality_filter = {"vcf_filter": "pass", "min_gq": GQ_threshold, "min_ab": AB_threshold}

    # run MendelianVariantSearch for each family, collect results
    families = project.get_families()
    for i, family in enumerate(families):
        sys.stdout.write(
            "Processing %s - family %s  (%d / %d) .." % (inheritance_mode, family.family_id, i + 1, len(families))
        )
        variant_list = list(
            get_variants_with_inheritance_mode(
                get_mall(project.project_id),
                family.xfamily(),
                inheritance_mode,
                variant_filter=variant_filter,
                quality_filter=quality_filter,
            )
        )
        yield family, variant_list
        print(" got %d variants" % len(variant_list))
示例#20
0
def get_variants_for_inheritance_for_project(project, inheritance_mode):
    """
    Get the variants for this project / inheritance combo
    Return dict of family -> list of variants
    """

    # create search specification
    # this could theoretically differ by project, if there are different reference populations
    variant_filter = get_default_variant_filter('moderate_impact')
    variant_filter.ref_freqs.append(('1kg_wgs_phase3', g1k_freq_threshold))
    variant_filter.ref_freqs.append(('1kg_wgs_phase3_popmax', g1k_popmax_freq_threshold))
    variant_filter.ref_freqs.append(('exac_v3', exac_freq_threshold))
    variant_filter.ref_freqs.append(('exac_v3_popmax', exac_popmax_threshold))
    quality_filter = {
        'vcf_filter': 'pass',
        'min_gq': GQ_threshold,
        'min_ab': AB_threshold,
    }

    # run MendelianVariantSearch for each family, collect results
    families = project.get_families()
    for i, family in enumerate(families):
        sys.stdout.write("Processing %s - family %s  (%d / %d) .." % (inheritance_mode, family.family_id, i+1, len(families)))
        variant_list = list(get_variants_with_inheritance_mode(
            get_mall(project.project_id),
            family.xfamily(),
            inheritance_mode,
            variant_filter=variant_filter,
            quality_filter=quality_filter,
            ))
        yield family, variant_list
        print(" got %d variants" % len(variant_list))
示例#21
0
文件: views.py 项目: frichter/seqr
def combine_mendelian_families_variants(request):

    project, family_group = utils.get_project_and_family_group_for_user(request.user, request.GET)

    form = api_forms.CombineMendelianFamiliesVariantsForm(request.GET)
    if form.is_valid():
        variants_grouped = get_variants_by_family_for_gene(
            get_mall(project.project_id),
            [f.xfamily() for f in form.cleaned_data['families']],
            form.cleaned_data['inheritance_mode'],
            form.cleaned_data['gene_id'],
            variant_filter=form.cleaned_data['variant_filter'],
            quality_filter=form.cleaned_data['quality_filter']
        )
        variants_by_family = []
        for family in form.cleaned_data['families']:
            variants = variants_grouped[(family.project.project_id, family.family_id)]
            add_extra_info_to_variants_family(get_reference(), family, variants)
            variants_by_family.append({
                'project_id': family.project.project_id,
                'family_id': family.family_id,
                'family_name': str(family),
                'variants': [v.toJSON() for v in variants],
            })
        return JSONResponse({
            'is_error': False,
            'variants_by_family': variants_by_family,
        })

    else:
        return JSONResponse({
            'is_error': True,
            'error': server_utils.form_error_string(form)
        })
示例#22
0
def get_variants_in_gene(family_group,
                         gene_id,
                         variant_filter=None,
                         quality_filter=None):
    """

    """
    variants_by_family = []
    for family in family_group.get_families():
        variant_list = list(
            get_mall(
                family.project.project_id).variant_store.get_variants_in_gene(
                    family.project.project_id,
                    family.family_id,
                    gene_id,
                    variant_filter=variant_filter))
        variant_list = search_utils.filter_gene_variants_by_variant_filter(
            variant_list, gene_id, variant_filter)
        add_extra_info_to_variants_family(get_reference(), family,
                                          variant_list)
        variants_by_family.append({
            'variants': [v.toJSON() for v in variant_list],
            'family_id':
            family.family_id,
            'project_id':
            family.project.project_id,
            'family_name':
            str(family),
        })
    return variants_by_family
示例#23
0
    def handle(self, *args, **options):
        if len(args) < 1:
            print("Please provide the project_id. The individual_id(s) are optional")
            return

        project_id = args[0]

        try:
            project = Project.objects.get(project_id=project_id)
        except ObjectDoesNotExist:
            sys.exit("Invalid project id: " + project_id)

        individual_ids = args[1:]
        try:
            if individual_ids:
                individual_ids = [Individual.objects.get(project=project, indiv_id=individual_id) for individual_id in individual_ids]
            else:
                individual_ids = [i for i in Individual.objects.filter(project=project)]
        except ObjectDoesNotExist:
            sys.exit("Invalid individual ids: " + str(individual_ids))

        for i in individual_ids:
            family_collection = get_mall(project_id).variant_store._get_family_collection(project_id, i.family.family_id)
            if family_collection is None:
                print("WARNING: Family %s data not loaded in variant datastore. Skipping individual %s." % (i.family.family_id, i))
                continue
            self.handle_individual(project, i)
        print("Finished generating report")
示例#24
0
def load_cohorts(project_id):
    # now load cohorts
    project = Project.objects.get(project_id=project_id)
    for vcf_file, cohorts in project.cohorts_by_vcf().items():
        cohorts = [c for c in cohorts if get_mall(project).variant_store.get_family_status(project_id, c.cohort_id) != 'loaded']
        for i in xrange(0, len(cohorts), settings.FAMILY_LOAD_BATCH_SIZE):
            print("Loading project %s - cohorts: %s" % (project_id, cohorts[i:i+settings.FAMILY_LOAD_BATCH_SIZE]))
            load_variants_for_cohort_list(project, cohorts[i:i+settings.FAMILY_LOAD_BATCH_SIZE])
示例#25
0
    def handle(self, *args, **options):
        project_id = options['project_id']
        print("Loading data into project: " + project_id)
        project = Project.objects.get(project_id = project_id)

        cnv_filename = options['cnv_filename']
        bed_files_directory = options['bed_files_directory']
        
        if not os.path.isfile(cnv_filename):
            raise ValueError("CNV file %s doesn't exist" % options['cnv_filename'])
        
        with open(cnv_filename) as f:
            header_fields = f.readline().rstrip('\n').split('\t')
            for line in f:
                fields = line.rstrip('\n').split('\t')
                row_dict = dict(zip(header_fields, fields))

                chrom = "chr"+row_dict['chr']
                start = int(row_dict['start'])
                end = int(row_dict['end'])
                #left_overhang = int(row_dict['left_overhang_start'])
                #right_overhang = int(row_dict['right_overhang_end'])

                sample_id = row_dict['sample']
                try:
                    i = Individual.objects.get(project=project, indiv_id__istartswith=sample_id)
                except Exception as e:
                    print("WARNING: %s: %s not found in %s" % (e, sample_id, project))
                    continue
                
                bed_file_path = os.path.join(bed_files_directory, "%s.bed" % sample_id)
                if not os.path.isfile(bed_file_path):
                    print("WARNING: .bed file not found: " + bed_file_path)

                    if i.cnv_bed_file != bed_file_path:
                        print("Setting cnv_bed_file path to %s" % bed_file_path)
                        i.cnv_bed_file = bed_file_path
                        i.save()
                
                project_collection = get_project_datastore(project)._get_project_collection(project_id)
                family_collection = get_mall(project).variant_store._get_family_collection(project_id, i.family.family_id)

                for collection in filter(None, [project_collection, family_collection]):
                    
                    collection.update_many(
                        {'$and': [
                            {'xpos': {'$gte': genomeloc.get_single_location(chrom, start)} },
                            {'xpos': {'$lte': genomeloc.get_single_location(chrom, end)}}
                        ]},
                        {'$set': {'genotypes.%s.extras.cnvs' % i.indiv_id: row_dict}})

                    #result = list(collection.find({'$and' : [
                    #       {'xpos': {'$gte':  genomeloc.get_single_location(chrom, start)}},
                    #       {'xpos' :{'$lte': genomeloc.get_single_location(chrom, end)}}]},
                    #   {'genotypes.%s.extras.cnvs' % i.indiv_id :1 }))
                    #print(chrom, start, end, len(result), result[0] if result else None)

        print("Done")
示例#26
0
def load_project_variants(project_id,
                          force_load_annotations=False,
                          force_load_variants=False,
                          ignore_csq_in_vcf=False,
                          start_from_chrom=None,
                          end_with_chrom=None):
    """
    Load any families and cohorts in this project that aren't loaded already 
    """
    print "Loading project %s" % project_id
    print(
        date.strftime(
            datetime.now(), "%m/%d/%Y %H:%M:%S  -- loading project: " +
            project_id + " - db.variants cache"))
    project = Project.objects.get(project_id=project_id)

    for vcf_obj in sorted(project.get_all_vcf_files(), key=lambda v: v.path()):
        if not os.path.isfile(vcf_obj.path()):
            print("Skipping " + vcf_obj.path())
            continue

        r = vcf.VCFReader(filename=vcf_obj.path())
        if not ignore_csq_in_vcf and "CSQ" not in r.infos:
            raise ValueError("VEP annotations not found in VCF: " +
                             vcf_obj.path())

        mall.get_annotator().add_preannotated_vcf_file(
            vcf_obj.path(),
            force=force_load_annotations,
            start_from_chrom=start_from_chrom,
            end_with_chrom=end_with_chrom)

    # batch load families by VCF file
    for vcf_file, families in project.families_by_vcf().items():
        if not force_load_variants:
            # filter out families that have already finished loading
            families = [
                f for f in families
                if get_mall(project.project_id).variant_store.
                get_family_status(project_id, f.family_id) != 'loaded'
            ]

        for i in xrange(0, len(families), settings.FAMILY_LOAD_BATCH_SIZE):
            print(
                date.strftime(
                    datetime.now(), "%m/%d/%Y %H:%M:%S  -- loading project: " +
                    project_id + " - families batch %d - %d families" %
                    (i, len(families[i:i + settings.FAMILY_LOAD_BATCH_SIZE]))))
            load_variants_for_family_list(
                project,
                families[i:i + settings.FAMILY_LOAD_BATCH_SIZE],
                vcf_file,
                start_from_chrom=start_from_chrom,
                end_with_chrom=end_with_chrom)

    # now load cohorts
    load_cohorts(project_id)
示例#27
0
def calculate_mendelian_variant_search(search_spec, xfamily):

    variants = None

    if search_spec.search_mode == 'standard_inheritance':

        variants = list(get_variants_with_inheritance_mode(
            get_mall(),
            xfamily,
            search_spec.inheritance_mode,
            variant_filter=search_spec.variant_filter,
            quality_filter=search_spec.genotype_quality_filter,
        ))

    elif search_spec.search_mode == 'custom_inheritance':

        variants = list(get_variants_family(
            get_datastore(),
            xfamily,
            genotype_filter=search_spec.genotype_inheritance_filter,
            variant_filter=search_spec.variant_filter,
            quality_filter=search_spec.genotype_quality_filter,
        ))

    elif search_spec.search_mode == 'gene_burden':

        gene_stream = get_genes_family(
            get_datastore(),
            get_reference(),
            xfamily,
            burden_filter=search_spec.gene_burden_filter,
            variant_filter=search_spec.variant_filter,
            quality_filter=search_spec.genotype_quality_filter,
        )

        variants = list(stream_utils.gene_stream_to_variant_stream(gene_stream, get_reference()))

    elif search_spec.search_mode == 'allele_count':

        variants = list(get_variants_allele_count(
            get_datastore(),
            xfamily,
            search_spec.allele_count_filter,
            variant_filter=search_spec.variant_filter,
            quality_filter=search_spec.genotype_quality_filter,
        ))

    elif search_spec.search_mode == 'all_variants':
        variants = list(get_variants_family(
            get_datastore(),
            xfamily,
            variant_filter=search_spec.variant_filter,
            quality_filter=search_spec.genotype_quality_filter,
        ))

    return variants
示例#28
0
def load_variants_for_family_list(project,
                                  families,
                                  vcf_file,
                                  mark_as_loaded=True,
                                  start_from_chrom=None,
                                  end_with_chrom=None):
    """
    Reload variants for a list of families, all from the same vcf
    """
    family_list = []
    for family in families:
        family_list.append({
            'project_id': family.project.project_id,
            'family_id': family.family_id,
            'individuals': family.indiv_ids_with_variant_data(),
        })

    # add all families from this vcf to the datastore
    get_mall(project.project_id).variant_store.add_family_set(family_list)

    # create the VCF ID map
    vcf_id_map = {}
    for family in families:
        for individual in family.get_individuals():
            if individual.vcf_id:
                vcf_id_map[individual.vcf_id] = individual.indiv_id

    # load them all into the datastore
    family_tuple_list = [(f['project_id'], f['family_id'])
                         for f in family_list]
    get_mall(project.project_id).variant_store.load_family_set(
        vcf_file,
        family_tuple_list,
        reference_populations=project.get_reference_population_slugs(),
        vcf_id_map=vcf_id_map,
        mark_as_loaded=mark_as_loaded,
        start_from_chrom=start_from_chrom,
        end_with_chrom=end_with_chrom,
    )

    # finish up each family
    for family in families:
        _family_postprocessing(family)
示例#29
0
def load_cohorts(project_id):
    # now load cohorts
    print(date.strftime(datetime.now(), "%m/%d/%Y %H:%M:%S  -- loading project: " + project_id + " - cohorts"))

    project = Project.objects.get(project_id=project_id)
    for vcf_file, cohorts in project.cohorts_by_vcf().items():
        cohorts = [c for c in cohorts if get_mall(project.project_id).variant_store.get_family_status(project_id, c.cohort_id) != 'loaded']
        for i in xrange(0, len(cohorts), settings.FAMILY_LOAD_BATCH_SIZE):
            print("Loading project %s - cohorts: %s" % (project_id, cohorts[i:i+settings.FAMILY_LOAD_BATCH_SIZE]))
            load_variants_for_cohort_list(project, cohorts[i:i+settings.FAMILY_LOAD_BATCH_SIZE])

    print(date.strftime(datetime.now(), "%m/%d/%Y %H:%M:%S  -- finished loading project: " + project_id))
示例#30
0
def clean_project(project_id):
    """
    Clear data for this project from all the xbrowse resources:
     - datastore
     - coverage store
     - cnv store
    Does not remove any of the project's data links - so no data is lost, but everything must be rebuilt
    """
    project = Project.objects.get(project_id=project_id)
    individuals = project.get_individuals()

    # datastore
    get_mall(project_id).variant_store.delete_project(project_id)

    # coverage store
    for individual in individuals:
        get_coverage_store().remove_sample(individual.get_coverage_store_id())

    # cnv store
    for individual in individuals:
        get_cnv_store().remove_sample(individual.get_coverage_store_id())
示例#31
0
文件: utils.py 项目: mattsolo1/seqr
def calculate_mendelian_variant_search(search_spec, xfamily):
    sys.stderr.write("     mendelian_variant_search for %s - search mode: %s  %s\n" % (xfamily.project_id, search_spec.search_mode, search_spec.__dict__))

    variants = None
    if search_spec.search_mode == 'standard_inheritance':
        variants = list(get_variants_with_inheritance_mode(
            get_mall(xfamily.project_id),
            xfamily,
            search_spec.inheritance_mode,
            variant_filter=search_spec.variant_filter,
            quality_filter=search_spec.quality_filter,
        ))

    elif search_spec.search_mode == 'custom_inheritance':
        variants = list(get_variants_family(
            get_datastore(xfamily.project_id),
            xfamily,
            genotype_filter=search_spec.genotype_inheritance_filter,
            variant_filter=search_spec.variant_filter,
            quality_filter=search_spec.quality_filter,
        ))

    elif search_spec.search_mode == 'gene_burden':
        gene_stream = get_genes_family(
            get_datastore(xfamily.project_id),
            get_reference(),
            xfamily,
            burden_filter=search_spec.gene_burden_filter,
            variant_filter=search_spec.variant_filter,
            quality_filter=search_spec.quality_filter,
        )

        variants = list(stream_utils.gene_stream_to_variant_stream(gene_stream, get_reference()))

    elif search_spec.search_mode == 'allele_count':
        variants = list(get_variants_allele_count(
            get_datastore(xfamily.project_id),
            xfamily,
            search_spec.allele_count_filter,
            variant_filter=search_spec.variant_filter,
            quality_filter=search_spec.quality_filter,
        ))

    elif search_spec.search_mode == 'all_variants':
        variants = list(get_variants_family(
            get_datastore(xfamily.project_id),
            xfamily,
            variant_filter=search_spec.variant_filter,
            quality_filter=search_spec.quality_filter,
            indivs_to_consider=xfamily.indiv_id_list(),
        ))

    return variants
示例#32
0
def clean_project(project_id):
    """
    Clear data for this project from all the xbrowse resources:
     - datastore
     - coverage store
     - cnv store
    Does not remove any of the project's data links - so no data is lost, but everything must be rebuilt
    """
    project = Project.objects.get(project_id=project_id)
    individuals = project.get_individuals()

    # datastore
    get_mall(project_id).variant_store.delete_project(project_id)

    # coverage store
    for individual in individuals:
        get_coverage_store().remove_sample(individual.get_coverage_store_id())

    # cnv store
    for individual in individuals:
        get_cnv_store().remove_sample(individual.get_coverage_store_id())
示例#33
0
def get_variants_for_inheritance_for_project(project, inheritance_mode):
    """
    Get the variants for this project / inheritance combo
    Return dict of family -> list of variants
    """

    # create search specification
    # this could theoretically differ by project, if there are different reference populations
    #variant_filter = VariantFilter(so_annotations=SO_SEVERITY_ORDER, ref_freqs=[])
    variant_filter = get_default_variant_filter('moderate_impact')
    variant_filter.ref_freqs.append(('1kg_wgs_phase3', g1k_freq_threshold))
    variant_filter.ref_freqs.append(
        ('1kg_wgs_phase3_popmax', g1k_popmax_freq_threshold))
    variant_filter.ref_freqs.append(('exac_v3', exac_freq_threshold))
    variant_filter.ref_freqs.append(('exac_v3_popmax', exac_popmax_threshold))
    variant_filter.ref_freqs.append(
        ('merck-wgs-3793', merck_wgs_3793_threshold))
    #variant_filter.ref_freqs.append(('merck-pcr-free-wgs-144', merck_wgs_144_threshold))
    quality_filter = {
        #        'vcf_filter': 'pass',
        'min_gq': GQ_threshold,
        'min_ab': AB_threshold,
    }

    # run MendelianVariantSearch for each family, collect results

    families = project.get_families()

    for i, family in enumerate(families):
        print("Processing %s - family %s  (%d / %d)" %
              (inheritance_mode, family.family_id, i + 1, len(families)))
        try:
            if inheritance_mode == "all_variants":
                yield family, list(
                    get_variants(get_datastore(project.project_id),
                                 family.xfamily(),
                                 variant_filter=variant_filter,
                                 quality_filter=quality_filter,
                                 indivs_to_consider=family.indiv_id_list()))
            else:
                yield family, list(
                    get_variants_with_inheritance_mode(
                        get_mall(project.project_id),
                        family.xfamily(),
                        inheritance_mode,
                        variant_filter=variant_filter,
                        quality_filter=quality_filter,
                    ))
        except ValueError as e:
            print("Error: %s. Skipping family %s" % (str(e), str(family)))
示例#34
0
def load_cohorts(project_id):
    # now load cohorts
    project = Project.objects.get(project_id=project_id)
    for vcf_file, cohorts in project.cohorts_by_vcf().items():
        cohorts = [
            c for c in cohorts
            if get_mall(project).variant_store.get_family_status(
                project_id, c.cohort_id) != 'loaded'
        ]
        for i in xrange(0, len(cohorts), settings.FAMILY_LOAD_BATCH_SIZE):
            print("Loading project %s - cohorts: %s" %
                  (project_id, cohorts[i:i + settings.FAMILY_LOAD_BATCH_SIZE]))
            load_variants_for_cohort_list(
                project, cohorts[i:i + settings.FAMILY_LOAD_BATCH_SIZE])
示例#35
0
def inheritance_matrix_for_gene(project, gene_id):
    """
    Run get_family_matrix_for_gene for the families in this project
    """
    variant_filter = get_default_variant_filter('moderate_impact', mall.get_annotator().reference_population_slugs)
    quality_filter = get_default_quality_filter('high_quality', mall.get_annotator().reference_population_slugs)
    matrix = get_family_matrix_for_gene(
        get_mall(),
        [f.xfamily() for f in project.get_active_families()],
        gene_id,
        variant_filter,
        quality_filter
    )
    return matrix
示例#36
0
文件: project.py 项目: dmyung/xbrowse
def inheritance_matrix_for_gene(project, gene_id):
    """
    Run get_family_matrix_for_gene for the families in this project
    """
    variant_filter = get_default_variant_filter(
        'moderate_impact',
        mall.get_annotator().reference_population_slugs)
    quality_filter = get_default_quality_filter(
        'high_quality',
        mall.get_annotator().reference_population_slugs)
    matrix = get_family_matrix_for_gene(
        get_mall(project.project_id),
        [f.xfamily() for f in project.get_active_families()], gene_id,
        variant_filter, quality_filter)
    return matrix
示例#37
0
def load_project_variants(project_id, force_annotations=False):
    """
    Load any families and cohorts in this project that aren't loaded already 
    """
    print "Loading project %s" % project_id
    project = Project.objects.get(project_id=project_id)

    for vcf in project.get_all_vcf_files():
        mall.get_annotator().add_vcf_file_to_annotator(vcf.path(), force_all=force_annotations)

    # batch load families by VCF file
    for vcf_file, families in project.families_by_vcf().items():
        families = [f for f in families if get_mall().variant_store.get_family_status(project_id, f.family_id) != 'loaded']
        for i in xrange(0, len(families), settings.FAMILY_LOAD_BATCH_SIZE):
            load_variants_for_family_list(project, families[i:i+settings.FAMILY_LOAD_BATCH_SIZE], vcf_file)

    # now load cohorts
    # TODO: load cohorts and families together
    for vcf_file, cohorts in project.cohorts_by_vcf().items():
        cohorts = [c for c in cohorts if get_mall().variant_store.get_family_status(project_id, c.cohort_id) != 'loaded']
        for i in xrange(0, len(cohorts), settings.FAMILY_LOAD_BATCH_SIZE):
            load_variants_for_cohort_list(project, cohorts[i:i+settings.FAMILY_LOAD_BATCH_SIZE], vcf_file)

    print "Finished loading project %s!" % project_id
def get_variants_for_inheritance_for_project(project, inheritance_mode):
    """
    Get the variants for this project / inheritance combo
    Return dict of family -> list of variants
    """

    # create search specification
    # this could theoretically differ by project, if there are different reference populations
    #variant_filter = VariantFilter(so_annotations=SO_SEVERITY_ORDER, ref_freqs=[])
    variant_filter = get_default_variant_filter('moderate_impact')
    variant_filter.ref_freqs.append(('1kg_wgs_phase3', g1k_freq_threshold))
    variant_filter.ref_freqs.append(('1kg_wgs_phase3_popmax', g1k_popmax_freq_threshold))
    variant_filter.ref_freqs.append(('exac_v3', exac_freq_threshold))
    variant_filter.ref_freqs.append(('exac_v3_popmax', exac_popmax_threshold))
    variant_filter.ref_freqs.append(('merck-wgs-3793', merck_wgs_3793_threshold))
    #variant_filter.ref_freqs.append(('merck-pcr-free-wgs-144', merck_wgs_144_threshold))
    quality_filter = {
#        'vcf_filter': 'pass',
        'min_gq': GQ_threshold,
        'min_ab': AB_threshold,
    }

    # run MendelianVariantSearch for each family, collect results

    families = project.get_families()

    for i, family in enumerate(families):
        print("Processing %s - family %s  (%d / %d)" % (inheritance_mode, family.family_id, i+1, len(families)))
        try:
            if inheritance_mode == "all_variants":
                yield family, list(get_variants(
                        get_datastore(project.project_id),
                        family.xfamily(),
                        variant_filter=variant_filter,
                        quality_filter=quality_filter,
                        indivs_to_consider=family.indiv_id_list()
                        ))
            else:
                yield family, list(get_variants_with_inheritance_mode(
                        get_mall(project.project_id),
                        family.xfamily(),
                        inheritance_mode,
                        variant_filter=variant_filter,
                        quality_filter=quality_filter,
                        ))
        except ValueError as e:
            print("Error: %s. Skipping family %s" % (str(e), str(family)))
示例#39
0
def get_gene_diangostic_info(family, gene_id, variant_filter=None):

    diagnostic_info = GeneDiagnosticInfo(gene_id)

    diagnostic_info._gene_phenotype_summary = get_gene_phenotype_summary(
        get_reference(), gene_id)
    diagnostic_info._gene_sequencing_summary = get_gene_sequencing_summary(
        get_coverage_store(), family, gene_id)
    diagnostic_info._variants = get_diagnostic_search_variants_in_family(
        get_mall(family.project).variant_store, family, gene_id,
        variant_filter)
    diagnostic_info._cnvs = get_diagnostic_search_cnvs_in_family(
        get_cnv_store(),
        family,
        gene_id,
    )

    return diagnostic_info
示例#40
0
def get_gene_diangostic_info(family, gene_id, variant_filter=None):

    diagnostic_info = GeneDiagnosticInfo(gene_id)

    diagnostic_info._gene_phenotype_summary = get_gene_phenotype_summary(get_reference(), gene_id)
    diagnostic_info._gene_sequencing_summary = get_gene_sequencing_summary(get_coverage_store(), family, gene_id)
    diagnostic_info._variants = get_diagnostic_search_variants_in_family(
        get_mall(family.project).variant_store,
        family,
        gene_id,
        variant_filter
    )
    diagnostic_info._cnvs = get_diagnostic_search_cnvs_in_family(
        get_cnv_store(),
        family,
        gene_id,
    )

    return diagnostic_info
示例#41
0
def calculate_combine_mendelian_families(family_group, search_spec, user=None):
    """
    Calculate search results from the params in search_spec
    Should be called after cache is checked - this does all the computation
    Returns (is_error, genes) tuple
    """
    xfamilygroup = family_group.xfamilygroup()

    genes = []
    for gene_id, family_id_list in get_families_by_gene(
            get_mall(family_group.project),
            xfamilygroup,
            search_spec.inheritance_mode,
            search_spec.variant_filter,
            search_spec.quality_filter,
            user=user,
    ):

        xgene = get_reference().get_gene(gene_id)
        if xgene is None:
            continue

        try:
            start_pos, end_pos = get_reference().get_gene_bounds(gene_id)
            chr, start = genomeloc.get_chr_pos(start_pos)
            end = genomeloc.get_chr_pos(end_pos)[1]
        except KeyError:
            chr, start, end = None, None, None

        gene = {
            'gene_info': xgene,
            'gene_id': gene_id,
            'gene_name': xgene['symbol'],
            'chr': chr,
            'start': start,
            'end': end,
            'family_id_list': family_id_list,
        }

        genes.append(gene)

    return genes
示例#42
0
def calculate_combine_mendelian_families(family_group, search_spec, user=None):
    """
    Calculate search results from the params in search_spec
    Should be called after cache is checked - this does all the computation
    Returns (is_error, genes) tuple
    """
    xfamilygroup = family_group.xfamilygroup()

    genes = []
    for gene_id, family_id_list in get_families_by_gene(
        get_mall(family_group.project),
        xfamilygroup,
        search_spec.inheritance_mode,
        search_spec.variant_filter,
        search_spec.quality_filter,
        user=user,
    ):

        xgene = get_reference().get_gene(gene_id)
        if xgene is None:
            continue

        try:
            start_pos, end_pos = get_reference().get_gene_bounds(gene_id)
            chr, start = genomeloc.get_chr_pos(start_pos)
            end = genomeloc.get_chr_pos(end_pos)[1]
        except KeyError:
            chr, start, end = None, None, None

        gene = {
            'gene_info': xgene,
            'gene_id': gene_id,
            'gene_name': xgene['symbol'],
            'chr': chr,
            'start': start,
            'end': end,
            'family_id_list': family_id_list,
        }

        genes.append(gene)

    return genes
示例#43
0
def get_variants_in_gene(family_group, gene_id, variant_filter=None, quality_filter=None):
    """

    """
    variants_by_family = []
    for family in family_group.get_families():
        variant_list = list(get_mall(family.project.project_id).variant_store.get_variants_in_gene(
            family.project.project_id,
            family.family_id,
            gene_id,
            variant_filter=variant_filter
        ))
        variant_list = search_utils.filter_gene_variants_by_variant_filter(variant_list, gene_id, variant_filter)
        add_extra_info_to_variants_family(get_reference(), family, variant_list)
        variants_by_family.append({
            'variants': [v.toJSON() for v in variant_list],
            'family_id': family.family_id,
            'project_id': family.project.project_id,
            'family_name': str(family),
        })
    return variants_by_family
示例#44
0
    def handle(self, *args, **options):
        if len(args) < 1:
            print(
                "Please provide the project_id. The individual_id(s) are optional"
            )
            return

        project_id = args[0]

        try:
            project = Project.objects.get(project_id=project_id)
        except ObjectDoesNotExist:
            sys.exit("Invalid project id: " + project_id)

        individual_ids = args[1:]
        try:
            if individual_ids:
                individual_ids = [
                    Individual.objects.get(project=project,
                                           indiv_id=individual_id)
                    for individual_id in individual_ids
                ]
            else:
                individual_ids = [
                    i for i in Individual.objects.filter(project=project)
                ]
        except ObjectDoesNotExist:
            sys.exit("Invalid individual ids: " + str(individual_ids))

        for i in individual_ids:
            family_collection = get_mall(
                project_id).variant_store._get_family_collection(
                    project_id, i.family.family_id)
            if family_collection is None:
                print(
                    "WARNING: Family %s data not loaded in variant datastore. Skipping individual %s."
                    % (i.family.family_id, i))
                continue
            self.handle_individual(project, i)
        print("Finished generating report")
示例#45
0
def load_cohorts(project_id):
    # now load cohorts
    print(
        date.strftime(
            datetime.now(), "%m/%d/%Y %H:%M:%S  -- loading project: " +
            project_id + " - cohorts"))

    project = Project.objects.get(project_id=project_id)
    for vcf_file, cohorts in project.cohorts_by_vcf().items():
        cohorts = [
            c for c in cohorts
            if get_mall(project.project_id).variant_store.get_family_status(
                project_id, c.cohort_id) != 'loaded'
        ]
        for i in xrange(0, len(cohorts), settings.FAMILY_LOAD_BATCH_SIZE):
            print("Loading project %s - cohorts: %s" %
                  (project_id, cohorts[i:i + settings.FAMILY_LOAD_BATCH_SIZE]))
            load_variants_for_cohort_list(
                project, cohorts[i:i + settings.FAMILY_LOAD_BATCH_SIZE])

    print(
        date.strftime(
            datetime.now(),
            "%m/%d/%Y %H:%M:%S  -- finished loading project: " + project_id))
示例#46
0
def load_project_variants(project_id,
                          force_annotations=False,
                          ignore_csq_in_vcf=False):
    """
    Load any families and cohorts in this project that aren't loaded already 
    """
    print "Loading project %s" % project_id
    print(
        date.strftime(
            datetime.now(), "%m/%d/%Y %H:%M:%S  -- loading project: " +
            project_id + " - db.variants cache"))
    os.system("du /mongo/mongodb")
    project = Project.objects.get(project_id=project_id)

    for vcf_obj in project.get_all_vcf_files():
        r = vcf.VCFReader(filename=vcf_obj.path())
        if not ignore_csq_in_vcf and "CSQ" in r.infos:
            mall.get_annotator().add_preannotated_vcf_file(
                vcf_obj.path(), force=force_annotations)
        else:
            mall.get_annotator().add_vcf_file_to_annotator(
                vcf_obj.path(), force_all=force_annotations)

    # batch load families by VCF file
    for vcf_file, families in project.families_by_vcf().items():
        families = [
            f for f in families
            if get_mall(project.project_id).variant_store.get_family_status(
                project_id, f.family_id) != 'loaded'
        ]
        for i in xrange(0, len(families), settings.FAMILY_LOAD_BATCH_SIZE):
            print(
                date.strftime(
                    datetime.now(), "%m/%d/%Y %H:%M:%S  -- loading project: " +
                    project_id + " - families batch %d - %d families" %
                    (i, len(families[i:i + settings.FAMILY_LOAD_BATCH_SIZE]))))
            load_variants_for_family_list(
                project, families[i:i + settings.FAMILY_LOAD_BATCH_SIZE],
                vcf_file)

    # now load cohorts
    print(
        date.strftime(
            datetime.now(), "%m/%d/%Y %H:%M:%S  -- loading project: " +
            project_id + " - cohorts"))
    # TODO: load cohorts and families together
    print(
        date.strftime(
            datetime.now(), "%m/%d/%Y %H:%M:%S  -- loading project: " +
            project_id + " - cohorts"))
    os.system("du /mongo/mongodb")
    for vcf_file, cohorts in project.cohorts_by_vcf().items():
        cohorts = [
            c for c in cohorts
            if get_mall(project.project_id).variant_store.get_family_status(
                project_id, c.cohort_id) != 'loaded'
        ]
        for i in xrange(0, len(cohorts), settings.FAMILY_LOAD_BATCH_SIZE):
            print("Loading project %s - cohorts: %s" %
                  (project_id, cohorts[i:i + settings.FAMILY_LOAD_BATCH_SIZE]))
            load_variants_for_cohort_list(
                project, cohorts[i:i + settings.FAMILY_LOAD_BATCH_SIZE])

    print(
        date.strftime(
            datetime.now(),
            "%m/%d/%Y %H:%M:%S  -- finished loading project: " + project_id))
示例#47
0
文件: views.py 项目: rpete/seqr
def combine_mendelian_families_spec(request):

    project, family_group = utils.get_project_and_family_group_for_user(request.user, request.GET)
    if not project.can_view(request.user):
        raise PermissionDenied

    search_hash = request.GET.get('search_hash')
    search_spec, genes = cache_utils.get_cached_results(project.project_id, search_hash)
    search_spec_obj = MendelianVariantSearchSpec.fromJSON(search_spec)

    if request.GET.get('return_type') != 'csv' or not request.GET.get('group_by_variants'):
        if genes is None:
            genes = api_utils.calculate_combine_mendelian_families(family_group, search_spec)
        api_utils.add_extra_info_to_genes(project, get_reference(), genes)
    
        if request.GET.get('return_type') != 'csv':
            return JSONResponse({
                    'is_error': False,
                    'genes': genes,
                    'search_spec': search_spec,
                    })
        else:
            response = HttpResponse(content_type='text/csv')
            response['Content-Disposition'] = 'attachment; filename="family_group_results_{}.csv"'.format(search_hash)
            writer = csv.writer(response)
            writer.writerow(["gene", "# families", "family list", "chrom", "start", "end"])
            for gene in genes:
                family_id_list = [family_id for (project_id, family_id) in gene["family_id_list"]]
                writer.writerow(map(str, [gene["gene_name"], len(family_id_list), " ".join(family_id_list), gene["chr"], gene["start"], gene["end"], ""]))
            return response
    else:
        # download results grouped by variant
        indiv_id_list = []
        for family in family_group.get_families():
            indiv_id_list.extend(family.indiv_ids_with_variant_data())

        response = HttpResponse(content_type='text/csv')
        response['Content-Disposition'] = 'attachment; filename="results_{}.csv"'.format(search_hash)
        writer = csv.writer(response)
        
        headers = ['genes','chr','pos','ref','alt','worst_annotation' ]
        headers.extend(project.get_reference_population_slugs())
        headers.extend([ 'polyphen','sift','muttaster','fathmm'])
        for indiv_id in indiv_id_list:
            headers.append(indiv_id)
            headers.append(indiv_id+'_gq')
            headers.append(indiv_id+'_dp')
        
        writer.writerow(headers)

        mall = get_mall(project.project_id)
        variant_key_to_individual_id_to_variant = defaultdict(dict)
        variant_key_to_variant = {}
        for family in family_group.get_families():
            for variant in get_variants_with_inheritance_mode(
                mall,
                family.xfamily(),
                search_spec_obj.inheritance_mode,
                search_spec_obj.variant_filter,
                search_spec_obj.quality_filter,
                ):
                if len(variant.coding_gene_ids) == 0:
                    continue

                variant_key = (variant.xpos, variant.ref, variant.alt)
                variant_key_to_variant[variant_key] = variant
                for indiv_id in family.indiv_ids_with_variant_data():
                    variant_key_to_individual_id_to_variant[variant_key][indiv_id] = variant
                    
        for variant_key in sorted(variant_key_to_individual_id_to_variant.keys()):
            variant = variant_key_to_variant[variant_key]
            individual_id_to_variant = variant_key_to_individual_id_to_variant[variant_key]

            genes = [mall.reference.get_gene_symbol(gene_id) for gene_id in variant.coding_gene_ids]
            fields = []
            fields.append(','.join(genes))
            fields.extend([
                        variant.chr,
                        str(variant.pos),
                        variant.ref,
                        variant.alt,
                        variant.annotation.get('vep_group', '.'),
                        ])
            for ref_population_slug in project.get_reference_population_slugs():
                fields.append(variant.annotation['freqs'][ref_population_slug])
            for field_key in ['polyphen', 'sift', 'muttaster', 'fathmm']:
                fields.append(variant.annotation[field_key])

            for indiv_id in indiv_id_list:
                variant = individual_id_to_variant.get(indiv_id)                    
                genotype = None
                if variant is not None:
                    genotype = variant.get_genotype(indiv_id)

                if genotype is None:
                    fields.extend(['.', '.', '.'])
                else:
                    if genotype.num_alt == 0:
                        fields.append("%s/%s" % (variant.ref, variant.ref))
                    elif genotype.num_alt == 1:
                        fields.append("%s/%s" % (variant.ref, variant.alt))
                    elif genotype.num_alt == 2:
                        fields.append("%s/%s" % (variant.alt, variant.alt))
                    else:
                        fields.append("./.")

                    fields.append(str(genotype.gq) if genotype.gq is not None else '.')
                    fields.append(genotype.extras['dp'] if genotype.extras.get('dp') is not None else '.')    
            writer.writerow(fields)
        return response        
示例#48
0
    def handle(self, *args, **options):

        project_id = args[0]
        inheritance_mode = args[1]
        fam_list_file_path = args[2]

        project = Project.objects.get(project_id=project_id)
        families = []
        for line in open(fam_list_file_path):
            family_id = line.strip('\n')
            families.append(Family.objects.get(project=project, family_id=family_id))


        # create search spec
        variant_filter = next(f for f in project.get_default_variant_filters() if f['slug'] == 'moderate_impact')['variant_filter']
        quality_filter = {
            'min_gq': 20,
            'min_ab': 25,
        }

        # run MendelianVariantSearch for each family, collect results
        family_results = {}
        for family in families:
            family_results[family] = list(get_variants_with_inheritance_mode(
                get_mall(project_id),
                family.xfamily(),
                inheritance_mode,
                variant_filter=variant_filter,
                quality_filter=quality_filter,
            ))

        # create family_variants.tsv
        f = open('family_variants.tsv', 'w')
        writer = csv.writer(f, dialect='excel', delimiter='\t')
        writer.writerow([
            '#family_id',
            'gene',
            'chrom',
            'ref',
            'alt',
            'rsid',
            'annotation',
        ])
        for family in families:
            for variant in family_results[family]:
                writer.writerow([
                    family.family_id,
                    get_gene_symbol(variant),
                    variant.chr,
                    variant.ref,
                    variant.alt,
                    variant.vcf_id,
                    variant.annotation['vep_group'],
                ])
        f.close()

        # create variants.tsv
        by_variant = {}
        variant_info = {}
        for family in families:
            for variant in family_results[family]:
                if variant.unique_tuple() not in by_variant:
                    by_variant[variant.unique_tuple()] = set()
                    variant_info[variant.unique_tuple()] = variant
                by_variant[variant.unique_tuple()].add(family.family_id)
        f = open('variants.tsv', 'w')
        writer = csv.writer(f, dialect='excel', delimiter='\t')
        headers = [
            '#chrom',
            'ref',
            'alt',
            'rsid',
            'gene'
            'annotation',
            'num_families',
        ]
        headers.extend([fam.family_id for fam in families])
        writer.writerow(headers)
        for variant_t in sorted(variant_info.keys()):
            variant = variant_info[variant_t]
            fields = [
                variant.chr,
                variant.ref,
                variant.alt,
                variant.vcf_id,
                get_gene_symbol(variant_info[variant_t]),
                variant.annotation['vep_group'],
                str(len(by_variant[variant_t])),
            ]
            for family in families:
                fields.append('1' if family.family_id in by_variant[variant_t] else '0')
            writer.writerow(fields)
        f.close()

        # create genes.tsv
        by_gene = {}
        for family in families:
            for variant in family_results[family]:
                gene_symbol = get_gene_symbol(variant)
                if gene_symbol not in by_gene:
                    by_gene[gene_symbol] = set()
                by_gene[gene_symbol].add(family.family_id)

        f = open('genes.tsv', 'w')
        writer = csv.writer(f, dialect='excel', delimiter='\t')
        headers = [
            '#gene',
            'num_families',
        ]
        headers.extend([fam.family_id for fam in families])
        writer.writerow(headers)
        for gene_symbol in sorted(by_gene.keys()):
            fields = [
                gene_symbol,
                str(len(by_gene[gene_symbol])),
            ]
            for family in families:
                fields.append('1' if family.family_id in by_gene[gene_symbol] else '0')
            writer.writerow(fields)
        f.close()
示例#49
0
    def handle(self, *args, **options):

        project_id = args[0]
        inheritance_mode = args[1]
        fam_list_file_path = args[2]

        project = Project.objects.get(project_id=project_id)
        families = []
        for line in open(fam_list_file_path):
            family_id = line.strip('\n')
            families.append(
                Family.objects.get(project=project, family_id=family_id))

        # create search spec
        variant_filter = next(
            f for f in project.get_default_variant_filters()
            if f['slug'] == 'moderate_impact')['variant_filter']
        quality_filter = {
            'min_gq': 30,
            'min_ab': 25,
        }

        # run MendelianVariantSearch for each family, collect results
        family_results = {}
        for family in families:
            family_results[family] = list(
                get_variants_with_inheritance_mode(
                    get_mall(project_id),
                    family.xfamily(),
                    inheritance_mode,
                    variant_filter=variant_filter,
                    quality_filter=quality_filter,
                ))

        # create family_variants.tsv
        f = open('family_variants.tsv', 'w')
        writer = csv.writer(f, dialect='excel', delimiter='\t')
        writer.writerow([
            '#family_id',
            'gene',
            'chrom',
            'ref',
            'alt',
            'rsid',
            'annotation',
        ])
        for family in families:
            for variant in family_results[family]:
                writer.writerow([
                    family.family_id,
                    get_gene_symbol(variant),
                    variant.chr,
                    variant.ref,
                    variant.alt,
                    variant.vcf_id,
                    variant.annotation['vep_group'],
                ])
        f.close()

        # create variants.tsv
        by_variant = {}
        variant_info = {}
        for family in families:
            for variant in family_results[family]:
                if variant.unique_tuple() not in by_variant:
                    by_variant[variant.unique_tuple()] = set()
                    variant_info[variant.unique_tuple()] = variant
                by_variant[variant.unique_tuple()].add(family.family_id)
        f = open('variants.tsv', 'w')
        writer = csv.writer(f, dialect='excel', delimiter='\t')
        headers = [
            '#chrom',
            'ref',
            'alt',
            'rsid',
            'gene'
            'annotation',
            'num_families',
        ]
        headers.extend([fam.family_id for fam in families])
        writer.writerow(headers)
        for variant_t in sorted(variant_info.keys()):
            variant = variant_info[variant_t]
            fields = [
                variant.chr,
                variant.ref,
                variant.alt,
                variant.vcf_id,
                get_gene_symbol(variant_info[variant_t]),
                variant.annotation['vep_group'],
                str(len(by_variant[variant_t])),
            ]
            for family in families:
                fields.append('1' if family.family_id in
                              by_variant[variant_t] else '0')
            writer.writerow(fields)
        f.close()

        # create genes.tsv
        by_gene = {}
        for family in families:
            for variant in family_results[family]:
                gene_symbol = get_gene_symbol(variant)
                if gene_symbol not in by_gene:
                    by_gene[gene_symbol] = set()
                by_gene[gene_symbol].add(family.family_id)

        f = open('genes.tsv', 'w')
        writer = csv.writer(f, dialect='excel', delimiter='\t')
        headers = [
            '#gene',
            'num_families',
        ]
        headers.extend([fam.family_id for fam in families])
        writer.writerow(headers)
        for gene_symbol in sorted(by_gene.keys()):
            fields = [
                gene_symbol,
                str(len(by_gene[gene_symbol])),
            ]
            for family in families:
                fields.append('1' if family.family_id in
                              by_gene[gene_symbol] else '0')
            writer.writerow(fields)
        f.close()
示例#50
0
    def handle(self, *args, **options):
        project_id = options['project_id']
        print("Loading data into project: " + project_id)
        project = Project.objects.get(project_id=project_id)

        cnv_filename = options['cnv_filename']
        bed_files_directory = options['bed_files_directory']

        if not os.path.isfile(cnv_filename):
            raise ValueError("CNV file %s doesn't exist" %
                             options['cnv_filename'])

        with open(cnv_filename) as f:
            header_fields = f.readline().rstrip('\n').split('\t')
            for line in f:
                fields = line.rstrip('\n').split('\t')
                row_dict = dict(zip(header_fields, fields))

                chrom = "chr" + row_dict['chr']
                start = int(row_dict['start'])
                end = int(row_dict['end'])
                #left_overhang = int(row_dict['left_overhang_start'])
                #right_overhang = int(row_dict['right_overhang_end'])

                sample_id = row_dict['sample']
                try:
                    i = Individual.objects.get(project=project,
                                               indiv_id__istartswith=sample_id)
                except Exception as e:
                    print("WARNING: %s: %s not found in %s" %
                          (e, sample_id, project))
                    continue

                bed_file_path = os.path.join(bed_files_directory,
                                             "%s.bed" % sample_id)
                if not os.path.isfile(bed_file_path):
                    print("WARNING: .bed file not found: " + bed_file_path)

                    if i.cnv_bed_file != bed_file_path:
                        print("Setting cnv_bed_file path to %s" %
                              bed_file_path)
                        i.cnv_bed_file = bed_file_path
                        i.save()

                project_collection = get_project_datastore(
                    project)._get_project_collection(project_id)
                family_collection = get_mall(
                    project).variant_store._get_family_collection(
                        project_id, i.family.family_id)

                for collection in filter(
                        None, [project_collection, family_collection]):

                    collection.update_many(
                        {
                            '$and': [{
                                'xpos': {
                                    '$gte':
                                    genomeloc.get_single_location(
                                        chrom, start)
                                }
                            }, {
                                'xpos': {
                                    '$lte':
                                    genomeloc.get_single_location(chrom, end)
                                }
                            }]
                        }, {
                            '$set': {
                                'genotypes.%s.extras.cnvs' % i.indiv_id:
                                row_dict
                            }
                        })

                    #result = list(collection.find({'$and' : [
                    #       {'xpos': {'$gte':  genomeloc.get_single_location(chrom, start)}},
                    #       {'xpos' :{'$lte': genomeloc.get_single_location(chrom, end)}}]},
                    #   {'genotypes.%s.extras.cnvs' % i.indiv_id :1 }))
                    #print(chrom, start, end, len(result), result[0] if result else None)

        print("Done")
示例#51
0
    def handle_individual(self, project, individual):
        project_id = project.project_id
        individual_id = individual.indiv_id

        print("Processing individual %s" % individual_id)
        # get variants that have been tagged or that have a note that starts with "REPORT"
        variants_in_report_and_notes = defaultdict(str)
        for vt in VariantTag.objects.filter(project_tag__project=project,
                                            project_tag__tag="REPORT",
                                            family=individual.family):

            variants_in_report_and_notes[(vt.xpos, vt.ref, vt.alt)] = ""

        for vn in VariantNote.objects.filter(project=project,
                                             family=individual.family):
            if vn.note and vn.note.strip().startswith("REPORT"):
                variants_in_report_and_notes[(vn.xpos, vn.ref, vn.alt)] = ""

        header = [
            "gene_name", "genotype", "variant", "functional_class", "hgvs_c",
            "hgvs_p", "rsid", "exac_global_af", "exac_pop_max_af",
            "exac_pop_max_population", "clinvar_clinsig", "clinvar_clnrevstat",
            "number_of_stars", "clinvar_url", "comments"
        ]

        if len(variants_in_report_and_notes) != 0:
            with open(
                    "report_for_%s_%s.flagged.txt" %
                (project_id, individual_id), "w") as out:
                #print("\t".join(header))
                out.write("\t".join(header) + "\n")

                # retrieve text of all notes that were left for any of these variants
                for vn in VariantNote.objects.filter(project=project,
                                                     family=individual.family):
                    if vn.note and (vn.xpos, vn.ref,
                                    vn.alt) in variants_in_report_and_notes:
                        other_notes = variants_in_report_and_notes[(vn.xpos,
                                                                    vn.ref,
                                                                    vn.alt)]
                        if len(other_notes) > 0:
                            other_notes += "||"
                        variants_in_report_and_notes[(
                            vn.xpos, vn.ref,
                            vn.alt)] = other_notes + "%s|%s|%s" % (
                                vn.date_saved, vn.user.email, vn.note.strip())

                for (xpos, ref,
                     alt), notes in variants_in_report_and_notes.items():

                    #chrom, pos = genomeloc.get_chr_pos(xpos)

                    v = get_mall(project_id).variant_store.get_single_variant(
                        project_id, individual.family.family_id, xpos, ref,
                        alt)
                    if v is None:
                        raise ValueError(
                            "Couldn't find variant in variant store for: %s, %s, %s %s %s"
                            % (project_id, individual.family.family_id, xpos,
                               ref, alt))

                    row = self.get_output_row(v,
                                              xpos,
                                              ref,
                                              alt,
                                              individual_id,
                                              individual.family,
                                              all_fields=True,
                                              comments=notes)
                    if row is None:
                        continue
                    #print("\t".join(row))
                    out.write("\t".join(row) + "\n")

                #print(variant_tag.project_tag.title, variant_tag.project_tag.tag,  variant_tag.xpos, variant_tag.ref, variant_tag.alt)

        with open("report_for_%s_%s.genes.txt" % (project_id, individual_id),
                  "w") as out:
            header = ["gene_chrom", "gene_start", "gene_end"
                      ] + header + ["json_dump"]
            #print("\t".join(header))
            out.write("\t".join(header) + "\n")
            for gene_id, (chrom, start, end) in gene_loc.items():
                xpos_start = genomeloc.get_single_location(
                    "chr" + chrom, start)
                xpos_end = genomeloc.get_single_location("chr" + chrom, end)
                for v in get_mall(
                        project_id).variant_store.get_variants_in_range(
                            project_id, individual.family.family_id,
                            xpos_start, xpos_end):

                    json_dump = str(v.genotypes)
                    try:
                        notes = variants_in_report_and_notes[(v.xpos, v.ref,
                                                              v.alt)]
                    except KeyError:
                        notes = ""
                    row = self.get_output_row(v,
                                              v.xpos,
                                              v.ref,
                                              v.alt,
                                              individual_id,
                                              individual.family,
                                              comments=notes,
                                              gene_id=gene_id)
                    if row is None:
                        continue
                    row = map(str, [chrom, start, end] + row + [json_dump])

                    #print("\t".join(row))
                    out.write("\t".join(row) + "\n")
示例#52
0
def calculate_mendelian_variant_search(search_spec, family, user=None):
    xfamily = family.xfamily()
    project = family.project
    variants = None
    if search_spec.search_mode == 'standard_inheritance':
        variants = list(
            get_variants_with_inheritance_mode(
                get_mall(project),
                xfamily,
                search_spec.inheritance_mode,
                variant_filter=search_spec.variant_filter,
                quality_filter=search_spec.quality_filter,
                user=user,
            ))

    elif search_spec.search_mode == 'custom_inheritance':
        variants = list(
            get_variants_family(
                get_datastore(project),
                xfamily,
                genotype_filter=search_spec.genotype_inheritance_filter,
                variant_filter=search_spec.variant_filter,
                quality_filter=search_spec.quality_filter,
                user=user,
            ))

    elif search_spec.search_mode == 'gene_burden':
        gene_stream = get_genes_family(
            get_datastore(project),
            get_reference(),
            xfamily,
            burden_filter=search_spec.gene_burden_filter,
            variant_filter=search_spec.variant_filter,
            quality_filter=search_spec.quality_filter,
            user=user,
        )

        variants = list(
            stream_utils.gene_stream_to_variant_stream(gene_stream,
                                                       get_reference()))

    elif search_spec.search_mode == 'allele_count':
        variants = list(
            get_variants_allele_count(
                get_datastore(project),
                xfamily,
                search_spec.allele_count_filter,
                variant_filter=search_spec.variant_filter,
                quality_filter=search_spec.quality_filter,
                user=user,
            ))

    elif search_spec.search_mode == 'all_variants':
        variants = list(
            get_variants_family(
                get_datastore(project),
                xfamily,
                variant_filter=search_spec.variant_filter,
                quality_filter=search_spec.quality_filter,
                indivs_to_consider=xfamily.indiv_id_list(),
                user=user,
            ))

    for variant in variants:
        variant.set_extra('family_id', family.family_id)

    return variants
示例#53
0
def calculate_mendelian_variant_search(search_spec, xfamily):
    sys.stderr.write((
        "mendelian_variant_search for %s - search mode: %s \n"
        "variant_filter: %s \ninheritance_mode: %s \nallele_count_filter: %s \nquality_filter: %s \ngenotype_inheritance_filter: %s \n"
    ) % (xfamily.project_id, search_spec.search_mode,
         search_spec.variant_filter.toJSON() if search_spec.variant_filter else
         '', search_spec.inheritance_mode, search_spec.allele_count_filter,
         search_spec.quality_filter, search_spec.genotype_inheritance_filter))

    variants = None
    if search_spec.search_mode == 'standard_inheritance':
        variants = list(
            get_variants_with_inheritance_mode(
                get_mall(xfamily.project_id),
                xfamily,
                search_spec.inheritance_mode,
                variant_filter=search_spec.variant_filter,
                quality_filter=search_spec.quality_filter,
            ))

    elif search_spec.search_mode == 'custom_inheritance':
        variants = list(
            get_variants_family(
                get_datastore(xfamily.project_id),
                xfamily,
                genotype_filter=search_spec.genotype_inheritance_filter,
                variant_filter=search_spec.variant_filter,
                quality_filter=search_spec.quality_filter,
            ))

    elif search_spec.search_mode == 'gene_burden':
        gene_stream = get_genes_family(
            get_datastore(xfamily.project_id),
            get_reference(),
            xfamily,
            burden_filter=search_spec.gene_burden_filter,
            variant_filter=search_spec.variant_filter,
            quality_filter=search_spec.quality_filter,
        )

        variants = list(
            stream_utils.gene_stream_to_variant_stream(gene_stream,
                                                       get_reference()))

    elif search_spec.search_mode == 'allele_count':
        variants = list(
            get_variants_allele_count(
                get_datastore(xfamily.project_id),
                xfamily,
                search_spec.allele_count_filter,
                variant_filter=search_spec.variant_filter,
                quality_filter=search_spec.quality_filter,
            ))

    elif search_spec.search_mode == 'all_variants':
        variants = list(
            get_variants_family(
                get_datastore(xfamily.project_id),
                xfamily,
                variant_filter=search_spec.variant_filter,
                quality_filter=search_spec.quality_filter,
                indivs_to_consider=xfamily.indiv_id_list(),
            ))

    return variants
示例#54
0
    def handle(self, *args, **options):
        number_of_variants_to_check = int(
            options.get("number_of_variants_to_check") or 20000)

        if not args:
            args = [p.project_id for p in Project.objects.all()]
            args.reverse()

        for project_id in args:
            try:
                project = Project.objects.get(project_id=project_id)
            except:
                print("ERROR: Project not found. Skipping..")
                continue
            all_counter = 0
            #found_counter = 0
            not_found_counter = 0
            not_found_variants = []
            for vcf_file in project.get_all_vcf_files():
                path = vcf_file.file_path
                #print("Processing %s - %s" % (project.project_id, path))
                if not os.path.isfile(path) and path.endswith(".vcf"):
                    path = path + ".gz"
                if path.endswith(".gz"):
                    f = gzip.open(path)
                else:
                    f = open(path)
                if f:
                    for variant in vcf_stuff.iterate_vcf(f):
                        all_counter += 1
                        try:
                            get_mall(project).annotator.get_annotation(
                                variant.xpos, variant.ref, variant.alt)
                        except ValueError, e:
                            not_found_counter += 1
                            if len(not_found_variants) < 30:
                                chrom, pos = genomeloc.get_chr_pos(
                                    variant.xpos)
                                chrom = chrom.replace("chr", "")
                                ref, alt = variant.ref, variant.alt
                                not_found_variants.append(
                                    "%(chrom)s-%(pos)s-%(ref)s-%(alt)s" %
                                    locals())
                            #print("WARNING: variant not found in annotator cache: " + str(e))
                            #if not_found_counter > 5:
                            #    print("---- ERROR: 5 variants not found. Project %s should be reloaded." % project_id)
                            #    break
                            found_counter = 0
                        #else:
                        #    found_counter += 1
                        #    if found_counter > 15000:
                        #        #print("---- Found 5000 variants in a row. Project %s looks ok." % project_id)
                        #        break
                        if all_counter >= number_of_variants_to_check:
                            fraction_missing = float(
                                not_found_counter) / all_counter
                            if not_found_counter > 10:
                                print(
                                    "---- ERROR: (%(fraction_missing)0.2f%%)  %(not_found_counter)s / %(all_counter)s variants not found. Project %(project_id)s should be reloaded. Examples: "
                                    % locals())

                                for v in not_found_variants:
                                    print(
                                        "http://exac.broadinstitute.org/variant/"
                                        + v)
                            break
示例#55
0
文件: views.py 项目: frichter/seqr
def combine_mendelian_families_spec(request):

    project, family_group = utils.get_project_and_family_group_for_user(request.user, request.GET)
    if not project.can_view(request.user):
        raise PermissionDenied

    search_hash = request.GET.get('search_hash')
    search_spec, genes = cache_utils.get_cached_results(project.project_id, search_hash)
    search_spec_obj = MendelianVariantSearchSpec.fromJSON(search_spec)

    if request.GET.get('return_type') != 'csv' or not request.GET.get('group_by_variants'):
        if genes is None:
            genes = api_utils.calculate_combine_mendelian_families(family_group, search_spec)
        api_utils.add_extra_info_to_genes(project, get_reference(), genes)
    
        if request.GET.get('return_type') != 'csv':
            return JSONResponse({
                    'is_error': False,
                    'genes': genes,
                    'search_spec': search_spec,
                    })
        else:
            response = HttpResponse(content_type='text/csv')
            response['Content-Disposition'] = 'attachment; filename="family_group_results_{}.csv"'.format(search_hash)
            writer = csv.writer(response)
            writer.writerow(["gene", "# families", "family list", "chrom", "start", "end"])
            for gene in genes:
                family_id_list = [family_id for (project_id, family_id) in gene["family_id_list"]]
                writer.writerow(map(str, [gene["gene_name"], len(family_id_list), " ".join(family_id_list), gene["chr"], gene["start"], gene["end"], ""]))
            return response
    else:
        # download results grouped by variant
        indiv_id_list = []
        for family in family_group.get_families():
            indiv_id_list.extend(family.indiv_ids_with_variant_data())

        response = HttpResponse(content_type='text/csv')
        response['Content-Disposition'] = 'attachment; filename="results_{}.csv"'.format(search_hash)
        writer = csv.writer(response)
        
        headers = ['genes','chr','pos','ref','alt','worst_annotation' ]
        headers.extend(project.get_reference_population_slugs())
        headers.extend([ 'polyphen','sift','muttaster','fathmm'])
        for indiv_id in indiv_id_list:
            headers.append(indiv_id)
            headers.append(indiv_id+'_gq')
            headers.append(indiv_id+'_dp')
        
        writer.writerow(headers)

        mall = get_mall(project.project_id)
        variant_key_to_individual_id_to_variant = defaultdict(dict)
        variant_key_to_variant = {}
        for family in family_group.get_families():
            for variant in get_variants_with_inheritance_mode(
                mall,
                family.xfamily(),
                search_spec_obj.inheritance_mode,
                search_spec_obj.variant_filter,
                search_spec_obj.quality_filter,
                ):
                if len(variant.coding_gene_ids) == 0:
                    continue

                variant_key = (variant.xpos, variant.ref, variant.alt)
                variant_key_to_variant[variant_key] = variant
                for indiv_id in family.indiv_ids_with_variant_data():
                    variant_key_to_individual_id_to_variant[variant_key][indiv_id] = variant
                    
        for variant_key in sorted(variant_key_to_individual_id_to_variant.keys()):
            variant = variant_key_to_variant[variant_key]
            individual_id_to_variant = variant_key_to_individual_id_to_variant[variant_key]

            genes = [mall.reference.get_gene_symbol(gene_id) for gene_id in variant.coding_gene_ids]
            fields = []
            fields.append(','.join(genes))
            fields.extend([
                        variant.chr,
                        str(variant.pos),
                        variant.ref,
                        variant.alt,
                        variant.annotation.get('vep_group', '.'),
                        ])
            for ref_population_slug in project.get_reference_population_slugs():
                fields.append(variant.annotation['freqs'][ref_population_slug])
            for field_key in ['polyphen', 'sift', 'muttaster', 'fathmm']:
                fields.append(variant.annotation[field_key])

            for indiv_id in indiv_id_list:
                variant = individual_id_to_variant.get(indiv_id)                    
                genotype = None
                if variant is not None:
                    genotype = variant.get_genotype(indiv_id)

                if genotype is None:
                    fields.extend(['.', '.', '.'])
                else:
                    if genotype.num_alt == 0:
                        fields.append("%s/%s" % (variant.ref, variant.ref))
                    elif genotype.num_alt == 1:
                        fields.append("%s/%s" % (variant.ref, variant.alt))
                    elif genotype.num_alt == 2:
                        fields.append("%s/%s" % (variant.alt, variant.alt))
                    else:
                        fields.append("./.")

                    fields.append(str(genotype.gq) if genotype.gq is not None else '.')
                    fields.append(genotype.extras['dp'] if genotype.extras.get('dp') is not None else '.')    
            writer.writerow(fields)
        return response        
示例#56
0
    def handle_individual(self, project, individual):
        project_id = project.project_id
        individual_id = individual.indiv_id

        print("Processing individual %s" % individual_id)
        # get variants that have been tagged or that have a note that starts with "REPORT"
        variants_in_report_and_notes = defaultdict(str)
        for vt in VariantTag.objects.filter(project_tag__project=project,
                                            project_tag__tag="REPORT",
                                            family=individual.family):

            variants_in_report_and_notes[(vt.xpos, vt.ref, vt.alt)] = ""

        for vn in VariantNote.objects.filter(project=project, family=individual.family):
            if vn.note and vn.note.strip().startswith("REPORT"):
                variants_in_report_and_notes[(vn.xpos, vn.ref, vn.alt)] = ""

        header = ["gene_name", "genotype", "variant", "functional_class",
                  "hgvs_c", "hgvs_p", "rsid",
                  "exac_global_af", "exac_pop_max_af", "exac_pop_max_population",
                  "clinvar_clinsig", "clinvar_clnrevstat", "number_of_stars",
                  "clinvar_url", "comments"]

        if len(variants_in_report_and_notes) != 0:
            with open("report_for_%s_%s.flagged.txt" % (project_id, individual_id), "w") as out:
                #print("\t".join(header))
                out.write("\t".join(header) + "\n")

                # retrieve text of all notes that were left for any of these variants
                for vn in VariantNote.objects.filter(project=project, family=individual.family):
                    if vn.note and (vn.xpos, vn.ref, vn.alt) in variants_in_report_and_notes:
                        other_notes = variants_in_report_and_notes[(vn.xpos, vn.ref, vn.alt)]
                        if len(other_notes) > 0:
                            other_notes += "||"
                        variants_in_report_and_notes[(vn.xpos, vn.ref, vn.alt)] = other_notes + "%s|%s|%s" % (vn.date_saved, vn.user.email, vn.note.strip())

                for (xpos, ref, alt), notes in variants_in_report_and_notes.items():

                    #chrom, pos = genomeloc.get_chr_pos(xpos)

                    v = get_mall(project_id).variant_store.get_single_variant(project_id, individual.family.family_id, xpos, ref, alt)
                    if v is None:
                        raise ValueError("Couldn't find variant in variant store for: %s, %s, %s %s %s" % (project_id, individual.family.family_id, xpos, ref, alt))

                    row = self.get_output_row(v, xpos, ref, alt, individual_id, individual.family, all_fields=True, comments=notes)
                    if row is None:
                        continue
                    #print("\t".join(row))
                    out.write("\t".join(row) + "\n")

                #print(variant_tag.project_tag.title, variant_tag.project_tag.tag,  variant_tag.xpos, variant_tag.ref, variant_tag.alt)


        with open("report_for_%s_%s.genes.txt" % (project_id, individual_id), "w") as out:
            header = ["gene_chrom", "gene_start", "gene_end"] + header + ["json_dump"]
            #print("\t".join(header))
            out.write("\t".join(header) + "\n")
            for gene_id, (chrom, start, end) in gene_loc.items():
                xpos_start = genomeloc.get_single_location("chr" + chrom, start)
                xpos_end = genomeloc.get_single_location("chr" + chrom, end)
                for v in get_mall(project_id).variant_store.get_variants_in_range(project_id, individual.family.family_id, xpos_start, xpos_end):

                    json_dump = str(v.genotypes)
                    try:
                        notes = variants_in_report_and_notes[(v.xpos, v.ref, v.alt)]
                    except KeyError:
                        notes = ""
                    row = self.get_output_row(v, v.xpos, v.ref, v.alt, individual_id, individual.family, comments=notes, gene_id=gene_id)
                    if row is None:
                        continue
                    row = map(str, [chrom, start, end] + row + [json_dump])

                    #print("\t".join(row))
                    out.write("\t".join(row) + "\n")
示例#57
0
def calculate_mendelian_variant_search(search_spec, family, user=None):
    xfamily = family.xfamily()
    project = family.project
    variants = None
    if search_spec.search_mode == 'standard_inheritance':
        variants = list(get_variants_with_inheritance_mode(
            get_mall(project),
            xfamily,
            search_spec.inheritance_mode,
            variant_filter=search_spec.variant_filter,
            quality_filter=search_spec.quality_filter,
            user=user,
        ))

    elif search_spec.search_mode == 'custom_inheritance':
        variants = list(get_variants_family(
            get_datastore(project),
            xfamily,
            genotype_filter=search_spec.genotype_inheritance_filter,
            variant_filter=search_spec.variant_filter,
            quality_filter=search_spec.quality_filter,
            user=user,
        ))

    elif search_spec.search_mode == 'gene_burden':
        gene_stream = get_genes_family(
            get_datastore(project),
            get_reference(),
            xfamily,
            burden_filter=search_spec.gene_burden_filter,
            variant_filter=search_spec.variant_filter,
            quality_filter=search_spec.quality_filter,
            user=user,
        )

        variants = list(stream_utils.gene_stream_to_variant_stream(gene_stream, get_reference()))

    elif search_spec.search_mode == 'allele_count':
        variants = list(get_variants_allele_count(
            get_datastore(project),
            xfamily,
            search_spec.allele_count_filter,
            variant_filter=search_spec.variant_filter,
            quality_filter=search_spec.quality_filter,
            user=user,
        ))

    elif search_spec.search_mode == 'all_variants':
        variants = list(get_variants_family(
            get_datastore(project),
            xfamily,
            variant_filter=search_spec.variant_filter,
            quality_filter=search_spec.quality_filter,
            indivs_to_consider=xfamily.indiv_id_list(),
            user=user,
        ))

    for variant in variants:
        variant.set_extra('family_id', family.family_id)

    return variants
示例#58
0
def calculate_mendelian_variant_search(search_spec, xfamily):
    sys.stderr.write("     cohort_variant_search - inheritance_mode: %s" %
                     search_spec.inheritance_mode)

    variants = None
    if search_spec.search_mode == 'standard_inheritance':

        variants = list(
            get_variants_with_inheritance_mode(
                get_mall(xfamily.project_id),
                xfamily,
                search_spec.inheritance_mode,
                variant_filter=search_spec.variant_filter,
                quality_filter=search_spec.genotype_quality_filter,
            ))

    elif search_spec.search_mode == 'custom_inheritance':

        variants = list(
            get_variants_family(
                get_datastore(xfamily.project_id),
                xfamily,
                genotype_filter=search_spec.genotype_inheritance_filter,
                variant_filter=search_spec.variant_filter,
                quality_filter=search_spec.genotype_quality_filter,
            ))

    elif search_spec.search_mode == 'gene_burden':

        gene_stream = get_genes_family(
            get_datastore(xfamily.project_id),
            get_reference(),
            xfamily,
            burden_filter=search_spec.gene_burden_filter,
            variant_filter=search_spec.variant_filter,
            quality_filter=search_spec.genotype_quality_filter,
        )

        variants = list(
            stream_utils.gene_stream_to_variant_stream(gene_stream,
                                                       get_reference()))

    elif search_spec.search_mode == 'allele_count':

        variants = list(
            get_variants_allele_count(
                get_datastore(xfamily.project_id),
                xfamily,
                search_spec.allele_count_filter,
                variant_filter=search_spec.variant_filter,
                quality_filter=search_spec.genotype_quality_filter,
            ))

    elif search_spec.search_mode == 'all_variants':
        variants = list(
            get_variants_family(
                get_datastore(xfamily.project_id),
                xfamily,
                variant_filter=search_spec.variant_filter,
                quality_filter=search_spec.genotype_quality_filter,
            ))

    return variants
示例#59
0
def write_snp_fileset(family, output_dir_path):
    """
    Write a set of files for a family that can be passed to linkage engine
    Creates the following files:
        variants.txt
        [family_id].fam
        markers.txt
        disease_model.json
    """

    individuals = family.get_individuals()

    # fam file
    fam_file_path = os.path.join(output_dir_path, family.family_id + '.fam')
    f = open(fam_file_path, 'w')
    for indiv in individuals:
        fields = [
            family.family_id,
            indiv.indiv_id,
            indiv.paternal_id if indiv.paternal_id else '.',
            indiv.maternal_id if indiv.maternal_id else '.',
            '2' if indiv.gender == 'F' else ('1' if indiv.gender == 'F' else '0'),
            '2' if indiv.affected == 'A' else ('1' if indiv.affected == 'N' else '0'),
        ]
        f.write('\t'.join(fields)+'\n')
    f.close()

    # markers.txt
    markers_path = os.path.join(output_dir_path, 'markers.txt')
    shutil.copy(settings.COMMON_SNP_FILE, markers_path)

    # disease model
    disease_model_path = os.path.join(output_dir_path, 'disease_model.txt')
    f = open(disease_model_path, 'w')
    f.writelines([
        "DD\t.001\n",
        "Dd\t.001\n",
        "dd\t.999\n",
    ])
    f.close()

    # variants.txt
    variants_file_path = os.path.join(output_dir_path, 'variants.txt')
    f = open(variants_file_path, 'w')
    f.write('#CHR\tPOS\tREF\tALT')
    for indiv in individuals:
        f.write('\t'+indiv.indiv_id)
    f.write('\n')
    for _line in open(settings.COMMON_SNP_FILE):
        fields = _line.strip('\n').split('\t')
        xpos = genomeloc.get_single_location('chr'+fields[0], int(fields[1]))
        ref = fields[2]
        alt = fields[3]
        variant = get_mall().variant_store.get_single_variant(family.project.project_id, family.family_id, xpos, ref, alt)
        fields = [
            fields[0],
            fields[1],
            fields[2],
            fields[3],
        ]
        for indiv in individuals:
            if variant:
                genotype = variant.get_genotype(indiv.indiv_id)
                fields.append(str(genotype.num_alt) if genotype.num_alt is not None else '.')
            else:
                fields.append('0')
        f.write('\t'.join(fields)+'\n')
    f.close()