def get_variants_by_family_for_gene(mall, family_list, inheritance_mode, gene_id, variant_filter=None, quality_filter=None, user=None): if variant_filter is None: variant_filter = VariantFilter() variant_filter.add_gene(gene_id) by_family = {} for family in family_list: family_t = (family.project_id, family.family_id) variants = list( get_variants_with_inheritance_mode( mall, family, inheritance_mode, variant_filter, quality_filter, user=user, )) by_family[family_t] = variants return by_family
def get_variants_for_inheritance_for_project(project, inheritance_mode): """ Get the variants for this project / inheritance combo Return dict of family -> list of variants """ # create search specification # this could theoretically differ by project, if there are different reference populations variant_filter = get_default_variant_filter('moderate_impact') variant_filter.ref_freqs.append(('1kg_wgs_phase3', g1k_freq_threshold)) variant_filter.ref_freqs.append(('1kg_wgs_phase3_popmax', g1k_popmax_freq_threshold)) variant_filter.ref_freqs.append(('exac_v3', exac_freq_threshold)) variant_filter.ref_freqs.append(('exac_v3_popmax', exac_popmax_threshold)) quality_filter = { 'vcf_filter': 'pass', 'min_gq': GQ_threshold, 'min_ab': AB_threshold, } # run MendelianVariantSearch for each family, collect results families = project.get_families() for i, family in enumerate(families): sys.stdout.write("Processing %s - family %s (%d / %d) .." % (inheritance_mode, family.family_id, i+1, len(families))) variant_list = list(get_variants_with_inheritance_mode( get_mall(project.project_id), family.xfamily(), inheritance_mode, variant_filter=variant_filter, quality_filter=quality_filter, )) yield family, variant_list print(" got %d variants" % len(variant_list))
def get_variants_for_inheritance_for_project(project, inheritance_mode): """ Get the variants for this project / inheritance combo Return dict of family -> list of variants """ # create search specification # this could theoretically differ by project, if there are different reference populations variant_filter = get_default_variant_filter("moderate_impact") variant_filter.ref_freqs.append(("1kg_wgs_phase3", g1k_freq_threshold)) variant_filter.ref_freqs.append(("1kg_wgs_phase3_popmax", g1k_popmax_freq_threshold)) variant_filter.ref_freqs.append(("exac_v3", exac_freq_threshold)) variant_filter.ref_freqs.append(("exac_v3_popmax", exac_popmax_threshold)) quality_filter = {"vcf_filter": "pass", "min_gq": GQ_threshold, "min_ab": AB_threshold} # run MendelianVariantSearch for each family, collect results families = project.get_families() for i, family in enumerate(families): sys.stdout.write( "Processing %s - family %s (%d / %d) .." % (inheritance_mode, family.family_id, i + 1, len(families)) ) variant_list = list( get_variants_with_inheritance_mode( get_mall(project.project_id), family.xfamily(), inheritance_mode, variant_filter=variant_filter, quality_filter=quality_filter, ) ) yield family, variant_list print(" got %d variants" % len(variant_list))
def calculate_mendelian_variant_search(search_spec, xfamily): variants = None if search_spec.search_mode == 'standard_inheritance': variants = list(get_variants_with_inheritance_mode( get_mall(), xfamily, search_spec.inheritance_mode, variant_filter=search_spec.variant_filter, quality_filter=search_spec.genotype_quality_filter, )) elif search_spec.search_mode == 'custom_inheritance': variants = list(get_variants_family( get_datastore(), xfamily, genotype_filter=search_spec.genotype_inheritance_filter, variant_filter=search_spec.variant_filter, quality_filter=search_spec.genotype_quality_filter, )) elif search_spec.search_mode == 'gene_burden': gene_stream = get_genes_family( get_datastore(), get_reference(), xfamily, burden_filter=search_spec.gene_burden_filter, variant_filter=search_spec.variant_filter, quality_filter=search_spec.genotype_quality_filter, ) variants = list(stream_utils.gene_stream_to_variant_stream(gene_stream, get_reference())) elif search_spec.search_mode == 'allele_count': variants = list(get_variants_allele_count( get_datastore(), xfamily, search_spec.allele_count_filter, variant_filter=search_spec.variant_filter, quality_filter=search_spec.genotype_quality_filter, )) elif search_spec.search_mode == 'all_variants': variants = list(get_variants_family( get_datastore(), xfamily, variant_filter=search_spec.variant_filter, quality_filter=search_spec.genotype_quality_filter, )) return variants
def calculate_mendelian_variant_search(search_spec, xfamily): sys.stderr.write(" mendelian_variant_search for %s - search mode: %s %s\n" % (xfamily.project_id, search_spec.search_mode, search_spec.__dict__)) variants = None if search_spec.search_mode == 'standard_inheritance': variants = list(get_variants_with_inheritance_mode( get_mall(xfamily.project_id), xfamily, search_spec.inheritance_mode, variant_filter=search_spec.variant_filter, quality_filter=search_spec.quality_filter, )) elif search_spec.search_mode == 'custom_inheritance': variants = list(get_variants_family( get_datastore(xfamily.project_id), xfamily, genotype_filter=search_spec.genotype_inheritance_filter, variant_filter=search_spec.variant_filter, quality_filter=search_spec.quality_filter, )) elif search_spec.search_mode == 'gene_burden': gene_stream = get_genes_family( get_datastore(xfamily.project_id), get_reference(), xfamily, burden_filter=search_spec.gene_burden_filter, variant_filter=search_spec.variant_filter, quality_filter=search_spec.quality_filter, ) variants = list(stream_utils.gene_stream_to_variant_stream(gene_stream, get_reference())) elif search_spec.search_mode == 'allele_count': variants = list(get_variants_allele_count( get_datastore(xfamily.project_id), xfamily, search_spec.allele_count_filter, variant_filter=search_spec.variant_filter, quality_filter=search_spec.quality_filter, )) elif search_spec.search_mode == 'all_variants': variants = list(get_variants_family( get_datastore(xfamily.project_id), xfamily, variant_filter=search_spec.variant_filter, quality_filter=search_spec.quality_filter, indivs_to_consider=xfamily.indiv_id_list(), )) return variants
def get_variants_for_inheritance_for_project(project, inheritance_mode): """ Get the variants for this project / inheritance combo Return dict of family -> list of variants """ # create search specification # this could theoretically differ by project, if there are different reference populations #variant_filter = VariantFilter(so_annotations=SO_SEVERITY_ORDER, ref_freqs=[]) variant_filter = get_default_variant_filter('moderate_impact') variant_filter.ref_freqs.append(('1kg_wgs_phase3', g1k_freq_threshold)) variant_filter.ref_freqs.append( ('1kg_wgs_phase3_popmax', g1k_popmax_freq_threshold)) variant_filter.ref_freqs.append(('exac_v3', exac_freq_threshold)) variant_filter.ref_freqs.append(('exac_v3_popmax', exac_popmax_threshold)) variant_filter.ref_freqs.append( ('merck-wgs-3793', merck_wgs_3793_threshold)) #variant_filter.ref_freqs.append(('merck-pcr-free-wgs-144', merck_wgs_144_threshold)) quality_filter = { # 'vcf_filter': 'pass', 'min_gq': GQ_threshold, 'min_ab': AB_threshold, } # run MendelianVariantSearch for each family, collect results families = project.get_families() for i, family in enumerate(families): print("Processing %s - family %s (%d / %d)" % (inheritance_mode, family.family_id, i + 1, len(families))) try: if inheritance_mode == "all_variants": yield family, list( get_variants(get_datastore(project.project_id), family.xfamily(), variant_filter=variant_filter, quality_filter=quality_filter, indivs_to_consider=family.indiv_id_list())) else: yield family, list( get_variants_with_inheritance_mode( get_mall(project.project_id), family.xfamily(), inheritance_mode, variant_filter=variant_filter, quality_filter=quality_filter, )) except ValueError as e: print("Error: %s. Skipping family %s" % (str(e), str(family)))
def get_variants_for_inheritance_for_project(project, inheritance_mode): """ Get the variants for this project / inheritance combo Return dict of family -> list of variants """ # create search specification # this could theoretically differ by project, if there are different reference populations #variant_filter = VariantFilter(so_annotations=SO_SEVERITY_ORDER, ref_freqs=[]) variant_filter = get_default_variant_filter('moderate_impact') variant_filter.ref_freqs.append(('1kg_wgs_phase3', g1k_freq_threshold)) variant_filter.ref_freqs.append(('1kg_wgs_phase3_popmax', g1k_popmax_freq_threshold)) variant_filter.ref_freqs.append(('exac_v3', exac_freq_threshold)) variant_filter.ref_freqs.append(('exac_v3_popmax', exac_popmax_threshold)) variant_filter.ref_freqs.append(('merck-wgs-3793', merck_wgs_3793_threshold)) #variant_filter.ref_freqs.append(('merck-pcr-free-wgs-144', merck_wgs_144_threshold)) quality_filter = { # 'vcf_filter': 'pass', 'min_gq': GQ_threshold, 'min_ab': AB_threshold, } # run MendelianVariantSearch for each family, collect results families = project.get_families() for i, family in enumerate(families): print("Processing %s - family %s (%d / %d)" % (inheritance_mode, family.family_id, i+1, len(families))) try: if inheritance_mode == "all_variants": yield family, list(get_variants( get_datastore(project.project_id), family.xfamily(), variant_filter=variant_filter, quality_filter=quality_filter, indivs_to_consider=family.indiv_id_list() )) else: yield family, list(get_variants_with_inheritance_mode( get_mall(project.project_id), family.xfamily(), inheritance_mode, variant_filter=variant_filter, quality_filter=quality_filter, )) except ValueError as e: print("Error: %s. Skipping family %s" % (str(e), str(family)))
def get_families_by_gene(mall, family_group, inheritance_mode, variant_filter=None, quality_filter=None): families_by_gene = defaultdict(set) for family in family_group.get_families(): for variant in get_variants_with_inheritance_mode( mall, family, inheritance_mode, variant_filter, quality_filter ): for gene_id in variant.coding_gene_ids: families_by_gene[gene_id].add((family.project_id, family.family_id)) for gene_id, family_set in families_by_gene.items(): yield gene_id, sorted(list(family_set))
def get_variants_by_family_for_gene(mall, family_list, inheritance_mode, gene_id, variant_filter=None, quality_filter=None): if variant_filter is None: variant_filter = VariantFilter() variant_filter.add_gene(gene_id) by_family = {} for family in family_list: family_t = (family.project_id, family.family_id) variants = list(get_variants_with_inheritance_mode( mall, family, inheritance_mode, variant_filter, quality_filter )) by_family[family_t] = variants return by_family
def combine_mendelian_families_spec(request): project, family_group = utils.get_project_and_family_group_for_user(request.user, request.GET) if not project.can_view(request.user): raise PermissionDenied search_hash = request.GET.get('search_hash') search_spec, genes = cache_utils.get_cached_results(project.project_id, search_hash) search_spec_obj = MendelianVariantSearchSpec.fromJSON(search_spec) if request.GET.get('return_type') != 'csv' or not request.GET.get('group_by_variants'): if genes is None: genes = api_utils.calculate_combine_mendelian_families(family_group, search_spec) api_utils.add_extra_info_to_genes(project, get_reference(), genes) if request.GET.get('return_type') != 'csv': return JSONResponse({ 'is_error': False, 'genes': genes, 'search_spec': search_spec, }) else: response = HttpResponse(content_type='text/csv') response['Content-Disposition'] = 'attachment; filename="family_group_results_{}.csv"'.format(search_hash) writer = csv.writer(response) writer.writerow(["gene", "# families", "family list", "chrom", "start", "end"]) for gene in genes: family_id_list = [family_id for (project_id, family_id) in gene["family_id_list"]] writer.writerow(map(str, [gene["gene_name"], len(family_id_list), " ".join(family_id_list), gene["chr"], gene["start"], gene["end"], ""])) return response else: # download results grouped by variant indiv_id_list = [] for family in family_group.get_families(): indiv_id_list.extend(family.indiv_ids_with_variant_data()) response = HttpResponse(content_type='text/csv') response['Content-Disposition'] = 'attachment; filename="results_{}.csv"'.format(search_hash) writer = csv.writer(response) headers = ['genes','chr','pos','ref','alt','worst_annotation' ] headers.extend(project.get_reference_population_slugs()) headers.extend([ 'polyphen','sift','muttaster','fathmm']) for indiv_id in indiv_id_list: headers.append(indiv_id) headers.append(indiv_id+'_gq') headers.append(indiv_id+'_dp') writer.writerow(headers) mall = get_mall(project.project_id) variant_key_to_individual_id_to_variant = defaultdict(dict) variant_key_to_variant = {} for family in family_group.get_families(): for variant in get_variants_with_inheritance_mode( mall, family.xfamily(), search_spec_obj.inheritance_mode, search_spec_obj.variant_filter, search_spec_obj.quality_filter, ): if len(variant.coding_gene_ids) == 0: continue variant_key = (variant.xpos, variant.ref, variant.alt) variant_key_to_variant[variant_key] = variant for indiv_id in family.indiv_ids_with_variant_data(): variant_key_to_individual_id_to_variant[variant_key][indiv_id] = variant for variant_key in sorted(variant_key_to_individual_id_to_variant.keys()): variant = variant_key_to_variant[variant_key] individual_id_to_variant = variant_key_to_individual_id_to_variant[variant_key] genes = [mall.reference.get_gene_symbol(gene_id) for gene_id in variant.coding_gene_ids] fields = [] fields.append(','.join(genes)) fields.extend([ variant.chr, str(variant.pos), variant.ref, variant.alt, variant.annotation.get('vep_group', '.'), ]) for ref_population_slug in project.get_reference_population_slugs(): fields.append(variant.annotation['freqs'][ref_population_slug]) for field_key in ['polyphen', 'sift', 'muttaster', 'fathmm']: fields.append(variant.annotation[field_key]) for indiv_id in indiv_id_list: variant = individual_id_to_variant.get(indiv_id) genotype = None if variant is not None: genotype = variant.get_genotype(indiv_id) if genotype is None: fields.extend(['.', '.', '.']) else: if genotype.num_alt == 0: fields.append("%s/%s" % (variant.ref, variant.ref)) elif genotype.num_alt == 1: fields.append("%s/%s" % (variant.ref, variant.alt)) elif genotype.num_alt == 2: fields.append("%s/%s" % (variant.alt, variant.alt)) else: fields.append("./.") fields.append(str(genotype.gq) if genotype.gq is not None else '.') fields.append(genotype.extras['dp'] if genotype.extras.get('dp') is not None else '.') writer.writerow(fields) return response
def calculate_mendelian_variant_search(search_spec, family, user=None): xfamily = family.xfamily() project = family.project variants = None if search_spec.search_mode == 'standard_inheritance': variants = list(get_variants_with_inheritance_mode( get_mall(project), xfamily, search_spec.inheritance_mode, variant_filter=search_spec.variant_filter, quality_filter=search_spec.quality_filter, user=user, )) elif search_spec.search_mode == 'custom_inheritance': variants = list(get_variants_family( get_datastore(project), xfamily, genotype_filter=search_spec.genotype_inheritance_filter, variant_filter=search_spec.variant_filter, quality_filter=search_spec.quality_filter, user=user, )) elif search_spec.search_mode == 'gene_burden': gene_stream = get_genes_family( get_datastore(project), get_reference(), xfamily, burden_filter=search_spec.gene_burden_filter, variant_filter=search_spec.variant_filter, quality_filter=search_spec.quality_filter, user=user, ) variants = list(stream_utils.gene_stream_to_variant_stream(gene_stream, get_reference())) elif search_spec.search_mode == 'allele_count': variants = list(get_variants_allele_count( get_datastore(project), xfamily, search_spec.allele_count_filter, variant_filter=search_spec.variant_filter, quality_filter=search_spec.quality_filter, user=user, )) elif search_spec.search_mode == 'all_variants': variants = list(get_variants_family( get_datastore(project), xfamily, variant_filter=search_spec.variant_filter, quality_filter=search_spec.quality_filter, indivs_to_consider=xfamily.indiv_id_list(), user=user, )) for variant in variants: variant.set_extra('family_id', family.family_id) return variants
def calculate_mendelian_variant_search(search_spec, xfamily): sys.stderr.write(( "mendelian_variant_search for %s - search mode: %s \n" "variant_filter: %s \ninheritance_mode: %s \nallele_count_filter: %s \nquality_filter: %s \ngenotype_inheritance_filter: %s \n" ) % (xfamily.project_id, search_spec.search_mode, search_spec.variant_filter.toJSON() if search_spec.variant_filter else '', search_spec.inheritance_mode, search_spec.allele_count_filter, search_spec.quality_filter, search_spec.genotype_inheritance_filter)) variants = None if search_spec.search_mode == 'standard_inheritance': variants = list( get_variants_with_inheritance_mode( get_mall(xfamily.project_id), xfamily, search_spec.inheritance_mode, variant_filter=search_spec.variant_filter, quality_filter=search_spec.quality_filter, )) elif search_spec.search_mode == 'custom_inheritance': variants = list( get_variants_family( get_datastore(xfamily.project_id), xfamily, genotype_filter=search_spec.genotype_inheritance_filter, variant_filter=search_spec.variant_filter, quality_filter=search_spec.quality_filter, )) elif search_spec.search_mode == 'gene_burden': gene_stream = get_genes_family( get_datastore(xfamily.project_id), get_reference(), xfamily, burden_filter=search_spec.gene_burden_filter, variant_filter=search_spec.variant_filter, quality_filter=search_spec.quality_filter, ) variants = list( stream_utils.gene_stream_to_variant_stream(gene_stream, get_reference())) elif search_spec.search_mode == 'allele_count': variants = list( get_variants_allele_count( get_datastore(xfamily.project_id), xfamily, search_spec.allele_count_filter, variant_filter=search_spec.variant_filter, quality_filter=search_spec.quality_filter, )) elif search_spec.search_mode == 'all_variants': variants = list( get_variants_family( get_datastore(xfamily.project_id), xfamily, variant_filter=search_spec.variant_filter, quality_filter=search_spec.quality_filter, indivs_to_consider=xfamily.indiv_id_list(), )) return variants
def calculate_mendelian_variant_search(search_spec, xfamily): sys.stderr.write(" cohort_variant_search - inheritance_mode: %s" % search_spec.inheritance_mode) variants = None if search_spec.search_mode == 'standard_inheritance': variants = list( get_variants_with_inheritance_mode( get_mall(xfamily.project_id), xfamily, search_spec.inheritance_mode, variant_filter=search_spec.variant_filter, quality_filter=search_spec.genotype_quality_filter, )) elif search_spec.search_mode == 'custom_inheritance': variants = list( get_variants_family( get_datastore(xfamily.project_id), xfamily, genotype_filter=search_spec.genotype_inheritance_filter, variant_filter=search_spec.variant_filter, quality_filter=search_spec.genotype_quality_filter, )) elif search_spec.search_mode == 'gene_burden': gene_stream = get_genes_family( get_datastore(xfamily.project_id), get_reference(), xfamily, burden_filter=search_spec.gene_burden_filter, variant_filter=search_spec.variant_filter, quality_filter=search_spec.genotype_quality_filter, ) variants = list( stream_utils.gene_stream_to_variant_stream(gene_stream, get_reference())) elif search_spec.search_mode == 'allele_count': variants = list( get_variants_allele_count( get_datastore(xfamily.project_id), xfamily, search_spec.allele_count_filter, variant_filter=search_spec.variant_filter, quality_filter=search_spec.genotype_quality_filter, )) elif search_spec.search_mode == 'all_variants': variants = list( get_variants_family( get_datastore(xfamily.project_id), xfamily, variant_filter=search_spec.variant_filter, quality_filter=search_spec.genotype_quality_filter, )) return variants
def calculate_mendelian_variant_search(search_spec, family, user=None): xfamily = family.xfamily() project = family.project variants = None if search_spec.search_mode == 'standard_inheritance': variants = list( get_variants_with_inheritance_mode( get_mall(project), xfamily, search_spec.inheritance_mode, variant_filter=search_spec.variant_filter, quality_filter=search_spec.quality_filter, user=user, )) elif search_spec.search_mode == 'custom_inheritance': variants = list( get_variants_family( get_datastore(project), xfamily, genotype_filter=search_spec.genotype_inheritance_filter, variant_filter=search_spec.variant_filter, quality_filter=search_spec.quality_filter, user=user, )) elif search_spec.search_mode == 'gene_burden': gene_stream = get_genes_family( get_datastore(project), get_reference(), xfamily, burden_filter=search_spec.gene_burden_filter, variant_filter=search_spec.variant_filter, quality_filter=search_spec.quality_filter, user=user, ) variants = list( stream_utils.gene_stream_to_variant_stream(gene_stream, get_reference())) elif search_spec.search_mode == 'allele_count': variants = list( get_variants_allele_count( get_datastore(project), xfamily, search_spec.allele_count_filter, variant_filter=search_spec.variant_filter, quality_filter=search_spec.quality_filter, user=user, )) elif search_spec.search_mode == 'all_variants': variants = list( get_variants_family( get_datastore(project), xfamily, variant_filter=search_spec.variant_filter, quality_filter=search_spec.quality_filter, indivs_to_consider=xfamily.indiv_id_list(), user=user, )) for variant in variants: variant.set_extra('family_id', family.family_id) return variants
def handle(self, *args, **options): project_id = args[0] inheritance_mode = args[1] fam_list_file_path = args[2] project = Project.objects.get(project_id=project_id) families = [] for line in open(fam_list_file_path): family_id = line.strip('\n') families.append(Family.objects.get(project=project, family_id=family_id)) # create search spec variant_filter = next(f for f in project.get_default_variant_filters() if f['slug'] == 'moderate_impact')['variant_filter'] quality_filter = { 'min_gq': 20, 'min_ab': 25, } # run MendelianVariantSearch for each family, collect results family_results = {} for family in families: family_results[family] = list(get_variants_with_inheritance_mode( get_mall(project_id), family.xfamily(), inheritance_mode, variant_filter=variant_filter, quality_filter=quality_filter, )) # create family_variants.tsv f = open('family_variants.tsv', 'w') writer = csv.writer(f, dialect='excel', delimiter='\t') writer.writerow([ '#family_id', 'gene', 'chrom', 'ref', 'alt', 'rsid', 'annotation', ]) for family in families: for variant in family_results[family]: writer.writerow([ family.family_id, get_gene_symbol(variant), variant.chr, variant.ref, variant.alt, variant.vcf_id, variant.annotation['vep_group'], ]) f.close() # create variants.tsv by_variant = {} variant_info = {} for family in families: for variant in family_results[family]: if variant.unique_tuple() not in by_variant: by_variant[variant.unique_tuple()] = set() variant_info[variant.unique_tuple()] = variant by_variant[variant.unique_tuple()].add(family.family_id) f = open('variants.tsv', 'w') writer = csv.writer(f, dialect='excel', delimiter='\t') headers = [ '#chrom', 'ref', 'alt', 'rsid', 'gene' 'annotation', 'num_families', ] headers.extend([fam.family_id for fam in families]) writer.writerow(headers) for variant_t in sorted(variant_info.keys()): variant = variant_info[variant_t] fields = [ variant.chr, variant.ref, variant.alt, variant.vcf_id, get_gene_symbol(variant_info[variant_t]), variant.annotation['vep_group'], str(len(by_variant[variant_t])), ] for family in families: fields.append('1' if family.family_id in by_variant[variant_t] else '0') writer.writerow(fields) f.close() # create genes.tsv by_gene = {} for family in families: for variant in family_results[family]: gene_symbol = get_gene_symbol(variant) if gene_symbol not in by_gene: by_gene[gene_symbol] = set() by_gene[gene_symbol].add(family.family_id) f = open('genes.tsv', 'w') writer = csv.writer(f, dialect='excel', delimiter='\t') headers = [ '#gene', 'num_families', ] headers.extend([fam.family_id for fam in families]) writer.writerow(headers) for gene_symbol in sorted(by_gene.keys()): fields = [ gene_symbol, str(len(by_gene[gene_symbol])), ] for family in families: fields.append('1' if family.family_id in by_gene[gene_symbol] else '0') writer.writerow(fields) f.close()
def handle(self, *args, **options): project_id = args[0] inheritance_mode = args[1] fam_list_file_path = args[2] project = Project.objects.get(project_id=project_id) families = [] for line in open(fam_list_file_path): family_id = line.strip('\n') families.append( Family.objects.get(project=project, family_id=family_id)) # create search spec variant_filter = next( f for f in project.get_default_variant_filters() if f['slug'] == 'moderate_impact')['variant_filter'] quality_filter = { 'min_gq': 30, 'min_ab': 25, } # run MendelianVariantSearch for each family, collect results family_results = {} for family in families: family_results[family] = list( get_variants_with_inheritance_mode( get_mall(project_id), family.xfamily(), inheritance_mode, variant_filter=variant_filter, quality_filter=quality_filter, )) # create family_variants.tsv f = open('family_variants.tsv', 'w') writer = csv.writer(f, dialect='excel', delimiter='\t') writer.writerow([ '#family_id', 'gene', 'chrom', 'ref', 'alt', 'rsid', 'annotation', ]) for family in families: for variant in family_results[family]: writer.writerow([ family.family_id, get_gene_symbol(variant), variant.chr, variant.ref, variant.alt, variant.vcf_id, variant.annotation['vep_group'], ]) f.close() # create variants.tsv by_variant = {} variant_info = {} for family in families: for variant in family_results[family]: if variant.unique_tuple() not in by_variant: by_variant[variant.unique_tuple()] = set() variant_info[variant.unique_tuple()] = variant by_variant[variant.unique_tuple()].add(family.family_id) f = open('variants.tsv', 'w') writer = csv.writer(f, dialect='excel', delimiter='\t') headers = [ '#chrom', 'ref', 'alt', 'rsid', 'gene' 'annotation', 'num_families', ] headers.extend([fam.family_id for fam in families]) writer.writerow(headers) for variant_t in sorted(variant_info.keys()): variant = variant_info[variant_t] fields = [ variant.chr, variant.ref, variant.alt, variant.vcf_id, get_gene_symbol(variant_info[variant_t]), variant.annotation['vep_group'], str(len(by_variant[variant_t])), ] for family in families: fields.append('1' if family.family_id in by_variant[variant_t] else '0') writer.writerow(fields) f.close() # create genes.tsv by_gene = {} for family in families: for variant in family_results[family]: gene_symbol = get_gene_symbol(variant) if gene_symbol not in by_gene: by_gene[gene_symbol] = set() by_gene[gene_symbol].add(family.family_id) f = open('genes.tsv', 'w') writer = csv.writer(f, dialect='excel', delimiter='\t') headers = [ '#gene', 'num_families', ] headers.extend([fam.family_id for fam in families]) writer.writerow(headers) for gene_symbol in sorted(by_gene.keys()): fields = [ gene_symbol, str(len(by_gene[gene_symbol])), ] for family in families: fields.append('1' if family.family_id in by_gene[gene_symbol] else '0') writer.writerow(fields) f.close()