def add_sample(self, sample_id, cnv_file): """ """ print "Adding CNVs for %s" % sample_id self._db.samples.remove({"sample_id": sample_id}) self._db.cnvs.remove({"sample_id": sample_id}) self._db.cnvs.ensure_index([("sample_id", 1), ("genes", 1)]) self._db.samples.insert({"sample_id": str(sample_id), "status": "loading"}) reader = csv.reader(cnv_file) reader.next() for row in reader: chrom = "chr" + row[7] start = int(row[5]) stop = int(row[6]) xstart = genomeloc.get_single_location(chrom, start) xstop = genomeloc.get_single_location(chrom, stop) cnv = { "sample_id": sample_id, "type": row[3], "nexons": int(row[4]), "xstart": xstart, "xstop": xstop, "genes": self.reference.get_genes_in_region(xstart, xstop), "reads": [int(row[10]), int(row[11])], "read_ratio": float(row[12]), } self._db.cnvs.insert(cnv)
def add_sample(self, sample_id, cnv_file): """ """ print "Adding CNVs for %s" % sample_id self.remove_sample(sample_id) self._db.cnvs.ensure_index([('sample_id', 1), ('genes', 1)]) # silly to have this here self._db.samples.insert({ 'sample_id': str(sample_id), 'status': 'loading', }) reader = csv.reader(cnv_file) reader.next() for row in reader: chrom = 'chr' + row[7] start = int(row[5]) stop = int(row[6]) xstart = genomeloc.get_single_location(chrom, start) xstop = genomeloc.get_single_location(chrom, stop) cnv = { 'sample_id': sample_id, 'type': row[3], 'nexons': int(row[4]), 'xstart': xstart, 'xstop': xstop, 'genes': self.reference.get_genes_in_region(chrom, start, stop), 'reads': [int(row[10]), int(row[11])], 'read_ratio': float(row[12]), } self._db.cnvs.insert(cnv)
def add_sample(self, sample_id, cnv_file): """ """ print "Adding CNVs for %s" % sample_id self.remove_sample(sample_id) self._db.cnvs.ensure_index([('sample_id', 1), ('genes', 1)]) # silly to have this here self._db.samples.insert({ 'sample_id': str(sample_id), 'status': 'loading', }) reader = csv.reader(cnv_file) reader.next() for row in reader: chrom = 'chr' + row[7] start = int(row[5]) stop = int(row[6]) xstart = genomeloc.get_single_location(chrom, start) xstop = genomeloc.get_single_location(chrom, stop) cnv = { 'sample_id': sample_id, 'type': row[3], 'nexons': int(row[4]), 'xstart': xstart, 'xstop': xstop, 'genes': self.reference.get_genes_in_region(xstart, xstop), 'reads': [int(row[10]), int(row[11])], 'read_ratio': float(row[12]), } self._db.cnvs.insert(cnv)
def create_genome_subset_from_interval_list(interval_list_file): """ Creates a genome subset from interval list file This is a file with cols chr, start, stop, strand, name Strand and name are ignored, and actually it could have extra cols too and won't complain Coordinates are 1-indexed and inclusive """ intervals = [] for line in interval_list_file: fields = line.strip('\n').split('\t') chrom = 'chr'+fields[0] start = int(fields[1]) end = int(fields[2]) intervals.append((genomeloc.get_single_location(chrom, start), genomeloc.get_single_location(chrom, end))) return GenomeSubsetFilter(intervals)
def create_genome_subset_from_interval_list(interval_list_file): """ Creates a genome subset from interval list file This is a file with cols chr, start, stop, strand, name Strand and name are ignored, and actually it could have extra cols too and won't complain Coordinates are 1-indexed and inclusive """ intervals = [] for line in interval_list_file: fields = line.strip('\n').split('\t') chrom = 'chr' + fields[0] start = int(fields[1]) end = int(fields[2]) intervals.append((genomeloc.get_single_location(chrom, start), genomeloc.get_single_location(chrom, end))) return GenomeSubsetFilter(intervals)
def load_dbnsfp(): polyphen_map = { 'D': 'probably_damaging', 'P': 'possibly_damaging', 'B': 'benign', } sift_map = { 'D': 'damaging', 'T': 'tolerated', } fathmm_map = { 'D': 'damaging', 'T': 'tolerated', } muttaster_map = { 'A': 'disease_causing', 'D': 'disease_causing', 'N': 'polymorphism', 'P': 'polymorphism', } nsfp_file = open(settings.INTERMEDIATE_FILE_DIR + 'dbnsfp.tsv', 'w') for chrom in CHROMOSOMES: print "Reading dbNSFP data for {}".format(chrom) single_chrom_file = open(settings.DBNSFP_DIR + 'dbNSFP2.1_variant.' + chrom) for i, line in enumerate(single_chrom_file): if i == 0: continue fields = line.strip('\n').split('\t') chrom, pos, ref, alt = fields[:4] chrom = 'chr' + chrom pos = int(pos) xpos = genomeloc.get_single_location(chrom, pos) if not xpos: continue polyphen = polyphen_map.get(fields[25], '.') sift = sift_map.get(fields[23], '.') fathmm = fathmm_map.get(fields[39], '.') muttaster = muttaster_map.get(fields[33], '.') fields = [ str(xpos), ref, alt, polyphen, sift, fathmm, muttaster ] nsfp_file.write('\t'.join(fields)+'\n')
def load_population(self, population): """ Take a population and a data source; extract and load it into annotator Data source can be VCF file, VCF Counts file, or a counts dir (in the case of ESP data) """ if population['file_type'] == 'vcf': if population['file_path'].endswith('.gz'): vcf_file = gzip.open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = vcf_file.fileobj else: vcf_file = open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = vcf_file progress = get_progressbar( size, 'Loading vcf: {}'.format(population['slug'])) for variant in vcf_stuff.iterate_vcf(vcf_file, genotypes=True, genotype_meta=False): progress.update(progress_file.tell()) freq = get_aaf(variant) self._add_population_frequency(variant.xpos, variant.ref, variant.alt, population['slug'], freq) vcf_file.close() elif population['file_type'] == 'sites_vcf': if population['file_path'].endswith('.gz'): vcf_file = gzip.open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = vcf_file.fileobj else: vcf_file = open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = vcf_file meta_key = population.get('vcf_info_key', 'AF') progress = get_progressbar( size, 'Loading sites vcf: {}'.format(population['slug'])) is_1kg_popmax = "popmax" in meta_key.lower() and ( "1000 Genomes" in population["name"]) if is_1kg_popmax: meta_fields = [ "EAS_AF", "EUR_AF", "AFR_AF", "AMR_AF", "SAS_AF" ] else: meta_fields = [ meta_key, ] for variant in vcf_stuff.iterate_vcf(vcf_file, meta_fields=meta_fields): progress.update(progress_file.tell()) if "popmax" in meta_key.lower() and ("1000 Genomes" in population["name"]): allele_idx = variant.extras['alt_allele_pos'] freq = 0 for meta_key in meta_fields: freq = max( freq, float( variant.extras.get(meta_key, 0).split(',')[allele_idx])) ##INFO=<ID=EAS_AF,Number=A,Type=Float,Description="Allele frequency in the EAS populations calculated from AC and AN, in the range (0,1)"> ##INFO=<ID=EUR_AF,Number=A,Type=Float,Description="Allele frequency in the EUR populations calculated from AC and AN, in the range (0,1)"> ##INFO=<ID=AFR_AF,Number=A,Type=Float,Description="Allele frequency in the AFR populations calculated from AC and AN, in the range (0,1)"> ##INFO=<ID=AMR_AF,Number=A,Type=Float,Description="Allele frequency in the AMR populations calculated from AC and AN, in the range (0,1)"> ##INFO=<ID=SAS_AF,Number=A,Type=Float,Description="Allele frequency in the SAS populations calculated from AC and AN, in the range (0,1)"> else: freq = float( variant.extras.get( meta_key, 0).split(',')[variant.extras['alt_allele_pos']]) self._add_population_frequency(variant.xpos, variant.ref, variant.alt, population['slug'], freq) vcf_file.close() # # Directory of per-chromosome VCFs that ESP publishes # elif population['file_type'] == 'esp_vcf_dir': for filename in os.listdir(population['dir_path']): file_path = os.path.abspath( os.path.join(population['dir_path'], filename)) f = open(file_path) file_size = os.path.getsize(file_path) progress = get_progressbar( file_size, 'Loading ESP file: {}'.format(filename)) for variant in get_variants_from_esp_file(f): progress.update(f.tell()) self._add_population_frequency( variant['xpos'], variant['ref'], variant['alt'], population['slug'], variant[population['counts_key']]) f.close() # # text file of allele counts, as Monkol has been using for the joint calling data # elif population['file_type'] == 'counts_file': if population['file_path'].endswith('.gz'): counts_file = gzip.open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = counts_file.fileobj else: counts_file = open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = counts_file progress = get_progressbar( size, 'Loading population: {}'.format(population['slug'])) for line in counts_file: progress.update(progress_file.tell()) fields = line.strip('\n').split('\t') chrom = 'chr' + fields[0] pos = int(fields[1]) xpos = genomeloc.get_single_location(chrom, pos) ref = fields[2] alt = fields[3] if int(fields[5]) == 0: continue freq = float(fields[4]) / float(fields[5]) self._add_population_frequency(xpos, ref, alt, population['slug'], freq) counts_file.close() # this is now the canonical allele frequency file - # tab separated file with xpos / ref / alt / freq elif population['file_type'] == 'xbrowse_freq_file': if population['file_path'].endswith('.gz'): counts_file = gzip.open(population['file_path']) progress_file = counts_file.fileobj else: counts_file = open(population['file_path']) progress_file = counts_file size = os.path.getsize(population['file_path']) progress = get_progressbar( size, 'Loading population: {}'.format(population['slug'])) for line in counts_file: progress.update(progress_file.tell()) fields = line.strip('\n').split('\t') xpos = int(fields[0]) ref = fields[1] alt = fields[2] freq = float(fields[3]) self._add_population_frequency(xpos, ref, alt, population['slug'], freq) counts_file.close() elif population['file_type'] == 'tsv_file': if population['file_path'].endswith('.gz'): freq_file = gzip.open(population['file_path']) progress_file = freq_file.fileobj else: freq_file = open(population['file_path']) progress_file = freq_file size = os.path.getsize(population['file_path']) progress = get_progressbar( size, 'Loading population: {}'.format(population['slug'])) header = next(freq_file) print("Header: " + header) for line in freq_file: progress.update(progress_file.tell()) fields = line.strip('\n').split('\t') chrom = fields[0] pos = int(fields[1]) ref = fields[2] alt = fields[3] freq = float(fields[4]) xpos = genomeloc.get_single_location(chrom, pos) self._add_population_frequency(xpos, ref, alt, population['slug'], freq) freq_file.close() elif population['file_type'] == 'sites_vcf_with_counts': if population['file_path'].endswith( '.gz') or population['file_path'].endswith('.bgz'): vcf_file = gzip.open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = vcf_file.fileobj else: vcf_file = open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = vcf_file ac_info_key = population['ac_info_key'] an_info_key = population['an_info_key'] progress = get_progressbar( size, 'Loading sites vcf: {}'.format(population['slug'])) for variant in vcf_stuff.iterate_vcf( vcf_file, meta_fields=[ac_info_key, an_info_key]): progress.update(progress_file.tell()) alt_allele_pos = variant.extras['alt_allele_pos'] try: ac = int( variant.extras.get(ac_info_key).split(',') [alt_allele_pos].replace("NA", "0")) except Exception, e: print( "Couldn't parse AC value %s from %s: %s" % (alt_allele_pos, ac_info_key, variant.extras), e) continue try: if "popmax" in ac_info_key.lower(): AN_index = alt_allele_pos # each allele may have a different AN value from a different population else: AN_index = 0 an = int( variant.extras.get(an_info_key).split(',') [AN_index].replace("NA", "0")) except Exception, e: print( "Couldn't parse AN value %s from %s: %s" % (alt_allele_pos, an_info_key, variant.extras), e) continue if an == 0: freq = 0.0 else: freq = float(ac) / an self._add_population_frequency(variant.xpos, variant.ref, variant.alt, population['slug'], freq)
def load(self): self._db.drop_collection('variants') self._db.variants.ensure_index([('xpos', 1), ('ref', 1), ('alt', 1)]) # load dbnsfp info polyphen_map = { 'D': 'probably_damaging', 'P': 'possibly_damaging', 'B': 'benign', '.': None } sift_map = {'D': 'damaging', 'T': 'tolerated', '.': None} fathmm_map = {'D': 'damaging', 'T': 'tolerated', '.': None} muttaster_map = { 'A': 'disease_causing', 'D': 'disease_causing', 'N': 'polymorphism', 'P': 'polymorphism', '.': None } #interesting_fields = "rs_dbSNP141 Ancestral_allele SIFT_score SIFT_converted_rankscore SIFT_pred Polyphen2_HDIV_pred Polyphen2_HVAR_pred MutationTaster_pred MutationAssessor_pred FATHMM_pred MetaSVM_pred CADD_phred" # LRT_pred MetaLR_pred VEST3_rankscore PROVEAN_converted_rankscore PROVEAN_pred CADD_raw CADD_raw_rankscore GERP++_NR GERP++_RS GERP++_RS_rankscore ESP6500_AA_AF ESP6500_EA_AF ARIC5606_AA_AC ARIC5606_AA_AF ARIC5606_EA_AC ARIC5606_EA_AF ExAC_AC ExAC_AF ExAC_Adj_AC ExAC_Adj_AF ExAC_AFR_AC ExAC_AFR_AF ExAC_AMR_AC ExAC_AMR_AF ExAC_EAS_AC ExAC_EAS_AF ExAC_FIN_AC ExAC_FIN_AF ExAC_NFE_AC ExAC_NFE_AF ExAC_SAS_AC ExAC_SAS_AF clinvar_rs clinvar_clnsig clinvar_trait" #interesting_fields = interesting_fields.split() def collapse(scores): s = set(scores.split(";")) if len(s) > 1: raise ValueError("Couldn't collapse %s" % str(scores)) return list(s)[0] pred_rank = ['D', 'A', 'T', 'N', 'P', 'B', '.'] def select_worst(pred_value): i = len(pred_rank) - 1 for pred in pred_value.split(";"): r = pred_rank.index(pred) if r < i: i = r return pred_rank[i] for chrom in CHROMOSOMES: if chrom == "chrM": continue # no dbNSFP data for chrM print "Reading dbNSFP data for {}".format(chrom) single_chrom_file = open(self._settings.dbnsfp_dir + 'dbNSFP2.9_variant.' + chrom) header = single_chrom_file.readline() header_fields = header.strip("\n").split() field_index = { name: header_fields.index(name) for name in header_fields } for i, line in enumerate(single_chrom_file): if i == 0: continue if not i % 100000: print i fields = line.strip('\n').split('\t') chrom, pos, ref, alt = fields[:4] chrom = 'chr' + chrom pos = int(pos) xpos = genomeloc.get_single_location(chrom, pos) if not xpos: raise ValueError( "Unexpected chr, pos: %(chrom)s, %(pos)s" % (chrom, pos)) rsid = fields[field_index["rs_dbSNP141"]] annotations_dict = { 'rsid': rsid if rsid != '.' else None, 'polyphen': polyphen_map[select_worst( fields[field_index["Polyphen2_HVAR_pred"]])], 'sift': sift_map[select_worst(fields[field_index["SIFT_pred"]])], 'fathmm': fathmm_map[select_worst( fields[field_index["FATHMM_pred"]])], 'muttaster': muttaster_map[select_worst( fields[field_index["MutationTaster_pred"]])], 'metasvm': collapse(fields[field_index["MetaSVM_pred"]]), #'cadd_phred': collapse(fields[field_index["CADD_phred"]]), } #extras_to_add_now = ["clinvar_rs", "clinvar_clnsig", "clinvar_trait"] #for name in extras_to_add_now: # annotations_dict[name] = fields[field_index[name]] self._db.variants.update({ 'xpos': xpos, 'ref': ref, 'alt': alt }, {'$set': annotations_dict}, upsert=True)
def load(self): self._db.drop_collection('variants') self._db.variants.ensure_index([('xpos', 1), ('ref', 1), ('alt', 1)]) # load dbsnp info for i, variant in enumerate(vcf_stuff.iterate_vcf(open(self._settings.dbsnp_vcf_file))): if not i % 100000: print i self._db.variants.update( {'xpos': variant.xpos, 'ref': variant.ref, 'alt': variant.alt}, {'$set': {'rsid': variant.vcf_id}}, upsert=True ) # load dbnsfp info polyphen_map = { 'D': 'probably_damaging', 'P': 'possibly_damaging', 'B': 'benign', } sift_map = { 'D': 'damaging', 'T': 'tolerated', } fathmm_map = { 'D': 'damaging', 'T': 'tolerated', } muttaster_map = { 'A': 'disease_causing', 'D': 'disease_causing', 'N': 'polymorphism', 'P': 'polymorphism', } for chrom in CHROMOSOMES: print "Reading dbNSFP data for {}".format(chrom) single_chrom_file = open(self._settings.dbnsfp_dir + 'dbNSFP2.1_variant.' + chrom) for i, line in enumerate(single_chrom_file): if i == 0: continue if not i%100000: print i fields = line.strip('\n').split('\t') chrom, pos, ref, alt = fields[:4] chrom = 'chr' + chrom pos = int(pos) xpos = genomeloc.get_single_location(chrom, pos) if not xpos: continue polyphen = polyphen_map.get(fields[25]) sift = sift_map.get(fields[23]) fathmm = fathmm_map.get(fields[39]) muttaster = muttaster_map.get(fields[33]) self._db.variants.update( {'xpos': xpos, 'ref': ref, 'alt': alt}, {'$set': { 'polyphen': polyphen, 'sift': sift, 'fathmm': fathmm, 'muttaster': muttaster, }}, upsert=True )
def load_population(self, population): """ Take a population and a data source; extract and load it into annotator Data source can be VCF file, VCF Counts file, or a counts dir (in the case of ESP data) """ if population['file_type'] == 'vcf': if population['file_path'].endswith('.gz'): vcf_file = gzip.open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = vcf_file.fileobj else: vcf_file = open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = vcf_file progress = get_progressbar(size, 'Loading vcf: {}'.format(population['slug'])) for variant in vcf_stuff.iterate_vcf(vcf_file, genotypes=True, genotype_meta=False): progress.update(progress_file.tell()) freq = get_aaf(variant) self._add_population_frequency(variant.xpos, variant.ref, variant.alt, population['slug'], freq) vcf_file.close() elif population['file_type'] == 'sites_vcf': if population['file_path'].endswith('.gz'): vcf_file = gzip.open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = vcf_file.fileobj else: vcf_file = open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = vcf_file meta_key = population.get('vcf_info_key', 'AF') progress = get_progressbar(size, 'Loading sites vcf: {}'.format(population['slug'])) is_1kg_popmax = "popmax" in meta_key.lower() and ("1000 Genomes" in population["name"]) if is_1kg_popmax: meta_fields = ["EAS_AF", "EUR_AF", "AFR_AF", "AMR_AF", "SAS_AF"] else: meta_fields = [meta_key,] for variant in vcf_stuff.iterate_vcf(vcf_file, meta_fields=meta_fields): progress.update(progress_file.tell()) if "popmax" in meta_key.lower() and ("1000 Genomes" in population["name"]): allele_idx = variant.extras['alt_allele_pos'] freq = 0 for meta_key in meta_fields: freq = max(freq, float(variant.extras.get(meta_key, 0).split(',')[allele_idx])) ##INFO=<ID=EAS_AF,Number=A,Type=Float,Description="Allele frequency in the EAS populations calculated from AC and AN, in the range (0,1)"> ##INFO=<ID=EUR_AF,Number=A,Type=Float,Description="Allele frequency in the EUR populations calculated from AC and AN, in the range (0,1)"> ##INFO=<ID=AFR_AF,Number=A,Type=Float,Description="Allele frequency in the AFR populations calculated from AC and AN, in the range (0,1)"> ##INFO=<ID=AMR_AF,Number=A,Type=Float,Description="Allele frequency in the AMR populations calculated from AC and AN, in the range (0,1)"> ##INFO=<ID=SAS_AF,Number=A,Type=Float,Description="Allele frequency in the SAS populations calculated from AC and AN, in the range (0,1)"> else: freq = float(variant.extras.get(meta_key, 0).split(',')[variant.extras['alt_allele_pos']]) self._add_population_frequency( variant.xpos, variant.ref, variant.alt, population['slug'], freq ) vcf_file.close() # # Directory of per-chromosome VCFs that ESP publishes # elif population['file_type'] == 'esp_vcf_dir': for filename in os.listdir(population['dir_path']): file_path = os.path.abspath(os.path.join(population['dir_path'], filename)) f = open(file_path) file_size = os.path.getsize(file_path) progress = get_progressbar(file_size, 'Loading ESP file: {}'.format(filename)) for variant in get_variants_from_esp_file(f): progress.update(f.tell()) self._add_population_frequency( variant['xpos'], variant['ref'], variant['alt'], population['slug'], variant[population['counts_key']] ) f.close() # # text file of allele counts, as Monkol has been using for the joint calling data # elif population['file_type'] == 'counts_file': if population['file_path'].endswith('.gz'): counts_file = gzip.open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = counts_file.fileobj else: counts_file = open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = counts_file progress = get_progressbar(size, 'Loading population: {}'.format(population['slug'])) for line in counts_file: progress.update(progress_file.tell()) fields = line.strip('\n').split('\t') chrom = 'chr' + fields[0] pos = int(fields[1]) xpos = genomeloc.get_single_location(chrom, pos) ref = fields[2] alt = fields[3] if int(fields[5]) == 0: continue freq = float(fields[4]) / float(fields[5]) self._add_population_frequency( xpos, ref, alt, population['slug'], freq ) counts_file.close() # this is now the canonical allele frequency file - # tab separated file with xpos / ref / alt / freq elif population['file_type'] == 'xbrowse_freq_file': if population['file_path'].endswith('.gz'): counts_file = gzip.open(population['file_path']) progress_file = counts_file.fileobj else: counts_file = open(population['file_path']) progress_file = counts_file size = os.path.getsize(population['file_path']) progress = get_progressbar(size, 'Loading population: {}'.format(population['slug'])) for line in counts_file: progress.update(progress_file.tell()) fields = line.strip('\n').split('\t') xpos = int(fields[0]) ref = fields[1] alt = fields[2] freq = float(fields[3]) self._add_population_frequency( xpos, ref, alt, population['slug'], freq ) counts_file.close() elif population['file_type'] == 'tsv_file': if population['file_path'].endswith('.gz'): freq_file = gzip.open(population['file_path']) progress_file = freq_file.fileobj else: freq_file = open(population['file_path']) progress_file = freq_file size = os.path.getsize(population['file_path']) progress = get_progressbar(size, 'Loading population: {}'.format(population['slug'])) header = next(freq_file) print("Header: " + header) for line in freq_file: progress.update(progress_file.tell()) fields = line.strip('\n').split('\t') chrom = fields[0] pos = int(fields[1]) ref = fields[2] alt = fields[3] freq = float(fields[4]) xpos = genomeloc.get_single_location(chrom, pos) self._add_population_frequency( xpos, ref, alt, population['slug'], freq ) freq_file.close() elif population['file_type'] == 'sites_vcf_with_counts': if population['file_path'].endswith('.gz') or population['file_path'].endswith('.bgz'): vcf_file = gzip.open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = vcf_file.fileobj else: vcf_file = open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = vcf_file ac_info_key = population['ac_info_key'] an_info_key = population['an_info_key'] progress = get_progressbar(size, 'Loading sites vcf: {}'.format(population['slug'])) for variant in vcf_stuff.iterate_vcf(vcf_file, meta_fields=[ac_info_key, an_info_key]): progress.update(progress_file.tell()) alt_allele_pos = variant.extras['alt_allele_pos'] try: ac = int(variant.extras.get(ac_info_key).split(',')[alt_allele_pos].replace("NA", "0")) except Exception, e: print("Couldn't parse AC value %s from %s: %s" % (alt_allele_pos, ac_info_key, variant.extras), e) continue try: if "popmax" in ac_info_key.lower(): AN_index = alt_allele_pos # each allele may have a different AN value from a different population else: AN_index = 0 an = int(variant.extras.get(an_info_key).split(',')[AN_index].replace("NA", "0")) except Exception, e: print("Couldn't parse AN value %s from %s: %s" % (alt_allele_pos, an_info_key, variant.extras), e) continue if an == 0: freq = 0.0 else: freq = float(ac)/an self._add_population_frequency( variant.xpos, variant.ref, variant.alt, population['slug'], freq )
def load_population_to_annotator(self, population): """ Take a population and a data source; extract and load it into annotator Data source can be VCF file, VCF Counts file, or a counts dir (in the case of ESP data) """ if population['file_type'] == 'vcf': if population['file_path'].endswith('.gz'): vcf_file = gzip.open(population['file_path']) else: vcf_file = open(population['file_path']) for i, variant in enumerate(vcf_stuff.iterate_vcf(vcf_file, genotypes=True, genotype_meta=False)): if i % 10000 == 0: print i freq = get_aaf(variant) self._add_population_frequency(variant.xpos, variant.ref, variant.alt, population['slug'], freq) elif population['file_type'] == 'sites_vcf': if population['file_path'].endswith('.gz'): vcf_file = gzip.open(population['file_path']) else: vcf_file = open(population['file_path']) meta_key = population['vcf_info_key'] for i, variant in enumerate(vcf_stuff.iterate_vcf(vcf_file, meta_fields=[meta_key,])): if i % 10000 == 0: print i freq = float(variant.extras.get(meta_key, 0)) self._add_population_frequency( variant.xpos, variant.ref, variant.alt, population['slug'], freq ) # # Directory of per-chromosome VCFs that ESP publishes # elif population['file_type'] == 'esp_vcf_dir': for filename in os.listdir(population['dir_path']): print "Adding %s" % filename file_path = os.path.abspath(os.path.join(population['dir_path'], filename)) f = open(file_path) for i, variant in enumerate(get_variants_from_esp_file(f)): if i % 10000 == 0: print i self._add_population_frequency( variant['xpos'], variant['ref'], variant['alt'], population['slug'], variant[population['counts_key']] ) # # text file of allele counts, as Monkol has been using for the joint calling data # elif population['file_type'] == 'counts_file': if population['file_path'].endswith('.gz'): counts_file = gzip.open(population['file_path']) else: counts_file = open(population['file_path']) for i, line in enumerate(counts_file): if i % 10000 == 0: print i fields = line.strip('\n').split('\t') chrom = 'chr' + fields[0] pos = int(fields[1]) xpos = genomeloc.get_single_location(chrom, pos) ref = fields[2] alt = fields[3] if int(fields[5]) == 0: continue freq = float(fields[4]) / float(fields[5]) self._add_population_frequency( xpos, ref, alt, population['slug'], freq ) # this is now the canonical allele frequency file - # tab separated file with xpos / ref / alt / freq elif population['file_type'] == 'xbrowse_freq_file': if population['file_path'].endswith('.gz'): counts_file = gzip.open(population['file_path']) else: counts_file = open(population['file_path']) for i, line in enumerate(counts_file): if i % 10000 == 0: print i fields = line.strip('\n').split('\t') xpos = int(fields[0]) ref = fields[1] alt = fields[2] freq = float(fields[3]) self._add_population_frequency( xpos, ref, alt, population['slug'], freq )
def load(self): self._db.drop_collection('variants') self._db.variants.ensure_index([('xpos', 1), ('ref', 1), ('alt', 1)]) # load dbnsfp info polyphen_map = { 'D': 'probably_damaging', 'P': 'possibly_damaging', 'B': 'benign', '.': None } sift_map = { 'D': 'damaging', 'T': 'tolerated', '.': None } fathmm_map = { 'D': 'damaging', 'T': 'tolerated', '.': None } muttaster_map = { 'A': 'disease_causing', 'D': 'disease_causing', 'N': 'polymorphism', 'P': 'polymorphism', '.': None } #interesting_fields = "rs_dbSNP141 Ancestral_allele SIFT_score SIFT_converted_rankscore SIFT_pred Polyphen2_HDIV_pred Polyphen2_HVAR_pred MutationTaster_pred MutationAssessor_pred FATHMM_pred MetaSVM_pred CADD_phred" # LRT_pred MetaLR_pred VEST3_rankscore PROVEAN_converted_rankscore PROVEAN_pred CADD_raw CADD_raw_rankscore GERP++_NR GERP++_RS GERP++_RS_rankscore ESP6500_AA_AF ESP6500_EA_AF ARIC5606_AA_AC ARIC5606_AA_AF ARIC5606_EA_AC ARIC5606_EA_AF ExAC_AC ExAC_AF ExAC_Adj_AC ExAC_Adj_AF ExAC_AFR_AC ExAC_AFR_AF ExAC_AMR_AC ExAC_AMR_AF ExAC_EAS_AC ExAC_EAS_AF ExAC_FIN_AC ExAC_FIN_AF ExAC_NFE_AC ExAC_NFE_AF ExAC_SAS_AC ExAC_SAS_AF clinvar_rs clinvar_clnsig clinvar_trait" #interesting_fields = interesting_fields.split() def collapse(scores): s = set(scores.split(";")) if len(s) > 1: raise ValueError("Couldn't collapse %s" % str(scores)) return list(s)[0] pred_rank = ['D', 'A', 'T', 'N', 'P', 'B', '.'] def select_worst(pred_value): i = len(pred_rank) - 1 for pred in pred_value.split(";"): r = pred_rank.index(pred) if r < i: i = r return pred_rank[i] for chrom in CHROMOSOMES: if chrom == "chrM": continue # no dbNSFP data for chrM print "Reading dbNSFP data for {}".format(chrom) single_chrom_file = open(self._settings.dbnsfp_dir + 'dbNSFP2.9_variant.' + chrom) header = single_chrom_file.readline() header_fields = header.strip("\n").split() field_index = {name: header_fields.index(name) for name in header_fields} for i, line in enumerate(single_chrom_file): if i == 0: continue if not i%100000: print i fields = line.strip('\n').split('\t') chrom, pos, ref, alt = fields[:4] chrom = 'chr' + chrom pos = int(pos) xpos = genomeloc.get_single_location(chrom, pos) if not xpos: raise ValueError("Unexpected chr, pos: %(chrom)s, %(pos)s" % (chrom, pos)) rsid = fields[field_index["rs_dbSNP141"]] annotations_dict = { 'rsid': rsid if rsid != '.' else None, 'polyphen': polyphen_map[select_worst(fields[field_index["Polyphen2_HVAR_pred"]])], 'sift': sift_map[select_worst(fields[field_index["SIFT_pred"]])], 'fathmm': fathmm_map[select_worst(fields[field_index["FATHMM_pred"]])], 'muttaster': muttaster_map[select_worst(fields[field_index["MutationTaster_pred"]])], 'metasvm': collapse(fields[field_index["MetaSVM_pred"]]), #'cadd_phred': collapse(fields[field_index["CADD_phred"]]), } #extras_to_add_now = ["clinvar_rs", "clinvar_clnsig", "clinvar_trait"] #for name in extras_to_add_now: # annotations_dict[name] = fields[field_index[name]] self._db.variants.update( {'xpos': xpos, 'ref': ref, 'alt': alt}, {'$set': annotations_dict}, upsert=True )
def load_dbnsfp(self): self._db.drop_collection('variants') self._db.variants.ensure_index([('xpos', 1), ('ref', 1), ('alt', 1)]) # load dbnsfp info polyphen_map = { 'D': 'probably_damaging', 'P': 'possibly_damaging', 'B': 'benign', '.': None } sift_map = {'D': 'damaging', 'T': 'tolerated', '.': None} fathmm_map = {'D': 'damaging', 'T': 'tolerated', '.': None} muttaster_map = { 'A': 'disease_causing', 'D': 'disease_causing', 'N': 'polymorphism', 'P': 'polymorphism', '.': None } def collapse(scores): s = set(scores.split(";")) if len(s) > 1: raise ValueError("Couldn't collapse %s" % str(scores)) return list(s)[0] pred_rank = ['D', 'A', 'T', 'N', 'P', 'B', '.'] def select_worst(pred_value): i = len(pred_rank) - 1 for pred in pred_value.split(";"): r = pred_rank.index(pred) if r < i: i = r return pred_rank[i] for chrom in CHROMOSOMES: if chrom == "chrM": continue # no dbNSFP data for chrM print "Reading dbNSFP data for {}".format(chrom) single_chrom_file = open( self._settings.dbnsfp_dir[self._genome_version] + 'dbNSFP2.9_variant.' + chrom) header = single_chrom_file.readline() header_fields = header.strip("\n").split() field_index = { name: header_fields.index(name) for name in header_fields } for i, line in tqdm(enumerate(single_chrom_file)): if i == 0: continue fields = line.strip('\n').split('\t') chrom, pos, ref, alt = fields[:4] chrom = 'chr' + chrom pos = int(pos) xpos = genomeloc.get_single_location(chrom, pos) if not xpos: raise ValueError( "Unexpected chr, pos: %(chrom)s, %(pos)s" % (chrom, pos)) rsid = fields[field_index["rs_dbSNP141"]] annotations_dict = { 'rsid': rsid if rsid != '.' else None, 'polyphen': polyphen_map[select_worst( fields[field_index["Polyphen2_HVAR_pred"]])], 'sift': sift_map[select_worst(fields[field_index["SIFT_pred"]])], 'fathmm': fathmm_map[select_worst( fields[field_index["FATHMM_pred"]])], 'muttaster': muttaster_map[select_worst( fields[field_index["MutationTaster_pred"]])], 'metasvm': collapse(fields[field_index["MetaSVM_pred"]]), #'cadd_phred': collapse(fields[field_index["CADD_phred"]]), } self._db.variants.update({ 'xpos': xpos, 'ref': ref, 'alt': alt }, {'$set': annotations_dict}, upsert=True)
def load_dbnsfp(self): settings.CUSTOM_ANNOTATOR_SETTINGS.db[self._genome_version].drop_collection('variants') settings.CUSTOM_ANNOTATOR_SETTINGS.db[self._genome_version].variants.ensure_index([('xpos', 1), ('ref', 1), ('alt', 1)]) # load dbnsfp info polyphen_map = { 'D': 'probably_damaging', 'P': 'possibly_damaging', 'B': 'benign', '.': None } sift_map = { 'D': 'damaging', 'T': 'tolerated', '.': None } fathmm_map = { 'D': 'damaging', 'T': 'tolerated', '.': None } muttaster_map = { 'A': 'disease_causing', 'D': 'disease_causing', 'N': 'polymorphism', 'P': 'polymorphism', '.': None } def collapse(scores): s = set(scores.split(";")) if len(s) > 1: raise ValueError("Couldn't collapse %s" % str(scores)) return list(s)[0] pred_rank = ['D', 'A', 'T', 'N', 'P', 'B', '.'] def select_worst(pred_value): i = len(pred_rank) - 1 for pred in pred_value.split(";"): r = pred_rank.index(pred) if r < i: i = r return pred_rank[i] for chrom in CHROMOSOMES: if chrom == "chrM": continue # no dbNSFP data for chrM print "Reading dbNSFP data for {}".format(chrom) single_chrom_file = open(settings.CUSTOM_ANNOTATOR_SETTINGS.dbnsfp_dir[self._genome_version] + 'dbNSFP2.9_variant.' + chrom) header = single_chrom_file.readline() header_fields = header.strip("\n").split() field_index = {name: header_fields.index(name) for name in header_fields} for i, line in tqdm(enumerate(single_chrom_file)): if i == 0: continue fields = line.strip('\n').split('\t') chrom, pos, ref, alt = fields[:4] chrom = 'chr' + chrom pos = int(pos) xpos = genomeloc.get_single_location(chrom, pos) if not xpos: raise ValueError("Unexpected chr, pos: %(chrom)s, %(pos)s" % (chrom, pos)) rsid = fields[field_index["rs_dbSNP141"]] annotations_dict = { 'rsid': rsid if rsid != '.' else None, 'polyphen': polyphen_map[select_worst(fields[field_index["Polyphen2_HVAR_pred"]])], 'sift': sift_map[select_worst(fields[field_index["SIFT_pred"]])], 'fathmm': fathmm_map[select_worst(fields[field_index["FATHMM_pred"]])], 'muttaster': muttaster_map[select_worst(fields[field_index["MutationTaster_pred"]])], 'metasvm': collapse(fields[field_index["MetaSVM_pred"]]), #'cadd_phred': collapse(fields[field_index["CADD_phred"]]), } settings.CUSTOM_ANNOTATOR_SETTINGS.db[self._genome_version].variants.update( {'xpos': xpos, 'ref': ref, 'alt': alt}, {'$set': annotations_dict}, upsert=True )