def handle(self, *args, **options): number_of_variants_to_check = int(options.get("number_of_variants_to_check") or 20000) if not args: args = [p.project_id for p in Project.objects.all()] args.reverse() for project_id in args: try: project = Project.objects.get(project_id=project_id) except: print("ERROR: Project not found. Skipping..") continue all_counter = 0 #found_counter = 0 not_found_counter = 0 not_found_variants = [] for vcf_file in project.get_all_vcf_files(): path = vcf_file.file_path #print("Processing %s - %s" % (project.project_id, path)) if not os.path.isfile(path) and path.endswith(".vcf"): path = path + ".gz" if path.endswith(".gz"): f = gzip.open(path) else: f = open(path) if f: for variant in vcf_stuff.iterate_vcf(f): all_counter += 1 try: get_mall(project_id).annotator.get_annotation(variant.xpos, variant.ref, variant.alt) except ValueError, e: not_found_counter += 1 if len(not_found_variants) < 30: chrom, pos = genomeloc.get_chr_pos(variant.xpos) chrom = chrom.replace("chr","") ref, alt = variant.ref, variant.alt not_found_variants.append("%(chrom)s-%(pos)s-%(ref)s-%(alt)s" % locals()) #print("WARNING: variant not found in annotator cache: " + str(e)) #if not_found_counter > 5: # print("---- ERROR: 5 variants not found. Project %s should be reloaded." % project_id) # break found_counter = 0 #else: # found_counter += 1 # if found_counter > 15000: # #print("---- Found 5000 variants in a row. Project %s looks ok." % project_id) # break if all_counter >= number_of_variants_to_check: fraction_missing = float(not_found_counter) / all_counter if not_found_counter > 10: print("---- ERROR: (%(fraction_missing)0.2f%%) %(not_found_counter)s / %(all_counter)s variants not found. Project %(project_id)s should be reloaded. Examples: " % locals()) for v in not_found_variants: print("http://exac.broadinstitute.org/variant/" + v) break
def get_vep_annotations_from_vcf(vcf_file_obj): """ Iterate through the variants in a VEP annotated VCF, pull out annotation from CSQ field """ vep_meta_fields = ['CSQ'] header_info = {} csq_field_names = None for variant in vcf_stuff.iterate_vcf(vcf_file_obj, meta_fields=vep_meta_fields, header_info=header_info): if csq_field_names is None: csq_field_names = get_csq_fields_from_vcf_desc(header_info['CSQ'].desc) vep_annotation = get_vep_annotation_from_csq_info(variant.extras['CSQ'], csq_field_names) yield variant, vep_annotation
def add_variants_to_project_from_vcf(self, vcf_file, project_id, indiv_id_list=None, start_from_chrom=None, end_with_chrom=None): """ This is how variants are loaded """ chrom_list = list(map(str, range(1,23))) + ['X','Y'] chrom_list_start_index = 0 if start_from_chrom: chrom_list_start_index = chrom_list.index(start_from_chrom.replace("chr", "").upper()) chrom_list_end_index = len(chrom_list) if end_with_chrom: chrom_list_end_index = chrom_list.index(end_with_chrom.replace("chr", "").upper()) chromosomes_to_include = set(chrom_list[chrom_list_start_index : chrom_list_end_index]) #tabix_file = pysam.TabixFile(vcf_file) #vcf_iter = tabix_file.header #for chrom in chrom_list[chrom_list_start_index:chrom_list_end_index]: # print("Will load chrom: " + chrom) # vcf_iter = itertools.chain(vcf_iter, tabix_file.fetch(chrom)) project_collection = self._get_project_collection(project_id) reference_populations = self._annotator.reference_population_slugs + self._custom_populations_map.get(project_id) for counter, variant in enumerate(vcf_stuff.iterate_vcf(vcf_file, genotypes=True, indiv_id_list=indiv_id_list)): if (start_from_chrom or end_with_chrom) and variant.chr.replace("chr", "") not in chromosomes_to_include: continue if variant.alt == "*": #print("Skipping GATK 3.4 * alt allele: " + str(variant.unique_tuple())) continue if counter % 2000 == 0: print(date.strftime(datetime.now(), "%m/%d/%Y %H:%M:%S") + "-- inserting variant %d %s:%s-%s-%s (%0.1f%% done with %s) " % (counter, variant.chr, variant.pos, variant.ref, variant.alt, 100*variant.pos / CHROMOSOME_SIZES[variant.chr.replace("chr", "")], variant.chr)) variant_dict = project_collection.find_one({'xpos': variant.xpos, 'ref': variant.ref, 'alt': variant.alt}) if not variant_dict: variant_dict = variant.toJSON() try: annotation = self._annotator.get_annotation(variant.xpos, variant.ref, variant.alt, populations=reference_populations) except ValueError, e: sys.stderr.write("WARNING: " + str(e) + "\n") continue _add_index_fields_to_variant(variant_dict, annotation) else: for indiv_id, genotype in variant.get_genotypes(): if genotype.num_alt != 0: variant_dict['genotypes'][indiv_id] = genotype._asdict() project_collection.save(variant_dict)
def get_variants_from_esp_file(file_like_object): """ file_like_object is an ESP-style VCF file This could be called 23 times, one for each file, or just once after running vcf-merge Return a stream of variants with the following structure: { 'xpos': long int 'ref': str, 'alt': alt, 'esp_ea': float - aaf in european americans 'esp_aa': float - aaf in african americans } Note that ref and alt are *not* currently reduced by xbrowse """ ac_meta_fields = ['EA_AC', 'AA_AC'] for variant in vcf_stuff.iterate_vcf(file_like_object, meta_fields=ac_meta_fields, vcf_row_info=True): v = { 'xpos': variant.xpos, 'ref': variant.ref, 'alt': variant.alt, } ea_counts = [int(c) for c in variant.extras['EA_AC'].split(',')] aa_counts = [int(c) for c in variant.extras['AA_AC'].split(',')] ea_total = sum(ea_counts) aa_total = sum(aa_counts) # note that allele counts in VCF are alt1,alt2,ref. # ...dumb ea_thisallele = ea_counts[variant.extras['vcf_row_info'] ['alt_allele_pos']] aa_thisallele = aa_counts[variant.extras['vcf_row_info'] ['alt_allele_pos']] v['esp_ea'] = float(ea_thisallele) / ea_total v['esp_aa'] = float(aa_thisallele) / aa_total yield v
def add_variants_to_project_from_vcf(self, vcf_file, project_id, indiv_id_list=None): """ This is how variants are loaded """ project_collection = self._get_project_collection(project_id) reference_populations = self._get_project_reference_populations(project_id) for i, variant in enumerate(vcf_stuff.iterate_vcf(vcf_file, genotypes=True, indiv_id_list=indiv_id_list)): if i % 1000 == 0: print i variant_dict = project_collection.find_one({'xpos': variant.xpos, 'ref': variant.ref, 'alt': variant.alt}) if not variant_dict: self._annotator.annotate_variant(variant, reference_populations) variant_dict = variant.toJSON() variant_dict['vep_consequence'] = variant.annotation['vep_consequence'] variant_dict['freqs'] = variant.annotation['freqs'] else: for indiv_id, genotype in variant.get_genotypes(): if genotype.num_alt != 0: variant_dict['genotypes'][indiv_id] = genotype._asdict() project_collection.save(variant_dict)
def _add_vcf_file_for_family_set(self, family_info_list, vcf_file_path, reference_populations=None, vcf_id_map=None): collections = {f['family_id']: self._db[f['coll_name']] for f in family_info_list} for collection in collections.values(): collection.drop_indexes() indiv_id_list = [i for f in family_info_list for i in f['individuals']] vcf_file = compressed_file(vcf_file_path) size = os.path.getsize(vcf_file_path) progress = get_progressbar(size, 'Loading VCF: {}'.format(vcf_file_path)) for variant in vcf_stuff.iterate_vcf(vcf_file, genotypes=True, indiv_id_list=indiv_id_list, vcf_id_map=vcf_id_map): progress.update(vcf_file.tell_progress()) annotation = self._annotator.get_annotation(variant.xpos, variant.ref, variant.alt, populations=reference_populations) for family in family_info_list: # TODO: can we move this inside the if relevant clause below? family_variant = variant.make_copy(restrict_to_genotypes=family['individuals']) family_variant_dict = family_variant.toJSON() _add_index_fields_to_variant(family_variant_dict, annotation) if xbrowse_utils.is_variant_relevant_for_individuals(family_variant, family['individuals']): collection = collections[family['family_id']] collection.insert(family_variant_dict)
def add_variants_to_project_from_vcf(self, vcf_file, project_id, indiv_id_list=None): """ This is how variants are loaded """ project_collection = self._get_project_collection(project_id) reference_populations = self._annotator.reference_population_slugs + self._custom_populations_map.get(project_id) for variant in vcf_stuff.iterate_vcf(vcf_file, genotypes=True, indiv_id_list=indiv_id_list): variant_dict = project_collection.find_one({'xpos': variant.xpos, 'ref': variant.ref, 'alt': variant.alt}) if not variant_dict: variant_dict = variant.toJSON() try: annotation = self._annotator.get_annotation(variant.xpos, variant.ref, variant.alt, populations=reference_populations) except ValueError, e: sys.stderr.write("WARNING: " + str(e) + "\n") continue _add_index_fields_to_variant(variant_dict, annotation) else: for indiv_id, genotype in variant.get_genotypes(): if genotype.num_alt != 0: variant_dict['genotypes'][indiv_id] = genotype._asdict() project_collection.save(variant_dict)
def get_variants_from_esp_file(file_like_object): """ file_like_object is an ESP-style VCF file This could be called 23 times, one for each file, or just once after running vcf-merge Return a stream of variants with the following structure: { 'xpos': long int 'ref': str, 'alt': alt, 'esp_ea': float - aaf in european americans 'esp_aa': float - aaf in african americans } Note that ref and alt are *not* currently reduced by xbrowse """ ac_meta_fields = ['EA_AC', 'AA_AC'] for variant in vcf_stuff.iterate_vcf(file_like_object, meta_fields=ac_meta_fields, vcf_row_info=True): v = { 'xpos': variant.xpos, 'ref': variant.ref, 'alt': variant.alt, } ea_counts = [int(c) for c in variant.extras['EA_AC'].split(',')] aa_counts = [int(c) for c in variant.extras['AA_AC'].split(',')] ea_total = sum(ea_counts) aa_total = sum(aa_counts) # note that allele counts in VCF are alt1,alt2,ref. # ...dumb ea_thisallele = ea_counts[variant.extras['vcf_row_info']['alt_allele_pos']] aa_thisallele = aa_counts[variant.extras['vcf_row_info']['alt_allele_pos']] v['esp_ea'] = float(ea_thisallele) / ea_total v['esp_aa'] = float(aa_thisallele) / aa_total yield v
def handle(self, *args, **options): number_of_variants_to_check = int( options.get("number_of_variants_to_check") or 20000) if not args: args = [p.project_id for p in Project.objects.all()] args.reverse() for project_id in args: try: project = Project.objects.get(project_id=project_id) except: print("ERROR: Project not found. Skipping..") continue all_counter = 0 #found_counter = 0 not_found_counter = 0 not_found_variants = [] for vcf_file in project.get_all_vcf_files(): path = vcf_file.file_path #print("Processing %s - %s" % (project.project_id, path)) if not os.path.isfile(path) and path.endswith(".vcf"): path = path + ".gz" if path.endswith(".gz"): f = gzip.open(path) else: f = open(path) if f: for variant in vcf_stuff.iterate_vcf(f): all_counter += 1 try: get_mall(project).annotator.get_annotation( variant.xpos, variant.ref, variant.alt) except ValueError, e: not_found_counter += 1 if len(not_found_variants) < 30: chrom, pos = genomeloc.get_chr_pos( variant.xpos) chrom = chrom.replace("chr", "") ref, alt = variant.ref, variant.alt not_found_variants.append( "%(chrom)s-%(pos)s-%(ref)s-%(alt)s" % locals()) #print("WARNING: variant not found in annotator cache: " + str(e)) #if not_found_counter > 5: # print("---- ERROR: 5 variants not found. Project %s should be reloaded." % project_id) # break found_counter = 0 #else: # found_counter += 1 # if found_counter > 15000: # #print("---- Found 5000 variants in a row. Project %s looks ok." % project_id) # break if all_counter >= number_of_variants_to_check: fraction_missing = float( not_found_counter) / all_counter if not_found_counter > 10: print( "---- ERROR: (%(fraction_missing)0.2f%%) %(not_found_counter)s / %(all_counter)s variants not found. Project %(project_id)s should be reloaded. Examples: " % locals()) for v in not_found_variants: print( "http://exac.broadinstitute.org/variant/" + v) break
def _add_vcf_file_for_family_set(self, family_info_list, vcf_file_path, reference_populations=None, vcf_id_map=None): collections = {f['family_id']: self._db[f['coll_name']] for f in family_info_list} #for collection in collections.values(): # collection.drop_indexes() indiv_id_list = [i for f in family_info_list for i in f['individuals']] number_of_families = len(family_info_list) sys.stderr.write("Loading variants for %(number_of_families)d families %(family_info_list)s from %(vcf_file_path)s\n" % locals()) for family in family_info_list: print("Indexing family: " + str(family)) collection = collections[family['family_id']] collection.ensure_index([('xpos', 1), ('ref', 1), ('alt', 1)]) # check whether some of the variants for this chromosome has been loaded already # if yes, start from the last loaded variant, and not from the beginning if "_chr" in vcf_file_path or ".chr" in vcf_file_path: # if the VCF files are split by chromosome (eg. for WGS projects), check within the chromosome vcf_file = compressed_file(vcf_file_path) variant = next(vcf_stuff.iterate_vcf(vcf_file, genotypes=False, indiv_id_list=indiv_id_list, vcf_id_map=vcf_id_map)) print(vcf_file_path + " - chromsome: " + str(variant.chr)) vcf_file.close() position_per_chrom = {} for chrom in range(1,24): position_per_chrom[chrom] = defaultdict(int) for family in family_info_list: #variants = collections[family['family_id']].find().sort([('xpos',-1)]).limit(1) variants = list(collections[family['family_id']].find({'$and': [{'xpos': { '$gte': chrom*1e9 }}, {'xpos': { '$lt': (chrom+1)*1e9}}] }).sort([('xpos',-1)]).limit(1)) if len(variants) > 0: position_per_chrom[chrom][family['family_id']] = variants[0]['xpos'] - chrom*1e9 else: position_per_chrom[chrom][family['family_id']] = 0 for chrom in range(1,24): position_per_chrom[chrom] = min(position_per_chrom[chrom].values()) # get the smallest last-loaded variant position for this chromosome across all families chr_idx = int(variant.xpos/1e9) start_from_pos = int(position_per_chrom[chr_idx]) print("Start from: %s - %s (%0.1f%% done)" % (chr_idx, start_from_pos, 100.*start_from_pos/CHROMOSOME_SIZES[variant.chr.replace("chr", "")])) tabix_file = pysam.TabixFile(vcf_file_path) vcf_iter = itertools.chain(tabix_file.header, tabix_file.fetch(variant.chr.replace("chr", ""), start_from_pos, int(2.5e8))) else: vcf_iter = vcf_file = compressed_file(vcf_file_path) # TODO handle case where it's one vcf file, not split by chromosome size = os.path.getsize(vcf_file_path) progress = get_progressbar(size, 'Loading VCF: {}'.format(vcf_file_path)) def insert_all_variants_in_buffer(buff, collections_dict): for family_id in buff: if len(buff[family_id]) == 0: # defensive programming raise ValueError("%s has zero variants to insert. Should not be in buff." % family_id) while len(buff) > 0: # choose a random family for which to insert a variant from among families that still have variants to insert family_id = random.choice(buff.keys()) # pop a variant off the list for this family, and insert it family_variant_dict_to_insert = buff[family_id].pop() c = collections_dict[family_id] c.insert(family_variant_dict_to_insert) if len(buff[family_id]) == 0: del buff[family_id] # if no more variants for this family, delete it vcf_rows_counter = 0 variants_buffered_counter = 0 family_id_to_variant_list = defaultdict(list) # will accumulate variants to be inserted all at once for variant in vcf_stuff.iterate_vcf(vcf_iter, genotypes=True, indiv_id_list=indiv_id_list, vcf_id_map=vcf_id_map): if variant.alt == "*": #print("Skipping GATK 3.4 * alt allele: " + str(variant.unique_tuple())) continue try: annotation = self._annotator.get_annotation(variant.xpos, variant.ref, variant.alt, populations=reference_populations) except ValueError, e: sys.stderr.write("WARNING: " + str(e) + "\n") continue vcf_rows_counter += 1 for family in family_info_list: # TODO: can we move this inside the if relevant clause below? try: family_variant = variant.make_copy(restrict_to_genotypes=family['individuals']) family_variant_dict = family_variant.toJSON() _add_index_fields_to_variant(family_variant_dict, annotation) if xbrowse_utils.is_variant_relevant_for_individuals(family_variant, family['individuals']): collection = collections[family['family_id']] if not collection.find_one({'xpos': family_variant.xpos, 'ref': family_variant.ref, 'alt': family_variant.alt}): family_id_to_variant_list[family['family_id']].append(family_variant_dict) variants_buffered_counter += 1 except Exception, e: sys.stderr.write("ERROR: on variant %s, family: %s - %s\n" % (variant.toJSON(), family, e))
def load(self): self._db.drop_collection('variants') self._db.variants.ensure_index([('xpos', 1), ('ref', 1), ('alt', 1)]) # load dbsnp info for i, variant in enumerate(vcf_stuff.iterate_vcf(open(self._settings.dbsnp_vcf_file))): if not i % 100000: print i self._db.variants.update( {'xpos': variant.xpos, 'ref': variant.ref, 'alt': variant.alt}, {'$set': {'rsid': variant.vcf_id}}, upsert=True ) # load dbnsfp info polyphen_map = { 'D': 'probably_damaging', 'P': 'possibly_damaging', 'B': 'benign', } sift_map = { 'D': 'damaging', 'T': 'tolerated', } fathmm_map = { 'D': 'damaging', 'T': 'tolerated', } muttaster_map = { 'A': 'disease_causing', 'D': 'disease_causing', 'N': 'polymorphism', 'P': 'polymorphism', } for chrom in CHROMOSOMES: print "Reading dbNSFP data for {}".format(chrom) single_chrom_file = open(self._settings.dbnsfp_dir + 'dbNSFP2.1_variant.' + chrom) for i, line in enumerate(single_chrom_file): if i == 0: continue if not i%100000: print i fields = line.strip('\n').split('\t') chrom, pos, ref, alt = fields[:4] chrom = 'chr' + chrom pos = int(pos) xpos = genomeloc.get_single_location(chrom, pos) if not xpos: continue polyphen = polyphen_map.get(fields[25]) sift = sift_map.get(fields[23]) fathmm = fathmm_map.get(fields[39]) muttaster = muttaster_map.get(fields[33]) self._db.variants.update( {'xpos': xpos, 'ref': ref, 'alt': alt}, {'$set': { 'polyphen': polyphen, 'sift': sift, 'fathmm': fathmm, 'muttaster': muttaster, }}, upsert=True )
def load_population_to_annotator(self, population): """ Take a population and a data source; extract and load it into annotator Data source can be VCF file, VCF Counts file, or a counts dir (in the case of ESP data) """ if population['file_type'] == 'vcf': if population['file_path'].endswith('.gz'): vcf_file = gzip.open(population['file_path']) else: vcf_file = open(population['file_path']) for i, variant in enumerate(vcf_stuff.iterate_vcf(vcf_file, genotypes=True, genotype_meta=False)): if i % 10000 == 0: print i freq = get_aaf(variant) self._add_population_frequency(variant.xpos, variant.ref, variant.alt, population['slug'], freq) elif population['file_type'] == 'sites_vcf': if population['file_path'].endswith('.gz'): vcf_file = gzip.open(population['file_path']) else: vcf_file = open(population['file_path']) meta_key = population['vcf_info_key'] for i, variant in enumerate(vcf_stuff.iterate_vcf(vcf_file, meta_fields=[meta_key,])): if i % 10000 == 0: print i freq = float(variant.extras.get(meta_key, 0)) self._add_population_frequency( variant.xpos, variant.ref, variant.alt, population['slug'], freq ) # # Directory of per-chromosome VCFs that ESP publishes # elif population['file_type'] == 'esp_vcf_dir': for filename in os.listdir(population['dir_path']): print "Adding %s" % filename file_path = os.path.abspath(os.path.join(population['dir_path'], filename)) f = open(file_path) for i, variant in enumerate(get_variants_from_esp_file(f)): if i % 10000 == 0: print i self._add_population_frequency( variant['xpos'], variant['ref'], variant['alt'], population['slug'], variant[population['counts_key']] ) # # text file of allele counts, as Monkol has been using for the joint calling data # elif population['file_type'] == 'counts_file': if population['file_path'].endswith('.gz'): counts_file = gzip.open(population['file_path']) else: counts_file = open(population['file_path']) for i, line in enumerate(counts_file): if i % 10000 == 0: print i fields = line.strip('\n').split('\t') chrom = 'chr' + fields[0] pos = int(fields[1]) xpos = genomeloc.get_single_location(chrom, pos) ref = fields[2] alt = fields[3] if int(fields[5]) == 0: continue freq = float(fields[4]) / float(fields[5]) self._add_population_frequency( xpos, ref, alt, population['slug'], freq ) # this is now the canonical allele frequency file - # tab separated file with xpos / ref / alt / freq elif population['file_type'] == 'xbrowse_freq_file': if population['file_path'].endswith('.gz'): counts_file = gzip.open(population['file_path']) else: counts_file = open(population['file_path']) for i, line in enumerate(counts_file): if i % 10000 == 0: print i fields = line.strip('\n').split('\t') xpos = int(fields[0]) ref = fields[1] alt = fields[2] freq = float(fields[3]) self._add_population_frequency( xpos, ref, alt, population['slug'], freq )
def _add_vcf_file_for_family_set(self, family_info_list, vcf_file_path, reference_populations=None, vcf_id_map=None): collections = {f['family_id']: self._db[f['coll_name']] for f in family_info_list} #for collection in collections.values(): # collection.drop_indexes() indiv_id_list = [i for f in family_info_list for i in f['individuals']] number_of_families = len(family_info_list) sys.stderr.write("Loading variants for %(number_of_families)d families %(family_info_list)s from %(vcf_file_path)s\n" % locals()) #for family in family_info_list: # print("Indexing family: " + str(family)) # collection = collections[family['family_id']] # collection.ensure_index([('xpos', 1), ('ref', 1), ('alt', 1)]) vcf_file = compressed_file(vcf_file_path) size = os.path.getsize(vcf_file_path) progress = get_progressbar(size, 'Loading VCF: {}'.format(vcf_file_path)) def insert_all_variants_in_buffer(buff, collections_dict): for family_id in buff: if len(buff[family_id]) == 0: # defensive programming raise ValueError("%s has zero variants to insert. Should not be in buff." % family_id) while len(buff) > 0: # choose a random family for which to insert a variant from among families that still have variants to insert family_id = random.choice(buff.keys()) # pop a variant off the list for this family, and insert it family_variant_dict_to_insert = buff[family_id].pop() c = collections_dict[family_id] c.insert(family_variant_dict_to_insert) if len(buff[family_id]) == 0: del buff[family_id] # if no more variants for this family, delete it vcf_rows_counter = 0 variants_buffered_counter = 0 family_id_to_variant_list = defaultdict(list) # will accumulate variants to be inserted all at once for variant in vcf_stuff.iterate_vcf(vcf_file, genotypes=True, indiv_id_list=indiv_id_list, vcf_id_map=vcf_id_map): progress.update(vcf_file.tell_progress()) try: annotation = self._annotator.get_annotation(variant.xpos, variant.ref, variant.alt, populations=reference_populations) except ValueError, e: sys.stderr.write("WARNING: " + str(e) + "\n") continue vcf_rows_counter += 1 for family in family_info_list: # TODO: can we move this inside the if relevant clause below? family_variant = variant.make_copy(restrict_to_genotypes=family['individuals']) family_variant_dict = family_variant.toJSON() _add_index_fields_to_variant(family_variant_dict, annotation) if xbrowse_utils.is_variant_relevant_for_individuals(family_variant, family['individuals']): collection = collections[family['family_id']] if not collection.find_one({'xpos': family_variant.xpos, 'ref': family_variant.ref, 'alt': family_variant.alt}): family_id_to_variant_list[family['family_id']].append(family_variant_dict) variants_buffered_counter += 1 if variants_buffered_counter > 10000: print(date.strftime(datetime.now(), "%m/%d/%Y %H:%M:%S") + "-- inserting %d family-variants from %d vcf rows into %s families" % (variants_buffered_counter, vcf_rows_counter, len(family_id_to_variant_list))) insert_all_variants_in_buffer(family_id_to_variant_list, collections) assert len(family_id_to_variant_list) == 0 vcf_rows_counter = 0 variants_buffered_counter = 0
def _add_vcf_file_for_family_set(self, family_info_list, vcf_file_path, reference_populations=None, vcf_id_map=None, start_from_chrom=None, end_with_chrom=None): collections = { f['family_id']: self._db[f['coll_name']] for f in family_info_list } #for collection in collections.values(): # collection.drop_indexes() indiv_id_list = [i for f in family_info_list for i in f['individuals']] number_of_families = len(family_info_list) sys.stderr.write( "Loading variants for %(number_of_families)d families %(family_info_list)s from %(vcf_file_path)s\n" % locals()) for family in family_info_list: print("Indexing family: " + str(family)) collection = collections[family['family_id']] collection.ensure_index([('xpos', 1), ('ref', 1), ('alt', 1)]) # check whether some of the variants for this chromosome has been loaded already # if yes, start from the last loaded variant, and not from the beginning if "_chr" in vcf_file_path or ".chr" in vcf_file_path: # if the VCF files are split by chromosome (eg. for WGS projects), check within the chromosome vcf_file = compressed_file(vcf_file_path) variant = next( vcf_stuff.iterate_vcf(vcf_file, genotypes=False, indiv_id_list=indiv_id_list, vcf_id_map=vcf_id_map)) print(vcf_file_path + " - chromsome: " + str(variant.chr)) vcf_file.close() position_per_chrom = {} for chrom in range(1, 24): position_per_chrom[chrom] = defaultdict(int) for family in family_info_list: #variants = collections[family['family_id']].find().sort([('xpos',-1)]).limit(1) variants = list(collections[family['family_id']].find({ '$and': [{ 'xpos': { '$gte': chrom * 1e9 } }, { 'xpos': { '$lt': (chrom + 1) * 1e9 } }] }).sort([('xpos', -1)]).limit(1)) if len(variants) > 0: position_per_chrom[chrom][family[ 'family_id']] = variants[0]['xpos'] - chrom * 1e9 else: position_per_chrom[chrom][family['family_id']] = 0 for chrom in range(1, 24): position_per_chrom[chrom] = min( position_per_chrom[chrom].values() ) # get the smallest last-loaded variant position for this chromosome across all families chr_idx = int(variant.xpos / 1e9) start_from_pos = int(position_per_chrom[chr_idx]) print("Start from: %s - %s (%0.1f%% done)" % (chr_idx, start_from_pos, 100. * start_from_pos / CHROMOSOME_SIZES[variant.chr.replace("chr", "")])) tabix_file = pysam.TabixFile(vcf_file_path) vcf_iter = itertools.chain( tabix_file.header, tabix_file.fetch(variant.chr.replace("chr", ""), start_from_pos, int(2.5e8))) elif start_from_chrom or end_with_chrom: if start_from_chrom: print("Start chrom: chr%s" % start_from_chrom) if end_with_chrom: print("End chrom: chr%s" % end_with_chrom) chrom_list = list(map(str, range(1, 23))) + ['X', 'Y'] chrom_list_start_index = 0 if start_from_chrom: chrom_list_start_index = chrom_list.index( start_from_chrom.replace("chr", "").upper()) chrom_list_end_index = len(chrom_list) if end_with_chrom: chrom_list_end_index = chrom_list.index( end_with_chrom.replace("chr", "").upper()) tabix_file = pysam.TabixFile(vcf_file_path) vcf_iter = tabix_file.header for chrom in chrom_list[ chrom_list_start_index:chrom_list_end_index + 1]: print("Will load chrom: " + chrom) try: vcf_iter = itertools.chain(vcf_iter, tabix_file.fetch(chrom)) except ValueError as e: print("WARNING: " + str(e)) else: vcf_iter = vcf_file = compressed_file(vcf_file_path) # TODO handle case where it's one vcf file, not split by chromosome size = os.path.getsize(vcf_file_path) #progress = get_progressbar(size, 'Loading VCF: {}'.format(vcf_file_path)) def insert_all_variants_in_buffer(buff, collections_dict): for family_id in buff: if len(buff[family_id]) == 0: # defensive programming raise ValueError( "%s has zero variants to insert. Should not be in buff." % family_id) while len(buff) > 0: # choose a random family for which to insert a variant from among families that still have variants to insert family_id = random.choice(buff.keys()) # pop a variant off the list for this family, and insert it family_variant_dict_to_insert = buff[family_id].pop() c = collections_dict[family_id] c.insert(family_variant_dict_to_insert) if len(buff[family_id]) == 0: del buff[ family_id] # if no more variants for this family, delete it vcf_rows_counter = 0 variants_buffered_counter = 0 family_id_to_variant_list = defaultdict( list) # will accumulate variants to be inserted all at once for variant in vcf_stuff.iterate_vcf(vcf_iter, genotypes=True, indiv_id_list=indiv_id_list, vcf_id_map=vcf_id_map): if variant.alt == "*": #print("Skipping GATK 3.4 * alt allele: " + str(variant.unique_tuple())) continue try: annotation = self._annotator.get_annotation( variant.xpos, variant.ref, variant.alt, populations=reference_populations) except ValueError, e: sys.stderr.write("WARNING: " + str(e) + "\n") continue vcf_rows_counter += 1 for family in family_info_list: # TODO: can we move this inside the if relevant clause below? try: family_variant = variant.make_copy( restrict_to_genotypes=family['individuals']) family_variant_dict = family_variant.toJSON() _add_index_fields_to_variant(family_variant_dict, annotation) if xbrowse_utils.is_variant_relevant_for_individuals( family_variant, family['individuals']): collection = collections[family['family_id']] if not collection.find_one({ 'xpos': family_variant.xpos, 'ref': family_variant.ref, 'alt': family_variant.alt }): family_id_to_variant_list[family[ 'family_id']].append(family_variant_dict) variants_buffered_counter += 1 except Exception, e: sys.stderr.write( "ERROR: on variant %s, family: %s - %s\n" % (variant.toJSON(), family, e))
def load_population(self, population): """ Take a population and a data source; extract and load it into annotator Data source can be VCF file, VCF Counts file, or a counts dir (in the case of ESP data) """ if population['file_type'] == 'vcf': if population['file_path'].endswith('.gz'): vcf_file = gzip.open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = vcf_file.fileobj else: vcf_file = open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = vcf_file progress = get_progressbar( size, 'Loading vcf: {}'.format(population['slug'])) for variant in vcf_stuff.iterate_vcf(vcf_file, genotypes=True, genotype_meta=False): progress.update(progress_file.tell()) freq = get_aaf(variant) self._add_population_frequency(variant.xpos, variant.ref, variant.alt, population['slug'], freq) vcf_file.close() elif population['file_type'] == 'sites_vcf': if population['file_path'].endswith('.gz'): vcf_file = gzip.open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = vcf_file.fileobj else: vcf_file = open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = vcf_file meta_key = population.get('vcf_info_key', 'AF') progress = get_progressbar( size, 'Loading sites vcf: {}'.format(population['slug'])) is_1kg_popmax = "popmax" in meta_key.lower() and ( "1000 Genomes" in population["name"]) if is_1kg_popmax: meta_fields = [ "EAS_AF", "EUR_AF", "AFR_AF", "AMR_AF", "SAS_AF" ] else: meta_fields = [ meta_key, ] for variant in vcf_stuff.iterate_vcf(vcf_file, meta_fields=meta_fields): progress.update(progress_file.tell()) if "popmax" in meta_key.lower() and ("1000 Genomes" in population["name"]): allele_idx = variant.extras['alt_allele_pos'] freq = 0 for meta_key in meta_fields: freq = max( freq, float( variant.extras.get(meta_key, 0).split(',')[allele_idx])) ##INFO=<ID=EAS_AF,Number=A,Type=Float,Description="Allele frequency in the EAS populations calculated from AC and AN, in the range (0,1)"> ##INFO=<ID=EUR_AF,Number=A,Type=Float,Description="Allele frequency in the EUR populations calculated from AC and AN, in the range (0,1)"> ##INFO=<ID=AFR_AF,Number=A,Type=Float,Description="Allele frequency in the AFR populations calculated from AC and AN, in the range (0,1)"> ##INFO=<ID=AMR_AF,Number=A,Type=Float,Description="Allele frequency in the AMR populations calculated from AC and AN, in the range (0,1)"> ##INFO=<ID=SAS_AF,Number=A,Type=Float,Description="Allele frequency in the SAS populations calculated from AC and AN, in the range (0,1)"> else: freq = float( variant.extras.get( meta_key, 0).split(',')[variant.extras['alt_allele_pos']]) self._add_population_frequency(variant.xpos, variant.ref, variant.alt, population['slug'], freq) vcf_file.close() # # Directory of per-chromosome VCFs that ESP publishes # elif population['file_type'] == 'esp_vcf_dir': for filename in os.listdir(population['dir_path']): file_path = os.path.abspath( os.path.join(population['dir_path'], filename)) f = open(file_path) file_size = os.path.getsize(file_path) progress = get_progressbar( file_size, 'Loading ESP file: {}'.format(filename)) for variant in get_variants_from_esp_file(f): progress.update(f.tell()) self._add_population_frequency( variant['xpos'], variant['ref'], variant['alt'], population['slug'], variant[population['counts_key']]) f.close() # # text file of allele counts, as Monkol has been using for the joint calling data # elif population['file_type'] == 'counts_file': if population['file_path'].endswith('.gz'): counts_file = gzip.open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = counts_file.fileobj else: counts_file = open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = counts_file progress = get_progressbar( size, 'Loading population: {}'.format(population['slug'])) for line in counts_file: progress.update(progress_file.tell()) fields = line.strip('\n').split('\t') chrom = 'chr' + fields[0] pos = int(fields[1]) xpos = genomeloc.get_single_location(chrom, pos) ref = fields[2] alt = fields[3] if int(fields[5]) == 0: continue freq = float(fields[4]) / float(fields[5]) self._add_population_frequency(xpos, ref, alt, population['slug'], freq) counts_file.close() # this is now the canonical allele frequency file - # tab separated file with xpos / ref / alt / freq elif population['file_type'] == 'xbrowse_freq_file': if population['file_path'].endswith('.gz'): counts_file = gzip.open(population['file_path']) progress_file = counts_file.fileobj else: counts_file = open(population['file_path']) progress_file = counts_file size = os.path.getsize(population['file_path']) progress = get_progressbar( size, 'Loading population: {}'.format(population['slug'])) for line in counts_file: progress.update(progress_file.tell()) fields = line.strip('\n').split('\t') xpos = int(fields[0]) ref = fields[1] alt = fields[2] freq = float(fields[3]) self._add_population_frequency(xpos, ref, alt, population['slug'], freq) counts_file.close() elif population['file_type'] == 'tsv_file': if population['file_path'].endswith('.gz'): freq_file = gzip.open(population['file_path']) progress_file = freq_file.fileobj else: freq_file = open(population['file_path']) progress_file = freq_file size = os.path.getsize(population['file_path']) progress = get_progressbar( size, 'Loading population: {}'.format(population['slug'])) header = next(freq_file) print("Header: " + header) for line in freq_file: progress.update(progress_file.tell()) fields = line.strip('\n').split('\t') chrom = fields[0] pos = int(fields[1]) ref = fields[2] alt = fields[3] freq = float(fields[4]) xpos = genomeloc.get_single_location(chrom, pos) self._add_population_frequency(xpos, ref, alt, population['slug'], freq) freq_file.close() elif population['file_type'] == 'sites_vcf_with_counts': if population['file_path'].endswith( '.gz') or population['file_path'].endswith('.bgz'): vcf_file = gzip.open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = vcf_file.fileobj else: vcf_file = open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = vcf_file ac_info_key = population['ac_info_key'] an_info_key = population['an_info_key'] progress = get_progressbar( size, 'Loading sites vcf: {}'.format(population['slug'])) for variant in vcf_stuff.iterate_vcf( vcf_file, meta_fields=[ac_info_key, an_info_key]): progress.update(progress_file.tell()) alt_allele_pos = variant.extras['alt_allele_pos'] try: ac = int( variant.extras.get(ac_info_key).split(',') [alt_allele_pos].replace("NA", "0")) except Exception, e: print( "Couldn't parse AC value %s from %s: %s" % (alt_allele_pos, ac_info_key, variant.extras), e) continue try: if "popmax" in ac_info_key.lower(): AN_index = alt_allele_pos # each allele may have a different AN value from a different population else: AN_index = 0 an = int( variant.extras.get(an_info_key).split(',') [AN_index].replace("NA", "0")) except Exception, e: print( "Couldn't parse AN value %s from %s: %s" % (alt_allele_pos, an_info_key, variant.extras), e) continue if an == 0: freq = 0.0 else: freq = float(ac) / an self._add_population_frequency(variant.xpos, variant.ref, variant.alt, population['slug'], freq)
def load_population(self, population): """ Take a population and a data source; extract and load it into annotator Data source can be VCF file, VCF Counts file, or a counts dir (in the case of ESP data) """ if population['file_type'] == 'vcf': if population['file_path'].endswith('.gz'): vcf_file = gzip.open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = vcf_file.fileobj else: vcf_file = open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = vcf_file progress = get_progressbar(size, 'Loading vcf: {}'.format(population['slug'])) for variant in vcf_stuff.iterate_vcf(vcf_file, genotypes=True, genotype_meta=False): progress.update(progress_file.tell()) freq = get_aaf(variant) self._add_population_frequency(variant.xpos, variant.ref, variant.alt, population['slug'], freq) vcf_file.close() elif population['file_type'] == 'sites_vcf': if population['file_path'].endswith('.gz'): vcf_file = gzip.open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = vcf_file.fileobj else: vcf_file = open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = vcf_file meta_key = population.get('vcf_info_key', 'AF') progress = get_progressbar(size, 'Loading sites vcf: {}'.format(population['slug'])) is_1kg_popmax = "popmax" in meta_key.lower() and ("1000 Genomes" in population["name"]) if is_1kg_popmax: meta_fields = ["EAS_AF", "EUR_AF", "AFR_AF", "AMR_AF", "SAS_AF"] else: meta_fields = [meta_key,] for variant in vcf_stuff.iterate_vcf(vcf_file, meta_fields=meta_fields): progress.update(progress_file.tell()) if "popmax" in meta_key.lower() and ("1000 Genomes" in population["name"]): allele_idx = variant.extras['alt_allele_pos'] freq = 0 for meta_key in meta_fields: freq = max(freq, float(variant.extras.get(meta_key, 0).split(',')[allele_idx])) ##INFO=<ID=EAS_AF,Number=A,Type=Float,Description="Allele frequency in the EAS populations calculated from AC and AN, in the range (0,1)"> ##INFO=<ID=EUR_AF,Number=A,Type=Float,Description="Allele frequency in the EUR populations calculated from AC and AN, in the range (0,1)"> ##INFO=<ID=AFR_AF,Number=A,Type=Float,Description="Allele frequency in the AFR populations calculated from AC and AN, in the range (0,1)"> ##INFO=<ID=AMR_AF,Number=A,Type=Float,Description="Allele frequency in the AMR populations calculated from AC and AN, in the range (0,1)"> ##INFO=<ID=SAS_AF,Number=A,Type=Float,Description="Allele frequency in the SAS populations calculated from AC and AN, in the range (0,1)"> else: freq = float(variant.extras.get(meta_key, 0).split(',')[variant.extras['alt_allele_pos']]) self._add_population_frequency( variant.xpos, variant.ref, variant.alt, population['slug'], freq ) vcf_file.close() # # Directory of per-chromosome VCFs that ESP publishes # elif population['file_type'] == 'esp_vcf_dir': for filename in os.listdir(population['dir_path']): file_path = os.path.abspath(os.path.join(population['dir_path'], filename)) f = open(file_path) file_size = os.path.getsize(file_path) progress = get_progressbar(file_size, 'Loading ESP file: {}'.format(filename)) for variant in get_variants_from_esp_file(f): progress.update(f.tell()) self._add_population_frequency( variant['xpos'], variant['ref'], variant['alt'], population['slug'], variant[population['counts_key']] ) f.close() # # text file of allele counts, as Monkol has been using for the joint calling data # elif population['file_type'] == 'counts_file': if population['file_path'].endswith('.gz'): counts_file = gzip.open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = counts_file.fileobj else: counts_file = open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = counts_file progress = get_progressbar(size, 'Loading population: {}'.format(population['slug'])) for line in counts_file: progress.update(progress_file.tell()) fields = line.strip('\n').split('\t') chrom = 'chr' + fields[0] pos = int(fields[1]) xpos = genomeloc.get_single_location(chrom, pos) ref = fields[2] alt = fields[3] if int(fields[5]) == 0: continue freq = float(fields[4]) / float(fields[5]) self._add_population_frequency( xpos, ref, alt, population['slug'], freq ) counts_file.close() # this is now the canonical allele frequency file - # tab separated file with xpos / ref / alt / freq elif population['file_type'] == 'xbrowse_freq_file': if population['file_path'].endswith('.gz'): counts_file = gzip.open(population['file_path']) progress_file = counts_file.fileobj else: counts_file = open(population['file_path']) progress_file = counts_file size = os.path.getsize(population['file_path']) progress = get_progressbar(size, 'Loading population: {}'.format(population['slug'])) for line in counts_file: progress.update(progress_file.tell()) fields = line.strip('\n').split('\t') xpos = int(fields[0]) ref = fields[1] alt = fields[2] freq = float(fields[3]) self._add_population_frequency( xpos, ref, alt, population['slug'], freq ) counts_file.close() elif population['file_type'] == 'tsv_file': if population['file_path'].endswith('.gz'): freq_file = gzip.open(population['file_path']) progress_file = freq_file.fileobj else: freq_file = open(population['file_path']) progress_file = freq_file size = os.path.getsize(population['file_path']) progress = get_progressbar(size, 'Loading population: {}'.format(population['slug'])) header = next(freq_file) print("Header: " + header) for line in freq_file: progress.update(progress_file.tell()) fields = line.strip('\n').split('\t') chrom = fields[0] pos = int(fields[1]) ref = fields[2] alt = fields[3] freq = float(fields[4]) xpos = genomeloc.get_single_location(chrom, pos) self._add_population_frequency( xpos, ref, alt, population['slug'], freq ) freq_file.close() elif population['file_type'] == 'sites_vcf_with_counts': if population['file_path'].endswith('.gz') or population['file_path'].endswith('.bgz'): vcf_file = gzip.open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = vcf_file.fileobj else: vcf_file = open(population['file_path']) size = os.path.getsize(population['file_path']) progress_file = vcf_file ac_info_key = population['ac_info_key'] an_info_key = population['an_info_key'] progress = get_progressbar(size, 'Loading sites vcf: {}'.format(population['slug'])) for variant in vcf_stuff.iterate_vcf(vcf_file, meta_fields=[ac_info_key, an_info_key]): progress.update(progress_file.tell()) alt_allele_pos = variant.extras['alt_allele_pos'] try: ac = int(variant.extras.get(ac_info_key).split(',')[alt_allele_pos].replace("NA", "0")) except Exception, e: print("Couldn't parse AC value %s from %s: %s" % (alt_allele_pos, ac_info_key, variant.extras), e) continue try: if "popmax" in ac_info_key.lower(): AN_index = alt_allele_pos # each allele may have a different AN value from a different population else: AN_index = 0 an = int(variant.extras.get(an_info_key).split(',')[AN_index].replace("NA", "0")) except Exception, e: print("Couldn't parse AN value %s from %s: %s" % (alt_allele_pos, an_info_key, variant.extras), e) continue if an == 0: freq = 0.0 else: freq = float(ac)/an self._add_population_frequency( variant.xpos, variant.ref, variant.alt, population['slug'], freq )