def filterAndAnnotation(inputFilePath, outputFilePath, genome_id, is_grc): hIN = open(inputFilePath, 'r') hOUT = open(outputFilePath, 'w') # annotation_dir = config.param_conf.get("annotation", "annotation_dir") # filter_same_gene = config.param_conf.getboolean("filter_condition", "filter_same_gene") # annotation_dir = param_conf.resource_dir filter_same_gene = param_conf.filter_same_gene """ # old procedure # ref_gene_bed = annotation_dir + "/refGene.bed.gz" ref_exon_bed = annotation_dir + "/refExon.bed.gz" ens_gene_bed = annotation_dir + "/ensGene.bed.gz" ens_exon_bed = annotation_dir + "/ensExon.bed.gz" grch2ucsc_file = annotation_dir + "/grch2ucsc.txt" # relationship between CRCh and UCSC chromosome names grch2ucsc = {} with open(grch2ucsc_file, 'r') as hin: for line in hin: F = line.rstrip('\n').split('\t') grch2ucsc[F[0]] = F[1] ref_gene_tb = pysam.TabixFile(ref_gene_bed) ref_exon_tb = pysam.TabixFile(ref_exon_bed) ens_gene_tb = pysam.TabixFile(ens_gene_bed) ens_exon_tb = pysam.TabixFile(ens_exon_bed) """ annot_utils.gene.make_gene_info(outputFilePath + ".tmp.refGene.bed.gz", "refseq", genome_id, is_grc, False) annot_utils.gene.make_gene_info(outputFilePath + ".tmp.ensGene.bed.gz", "gencode", genome_id, is_grc, False) annot_utils.exon.make_exon_info(outputFilePath + ".tmp.refExon.bed.gz", "refseq", genome_id, is_grc, False) annot_utils.exon.make_exon_info(outputFilePath + ".tmp.ensExon.bed.gz", "gencode", genome_id, is_grc, False) ref_gene_tb = pysam.TabixFile(outputFilePath + ".tmp.refGene.bed.gz") ens_gene_tb = pysam.TabixFile(outputFilePath + ".tmp.ensGene.bed.gz") ref_exon_tb = pysam.TabixFile(outputFilePath + ".tmp.refExon.bed.gz") ens_exon_tb = pysam.TabixFile(outputFilePath + ".tmp.ensExon.bed.gz") for line in hIN: F = line.rstrip('\n').split('\t') # check gene annotation for the side 1 gene1 = get_gene_info(F[0], F[1], ref_gene_tb, ens_gene_tb) # check gene annotation for the side 2 gene2 = get_gene_info(F[3], F[4], ref_gene_tb, ens_gene_tb) # check exon-intron junction annotation for the side 1 junction1 = get_junc_info(F[0], F[1], ref_exon_tb, ens_exon_tb, junction_margin) # check exon-intron junction annotation for the side 2 junction2 = get_junc_info(F[3], F[4], ref_exon_tb, ens_exon_tb, junction_margin) sameGeneFlag = 0 for g1 in gene1: for g2 in gene2: if g1 == g2 and g1 != "---": sameGeneFlag = 1 if filter_same_gene == True and sameGeneFlag == 1: continue print >> hOUT, '\t'.join(F[0:8]) + '\t' + ';'.join(gene1) + '\t' + ';'.join(junction1) + '\t' + ';'.join(gene2) + '\t' + ';'.join(junction2) + '\t' + \ F[11] + '\t' + F[12] + '\t' + F[16] + '\t' + F[17] hIN.close() hOUT.close() subprocess.check_call( ["rm", "-rf", outputFilePath + ".tmp.refGene.bed.gz"]) subprocess.check_call( ["rm", "-rf", outputFilePath + ".tmp.ensGene.bed.gz"]) subprocess.check_call( ["rm", "-rf", outputFilePath + ".tmp.refExon.bed.gz"]) subprocess.check_call( ["rm", "-rf", outputFilePath + ".tmp.ensExon.bed.gz"]) subprocess.check_call( ["rm", "-rf", outputFilePath + ".tmp.refGene.bed.gz.tbi"]) subprocess.check_call( ["rm", "-rf", outputFilePath + ".tmp.ensGene.bed.gz.tbi"]) subprocess.check_call( ["rm", "-rf", outputFilePath + ".tmp.refExon.bed.gz.tbi"]) subprocess.check_call( ["rm", "-rf", outputFilePath + ".tmp.ensExon.bed.gz.tbi"])
def setUp(self): self.tabix = pysam.TabixFile(self.filename) self.compare = load_and_convert(self.filename)
def testManager(self): with pysam.TabixFile(self.filename) as tabixfile: tabixfile.fetch() self.assertEqual(tabixfile.closed, True)
def getPairCoverRegionFromBam(inputBam, outputFilePath, inputTabixFile): """ script for obtaining pair read information (mainly end position, because it cannot recovered from bam files) """ #################### bamfile = pysam.Samfile(inputBam, "rb") tabixfile = pysam.TabixFile(inputTabixFile) hOUT = open(outputFilePath + ".tmp", "w") ID2info = {} tempChr = "" tempPos = 0 checkPositionMargin = 10000000 tabixErrorMsg = "" for read in bamfile.fetch(): # when into new regions, fetch the keys from the tabix indexed file if bamfile.getrname( read.tid) != tempChr or int(read.pos + 1) > tempPos + checkPositionMargin: tempChr = bamfile.getrname(read.tid) tempPos = int(read.pos + 1) - 1 ID2info = {} tabixErrorFlag = 0 try: records = tabixfile.fetch(tempChr, tempPos, tempPos + checkPositionMargin) except Exception as inst: # print >> sys.stderr, "%s: %s" % (type(inst), inst.args) tabixErrorMsg = str(inst.args) tabixErrorFlag = 1 if tabixErrorFlag == 0: for record in records: splt_record = record.split('\t') ID2info[splt_record[3]] = record flags = format(int(read.flag), '#014b')[:1:-1] # skip supplementary alignment if flags[8] == "1" or flags[11] == "1": continue # skip one of the pair is unmapped if flags[2] == "1" or flags[3] == "1": continue seqID = (read.qname + "/1" if flags[6] == "1" else read.qname + "/2") if seqID in ID2info: print(ID2info[seqID] + "\t" + bamfile.getrname(read.tid) + ":" + str(read.pos + 1) + "-" + str(read.aend) + "\t" + str(read.mapq), file=hOUT) if tabixErrorMsg != "": utils.warningMessage( "One or more error occured in tabix file fetch, e.g.: " + tabixErrorMsg) bamfile.close() tabixfile.close() hOUT.close() #################### #################### hOUT = open(outputFilePath, 'w') subprocess.call(["sort", "-k5n", outputFilePath + ".tmp"], stdout=hOUT) hOUT.close() #################### #################### subprocess.call(["rm", outputFilePath + ".tmp"])
def setUp(self): if not pysam.config.HAVE_LIBCURL or not checkURL(self.url): self.remote_file = None else: self.remote_file = pysam.TabixFile(self.url, "r") self.local_file = pysam.TabixFile(self.local, "r")
def _add_vcf_file_for_family_set(self, family_info_list, vcf_file_path, reference_populations=None, vcf_id_map=None, start_from_chrom=None, end_with_chrom=None): collections = { f['family_id']: self._db[f['coll_name']] for f in family_info_list } #for collection in collections.values(): # collection.drop_indexes() indiv_id_list = [i for f in family_info_list for i in f['individuals']] number_of_families = len(family_info_list) sys.stderr.write( "Loading variants for %(number_of_families)d families %(family_info_list)s from %(vcf_file_path)s\n" % locals()) for family in family_info_list: print("Indexing family: " + str(family)) collection = collections[family['family_id']] collection.ensure_index([('xpos', 1), ('ref', 1), ('alt', 1)]) # check whether some of the variants for this chromosome has been loaded already # if yes, start from the last loaded variant, and not from the beginning if "_chr" in vcf_file_path or ".chr" in vcf_file_path: # if the VCF files are split by chromosome (eg. for WGS projects), check within the chromosome vcf_file = compressed_file(vcf_file_path) variant = next( vcf_stuff.iterate_vcf(vcf_file, genotypes=False, indiv_id_list=indiv_id_list, vcf_id_map=vcf_id_map)) print(vcf_file_path + " - chromsome: " + str(variant.chr)) vcf_file.close() position_per_chrom = {} for chrom in range(1, 24): position_per_chrom[chrom] = defaultdict(int) for family in family_info_list: #variants = collections[family['family_id']].find().sort([('xpos',-1)]).limit(1) variants = list(collections[family['family_id']].find({ '$and': [{ 'xpos': { '$gte': chrom * 1e9 } }, { 'xpos': { '$lt': (chrom + 1) * 1e9 } }] }).sort([('xpos', -1)]).limit(1)) if len(variants) > 0: position_per_chrom[chrom][family[ 'family_id']] = variants[0]['xpos'] - chrom * 1e9 else: position_per_chrom[chrom][family['family_id']] = 0 for chrom in range(1, 24): position_per_chrom[chrom] = min( position_per_chrom[chrom].values() ) # get the smallest last-loaded variant position for this chromosome across all families chr_idx = int(variant.xpos / 1e9) start_from_pos = int(position_per_chrom[chr_idx]) print("Start from: %s - %s (%0.1f%% done)" % (chr_idx, start_from_pos, 100. * start_from_pos / CHROMOSOME_SIZES[variant.chr.replace("chr", "")])) tabix_file = pysam.TabixFile(vcf_file_path) vcf_iter = itertools.chain( tabix_file.header, tabix_file.fetch(variant.chr.replace("chr", ""), start_from_pos, int(2.5e8))) elif start_from_chrom or end_with_chrom: if start_from_chrom: print("Start chrom: chr%s" % start_from_chrom) if end_with_chrom: print("End chrom: chr%s" % end_with_chrom) chrom_list = list(map(str, range(1, 23))) + ['X', 'Y'] chrom_list_start_index = 0 if start_from_chrom: chrom_list_start_index = chrom_list.index( start_from_chrom.replace("chr", "").upper()) chrom_list_end_index = len(chrom_list) if end_with_chrom: chrom_list_end_index = chrom_list.index( end_with_chrom.replace("chr", "").upper()) tabix_file = pysam.TabixFile(vcf_file_path) vcf_iter = tabix_file.header for chrom in chrom_list[ chrom_list_start_index:chrom_list_end_index + 1]: print("Will load chrom: " + chrom) try: vcf_iter = itertools.chain(vcf_iter, tabix_file.fetch(chrom)) except ValueError as e: print("WARNING: " + str(e)) else: vcf_iter = vcf_file = compressed_file(vcf_file_path) # TODO handle case where it's one vcf file, not split by chromosome size = os.path.getsize(vcf_file_path) #progress = get_progressbar(size, 'Loading VCF: {}'.format(vcf_file_path)) def insert_all_variants_in_buffer(buff, collections_dict): for family_id in buff: if len(buff[family_id]) == 0: # defensive programming raise ValueError( "%s has zero variants to insert. Should not be in buff." % family_id) while len(buff) > 0: # choose a random family for which to insert a variant from among families that still have variants to insert family_id = random.choice(buff.keys()) # pop a variant off the list for this family, and insert it family_variant_dict_to_insert = buff[family_id].pop() c = collections_dict[family_id] c.insert(family_variant_dict_to_insert) if len(buff[family_id]) == 0: del buff[ family_id] # if no more variants for this family, delete it vcf_rows_counter = 0 variants_buffered_counter = 0 family_id_to_variant_list = defaultdict( list) # will accumulate variants to be inserted all at once for variant in vcf_stuff.iterate_vcf(vcf_iter, genotypes=True, indiv_id_list=indiv_id_list, vcf_id_map=vcf_id_map): if variant.alt == "*": #print("Skipping GATK 3.4 * alt allele: " + str(variant.unique_tuple())) continue try: annotation = self._annotator.get_annotation( variant.xpos, variant.ref, variant.alt, populations=reference_populations) except ValueError, e: sys.stderr.write("WARNING: " + str(e) + "\n") continue vcf_rows_counter += 1 for family in family_info_list: # TODO: can we move this inside the if relevant clause below? try: family_variant = variant.make_copy( restrict_to_genotypes=family['individuals']) family_variant_dict = family_variant.toJSON() _add_index_fields_to_variant(family_variant_dict, annotation) if xbrowse_utils.is_variant_relevant_for_individuals( family_variant, family['individuals']): collection = collections[family['family_id']] if not collection.find_one({ 'xpos': family_variant.xpos, 'ref': family_variant.ref, 'alt': family_variant.alt }): family_id_to_variant_list[family[ 'family_id']].append(family_variant_dict) variants_buffered_counter += 1 except Exception, e: sys.stderr.write( "ERROR: on variant %s, family: %s - %s\n" % (variant.toJSON(), family, e))
def torsid(variantlist, regiontext, build): """ Parameters ---------- variantlist : list List of variants in either rs id or other chr_pos, chr_pos_ref, chr_pos_ref_alt, chr_pos_ref_alt_build format. Returns ------- rsidlist : list Corresponding rs id in the region if found. Otherwise returns '.' """ if all(x=='.' for x in variantlist): raise InvalidUsage('No variants provided') variantlist = cleanSNPs(variantlist, regiontext, build) chrom, startbp, endbp = parseRegionText(regiontext, build) chrom = str(chrom).replace('23',"X") # Load dbSNP151 SNP names from region indicated dbsnp_filepath = '' suffix = 'b37' if build.lower() in ["hg38", "grch38"]: suffix = 'b38' dbsnp_filepath = os.path.join(MYDIR, 'data', 'dbSNP151', 'GRCh38p7', 'All_20180418.vcf.gz') else: suffix = 'b37' dbsnp_filepath = os.path.join(MYDIR, 'data', 'dbSNP151', 'GRCh37p13', 'All_20180423.vcf.gz') # Load dbSNP file tbx = pysam.TabixFile(dbsnp_filepath) print('Compiling list of known variants in the region from dbSNP151') chromcol = [] poscol = [] idcol = [] refcol = [] altcol = [] rsid = dict({}) # chr_pos_ref_alt_build (keys) for rsid output (values) for row in tbx.fetch(str(chrom), startbp, endbp): rowlist = str(row).split('\t') chromi = rowlist[0].replace('chr','') posi = rowlist[1] idi = rowlist[2] refi = rowlist[3] alti = rowlist[4] varstr = '_'.join([chromi, posi, refi, alti, suffix]) chromcol.append(chromi) poscol.append(posi) idcol.append(idi) refcol.append(refi) altcol.append(alti) rsid[varstr] = idi altalleles = alti.split(',') # could have more than one alt allele (multi-allelic) if len(altalleles)>1: varstr = '_'.join([chromi, posi, refi, altalleles[0], suffix]) rsid[varstr] = idi for i in np.arange(len(altalleles)-1): varstr = '_'.join([chromi, posi, refi, altalleles[i+1], suffix]) rsid[varstr] = idi finalvarlist = [] for variant in variantlist: if not variant.startswith('rs'): try: finalvarlist.append(rsid[variant]) except: finalvarlist.append('.') else: finalvarlist.append(variant) return finalvarlist
def main(args, pass_through_args): if cram_input(args.bams): if "-r" not in pass_through_args and not "--reference" in pass_through_args: sys.exit("ERROR: missing reference file required for CRAM. " + "Use -r option. (Run `samplot.py -h` for more help)") global HTML global HERE vcf = pysam.VariantFile(args.vcf) vcf_samples = vcf.header.samples vcf_samples_set = set(vcf_samples) vcf_samples_list = list(vcf_samples) annotations = None if args.gff: annotations = pysam.TabixFile(args.gff) filters = [to_exprs(f) for f in args.filter] ped_samples = parse_ped(args.ped, vcf_samples) # this is empty unless we have a sample with both parents defined. dn_row = get_dn_row(ped_samples) if not os.path.exists(args.out_dir): os.makedirs(args.out_dir) names_to_bams = get_names_to_bams(args.bams, args.sample_ids) important_regions = None if args.important_regions: important_regions = read_important_regions(args.important_regions) tabledata = [] # user requested FORMAT fields to add to plot title format_field_ids = None if args.format: format_field_ids = args.format.split(",") out_file = sys.stdout if args.command_file: out_file = open(args.command_file, "w") for variant in vcf: svtype = variant.info.get("SVTYPE", "SV") if args.important_regions: if not var_in_important_regions(important_regions, variant.chrom, variant.start, variant.stop): continue if svtype in ("BND", "INS"): continue if variant.stop - variant.start > args.max_mb * 1000000: continue if variant.stop - variant.start > args.min_bp: continue gts = [s.get("GT", (None, None)) for s in variant.samples.values()] if sum(None in g for g in gts) >= args.min_call_rate * len(vcf_samples): continue if args.max_hets: # requisite hets/hom-alts if sum(sum(x) >= 1 for x in gts if not None in x) > args.max_hets: continue if not any(sum(x) > 0 for x in gts if not None in x): continue test_idxs = [ i for i, gt in enumerate(gts) if not None in gt and sum(gt) > 0 ] test_samples = [ s for i, s in enumerate(variant.samples.values()) if i in test_idxs ] if len(filters) == 0: idxs = test_idxs else: idxs = [] odict = make_single(dict(variant.info.items())) for i, ts in enumerate(test_samples): vdict = odict.copy() vdict.update(make_single(dict(ts.items()))) if any(check_expr(vdict, fs) for fs in filters): idxs.append(test_idxs[i]) if len(idxs) == 0: continue is_dn = [] # we call it a de novo if the sample passed the filters but the mom and # dad had homref genotypes before filtering. # so stringent filtering on the kid and lenient on parents. variant_samples = [] for i in idxs: if vcf_samples[i] in names_to_bams: variant_samples.append(vcf_samples[i]) if len(variant_samples) <= 0: continue bams = [names_to_bams[s] for s in variant_samples] if dn_row != "": test_sample_names = {s.name for s in test_samples} for variant_sample in variant_samples: sample = ped_samples[variant_sample] if sample.mom is None or sample.dad is None: continue if not sample.mom.id in test_sample_names and not sample.dad.id in test_sample_names: is_dn.append(sample.id) if len(is_dn) <= 0 and args.dn_only: continue # save these for the html. n_samples = len(variant_samples) # semi-colon delimited eases CSV export from HTML sample_str = ";".join(variant_samples) # dict holding sample to FORMAT title string plot_titles = dict() if format_field_ids: format_attrs = get_format_title(vcf_samples_list, format_field_ids, variant) plot_titles = make_plot_titles(variant_samples, format_attrs) # try to get family members if args.ped is not None: # do DN samples first so we can see parents. for variant_sample in is_dn + [ x for x in variant_samples if not x in is_dn ]: s = ped_samples.get(variant_sample) if s is None: continue if s.mom is not None and not s.mom.id in variant_samples and s.mom.id in vcf_samples_set: variant_samples.append("mom-of-%s[%s]" % (variant_sample, s.mom.id)) bams.append(names_to_bams[s.mom.id]) if s.dad is not None and not s.dad.id in variant_samples and s.dad.id in vcf_samples_set: variant_samples.append("dad-of-%s[%s]" % (variant_sample, s.dad.id)) bams.append(names_to_bams[s.dad.id]) for kid in s.kids: if not kid.id in variant_samples and kid.id in vcf_samples_set: variant_samples.append("kid-of-%s[%s]" % (variant_sample, kid.id)) bams.append(names_to_bams[kid.id]) if args.max_hets: if len(bams) > 1.5 * args.max_hets: break if args.max_hets: if len(bams) > 1.5 * args.max_hets: break elif args.min_entries and len(bams) < args.min_entries: # extend with some controls: hom_ref_idxs = [ i for i, gt in enumerate(gts) if len(gt) == 2 and gt[0] == 0 and gt[1] == 0 ] if len(hom_ref_idxs) > 3: random.shuffle(hom_ref_idxs) hom_ref_idxs = hom_ref_idxs[:3] hom_ref_samples = [] for i in hom_ref_idxs: if vcf_samples[i] in names_to_bams: hom_ref_samples.append(vcf_samples[i]) to_add_count = args.min_entries - len(bams) bams.extend(names_to_bams[s] for s in hom_ref_samples[:to_add_count]) variant_samples += [ "control-sample:" + s for s in hom_ref_samples[:to_add_count] ] data_dict = { "chrom": variant.chrom, "start": variant.start, "end": variant.stop, "svtype": svtype, "svlength": variant.stop - variant.start, "samples": sample_str, "nsamples": n_samples, } if annotations: data_dict["overlaps"] = get_overlap(annotations, variant.chrom, variant.start, variant.stop) if dn_row != "": data_dict["dn"] = ",".join(is_dn) fig_path = os.path.join( args.out_dir, "{svtype}_{chrom}_{start}_{end}.{itype}".format( itype=args.output_type, **data_dict)) tabledata.append(data_dict) if "CIPOS" in variant.info: v = variant.info["CIPOS"] cipos = "--start_ci '%s,%s'" % (abs(v[0]), abs(v[1])) else: cipos = "" if "CIEND" in variant.info: v = variant.info["CIEND"] ciend = "--end_ci '%s,%s'" % (abs(v[0]), abs(v[1])) else: ciend = "" # dynamically set Z to speed drawing and remove noise for larger events z = 3 if variant.stop - variant.start > 2000: z = 4 if variant.stop - variant.start > 10000: z = 6 if variant.stop - variant.start > 20000: z = 9 if args.max_entries: bams = bams[:args.max_entries] variant_samples = variant_samples[:args.max_entries] # update titles based on FORMAT fields requested title_list = list() for variant_sample in variant_samples: if variant_sample in plot_titles: title_list.append(plot_titles[variant_sample]) else: title_list.append(variant_sample) out_file.write( "python {here}/samplot.py {extra_args} -z {z} --minq 0 -n {titles} {cipos} {ciend} {svtype} -c {chrom} -s {start} -e {end} -o {fig_path} -d 1 -b {bams}\n" .format( here=HERE, extra_args=" ".join(pass_through_args), bams=" ".join(bams), titles=" ".join(title_list), z=z, cipos=cipos, ciend=ciend, svtype="-t " + svtype if svtype != "SV" else "", fig_path=fig_path, chrom=variant.chrom, start=variant.start, end=variant.stop, )) if args.command_file: out_file.close() # update the javascript HTML = HTML.replace("[DATA]", json.dumps(tabledata)) HTML = HTML.replace("[PLOT_TYPE]", args.output_type) HTML = HTML.replace("[GFF]", "true" if annotations else "false") HTML = HTML.replace("[DENOVO]", "true" if dn_row else "false") with open("{out_dir}/index.html".format(out_dir=args.out_dir), "w") as fh: print(HTML, file=fh)
#! /usr/bin/env python import sys, pysam input_file1 = sys.argv[1] input_file2 = sys.argv[2] output_file = sys.argv[3] id_file = sys.argv[4] id_tb = pysam.TabixFile(id_file) """ key2id = {} with open(id_file, 'r') as hin: for line in hin: F = line.rstrip('\n').split('\t') FF = F[3].split('|') for i in [-2, 2, -1, 1, 0]: for j in [-2, 2, -1, 1, 0]: key = F[0] + '\t' + str(int(F[1]) + i) + '\t' + str(int(F[2]) + j) if key not in key_list: continue key2id[F[0] + '\t' + F[1] + '\t' + F[2]] = FF[0] """ def check_id(chr, start, end): tabixErrorFlag = 0 try: records = id_tb.fetch(chr, start - 5, end + 5) except Exception as inst:
def handle( self, file: str, organism: str, doi: str = None, ignore: str = None, cpu: int = 1, verbosity: int = 1, **options ): """Execute the main function.""" # retrieve only the file name filename = os.path.basename(file) if verbosity > 0: self.stdout.write("Processing file: {}".format(filename)) try: FileValidator().validate(file) except ImportingError as e: raise CommandError(e) try: index_file = "{}.tbi".format(file) FileValidator().validate(index_file) except ImportingError: try: index_file = "{}.csi".format(file) FileValidator().validate(index_file) except ImportingError: raise CommandError("No index found (.tbi/.csi)") try: feature_file = FeatureLoader( filename=filename, source="GFF_SOURCE", doi=doi ) except ImportingError as e: raise CommandError(e) pool = ThreadPoolExecutor(max_workers=cpu) tasks = list() chunk_size = cpu * 2 # Load the GFF3 file with open(file) as tbx_file: tbx = pysam.TabixFile(filename=tbx_file.name, index=index_file) for row in tqdm(tbx.fetch(parser=pysam.asGTF()), total=get_num_lines(file)): if ignore is not None and row.feature in ignore: continue tasks.append( pool.submit(feature_file.store_tabix_GFF_feature, row, organism) ) if len(tasks) >= chunk_size: for task in as_completed(tasks): try: task.result() except ImportingError as e: raise CommandError(e) tasks.clear() else: for task in as_completed(tasks): try: task.result() except ImportingError as e: raise CommandError(e) tasks.clear() pool.shutdown() if verbosity > 0: self.stdout.write("Loading relationships") pool = ThreadPoolExecutor(max_workers=cpu) tasks = list() for item in feature_file.relationships: tasks.append( pool.submit( feature_file.store_relationship, organism, item["subject_id"], item["object_id"], ) ) for task in tqdm(as_completed(tasks), total=len(tasks)): try: task.result() except ImportingError as e: raise CommandError(e) pool.shutdown() if feature_file.ignored_attrs is not None: self.stdout.write( self.style.WARNING( "Ignored attrs: {}".format(feature_file.ignored_attrs) ) ) if verbosity > 0: self.stdout.write(self.style.SUCCESS("Done with {}".format(filename)))
def allc_to_bigwig(allc_path, output_prefix, bin_size, mc_contexts, chrom_size_path, strandness): """\ Generate BigWig files from one ALLC file. Parameters ---------- allc_path {allc_path_doc} output_prefix Path prefix of the output BigWig file. bin_size {bw_bin_sizes_doc} mc_contexts {mc_contexts_doc} strandness {strandness_doc} chrom_size_path {chrom_size_path_doc} If chrom_size_path provided, will use it to extract ALLC with chrom order, but if region provided, will ignore this. """ if strandness not in {"split", "both"}: raise ValueError( f'strandness need to be "split" or "both", got "{strandness}"') chrom_sizes = parse_chrom_size(chrom_size_path) chrom_sizes_list = [(k, v) for k, v in chrom_sizes.items()] # create bigwig file handles for each case # context_handle: key is mC context pattern like CHN, CAN, CGN, value is the output handle context_handle = {} output_path_collect = {} for bw_type in ["frac", "cov"]: out_suffix = f"{bw_type}.bw" for mc_context in mc_contexts: if strandness == "split": file_path = output_prefix + f".{mc_context}-Watson.{out_suffix}" output_path_collect[(mc_context, "Watson", out_suffix)] = file_path # handle for Watson/+ strand w_handle = pyBigWig.open(file_path, "w") w_handle.addHeader(chrom_sizes_list) context_handle[(mc_context, "+", bw_type)] = w_handle file_path = output_prefix + f".{mc_context}-Crick.{out_suffix}" output_path_collect[(mc_context, "Crick", out_suffix)] = file_path # handle for Crick/- strand c_handle = pyBigWig.open(file_path, "w") c_handle.addHeader(chrom_sizes_list) context_handle[(mc_context, "-", bw_type)] = c_handle else: # handle for both strand file_path = output_prefix + f".{mc_context}-{strandness}.{out_suffix}" output_path_collect[(mc_context, strandness, out_suffix)] = file_path _handle = pyBigWig.open(file_path, "w") _handle.addHeader(chrom_sizes_list) context_handle[mc_context, bw_type] = _handle def _init_counter(_contexts, _strandness): if _strandness == "split": # a counter for +/- strand separately _counter = StrandContextCounter(_contexts) else: # a counter for both +/- strands _counter = ContextCounter(_contexts) return _counter with pysam.TabixFile(allc_path) as allc: allc_chroms = set(allc.contigs) for chrom, chrom_size in chrom_sizes.items(): if chrom not in allc_chroms: continue counter = _init_counter(mc_contexts, strandness) cur_bin = 0 for line in allc.fetch(chrom): _, pos, strand, context, mc, cov, _ = line.split("\t") pos = int(pos) mc = float(mc) cov = float(cov) this_bin = (pos - 1) // bin_size if this_bin != cur_bin: # dump cur_bin counts bin_start = int(cur_bin * bin_size) write_entry( counter=counter, context_handle=context_handle, mc_contexts=mc_contexts, strandness=strandness, chrom=chrom, bin_start=bin_start, bin_size=bin_size, ) # initiate next bin cur_bin = this_bin counter = _init_counter(mc_contexts, strandness) # add counts if strandness == "split": counter.add(context, strand, mc, cov) else: counter.add(context, mc, cov) # final bin of the chrom bin_start = int(cur_bin * bin_size) write_entry( counter=counter, context_handle=context_handle, mc_contexts=mc_contexts, strandness=strandness, chrom=chrom, bin_start=bin_start, bin_size=bin_size, ) print(chrom, "finished") for handle in context_handle.values(): handle.close() return output_path_collect
def annotate(in_vcf_gz_path: str, out_vcf_path: str, annot_bed_path: str): chroms = [f"chr{n}" for n in range(1, 23)] with pysam.TabixFile(in_vcf_gz_path) as in_vcf_file, pysam.TabixFile( annot_bed_path) as annot_bed_file, open(out_vcf_path, "w") as out_vcf_file: # Make and write headers vcf_headers = in_vcf_file.header annot_key_str = annot_bed_file.header[0].split("=")[1] annot_info_header = f"##INFO=<ID=ANNOT,Key={annot_key_str}>" vcf_headers.append(annot_info_header) vcf_headers[-1], vcf_headers[-2] = ( vcf_headers[-2], vcf_headers[-1], ) # Swap for vcf_header in vcf_headers: print(vcf_header, file=out_vcf_file) # Annotate by the input BED file for chrom in chroms: var_iter = in_vcf_file.fetch(chrom, parser=pysam.asTuple()) bed_iter = annot_bed_file.fetch(chrom, parser=pysam.asTuple()) bed_memory = deque() variant = next(var_iter, None) while variant is not None: var_pos = int(variant[1]) - 1 # 1-based -> 0-based var_ref = variant[3] var_alt = variant[4] # Determine a search region for BED coordinates if len(var_ref) == 1: # Insertion or substitution region_start = var_pos region_end = (var_pos + 2 if len(var_alt) > 1 else var_pos + 1) else: # Deletion region_start = var_pos + 1 region_end = region_start + len(var_ref) - 1 # Get an annotation integer annot_int = 0 stop_bed_iter = False # 1. Check the memory of previously checked BED coordinates while (len(bed_memory) > 0 and int(bed_memory[0][2]) <= region_start): # Remove non-overlapped coordinates bed_memory.popleft() for bed in bed_memory: if int(bed[1]) < region_end: # Overlap annot_int |= int(bed[3]) else: stop_bed_iter = True break # 2. Continuously iterate over the BED coordinates and check if not stop_bed_iter: bed = next(bed_iter, None) while bed is not None: bed_start = int(bed[1]) bed_end = int(bed[2]) if region_start < bed_end: bed_memory.append(bed) if bed_start < region_end: # Overlap annot_int |= int(bed[3]) else: break bed = next(bed_iter, None) print(str(variant) + f";ANNOT={annot_int}", file=out_vcf_file) variant = next(var_iter, None)
def run(args): import snpCaller, indelCaller pool = mp.Pool(processes=args.cpu) if not args.output: args.output = os.getcwd() os.makedirs(args.output, exist_ok=True) end = None if not args.end: try: with open(args.ref + '.fai', 'r') as file: for line in file: if line.split('\t')[0] == args.chrom: end = int(line.split('\t')[1]) if end == None: print('%s: contig %s not found in reference.' % (str(datetime.datetime.now()), args.chrom), flush=True) sys.exit(2) except FileNotFoundError: print('%s: Index file .fai required for reference genome file' % (str(datetime.datetime.now())), flush=True) sys.exit(2) else: end = args.end if not args.start: start = 1 else: start = args.start threshold = [ float(args.neighbor_threshold.split(',')[0]), float(args.neighbor_threshold.split(',')[1]) ] dirname = os.path.dirname(__file__) if args.exclude_bed in ['hg38', 'hg19', 'mm10', 'mm39']: args.exclude_bed = os.path.join( dirname, 'release_data/bed_files/%s_centro_telo.bed.gz' % args.exclude_bed) if args.include_bed: tbx = pysam.TabixFile(args.include_bed) include_intervals = IntervalTree( Interval(int(row[1]), int(row[2]), "%s" % (row[1])) for row in tbx.fetch(args.chrom, parser=pysam.asBed())) include_intervals = IntervalTree(include_intervals.overlap(start, end)) if include_intervals: start = max(start, min(x[0] for x in include_intervals)) end = min(end, max(x[1] for x in include_intervals)) else: print( '%s: No overlap between include_bed file and start/end coordinates' % (str(datetime.datetime.now())), flush=True) return in_dict={'chrom':args.chrom, 'start':start, 'end':end, 'sam_path':args.bam, 'fasta_path':args.ref, \ 'mincov':args.mincov, 'maxcov':args.maxcov, 'min_allele_freq':args.min_allele_freq, 'min_nbr_sites':args.min_nbr_sites, \ 'threshold':threshold, 'snp_model':args.snp_model, 'cpu':args.cpu, 'vcf_path':args.output,'prefix':args.prefix,'sample':args.sample, \ 'seq':args.sequencing, 'supplementary':args.supplementary, 'include_bed':args.include_bed, 'exclude_bed':args.exclude_bed} snp_vcf = '' if args.mode in ['snps', 'snps_unphased', 'both']: snp_time = time.time() snp_vcf = snpCaller.test_model(in_dict, pool) print('\n%s: SNP calling completed for contig %s. Time taken= %.4f\n' % (str(datetime.datetime.now()), in_dict['chrom'], time.time() - snp_time), flush=True) if snp_vcf and args.mode in ['snps', 'both']: enable_whatshap = '--distrust-genotypes --include-homozygous' if args.enable_whatshap else '' print('\n%s: ------WhatsHap SNP phasing log------\n' % (str(datetime.datetime.now())), flush=True) run_cmd( "whatshap phase %s.vcf.gz %s -o %s.phased.preclean.vcf -r %s --ignore-read-groups --chromosome %s %s" % (snp_vcf, in_dict['sam_path'], snp_vcf, in_dict['fasta_path'], in_dict['chrom'], enable_whatshap), verbose=True) run_cmd( "bcftools view -e 'GT=\"0\\0\"' %s.phased.preclean.vcf|bgziptabix %s.phased.vcf.gz" % (snp_vcf, snp_vcf)) print('\n%s: ------SNP phasing completed------\n' % (str(datetime.datetime.now())), flush=True) if args.mode == 'both' or args.phase_bam: print('\n%s: ------WhatsHap BAM phasing log------\n' % (str(datetime.datetime.now())), flush=True) run_cmd( "whatshap haplotag --ignore-read-groups --ignore-linked-read -o %s.phased.bam --reference %s %s.phased.vcf.gz %s --regions %s:%d:%d --tag-supplementary" % (snp_vcf, in_dict['fasta_path'], snp_vcf, in_dict['sam_path'], args.chrom, start, end), verbose=True) run_cmd('samtools index %s.phased.bam' % snp_vcf) print('\n%s: ------BAM phasing completed-----\n' % (str(datetime.datetime.now())), flush=True) else: return if args.mode in ['indels', 'both']: sam_path = '%s.phased.bam' % snp_vcf if args.mode == 'both' else args.bam in_dict={'chrom':args.chrom, 'start':start, 'end':end, 'sam_path':sam_path, 'fasta_path':args.ref, \ 'mincov':args.mincov, 'maxcov':args.maxcov, 'min_allele_freq':args.min_allele_freq, 'min_nbr_sites':args.min_nbr_sites, \ 'threshold':threshold, 'snp_model':args.snp_model,'indel_model':args.indel_model, 'cpu':args.cpu, 'vcf_path':args.output,'prefix':args.prefix,'sample':args.sample, 'seq':args.sequencing, \ 'del_t':args.del_threshold,'ins_t':args.ins_threshold,'supplementary':args.supplementary, 'include_bed':args.include_bed\ , 'exclude_bed':args.exclude_bed,'win_size':args.win_size,'small_win_size':args.small_win_size} ind_time = time.time() indel_vcf = indelCaller.test_model(in_dict, pool) print('%s: Post processing' % (str(datetime.datetime.now())), flush=True) run_cmd('samtools faidx %s %s > %s/%s.fa' % (args.ref, args.chrom, args.output, args.chrom)) remove_path('%s/ref.sdf' % args.output) run_cmd('rtg RTG_MEM=4G format -f fasta %s/%s.fa -o %s/ref.sdf' % (args.output, args.chrom, args.output)) remove_path('%s.vcf.gz' % indel_vcf) run_cmd( 'rtg RTG_MEM=4G vcfdecompose -i %s.raw.vcf.gz --break-mnps -o - -t %s/ref.sdf|rtg RTG_MEM=4G vcffilter -i - --non-snps-only -o %s.vcf.gz' % (indel_vcf, args.output, indel_vcf)) print('%s: Indel calling completed for contig %s. Time taken= %.4f' % (str(datetime.datetime.now()), in_dict['chrom'], time.time() - ind_time), flush=True) if args.mode == 'both': if not args.keep_bam: os.remove('%s.phased.bam' % snp_vcf) final_path = os.path.join(args.output, '%s.final.vcf.gz' % args.prefix) run_cmd( 'bcftools concat %s.phased.vcf.gz %s.vcf.gz -a -d all |bgziptabix %s' % (snp_vcf, indel_vcf, final_path)) pool.close() pool.join()
def main(argv=sys.argv): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-i", "--input-vcf", dest="input_vcf_file", type="string", help="input vcf file") parser.add_option( "-f", "--input-fasta", dest="input_fasta_file", type="string", help="input fasta file. faidx indexed reference sequence file to " "determine INDEL context [%default]") parser.add_option( "-e", "--input-bed", dest="input_bed_file", type="string", help="input file with intervals. Tab-delimited file of intervals " "in bed format to restrict analysis to. [%default]") parser.add_option( "-r", "--region", dest="region", type="string", help="Region string to restrict analysis to. Takes precedence " "over --input-bed. [%default]") parser.add_option( "-m", "--method", dest="methods", action="append", type="choice", choices=("mutational-signature", "mutational-signature-profile", "kinship", "format-distribution", "gc-context", "gc-depth-profile"), help="methods to apply [%default]") parser.add_option( "--format-distribution", dest="format_distributions", action="append", type="string", help="format to compute histograms on. Option can specified multiple times. " "At the moment, only integer metrics are supported [%default]") parser.add_option( "--format-distribution-nbins", dest="format_distributions_nbins", type="int", help="number of bins to use for histograms [%default]") parser.add_option( "--only-variant-positions", dest="only_variant_positions", action="store_true", help="only use variant positions [%default]") parser.add_option( "--gc-window-size", dest="gc_window_size", type="int", help="(half) window size to use for G+C computation. A size " "of 50 means that 50 bases on either side of the variant are " "used to compute the G+C content [%default]") parser.set_defaults( methods=[], input_vcf_file=None, input_bed_file=None, region=None, input_fasta_file=None, format_distributions=[], format_distribution_nbins=1000, gc_window_size=50, report_step=1000000, ) (options, args) = E.start(parser, argv, add_output_options=True) if len(args) == 1: options.input_vcf_file = args[0] if options.input_vcf_file is None: raise ValueError("please supply a VCF file") if options.input_fasta_file is None: raise ValueError("please supply a FASTA file") if "format-distribution" in options.methods and not options.format_distributions: raise ValueError("please supply at least one FORMAT field (DP, GQ) " "when --method=format-distribution has been selected") if not os.path.exists(options.input_vcf_file): raise OSError("input vcf file {} does not exist".format( options.input_vcf_file)) if not os.path.exists(options.input_vcf_file + ".tbi") and not \ os.path.exists(options.input_vcf_file + ".csi"): raise OSError("input vcf file {} needs to be indexed".format( options.input_vcf_file)) if not os.path.exists(options.input_fasta_file): raise OSError("input fasta file {} does not exist".format( options.input_fasta_file)) if not os.path.exists(options.input_fasta_file + ".fai"): raise OSError("input fasta file {} needs to be indexed".format( options.input_fasta_file)) # update paths to absolute options.input_fasta_file = os.path.abspath(options.input_fasta_file) options.input_vcf_file = os.path.abspath(options.input_vcf_file) # catch issue with empty variant files try: vcf_in = pysam.VariantFile(options.input_vcf_file) except (OSError, ValueError): E.warn("could not open variant file - likely to be empty") E.stop() return 0 fasta_in = pysam.FastaFile(options.input_fasta_file) if options.input_bed_file: if not os.path.exists(options.input_bed_file): raise OSError("input bed file {} does not exist".format( options.input_bed_file)) bed_in = pysam.TabixFile(options.input_bed_file) else: bed_in = None vcf2stats_count( vcf_in, fasta_in, bed_in, options) E.stop()
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-i", "--input-fastq-file", dest="input_fastq_file", type="string", help="input fastq file. " "[%default]") parser.add_option("-m", "--method", dest="method", type="choice", choices=("read-variant", "depth-vcf", "read-list", "coverage-vcf"), help="method to apply [%default]") parser.add_option( "-e", "--input-bed", dest="input_bed_file", type="string", help="input file with intervals. Tab-delimited file of intervals " "in bed format to restrict analysis to. [%default]") parser.add_option( "-r", "--region-string", dest="region_string", type="string", help="region string. Only apply method in specified region. " "[%default]") parser.add_option("-f", "--reference-fasta-file", dest="reference_fasta_file", help="reference genomic sequence in fasta format. " "[%default]") parser.add_option("-s", "--stepper", dest="stepper", type="choice", choices=("nofilter", "samtools", "all")) parser.set_defaults(method="read-variant", reference_fasta_file=None, input_bed_file=None, regex_sample_name="([^/]+).bam", stepper="nofilter", region_string=None) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv, add_output_options=True) pysam_in = pysam.AlignmentFile(args[0], "rb") if options.input_bed_file: if not os.path.exists(options.input_bed_file): raise OSError("input bed file {} does not exist".format( options.input_bed_file)) bed_in = pysam.TabixFile(options.input_bed_file) else: bed_in = None if options.region_string is not None: itr = generate_from_region(pysam_in, options.region, stepper=options.stepper) elif bed_in is not None: itr = generate_from_bed(pysam_in, bed_in, stepper=options.stepper) else: itr = generate_from_bam(pysam_in, stepper=options.stepper) reference_fasta = pysam.FastaFile(options.reference_fasta_file) outf = options.stdout c = E.Counter() if options.method == "read-variant": outf.write("chromosome\tposition\tref\ttypes\n") for pileupcolumn in itr: c.positions_pileup += 1 reference_base = reference_fasta.fetch( pileupcolumn.reference_name, pileupcolumn.reference_pos, pileupcolumn.reference_pos + 1) matches = [] bases = set() for read in pileupcolumn.pileups: qpos = read.query_position if qpos is not None: base = read.alignment.query_sequence[qpos] else: base = "-" matches.append((base, read.alignment.query_name)) bases.add(base) bases = list(bases) if len(bases) == 1: c.position_noninformative += 1 if bases[0] == reference_base: c.position_reference += 1 continue c.position_informative += 1 d = {} for base in bases: d[base] = ",".join([x[1] for x in matches if x[0] == base]) outf.write("{}\t{}\t{}\t{}\n".format(pileupcolumn.reference_name, pileupcolumn.reference_pos, reference_base, json.dumps(d))) elif options.method in ("depth-vcf", "coverage-vcf"): if options.regex_sample_name: sample_name = re.search(options.regex_sample_name, args[0]).groups()[0] else: sample_name = "unknown" outf.write("##fileformat=VCFv4.1\n") outf.write("##FORMAT=<ID=GT,Number=1,Type=String," "Description=\"Genotype\">\n") outf.write("##FORMAT=<ID=DP,Number=1,Type=Integer," "Description=\"Genotype\">\n") outf.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\t" "FILTER\tINFO\tFORMAT\t{}\n".format(sample_name)) is_depth = options.method == "depth-vcf" for idx, pileupcolumn in enumerate(itr): if idx % 1000 == 0: E.info("processed {} positions".format(idx)) reference_base = reference_fasta.fetch( pileupcolumn.reference_name, pileupcolumn.reference_pos, pileupcolumn.reference_pos + 1).upper() if reference_base == 'A': alt_base = 'C' else: alt_base = 'A' if is_depth: n = sum([ 1 for x in pileupcolumn.pileups if not (x.is_del or x.is_refskip) ]) else: n = pileupcolumn.n outf.write("{}\t{}\t.\t{}\t{}\t.\tPASS\t.\tGT:DP\t0/1:{}\n".format( pileupcolumn.reference_name, pileupcolumn.reference_pos, reference_base, alt_base, n)) elif options.method == "read-list": outf.write( "chromosome\tposition\treference_base\tbase\tquality\tquery_name\n" ) for pileupcolumn in itr: reference_base = reference_fasta.fetch( pileupcolumn.reference_name, pileupcolumn.reference_pos, pileupcolumn.reference_pos + 1) matches = [] for read in pileupcolumn.pileups: qpos = read.query_position if qpos is not None: base = read.alignment.query_sequence[qpos] quality = read.alignment.query_qualities[qpos] else: base = "-" quality = "" outf.write("{}\t{}\t{}\t{}\t{}\t{}\n".format( pileupcolumn.reference_name, pileupcolumn.reference_pos, reference_base, base, quality, read.alignment.query_name)) E.info(c) # write footer and output benchmark information. E.stop()
def __init__(self, path, binned): self._tabixfile = pysam.TabixFile(path) self._binned = binned
def gm_main(path, region_file): simple_file_name = '' simple_file_name = os.path.basename(region_file) simple_file_name = os.path.splitext(simple_file_name)[0] regions = get_chrm_start_end(region_file) #print(regions[0] # for each file in path for bed_file in os.listdir(path): if bed_file.endswith('bed.gz'): # file outputs # intermediate quality intermediates_path = '/scratch/Shares/layer/nextflow/kristen/fastq_to_vcf/mpileup/chco-exome-analysis/intermediates' intermediate_Q_name = bed_file + simple_file_name + 'quality_intermediate.txt' intermediate_Q_txt = open( os.path.join(intermediates_path, intermediate_Q_name), 'a') intermediate_Q_txt.truncate(0) intermediate_SB_name = bed_file + simple_file_name + 'strand_bias_intermediate.txt' intermediate_SB_txt = open( os.path.join(intermediates_path, intermediate_SB_name), 'a') intermediate_SB_txt.truncate(0) # final quality finals_path = '/scratch/Shares/layer/nextflow/kristen/fastq_to_vcf/mpileup/chco-exome-analysis/final_metrics' final_Q_name = bed_file + simple_file_name + 'quality_final.txt' final_Q_txt = open(os.path.join(finals_path, final_Q_name), 'a') final_Q_txt.truncate(0) final_SB_name = bed_file + simple_file_name + 'strand_bias_final.txt' final_SB_txt = open(os.path.join(finals_path, final_SB_name), 'a') final_SB_txt.truncate(0) for r in range(len(regions)): chrm = regions[r][0] start = int(regions[r][1]) end = int(regions[r][2]) tbx = pysam.TabixFile(path + '/' + bed_file) # list to hold mpileup quality counts quality = get_quality(tbx, chrm, start, end) for q in quality: print(q[0], '\t', q[1], '\t', q[2], '\t', np.average(q[3]), file=intermediate_Q_txt) reads = get_reads(tbx, chrm, start, end) counts = get_counts(reads) strand_bias = get_strandbias(counts) if strand_bias == -1: pass else: for sb in strand_bias: print(sb[0][0], '\t', sb[0][1], '\t', sb[0][2], '\t', sb[1], file=intermediate_SB_txt) intermediate_Q_txt.close() intermediate_SB_txt.close() geno_to_exo_main( region_file, os.path.join(intermediates_path, intermediate_Q_name), os.path.join(finals_path, final_Q_name)) geno_to_exo_main( region_file, os.path.join(intermediates_path, intermediate_SB_name), os.path.join(finals_path, final_SB_name)) else: continue
def filterNonMatchControl(inputFilePath, outputFilePath, controlFile, matchedNormal, controlPanel_num_thres, controlPanel_check_margin): """ script for removing candidate in which non-matched normals have the junction reads """ hIN = open(inputFilePath, 'r') hOUT = open(outputFilePath, 'w') use_control = True if controlFile != "" else False if use_control == True: tabixfile = pysam.TabixFile(controlFile) tabixErrorMsg = "" for line in hIN: F = line.rstrip('\n').split('\t') controlFlag = 0 max_control_sample = "---" max_control_num = 0 if use_control == True: inseqSize = (0 if F[7] == "---" else len(F[7])) #################### # get the records for control junction data for the current position tabixErrorFlag = 0 try: records = tabixfile.fetch( F[0], int(F[1]) - controlPanel_check_margin, int(F[2]) + controlPanel_check_margin) except Exception as inst: # print >> sys.stderr, "%s: %s" % (type(inst), inst.args) tabixErrorMsg = str(inst.args) tabixErrorFlag = 1 #################### #################### # for each record in control junction extracted, check the consistency with the current junction # max_control_sample = "---" # max_control_num = 0 if tabixErrorFlag == 0: for record_line in records: record = record_line.split('\t') if F[0] == record[0] and F[3] == record[3] and F[ 8] == record[8] and F[9] == record[9]: flag = 0 # detailed check on the junction position considering inserted sequences if F[8] == "+": expectedDiffSize = (int(F[2]) - int(record[2])) + ( inseqSize - int(record[7])) if (F[9] == "+" and int(F[5]) == int(record[5]) - int(expectedDiffSize)) or ( F[9] == "-" and int(F[5]) == int(record[5]) + int(expectedDiffSize)): flag = 1 else: expectedDiffSize = (int(F[2]) - int(record[2])) + ( int(record[7]) - inseqSize) if (F[9] == "+" and int(F[5]) == int(record[5]) + int(expectedDiffSize)) or ( F[9] == "-" and int(F[5]) == int(record[5]) - int(expectedDiffSize)): flag = 1 # if position relationship including inserted sequences matches if flag == 1: controlSamples = record[10].split(';') controlNums = record[11].split(';') for i in range(0, len(controlSamples)): if controlSamples[i] == matchedNormal: continue if int(controlNums[i]) > max_control_num: max_control_sample = controlSamples[i] max_control_num = int(controlNums[i]) if int(controlNums[i]) >= int( controlPanel_num_thres): controlFlag = 1 """ # if controlSamples[i] != matchedNormal is not None and int(controlNums[i]) >= int(controlPanel_num_thres): # if controlSamples[i] != matchedNormal and int(controlNums[i]) >= int(supportReadThres): controlFlag = 1 if int(controlNums[i]) > max_control_num: max_control_sample = controlSamples[i] max_control_num = int(controlNums[i]) """ #################### if controlFlag == 0: print >> hOUT, "\t".join( F) + '\t' + max_control_sample + '\t' + str(max_control_num) if tabixErrorMsg != "": utils.warningMessage( "One or more error occured in tabix file fetch, e.g.: " + tabixErrorMsg) hIN.close() hOUT.close() if use_control == True: tabixfile.close()
def standardizeSNPsV2(variantlist, regiontxt, build): """ Input: Variant names in any of these formats: rsid, chrom_pos_ref_alt, chrom:pos_ref_alt, chrom:pos_ref_alt_b37/b38 Output: chrom_pos_ref_alt_b37/b38 variant ID format, but looks at GTEx variant lookup table first. In the case of multi-allelic variants (e.g. rs2211330(T/A,C)), formats such as 1_205001063_T_A,C_b37 are accepted If variant ID format is chr:pos, and the chr:pos has a unique biallelic SNV, then it will be assigned that variant """ if all(x=='.' for x in variantlist): raise InvalidUsage('No variants provided') if np.nan in variantlist: raise InvalidUsage('Missing variant IDs detected in row(s): ' + str([ i+1 for i,x in enumerate(variantlist) if str(x) == 'nan' ])) # Ensure valid region: chrom, startbp, endbp = parseRegionText(regiontxt, build) chrom = str(chrom).replace('23',"X") # Load GTEx variant lookup table for region indicated db = client.GTEx_V7 rsid_colname = 'rs_id_dbSNP147_GRCh37p13' if build.lower() in ["hg38", "grch38"]: db = client.GTEx_V8 rsid_colname = 'rs_id_dbSNP151_GRCh38p7' collection = db['variant_table'] variants_query = collection.find( { '$and': [ { 'chr': int(chrom.replace('X','23')) }, { 'variant_pos': { '$gte': int(startbp), '$lte': int(endbp) } } ]} ) variants_list = list(variants_query) variants_df = pd.DataFrame(variants_list) variants_df = variants_df.drop(['_id'], axis=1) # Load dbSNP151 SNP names from region indicated dbsnp_filepath = '' suffix = 'b37' if build.lower() in ["hg38", "grch38"]: suffix = 'b38' dbsnp_filepath = os.path.join(MYDIR, 'data', 'dbSNP151', 'GRCh38p7', 'All_20180418.vcf.gz') else: suffix = 'b37' dbsnp_filepath = os.path.join(MYDIR, 'data', 'dbSNP151', 'GRCh37p13', 'All_20180423.vcf.gz') # Load dbSNP file #delayeddf = delayed(pd.read_csv)(dbsnp_filepath,skiprows=getNumHeaderLines(dbsnp_filepath),sep='\t') #dbsnp = dd.from_delayed(delayeddf) tbx = pysam.TabixFile(dbsnp_filepath) print('Compiling list of known variants in the region from dbSNP151') chromcol = [] poscol = [] idcol = [] refcol = [] altcol = [] variantid = [] # in chr_pos_ref_alt_build format rsids = dict({}) # a multi-allelic variant rsid (key) can be represented in several variantid formats (values) for row in tbx.fetch(str(chrom), startbp, endbp): rowlist = str(row).split('\t') chromi = rowlist[0].replace('chr','') posi = rowlist[1] idi = rowlist[2] refi = rowlist[3] alti = rowlist[4] varstr = '_'.join([chromi, posi, refi, alti, suffix]) chromcol.append(chromi) poscol.append(posi) idcol.append(idi) refcol.append(refi) altcol.append(alti) variantid.append(varstr) rsids[idi] = [varstr] altalleles = alti.split(',') # could have more than one alt allele (multi-allelic) if len(altalleles)>1: varstr = '_'.join([chromi, posi, refi, altalleles[0], suffix]) rsids[idi].append(varstr) for i in np.arange(len(altalleles)-1): varstr = '_'.join([chromi, posi, refi, altalleles[i+1], suffix]) rsids[idi].append(varstr) print('Cleaning and mapping list of variants') variantlist = [asnp.split(';')[0].replace(':','_').replace('.','') for asnp in variantlist] # cleaning up the SNP names a bit stdvariantlist = [] for variant in variantlist: if variant == '': stdvariantlist.append('.') continue variantstr = variant.replace('chr','') if re.search("^23_",variantstr): variantstr = variantstr.replace('23_','X_',1) if variantstr.startswith('rs'): try: # Here's the difference from the first function version (we look at GTEx first) if variant in list(variants_df[rsid_colname]): stdvar = variants_df['variant_id'].loc[ variants_df[rsid_colname] == variant].to_list()[0] stdvariantlist.append(stdvar) else: stdvariantlist.append(rsids[variantstr][0]) except: stdvariantlist.append('.') elif re.search("^\d+_\d+_[A,T,G,C]+_[A,T,C,G]+,*", variantstr.replace('X','23')): strlist = variantstr.split('_') strlist = list(filter(None, strlist)) # remove empty strings try: achr, astart, aend = parseRegionText(strlist[0]+":"+strlist[1]+"-"+str(int(strlist[1])+1), build) achr = str(achr).replace('23','X') if achr == str(chrom) and astart >= startbp and astart <= endbp: variantstr = variantstr.replace("_"+str(suffix),"") + "_"+str(suffix) if len(variantstr.split('_')) == 5: stdvariantlist.append(variantstr) else: raise InvalidUsage(f'Variant format not recognizable: {variant}. Is it from another coordinate build system?', status_code=410) else: stdvariantlist.append('.') except: raise InvalidUsage(f'Problem with variant {variant}', status_code=410) elif re.search("^\d+_\d+_*[A,T,G,C]*", variantstr.replace('X','23')): strlist = variantstr.split('_') strlist = list(filter(None, strlist)) # remove empty strings try: achr, astart, aend = parseRegionText(strlist[0]+":"+strlist[1]+"-"+str(int(strlist[1])+1), build) achr = str(achr).replace('23','X') if achr == str(chrom) and astart >= startbp and astart <= endbp: if len(strlist)==3: aref=strlist[2] else: aref='' stdvariantlist.append(fetchSNV(achr, astart, aref, build)) else: stdvariantlist.append('.') except: raise InvalidUsage(f'Problem with variant {variant}', status_code=410) else: raise InvalidUsage(f'Variant format not recognized: {variant}', status_code=410) return stdvariantlist
def get_part_from_gtf(annotation, reference=None, feature="CDS"): tabixfile = pysam.TabixFile(annotation, parser=pysam.asGTF()) return [gtf for gtf in tabixfile.fetch(reference=reference) if (gtf.feature == feature)]
def main(argv): prog = "paleomix vcf_to_fasta" usage = "%s [options] --genotype in.vcf --intervals in.bed" % (prog, ) parser = argparse.ArgumentParser(prog=prog, usage=usage) parser.add_argument( "--genotype", required=True, metavar="VCF", help="Tabix indexed VCF file; by default the first " "sample is used in multi-sample VCFs. Use " "--nth-sample option to select another sample.", ) parser.add_argument( "--nth-sample", default=1, type=int, metavar="NTH", help="Use Nth sample from the VCF, with the first " "sample numbered '1' [default: %(default)s].", ) parser.add_argument( "--intervals", metavar="BED", help="Six column BED file; sequences on the same " "contig with the same name are assumed to " "represent the same gene, and are merged into a " "single contiguous FASTA sequence.", ) parser.add_argument( "--padding", type=int, default=10, help="Number of bases to expand intervals, when " "checking for adjacent indels [%(default)s]", ) parser.add_argument( "--whole-codon-indels-only", action="store_true", default=False, help="If true, only indels where (length %% 3) == 0 " "are retained [%(default)s]", ) parser.add_argument( "--ignore-indels", action="store_true", default=False, help="Do not include indels generated FASTA " "sequence [%(default)s].", ) opts = parser.parse_args(argv) print("Running vcf_to_fasta", end="", file=sys.stderr) if opts.whole_codon_indels_only: print(", assuming sequences represents CDS", end="", file=sys.stderr) print(file=sys.stderr) if not os.path.exists(opts.genotype): sys.stderr.write("ERROR: VCF file does not exist.\n") return 1 elif not os.path.exists(opts.genotype + ".tbi"): sys.stderr.write("ERROR: VCF file not tabix indexed.\n") sys.stderr.write(' To index, run "tabix -p vcf <filename>".\n') return 1 elif opts.nth_sample < 1: sys.stderr.write( "ERROR: --nth-sample uses 1-based offsets, zero and\n") sys.stderr.write(" negative values are not allowed!\n") return 1 # Relevant VCF functions uses zero-based offsets opts.nth_sample -= 1 genotype = pysam.TabixFile(opts.genotype) if opts.intervals is None: intervals = parse_intervals(genotype) else: intervals = read_intervals(opts.intervals) if intervals is None: return 1 if not check_nth_sample(opts, genotype): return 1 return genotype_genes(opts, intervals, genotype)
def annotate_peaks(peaks, gtf_gz, gtf_index, cfg_dict, q, idx, attributes, logger_options): """ Input: peaks (list): List of dictionaries containing information on peaks to annotate (see function 'annotate_single_peak') gtf_gz (str): Path to gtf.gz file gtf_index (str): Path to gtf.gz index cfg-dict (dict): The loaded config containing queries q (Queue): The queue to put annotations into idx (int): The order in which the annotations should be written to output attributes (list): A list of attribute columns to write to output, logger_options (dict): A dict for initializing UROPALogger """ logger = UROPALogger(**logger_options) #Open tabix file tabix_obj = pysam.TabixFile(gtf_gz, index=gtf_index) #For each peak in input peaks, collect all_valid_annotations logger.debug("Annotating peaks in chunk {0}".format(idx)) all_valid_annotations = [] for peak in peaks: #Annotate single peak valid_annotations = annotate_single_peak(peak, tabix_obj, cfg_dict, logger=logger) all_valid_annotations.extend(valid_annotations) tabix_obj.close() #Write annotations to best hits and final hits logger.debug( "Annotated all peaks in chunk {0}. Now adding contents to queue...". format(idx)) content = "\n".join([ annopeak_to_string(peak, attributes=attributes) for peak in all_valid_annotations ]) + "\n" q.put(("allhits.bed", idx, content)) q.put(("allhits.txt", idx, content)) content = "" finalhits_content = "\n".join([ annopeak_to_string(peak, attributes=attributes) for peak in all_valid_annotations if peak.get("best_hit", 0) == 1 ]) + "\n" q.put(("finalhits.bed", idx, finalhits_content)) q.put(("finalhits.txt", idx, finalhits_content)) finalhits_content = "" ## Hits per query if chosen if cfg_dict["output_by_query"] == True: query_names = [query["name"] for query in cfg_dict["queries"]] for name in query_names: query_str = "\n".join([ annopeak_to_string(peak, attributes=attributes) for peak in all_valid_annotations if peak.get("query_name", "") == name ]) + "\n" q.put((name + ".bed", idx, query_str)) q.put((name + ".txt", idx, query_str)) logger.debug("Job finished for chunk {0}".format(idx)) return (0) #success
def main(): # arguments parser = argparse.ArgumentParser() parser.add_argument( '-vcf', help='Allsites vcf to apply filters to and get callable sites', required=True) parser.add_argument('-bed', '--bed_repeats', help='BED file with repeat regions listed', required=True) parser.add_argument('-ar_bed', '--ar_bed', help='BED file of ancestral repeats', default='None') parser.add_argument( '-DF', '--DepthFilter', help= 'Defines abnormal depth eg) 2 means abnormal depth is twice and half the mean depth', default=2.0, type=float) parser.add_argument('-mean_depth', '--mean_depth', help='Mean coverage depth of samples', default=44.0) parser.add_argument('-N', '--no_individuals', help='Number of individuals in allsites VCF', type=float, default=10.0) parser.add_argument( '-chr', help= 'Specifies chromosome to extract callable sites for, if ALL will run a job for each, ' '-chr ALL can only be specified in conjunction with -sub', default='ALL') parser.add_argument( '-pol', help= 'If specified will check if site can be polarised, takes a wga bed file', default='None') parser.add_argument('-out', help='Output directory and prefix', required=True) parser.add_argument('-evolgen', help='If specified will run on lab queue', action='store_true', default=False) parser.add_argument('-sub', help='If specified will submit itself to cluster', action='store_true', default=False) args = parser.parse_args() # variables all_sites = args.vcf repeat_bed = args.bed_repeats line_bed = args.ar_bed filter_factor = args.DepthFilter all_data_mean_depth = float(args.mean_depth) no_indiv = args.no_individuals chromosome = args.chr pol = args.pol out = args.out fasta_out = out + '.' + chromosome + '.fa' evolgen = args.evolgen # submission loop if args.sub is True: if chromosome == 'ALL': # gen chromo list and submit job for each grep_cmd = ( 'zcat ' + all_sites + ' | head -n 20000 | grep ^##contig | cut -d "," -f 1 | cut -d "=" -f 3 | grep -v ^NODE' ) chromo_list = subprocess.Popen(grep_cmd, stdout=subprocess.PIPE, shell=True)\ .communicate()[0].split('\n')[:-1] output_fasta_list = [] jid_list = [] for chromo in chromo_list: output_fasta_list.append(out + '.' + chromo + '.fa') jid = 'callsites_' + chromo + '.sh' jid_list.append(jid) command_line = ('callable_sites_from_vcf.py ' '-vcf ' + all_sites + ' ' '-bed ' + repeat_bed + ' ' '-ar_bed ' + line_bed + ' ' '-DF ' + str(filter_factor) + ' ' '-mean_depth ' + str(all_data_mean_depth) + ' ' '-N ' + str(no_indiv) + ' ' '-chr ' + chromo + ' ' '-pol ' + pol + ' ' '-out ' + out) q_sub([command_line], out + '.' + chromo, jid=jid, evolgen=evolgen, t=48) # cat job for final output cat_cmd = 'cat ' + ' '.join(output_fasta_list) + ' > ' + fasta_out q_sub([cat_cmd], out + 'cat', evolgen=evolgen, hold=jid_list) sys.exit() else: # submit script for chromosome command_line = ('callable_sites_from_vcf.py ' '-vcf ' + all_sites + ' ' '-bed ' + repeat_bed + ' ' '-ar_bed ' + line_bed + ' ' '-DF ' + str(filter_factor) + ' ' '-mean_depth ' + str(all_data_mean_depth) + ' ' '-N ' + str(no_indiv) + ' ' '-chr ' + chromosome + ' ' '-pol ' + pol + ' ' '-out ' + out) q_sub([command_line], out, evolgen=evolgen, t=48) sys.exit() # catch -all specified without -sub if args.chr == 'ALL' and args.sub is False: sys.exit('"-chr ALL" can only be run in conjunction with "-sub"') # calculate depth cutoffs lower_depth_limit = all_data_mean_depth / filter_factor upper_depth_limit = all_data_mean_depth * filter_factor repeats = set() # get bed regions per chromo for x in open(repeat_bed): if x.split()[0] == chromosome: repeats |= {y for y in range(int(x.split()[1]), int(x.split()[2]))} lines = set() # get bed regions per chromo if line_bed != 'None': for x in gzip.open(line_bed): if x.split()[0] == chromosome: lines |= { y for y in range(int(x.split()[1]), int(x.split()[2])) } # loop through allsites for chromosome counter = 0 fasta_string = '>' + chromosome + '\n' if pol != 'None': wga_bed = pysam.TabixFile(pol) else: wga_bed = None with open(fasta_out, 'w') as out_fa: out_fa.write(fasta_string) fasta_string = '' prev_position = 0 for line in VariantFile(all_sites).fetch(chromosome): # catch missing sites in allsites (new gatk3.7 feature) position = int(line.pos) diff = position - prev_position if diff != 1: missed_bases = ''.join(['1' for i in range(0, diff - 1)]) fasta_string += missed_bases prev_position = position # add line break every 60 bases if len(fasta_string) >= 60: if len(fasta_string) == 60: out_fa.write(fasta_string + '\n') fasta_string = '' else: out_fa.write(fasta_string[:60] + '\n') fasta_string = fasta_string[60:] counter += 1 # check for ns if line.ref == 'N': fasta_string += '0' continue # depth filter try: cumulative_depth = line.info["DP"] except KeyError: fasta_string += '1' continue locus_mean_depth = cumulative_depth / no_indiv if lower_depth_limit <= locus_mean_depth <= upper_depth_limit: # repeat filter if line.pos not in repeats: # check if polarisable if pol != 'None': can_polarise = polarisable(line, wga_bed)[0] if can_polarise is False: fasta_string += 'k' continue else: fasta_string += 'K' continue else: fasta_string += 'k' continue else: if line.pos in lines: # check if polarisable if pol != 'None': can_polarise = polarisable(line, wga_bed)[0] if can_polarise is False: fasta_string += 'r' continue else: fasta_string += 'R' continue else: fasta_string += 'r' continue else: fasta_string += '1' continue else: fasta_string += '1' continue out_fa.write(fasta_string + '\n') print counter
hg38 = Genome(assembly="hg38") import os import pysam import argparse parser = argparse.ArgumentParser( description='Process histograms, scatter plots and metaplots') parser.add_argument('cell_type') parser.add_argument('tabix_file') parser.add_argument('fragments') args = parser.parse_args() cell_type = args.cell_type tabix_file = pysam.TabixFile(args.tabix_file) os.system( 'gunzip -c {} | bedtools intersect -sorted -c -a /home/John/JohnProject/reference/DHS_adjusted_6mer_bias_adjustedby_30_sorted_no_blacklist.unique.bed -b - > {}/index_cuts_{}_intersect.bed' .format(args.fragments, cell_type, cell_type)) from reference.tools import exp_profile, tabix_profile dhs = pd.read_table( "{}/index_cuts_{}_intersect.bed".format(cell_type, cell_type), names='dhs_chr adjusted_dhs_start adjusted_dhs_end index_cuts'.split(), header=None, low_memory=False) dhs_bias = pd.read_table( '/home/John/JohnProject/reference/DHS_with_footprints_and_biases_6mer_adjustedby30.txt.gz',
def func1(): # opens any tabix file with pysam.TabixFile(self.filename) as inf: pass
def main(): # The results in a dictionary to be printed at the end of the script. output_table = OrderedDict({i: {'Ref': 0, 'Mod': 0, 'Oth': 0, 'Ref_SF': 0, 'Mod_SF': 0, 'Oth_SF': 0} for i in range(minLength, maxLength+1)}) # Max divergence allowed in bwa using the ancient paramenters '-n 0.01 -o 2 -l 16500'. This will be used used to correct the estimates of spurious alignments. MaxDivBWA = {'20': 2, '21': 2, '22': 3, '23': 3, '24': 3, '25': 3, '26': 3, '27': 3, '28': 3, '29': 3, '30': 3, '31': 3, '32': 3, '33': 3, '34': 3, '35': 3, '36': 3, '37': 3, '38': 3, '39': 3, '40': 3, '41': 3, '42': 4, '43': 4, '44': 4, '45': 4, '46': 4, '47': 4, '48': 4, '49': 4, '50': 4, '51': 4, '52': 4, '53': 4, '54': 4, '55': 4, '56': 4, '57': 4, '58': 4, '59': 4, '60': 4} start = time.time() r = list(range(minLength, maxLength+1)) with pysam.AlignmentFile(input_file, "rb", check_sq=False) as samfile, pysam.TabixFile(infosites) as tabixfile: for chrom in [str(k) for k in range(1, 23)] + ['X']: for read in samfile.fetch(chrom, until_eof=True): # until_eof=True prevent pysam to complain if there is no index file. Cigar = read.cigarstring if (rm_Indels): if 'I' in Cigar or 'D' in Cigar: continue # Filter out softclip, hardclip and for MapQuality cutoff if 'S' not in Cigar and 'H' not in Cigar and read.mapping_quality >= MQ_cutoff: pos = read.get_reference_positions(full_length=False) site_position = 0 passTvFilter = True try: for s in tabixfile.fetch(chrom, pos[0], pos[-1], parser=pysam.asBed()): site_position = int(s[1]) reference = s[3] modified = s[4] if(Transversions == True): passTvFilter = not reference + \ modified in ['CT', 'TC', 'GA', 'AG'] except ValueError: break if site_position != 0 and passTvFilter: refseq = read.get_reference_sequence() myseq = read.query_sequence bq = read.query_qualities L = min(len(myseq), maxLength) if site_position in pos and L >= minLength: p = site_position # Sequences have different length, we need to align them # update the variables accordingly if len(myseq) != len(refseq): (refseq, myseq, pos, bq) = alnseq( refseq, myseq, CIGAR=read.cigartuples, basequalities=bq, start=pos[0]) # increment counters if deam_filter_skip or is_deaminated(refseq, myseq, terminal_deam, read.is_reverse, isDoubleStranded=DoubleStrand): Allele = myseq[pos.index(p)] BQ = bq[pos.index(p)] if Allele in ['A', 'C', 'G', 'T']: if BQ >= BQ_cutoff: ret_type, pass_SF = count( Allele, Reference=reference, Modified=modified, isReverse=read.is_reverse) output_table[L][ret_type] += 1 if pass_SF: output_table[L][ret_type + '_SF'] += 1 print('bp\tRef\tMod\tOth\tRef_SF\tMod_SF\tOth_SF\tSpuriousAln(95%CI)\tSpuriousAln_SF(95%CI)') SpAl = [] TrAl = [] for i, elem in sorted(output_table.items()): print(i, end='') d = MaxDivBWA[str(i)]/i for j in ['Ref', 'Mod', 'Oth', 'Ref_SF', 'Mod_SF', 'Oth_SF']: print('\t'+str(elem[j]), end='') for j in ['', '_SF']: TrueAln = float(elem['Ref'+j]) SpuriousAln = float(elem['Mod'+j]+elem['Oth'+j]) if TrueAln+SpuriousAln > 0: TrueAln1 = max(TrueAln - SpuriousAln*d/(3-d), 0) SpuriousAln1 = SpuriousAln / (1-d/3) SpAl.append(SpuriousAln1) TrAl.append(TrueAln1) spu = round(SpuriousAln1 / (SpuriousAln1 + TrueAln1), 4) ci = binom_interval(SpuriousAln1, SpuriousAln1+TrueAln1) else: spu = 0; ci = [0,0] print( '\t'+str(spu)+' ('+str(round(ci[0], 4))+','+str(round(ci[1], 4))+')', end='') print() # Split the SpAl and TrAl and then print the cutoffs using the cumulative estimates. spal = SpAl[0::2] spal_sf = SpAl[1::2] tral = TrAl[0::2] tral_sf = TrAl[1::2] j001 = j01 = j1 = j001sf = j01sf = j1sf = True for i in range(0, len(spal)): cum_spal = sum(spal[i:])/(sum(spal[i:])+sum(tral[i:])) cum_spal_sf = sum(spal_sf[i:])/(sum(spal_sf[i:])+sum(tral_sf[i:])) if(cum_spal < 0.001 and j001): print('# 0.1% cutoff is', r[i], 'bp') j001 = False if(cum_spal < 0.01 and j01): print('# 1% cutoff is', r[i], 'bp') j01 = False if(cum_spal < 0.1 and j1): print('# 10% cutoff is', r[i], 'bp') j1 = False if(cum_spal_sf < 0.001 and j001sf): print('# 0.1% cutoff with SF is', r[i], 'bp') j001sf = False if(cum_spal_sf < 0.01 and j01sf): print('# 1% cutoff with SF is', r[i], 'bp') j01sf = False if(cum_spal_sf < 0.1 and j1sf): print('# 10% cutoff with SF is', r[i], 'bp') j1sf = False end = time.time() print("#...done in", round((end - start)/60, 3), "minute(s)!")
def setUp(self): TestVCF.setUp(self) self.tabix = pysam.TabixFile(self.tmpfilename + ".gz") self.compare = load_and_convert(self.filename)
filter_value = gnomad_row_fields[6] info_fields = [('Filter', filter_value)] + [tuple(kv.split('=')) for kv in gnomad_row_fields[7].split(';')] info_fields = filter(lambda kv: kv[0] in NEEDED_GNOMAD_FIELDS_SET, info_fields) info_fields = dict(info_fields) gnomad_column_values = [info_fields.get(k, '') for k in NEEDED_GNOMAD_FIELDS] # check that the clinvar alt allele matches (one of the) gnomAD alt allele(s) #if len(alt_alleles) > 1: # # select the AC/AN numbers corresponding to the specific alt allele # alt_allele_index = alt_alleles.index(alt) # gnomad_column_values = map(lambda x: x.split(",")[alt_allele_index] if "," in x else x, gnomad_column_values) return gnomad_column_values gnomad_f = pysam.TabixFile(args.gnomad_sites_vcf) clinvar_f = gzip.open(args.clinvar_table) if args.clinvar_table.endswith('.gz') else open(args.clinvar_table) clinvar_header = next(clinvar_f).rstrip('\n').split('\t') clinvar_with_gnomad_header = clinvar_header + NEEDED_GNOMAD_FIELDS print("\t".join(clinvar_with_gnomad_header)) for i, clinvar_row in enumerate(clinvar_f): clinvar_fields = clinvar_row.rstrip('\n').split('\t') clinvar_dict = dict(zip(clinvar_header, clinvar_fields)) chrom = clinvar_dict['chrom'] pos = int(clinvar_dict['pos']) ref = clinvar_dict['ref'] alt = clinvar_dict['alt'] gnomad_column_values = get_gnomad_column_values(gnomad_f, chrom, pos, ref, alt) print("\t".join(clinvar_fields + gnomad_column_values))
def setUp(self): IterationTest.setUp(self) self.tabix = pysam.TabixFile(self.filename)
import my_utils.seq import pysam input_file = sys.argv[1] output_file = sys.argv[2] reference = sys.argv[3] # hgmd_file = sys.argv[4] spidex_file = sys.argv[4] key2exists = {} header2ind = {} seq_margin = 100 # hgmd_db = pysam.TabixFile(hgmd_file) spidex_db = pysam.TabixFile(spidex_file) hout = open(output_file, 'w') print >> hout, '\t'.join([ "Cancer_Type", "Sample_Name", "Gene_Symbol", "Mutation_Key", "Motif_Pos", "Motif_Seq", "Rel_Pos", "Ref_Base", "Alt_Base", "Mutation_Type", "Is_Canonical", "SPIDEX" ]) with open(input_file, 'r') as hin: header = hin.readline().rstrip('\n').split('\t') for i in range(len(header)): header2ind[header[i]] = i for line in hin: