'''Take start and stop from coordinates as specified : full bed coordinates''' # correct bed coordinates patch_start = start patch_new_end = end # CONTINUE COMBINED ============================================ # check start and end of range # set start_diff if sequence to query is over the chromosome ends --> ready to padd start_diff = 0 seq_start = patch_start if patch_start < 0: start_diff = abs(patch_start) seq_start = 0 # cover over the border cases for sequence retrival # extract reference sequence ------------------------------------------- with pysam.Fastafile(FLAGS.genome) as fa: seq = fa.fetch(reference=chrom, start=seq_start, end=patch_new_end) # pad if specifiedand at end of chromosome if start_diff > 0: if FLAGS.padd_ends in ['left', 'both']: # padd with N's print('padding with N\'s left wards') seq = 'N' * start_diff + seq else: print( '%s:%s-%s is smaller then bp_context and no padding specified ... skipping' % (chrom, start, end)) continue
def __init__(self, seqFn): import pysam self.genome = pysam.Fastafile(seqFn) print "seqClass: input 0-based coordinate -- [start, end)"
def main(): usage = 'usage: %prog [options] <params_file> <model_file> <vcf_file>' parser = OptionParser(usage) parser.add_option( '-c', dest='center_pct', default=0.25, type='float', help='Require clustered SNPs lie in center region [Default: %default]') parser.add_option('-f', dest='genome_fasta', default='%s/assembly/hg19.fa' % os.environ['HG19'], help='Genome FASTA for sequences [Default: %default]') parser.add_option('-g', dest='genome_file', default='%s/assembly/human.hg19.genome' % os.environ['HG19'], help='Chromosome lengths file [Default: %default]') parser.add_option('--h5', dest='out_h5', default=False, action='store_true', help='Output stats to sad.h5 [Default: %default]') parser.add_option('--local', dest='local', default=1024, type='int', help='Local SAD score [Default: %default]') parser.add_option('-n', dest='norm_file', default=None, help='Normalize SAD scores') parser.add_option( '-o', dest='out_dir', default='sad', help='Output directory for tables and plots [Default: %default]') parser.add_option('-p', dest='processes', default=None, type='int', help='Number of processes, passed by multi script') parser.add_option('--pseudo', dest='log_pseudo', default=1, type='float', help='Log2 pseudocount [Default: %default]') parser.add_option( '--rc', dest='rc', default=False, action='store_true', help= 'Average forward and reverse complement predictions [Default: %default]' ) parser.add_option('--shifts', dest='shifts', default='0', type='str', help='Ensemble prediction shifts [Default: %default]') parser.add_option( '--stats', dest='sad_stats', default='SAD', help='Comma-separated list of stats to save. [Default: %default]') parser.add_option( '-t', dest='targets_file', default=None, type='str', help='File specifying target indexes and labels in table format') parser.add_option( '--ti', dest='track_indexes', default=None, type='str', help='Comma-separated list of target indexes to output BigWig tracks') parser.add_option( '-u', dest='penultimate', default=False, action='store_true', help='Compute SED in the penultimate layer [Default: %default]') parser.add_option('-z', dest='out_zarr', default=False, action='store_true', help='Output stats to sad.zarr [Default: %default]') (options, args) = parser.parse_args() if len(args) == 3: # single worker params_file = args[0] model_file = args[1] vcf_file = args[2] elif len(args) == 5: # multi worker options_pkl_file = args[0] params_file = args[1] model_file = args[2] vcf_file = args[3] worker_index = int(args[4]) # load options options_pkl = open(options_pkl_file, 'rb') options = pickle.load(options_pkl) options_pkl.close() # update output directory options.out_dir = '%s/job%d' % (options.out_dir, worker_index) else: parser.error( 'Must provide parameters and model files and QTL VCF file') if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) if options.track_indexes is None: options.track_indexes = [] else: options.track_indexes = [ int(ti) for ti in options.track_indexes.split(',') ] if not os.path.isdir('%s/tracks' % options.out_dir): os.mkdir('%s/tracks' % options.out_dir) options.shifts = [int(shift) for shift in options.shifts.split(',')] options.sad_stats = options.sad_stats.split(',') ################################################################# # read parameters and collet target information job = params.read_job_params(params_file, require=['seq_length', 'num_targets']) if options.targets_file is None: target_ids = ['t%d' % ti for ti in range(job['num_targets'])] target_labels = [''] * len(target_ids) target_subset = None else: targets_df = pd.read_table(options.targets_file) target_ids = targets_df.identifier target_labels = targets_df.description target_subset = targets_df.index if len(target_subset) == job['num_targets']: target_subset = None ################################################################# # load SNPs # read sorted SNPs from VCF snps = bvcf.vcf_snps(vcf_file, require_sorted=True, validate_ref_fasta=options.genome_fasta, flip_ref=True) # filter for worker SNPs if options.processes is not None: worker_bounds = np.linspace(0, len(snps), options.processes + 1, dtype='int') snps = snps[worker_bounds[worker_index]:worker_bounds[worker_index + 1]] num_snps = len(snps) # cluster SNPs by position snp_clusters = cluster_snps(snps, job['seq_length'], options.center_pct) # delimit sequence boundaries [sc.delimit(job['seq_length']) for sc in snp_clusters] # open genome FASTA genome_open = pysam.Fastafile(options.genome_fasta) # make SNP sequence generator def snp_gen(): for sc in snp_clusters: snp_1hot_list = sc.get_1hots(genome_open) for snp_1hot in snp_1hot_list: yield {'sequence': snp_1hot} snp_types = {'sequence': tf.float32} snp_shapes = { 'sequence': tf.TensorShape([tf.Dimension(job['seq_length']), tf.Dimension(4)]) } dataset = tf.data.Dataset().from_generator(snp_gen, output_types=snp_types, output_shapes=snp_shapes) dataset = dataset.batch(job['batch_size']) dataset = dataset.prefetch(2 * job['batch_size']) # dataset = dataset.apply(tf.contrib.data.prefetch_to_device('/device:GPU:0')) iterator = dataset.make_one_shot_iterator() data_ops = iterator.get_next() ################################################################# # setup model # build model t0 = time.time() model = seqnn.SeqNN() model.build_sad(job, data_ops, ensemble_rc=options.rc, ensemble_shifts=options.shifts, embed_penultimate=options.penultimate, target_subset=target_subset) print('Model building time %f' % (time.time() - t0), flush=True) if options.penultimate: # labels become inappropriate target_ids = [''] * model.hp.cnn_filters[-1] target_labels = target_ids # read target normalization factors target_norms = np.ones(len(target_labels)) if options.norm_file is not None: ti = 0 for line in open(options.norm_file): target_norms[ti] = float(line.strip()) ti += 1 num_targets = len(target_ids) ################################################################# # setup output sad_out = initialize_output_h5(options.out_dir, options.sad_stats, snps, target_ids, target_labels) snp_threads = [] snp_queue = Queue() for i in range(1): sw = SNPWorker(snp_queue, sad_out) sw.start() snp_threads.append(sw) ################################################################# # predict SNP scores, write output # initialize saver saver = tf.train.Saver() with tf.Session() as sess: # coordinator coord = tf.train.Coordinator() tf.train.start_queue_runners(coord=coord) # load variables into session saver.restore(sess, model_file) # initialize predictions stream preds_stream = PredStream(sess, model, 32) # predictions index pi = 0 # SNP index si = 0 for snp_cluster in snp_clusters: ref_preds = preds_stream[pi] pi += 1 for snp in snp_cluster.snps: # print(snp, flush=True) alt_preds = preds_stream[pi] pi += 1 # queue SNP snp_queue.put((ref_preds, alt_preds, si)) # update SNP index si += 1 # finish queue print('Waiting for threads to finish.', flush=True) snp_queue.join() # close genome genome_open.close() ################################################### # compute SAD distributions across variants # define percentiles d_fine = 0.001 d_coarse = 0.01 percentiles_neg = np.arange(d_fine, 0.1, d_fine) percentiles_base = np.arange(0.1, 0.9, d_coarse) percentiles_pos = np.arange(0.9, 1, d_fine) percentiles = np.concatenate( [percentiles_neg, percentiles_base, percentiles_pos]) sad_out.create_dataset('percentiles', data=percentiles) pct_len = len(percentiles) for sad_stat in options.sad_stats: sad_stat_pct = '%s_pct' % sad_stat # compute sad_pct = np.percentile(sad_out[sad_stat], 100 * percentiles, axis=0).T sad_pct = sad_pct.astype('float16') # save sad_out.create_dataset(sad_stat_pct, data=sad_pct, dtype='float16') sad_out.close()
def filter_candidates( (candidates_vcf, filtered_candidates_vcf, reference, dbsnp, min_dp, good_ao, min_ao, snp_min_af, snp_min_bq, snp_min_ao, ins_min_af, del_min_af, del_merge_min_af, ins_merge_min_af, merge_r)): thread_logger = logging.getLogger("{} ({})".format( filter_candidates.__name__, multiprocessing.current_process().name)) try: thread_logger.info( "---------------------Filter Candidates---------------------") records = {} with open(candidates_vcf) as v_f: for line in v_f: if line[0] == "#": continue if len(line.strip().split()) != 10: raise RuntimeError( "Bad VCF line (<10 fields): {}".format(line)) chrom, pos, _, ref, alt, _, _, info_, _, info = line.strip( ).split() pos = int(pos) loc = "{}.{}".format(chrom, pos) dp, ro, ao = map(int, info.split(":")[1:4]) info_dict = dict( map(lambda x: x.split("="), filter(None, info_.split(";")))) mq_ = safe_read_info_dict(info_dict, "MQ", int, -100) bq_ = safe_read_info_dict(info_dict, "BQ", int, -100) nm_ = safe_read_info_dict(info_dict, "NM", int, -100) as_ = safe_read_info_dict(info_dict, "AS", int, -100) xs_ = safe_read_info_dict(info_dict, "XS", int, -100) pr_ = safe_read_info_dict(info_dict, "PR", int, -100) cl_ = safe_read_info_dict(info_dict, "CL", int, -100) st_ = safe_read_info_dict(info_dict, "ST", str, "-100,-100") ls_ = safe_read_info_dict(info_dict, "LS", int, -100) rs_ = safe_read_info_dict(info_dict, "RS", int, -100) if ao < min(ro, min_ao): continue if loc not in records: records[loc] = [] if ref == "N" or "\t".join(line.split()[0:5]) \ not in map(lambda x: "\t".join(x[-1].split()[0:5]), records[loc]): records[loc].append([ chrom, pos, ref, alt, dp, ro, ao, mq_, bq_, st_, ls_, rs_, nm_, as_, xs_, pr_, cl_, line ]) elif "\t".join(line.split()[0:5]) \ in map(lambda x: "\t".join(x[-1].split()[0:5]), records[loc]): for i, x in enumerate(records[loc]): if "\t".join(line.split()[0:5]) == "\t".join(x[-1].split()[0:5]) \ and ao / float(ro + 0.0001) > x[6] / float(x[5] + 0.0001): records[loc][i] = [ chrom, pos, ref, alt, dp, ro, ao, mq_, bq_, st_, ls_, rs_, nm_, as_, xs_, pr_, cl_, line ] break fasta_file = pysam.Fastafile(reference) good_records = [] dels = [] for loc, rs in sorted(records.iteritems(), key=lambda x: x[1][0:2]) + \ [["", [["", 0, "", "", 0, 0, 0, ""]]]]: ins = filter(lambda x: x[2] == "N", rs) if len(ins) > 1: # emit ins afs = map(lambda x: x[6] / float(x[5] + x[6]), ins) max_af = max(afs) ins = filter( lambda x: x[6] / float(x[5] + x[6]) >= (max_af * merge_r), ins) chrom, pos, ref = ins[0][0:3] dp = max(map(lambda x: x[4], ins)) ro = max(map(lambda x: x[5], ins)) ao = max(map(lambda x: x[6], ins)) mq_ = max(map(lambda x: x[7], ins)) bq_ = max(map(lambda x: x[8], ins)) st_ = "{},{}".format( max(map(lambda x: int(x[9].split(",")[0]), ins)), max(map(lambda x: int(x[9].split(",")[1]), ins))) ls_ = max(map(lambda x: x[10], ins)) rs_ = max(map(lambda x: x[11], ins)) nm_ = max(map(lambda x: x[12], ins)) as_ = max(map(lambda x: x[13], ins)) xs_ = max(map(lambda x: x[14], ins)) pr_ = max(map(lambda x: x[15], ins)) cl_ = max(map(lambda x: x[16], ins)) alt = "".join(map(lambda x: x[3], ins)) if (max_af >= ins_merge_min_af) or (ao >= good_ao): ins = [[ chrom, pos, ref, alt, dp, ro, ao, mq_, bq_, st_, ls_, rs_, nm_, as_, xs_, pr_, cl_ ]] else: ins = [] elif len(ins) == 1: # emit 1-base ins dp, ro, ao = ins[0][4:7] if (ao / float(ro + ao) < (ins_min_af) and ao < good_ao) or dp <= 5: ins = [] else: ins = [ins[0][:-1]] good_records.extend(ins) if dels and (ins or filter(lambda x: x[3] != "N" and x[2] != "N", rs)): # emit del if len(dels) == 1: ro = dels[0][5] ao = dels[0][6] chrom, pos, ref = dels[0][0:3] if ao / float(ro + ao) >= ((del_min_af)) or ao >= good_ao: good_records.extend(dels) else: afs = map(lambda x: x[6] / float(x[5] + x[6]), dels) max_af = max(afs) merge_r_thr = merge_r * max_af dels = filter( lambda x: x[6] / float(x[5] + x[6]) >= merge_r_thr, dels) chrom, pos = dels[0][0:2] dp = max(map(lambda x: x[4], dels)) ro = max(map(lambda x: x[5], dels)) ao = max(map(lambda x: x[6], dels)) mq_ = max(map(lambda x: x[7], dels)) bq_ = max(map(lambda x: x[8], dels)) st_ = "{},{}".format( max(map(lambda x: int(x[9].split(",")[0]), dels)), max(map(lambda x: int(x[9].split(",")[1]), dels))) ls_ = max(map(lambda x: x[10], dels)) rs_ = max(map(lambda x: x[11], dels)) nm_ = max(map(lambda x: x[12], dels)) as_ = max(map(lambda x: x[13], dels)) xs_ = max(map(lambda x: x[14], dels)) pr_ = max(map(lambda x: x[15], dels)) cl_ = max(map(lambda x: x[16], dels)) ref = "".join(map(lambda x: x[2], dels)) alt = "N" good_records.append([ chrom, pos, ref, alt, dp, ro, ao, mq_, bq_, st_, ls_, rs_, nm_, as_, xs_, pr_, cl_ ]) dels = [] if not loc: continue for record in rs: dp = record[4] if dp <= min_dp: continue ro, ao = record[5:7] if record[2] != "N" and record[3] != "N" and record[ 2] != record[3]: bq = record[8] if (ao / float(ro + ao) >= (snp_min_af) or ao >= snp_min_ao) and bq >= snp_min_bq: # emit SNP good_records.append(record[:-1]) elif record[2] != "N" and record[3] == "N": if ao / float(ro + ao) >= ( del_merge_min_af) or ao >= good_ao: chrom, pos = record[0:2] if dels and pos - dels[-1][1] != 1: # emit del if len(dels) == 1: ro = dels[0][5] ao = dels[0][6] chrom, pos, ref = dels[0][0:3] pos = int(pos) if ao / float(ro + ao) >= ((del_min_af)): good_records.extend(dels) else: afs = map(lambda x: x[6] / float(x[5] + x[6]), dels) max_af = max(afs) merge_r_thr = merge_r * max_af dels = filter( lambda x: x[6] / float(x[5] + x[6]) >= merge_r_thr, dels) chrom, pos = dels[0][0:2] dp = max(map(lambda x: x[4], dels)) ro = max(map(lambda x: x[5], dels)) ao = max(map(lambda x: x[6], dels)) mq_ = max(map(lambda x: x[7], dels)) bq_ = max(map(lambda x: x[8], dels)) st_ = "{},{}".format( max( map(lambda x: int(x[9].split(",")[0]), dels)), max( map(lambda x: int(x[9].split(",")[1]), dels))) ls_ = max(map(lambda x: x[10], dels)) rs_ = max(map(lambda x: x[11], dels)) nm_ = max(map(lambda x: x[12], dels)) as_ = max(map(lambda x: x[13], dels)) xs_ = max(map(lambda x: x[14], dels)) pr_ = max(map(lambda x: x[15], dels)) cl_ = max(map(lambda x: x[16], dels)) ref = "".join(map(lambda x: x[2], dels)) alt = "N" good_records.append([ chrom, pos, ref, alt, dp, ro, ao, mq_, bq_, st_, ls_, rs_, nm_, as_, xs_, pr_, cl_ ]) dels = [] # accumulate dels dels.append(record[:-1]) final_records = [] dels = [] for i, record in enumerate(good_records): chrom, pos, ref, alt, dp, ro, ao, mq_, bq_, st_, ls_, rs_, nm_, as_, xs_, pr_, cl_ = record ref = ref.upper() alt = alt.upper() info_str = "" if st_ != "-100,-100": info_str += ";ST={}".format(st_) if ls_ != -100: info_str += ";LS={}".format(ls_) if rs_ != -100: info_str += ";RS={}".format(rs_) if nm_ != -100: info_str += ";NM={}".format(nm_) if as_ != -100: info_str += ";AS={}".format(as_) if xs_ != -100: info_str += ";XS={}".format(xs_) if pr_ != -100: info_str += ";PR={}".format(pr_) if cl_ != -100: info_str += ";CL={}".format(cl_) if mq_ != -100: info_str += ";MQ={}".format(mq_) if bq_ != -100: info_str += ";BQ={}".format(bq_) af = np.round(ao / float(ao + ro), 4) info_str += ";AF={}".format(af) if ref != "N" and alt != "N": line = "\t".join([ chrom, str(pos), ".", ref, alt, "100", ".", "DP={};RO={};AO={}".format(dp, ro, ao) + info_str, "GT:DP:RO:AO:AF", "0/1:{}:{}:{}:{}".format(dp, ro, ao, af) ]) final_records.append([chrom, pos, ref, alt, line]) elif alt == "N": ref = fasta_file.fetch(chrom, pos - 2, pos + len(ref) - 1).upper() alt = fasta_file.fetch(chrom, pos - 2, pos - 1).upper() line = "\t".join([ chrom, str(pos - 1), ".", ref, alt, "100", ".", "DP={};RO={};AO={}".format(dp, ro, ao) + info_str, "GT:DP:RO:AO:AF", "0/1:{}:{}:{}:{}".format(dp, ro, ao, af) ]) final_records.append([chrom, pos - 1, ref, alt, line]) elif ref == "N": ref = fasta_file.fetch(chrom, pos - 2, pos - 1).upper() alt = ref + alt line = "\t".join([ chrom, str(pos - 1), ".", ref, alt, "100", ".", "DP={};RO={};AO={}".format(dp, ro, ao) + info_str, "GT:DP:RO:AO:AF", "0/1:{}:{}:{}:{}".format(dp, ro, ao, af) ]) final_records.append([chrom, pos - 1, ref, alt, line]) final_records = sorted(final_records, key=lambda x: x[0:2]) if dbsnp: filtered_bed = pybedtools.BedTool( map( lambda x: pybedtools.Interval(x[1][0], int(x[1][1]), int(x[1][1]) + 1, x[1][2], x[ 1][3], str(x[0])), enumerate(final_records))).sort() dbsnp = pybedtools.BedTool(dbsnp).each( lambda x: pybedtools.Interval(x[0], int(x[1]), int(x[1]) + 1, x[3], x[4])).sort( ) non_in_dbsnp_1 = filtered_bed.window(dbsnp, w=0, v=True) non_in_dbsnp_2 = filtered_bed.window( dbsnp, w=0).filter(lambda x: x[1] != x[7] or x[3] != x[9] or x[ 4] != x[10]).sort() non_in_dbsnp_ids = [] for x in non_in_dbsnp_1: non_in_dbsnp_ids.append(int(x[5])) for x in non_in_dbsnp_2: non_in_dbsnp_ids.append(int(x[5])) final_records = map( lambda x: x[1], filter(lambda x: x[0] in non_in_dbsnp_ids, enumerate(final_records))) with open(filtered_candidates_vcf, "w") as o_f: o_f.write("##fileformat=VCFv4.2\n") o_f.write( "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n" ) for record in final_records: o_f.write(record[-1] + "\n") return filtered_candidates_vcf except Exception as ex: thread_logger.error(traceback.format_exc()) thread_logger.error(ex) return None
def crossmap_maf_file(mapping, infile, outfile, liftoverfile, refgenome, ref_name, cstyle='a'): ''' Convert genome coordinates in MAF (mutation annotation foramt) format. Parameters ---------- mapping : dict Dictionary with source chrom name as key, IntervalTree object as value. infile : file Input file in VCF format. Can be a regular or compressed (*.gz, *.Z,*.z, *.bz, *.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to remote file. outfile : str prefix of output files. liftoverfile : file Chain (https://genome.ucsc.edu/goldenPath/help/chain.html) format file. Can be a regular or compressed (*.gz, *.Z,*.z, *.bz, *.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to remote file. refgenome : file The genome sequence file of 'target' assembly in FASTA format. ref_name : str The NCBI build name of the target assembly, for example, "GRCh37", "GRCh38". cstyle : str, optional Chromosome ID style. Must be one of ['a', 's', 'l'], where 'a' : as-is. The chromosome ID of the output file is in the same style of the input file. 's' : short ID, such as "1", "2", "X. 'l' : long ID, such as "chr1", "chr2", "chrX. ''' #index refegenome file if it hasn't been done if not os.path.exists(refgenome + '.fai'): logging.info("Creating index for: %s" % refgenome) pysam.faidx(refgenome) if os.path.getmtime(refgenome + '.fai') < os.path.getmtime(refgenome): logging.info( "Index file is older than reference genome. Re-creating index for: %s" % refgenome) pysam.faidx(refgenome) refFasta = pysam.Fastafile(refgenome) FILE_OUT = open(outfile, 'w') UNMAP = open(outfile + '.unmap', 'w') total = 0 fail = 0 for line in ireader.reader(infile): if not line.strip(): continue line = line.strip() #meta-information lines needed in both mapped and unmapped files if line.startswith('#'): print(line, file=FILE_OUT) print(line, file=UNMAP) continue elif line.startswith('Hugo_Symbol'): print( "#liftOver: Program=%sv%s, Time=%s, ChainFile=%s, NewRefGenome=%s" % ("CrossMap", __version__, datetime.date.today().strftime("%B%d,%Y"), liftoverfile, refgenome), file=FILE_OUT) print(line, file=FILE_OUT) print(line, file=UNMAP) logging.info("Lifting over ... ") else: fields = str.split(line, sep='\t') total += 1 fields[3] = ref_name chrom = fields[4] start = int(fields[5]) - 1 # 0 based end = int(fields[6]) #strand = fields[7] a = map_coordinates(mapping, chrom, start, end, '+', chrom_style=cstyle) if a is None: print(line, file=UNMAP) fail += 1 continue if len(a) == 2: target_chr = str( a[1][0] ) #target_chr is from chain file, could be 'chr1' or '1' target_start = a[1][1] target_end = a[1][2] # update chrom fields[4] = target_chr # update start coordinate fields[5] = target_start + 1 # update end fields[6] = target_end # update ref allele try: target_chr = update_chromID(refFasta.references[0], target_chr) fields[10] = refFasta.fetch(target_chr, target_start, target_end).upper() except: print(line, file=UNMAP) fail += 1 continue if a[1][3] == '-': fields[10] = revcomp_DNA(fields[10], True) print('\t'.join(map(str, fields)), file=FILE_OUT) else: print(line, file=UNMAP) fail += 1 continue FILE_OUT.close() UNMAP.close() logging.info("Total entries: %d", total) logging.info("Failed to map: %d", fail)
def crossmap_vcf_file(mapping, infile, outfile, liftoverfile, refgenome): ''' Convert genome coordinates in VCF format. Parameters ---------- mapping : dict Dictionary with source chrom name as key, IntervalTree object as value. infile : file Input file in VCF format. Can be a regular or compressed (*.gz, *.Z,*.z, *.bz, *.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to remote file. outfile : str prefix of output files. liftoverfile : file Chain (https://genome.ucsc.edu/goldenPath/help/chain.html) format file. Can be a regular or compressed (*.gz, *.Z,*.z, *.bz, *.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to remote file. refgenome : file The genome sequence file of 'target' assembly in FASTA format. ''' #index refegenome file if it hasn't been done if not os.path.exists(refgenome + '.fai'): printlog(["Creating index for", refgenome]) pysam.faidx(refgenome) refFasta = pysam.Fastafile(refgenome) FILE_OUT = open(outfile, 'w') UNMAP = open(outfile + '.unmap', 'w') total = 0 fail = 0 withChr = False # check if the VCF data lines use 'chr1' or '1' for line in ireader.reader(infile): if not line.strip(): continue line = line.strip() #deal with meta-information lines. #meta-information lines needed in both mapped and unmapped files if line.startswith('##fileformat'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##INFO'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##FILTER'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##FORMAT'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##ALT'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##SAMPLE'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##PEDIGREE'): print(line, file=FILE_OUT) print(line, file=UNMAP) #meta-information lines needed in unmapped files elif line.startswith('##assembly'): print(line, file=UNMAP) elif line.startswith('##contig'): print(line, file=UNMAP) if 'ID=chr' in line: withChr = True #update contig information elif line.startswith('#CHROM'): printlog(["Updating contig field ... "]) target_gsize = dict( list(zip(refFasta.references, refFasta.lengths))) for chr_id in sorted(target_gsize): if chr_id.startswith('chr'): if withChr is True: print("##contig=<ID=%s,length=%d,assembly=%s>" % (chr_id, target_gsize[chr_id], os.path.basename(refgenome)), file=FILE_OUT) else: print("##contig=<ID=%s,length=%d,assembly=%s>" % (chr_id.replace('chr', ''), target_gsize[chr_id], os.path.basename(refgenome)), file=FILE_OUT) else: if withChr is True: print("##contig=<ID=%s,length=%d,assembly=%s>" % ('chr' + chr_id, target_gsize[chr_id], os.path.basename(refgenome)), file=FILE_OUT) else: print("##contig=<ID=%s,length=%d,assembly=%s>" % (chr_id, target_gsize[chr_id], os.path.basename(refgenome)), file=FILE_OUT) print( "##liftOverProgram=<CrossMap,version=%s,website=https://sourceforge.net/projects/crossmap>" % __version__, file=FILE_OUT) print("##liftOverChainFile=<%s>" % liftoverfile, file=FILE_OUT) print("##originalFile=<%s>" % infile, file=FILE_OUT) print("##targetRefGenome=<%s>" % refgenome, file=FILE_OUT) print("##liftOverDate=<%s>" % datetime.date.today().strftime("%B%d,%Y"), file=FILE_OUT) print(line, file=FILE_OUT) print(line, file=UNMAP) printlog(["Lifting over ... "]) else: if line.startswith('#'): continue fields = str.split(line, maxsplit=7) total += 1 chrom = fields[0] start = int(fields[1]) - 1 # 0 based end = start + len(fields[3]) a = map_coordinates(mapping, chrom, start, end, '+') if a is None: print(line + "\tFail(Unmap)", file=UNMAP) fail += 1 continue if len(a) == 2: # update chrom target_chr = str( a[1][0] ) #target_chr is from chain file, could be 'chr1' or '1' target_start = a[1][1] target_end = a[1][2] fields[0] = target_chr # update start coordinate fields[1] = target_start + 1 # update ref allele target_chr = update_chromID(refFasta.references[0], target_chr) try: fields[3] = refFasta.fetch(target_chr, target_start, target_end).upper() except: print(line + "\tFail(KeyError)", file=UNMAP) fail += 1 continue # update END if any fields[7] = re.sub('END\=\d+', 'END=' + str(target_end), fields[7]) if a[1][3] == '-': fields[4] = revcomp_DNA(fields[4], True) if fields[3] != fields[4]: print('\t'.join(map(str, fields)), file=FILE_OUT) else: print(line + "\tFail(REF==ALT)", file=UNMAP) fail += 1 else: print(line + "\tFail(Multiple_hits)", file=UNMAP) fail += 1 continue FILE_OUT.close() UNMAP.close() printlog(["Total entries:", str(total)]) printlog(["Failed to map:", str(fail)])
def makemut(args, chrom, start, end, vaf, ins, avoid, alignopts): ''' is ins is a sequence, it will is inserted at start, otherwise delete from start to end''' if args.seed is not None: random.seed(int(args.seed) + int(start)) mutid = chrom + '_' + str(start) + '_' + str(end) + '_' + str(vaf) if ins is None: mutid += ':DEL' else: mutid += ':INS:' + ins bamfile = pysam.AlignmentFile(args.bamFileName, 'rb') bammate = pysam.AlignmentFile( args.bamFileName, 'rb') # use for mates to avoid iterator problems reffile = pysam.Fastafile(args.refFasta) vcffile = pysam.VariantFile(args.germline, 'r') if args.germline is not None else None tmpbams = [] is_insertion = ins is not None is_deletion = ins is None snvfrac = float(args.snvfrac) mutstr = get_mutstr(chrom, start, end, ins, reffile) del_ln = 0 if is_deletion: del_ln = end - start mutpos = start mutpos_list = [start] # optional CNV file cnv = None if (args.cnvfile): cnv = pysam.Tabixfile(args.cnvfile, 'r') log = open( 'addindel_logs_' + os.path.basename(args.outBamFile) + '/' + os.path.basename(args.outBamFile) + "." + "_".join( (chrom, str(start), str(end))) + ".log", 'w') tmpoutbamname = args.tmpdir + "/" + mutid + ".tmpbam." + str( uuid4()) + ".bam" logger.info("%s creating tmp bam: %s" % (mutid, tmpoutbamname)) outbam_muts = pysam.AlignmentFile(tmpoutbamname, 'wb', template=bamfile) mutfail, hasSNP, maxfrac, outreads, mutreads, mutmates = mutation.mutate( args, log, bamfile, bammate, chrom, mutpos, mutpos + del_ln + 1, mutpos_list, avoid=avoid, mutid_list=[mutid], is_insertion=is_insertion, is_deletion=is_deletion, ins_seq=ins, reffile=reffile, indel_start=start, indel_end=end, vcffile=vcffile) if mutfail: outbam_muts.close() os.remove(tmpoutbamname) return None # pick reads to change readlist = [] for extqname, read in outreads.items(): if read.seq != mutreads[extqname]: readlist.append(extqname) logger.info("%s len(readlist): %d" % (mutid, len(readlist))) readlist.sort() random.shuffle(readlist) if len(readlist) < int(args.mindepth): logger.warning("%s skipped, too few reads in region: %d" % (mutid, len(readlist))) outbam_muts.close() os.remove(tmpoutbamname) return None if vaf is None: vaf = float(args.mutfrac ) # default minor allele freq if not otherwise specified if cnv: # cnv file is present if chrom in cnv.contigs: for cnregion in cnv.fetch(chrom, start, end): cn = float( cnregion.strip().split()[3]) # expect chrom,start,end,CN logger.info(mutid + "\t" + ' '.join(("copy number in snp region:", chrom, str(start), str(end), "=", str(cn)))) if float(cn) > 0.0: vaf = vaf / float(cn) else: vaf = 0.0 logger.info("%s adjusted VAF: %f" % (mutid, vaf)) else: logger.info("%s selected VAF: %f" % (mutid, vaf)) lastread = int(len(readlist) * vaf) # pick at least args.minmutreads if possible if lastread < int(args.minmutreads): if len(readlist) > int(args.minmutreads): lastread = int(args.minmutreads) logger.warning("%s forced %d reads" % (mutid, lastread)) else: logger.warning( "%s dropped site with fewer reads than --minmutreads" % mutid) os.remove(tmpoutbamname) return None readtrack = dd(list) for readname in readlist: orig_name, readpos, pairend = readname.split(',') readtrack[orig_name].append('%s,%s' % (readpos, pairend)) usedreads = 0 newreadlist = [] for orig_name in readtrack: for read_instance in readtrack[orig_name]: newreadlist.append(orig_name + ',' + read_instance) usedreads += 1 if usedreads >= lastread: break readlist = newreadlist logger.info("%s picked: %d reads" % (mutid, len(readlist))) wrote = 0 nmut = 0 mut_out = {} # change reads from .bam to mutated sequences for extqname, read in outreads.items(): if read.seq != mutreads[extqname]: if not args.nomut and extqname in readlist: qual = read.qual # changing seq resets qual (see pysam API docs) read.seq = mutreads[extqname] # make mutation read.qual = qual nmut += 1 if not hasSNP or args.force: wrote += 1 mut_out[extqname] = read muts_written = {} for extqname in mut_out: if extqname not in muts_written: outbam_muts.write(mut_out[extqname]) muts_written[extqname] = True if mutmates[extqname] is not None: # is mate also in mutated list? mate_read = mutmates[extqname] pairname = 'F' # read is first in pair if mate_read.is_read2: pairname = 'S' # read is second in pair if not mate_read.is_paired: pairname = 'U' # read is unpaired mateqname = ','.join( (mate_read.qname, str(mate_read.pos), pairname)) if mateqname in mut_out: # yes: output mutated mate outbam_muts.write(mut_out[mateqname]) muts_written[mateqname] = True else: # no: output original mate outbam_muts.write(mate_read) logger.info("%s wrote: %d, mutated: %d" % (mutid, wrote, nmut)) if not hasSNP or args.force: outbam_muts.close() aligners.remap_bam(args.aligner, tmpoutbamname, args.refFasta, alignopts, threads=int(args.alignerthreads), mutid=mutid, paired=(not args.single), insane=args.insane) outbam_muts = pysam.AlignmentFile(tmpoutbamname, 'rb') coverwindow = 1 incover = countReadCoverage(bamfile, chrom, mutpos - coverwindow, mutpos + del_ln + coverwindow) outcover = countReadCoverage(outbam_muts, chrom, mutpos - coverwindow, mutpos + del_ln + coverwindow) avgincover = float(sum(incover)) / float(len(incover)) avgoutcover = float(sum(outcover)) / float(len(outcover)) spikein_frac = 0.0 if wrote > 0: spikein_frac = float(nmut) / float(wrote) # qc cutoff for final snv depth if (avgoutcover > 0 and avgincover > 0 and avgoutcover / avgincover >= float(args.coverdiff)) or args.force: tmpbams.append(tmpoutbamname) indelstr = '' if is_insertion: indelstr = ':'.join(('INS', chrom, str(start), ins)) else: indelstr = ':'.join(('DEL', chrom, str(start), str(end))) snvstr = chrom + ":" + str(start) + "-" + str( end) + " (VAF=" + str(vaf) + ")" log.write("\t".join(("indel", indelstr, str(mutpos), mutstr, str(avgincover), str(avgoutcover), str(spikein_frac), str(maxfrac))) + "\n") else: outbam_muts.close() os.remove(tmpoutbamname) if os.path.exists(tmpoutbamname + '.bai'): os.remove(tmpoutbamname + '.bai') logger.warning("%s dropped for outcover/incover < %s" % (mutid, str(args.coverdiff))) return None outbam_muts.close() bamfile.close() bammate.close() log.close() return sorted(tmpbams)
def main(args): logger.info("starting %s called with args: %s" % (sys.argv[0], ' '.join(sys.argv))) bedfile = open(args.varFileName, 'r') reffile = pysam.Fastafile(args.refFasta) if not os.path.exists(args.bamFileName + '.bai'): logger.error("input bam must be indexed, not .bai file found for %s" % args.bamFileName) sys.exit(1) alignopts = {} if args.alignopts is not None: alignopts = dict([o.split(':') for o in args.alignopts.split(',')]) aligners.checkoptions(args.aligner, alignopts) # load readlist to avoid, if specified avoid = None if args.avoidreads is not None: avoid = dictlist(args.avoidreads) # make a temporary file to hold mutated reads outbam_mutsfile = "addindel." + str(uuid4()) + ".muts.bam" bamfile = pysam.AlignmentFile(args.bamFileName, 'rb') outbam_muts = pysam.AlignmentFile(outbam_mutsfile, 'wb', template=bamfile) outbam_muts.close() bamfile.close() tmpbams = [] if not os.path.exists(args.tmpdir): os.mkdir(args.tmpdir) logger.info("created tmp directory: %s" % args.tmpdir) if not os.path.exists('addindel_logs_' + os.path.basename(args.outBamFile)): os.mkdir('addindel_logs_' + os.path.basename(args.outBamFile)) logger.info("created directory: addindel_logs_%s" % os.path.basename(args.outBamFile)) assert os.path.exists('addindel_logs_' + os.path.basename(args.outBamFile) ), "could not create output directory!" assert os.path.exists(args.tmpdir), "could not create temporary directory!" pool = Pool(processes=int(args.procs)) results = [] ntried = 0 for bedline in bedfile: if ntried < int(args.numsnvs) or int(args.numsnvs) == 0: c = bedline.strip().split() chrom = c[0] start = int(c[1]) end = int(c[2]) vaf = float(c[3]) type = c[4] ins = None assert type in ('INS', 'DEL') if type == 'INS': ins = c[5] # make mutation (submit job to thread pool) result = pool.apply_async( makemut, [args, chrom, start, end, vaf, ins, avoid, alignopts]) results.append(result) ntried += 1 for result in results: tmpbamlist = result.get() if tmpbamlist is not None: for tmpbam in tmpbamlist: if os.path.exists(tmpbam): tmpbams.append(tmpbam) if len(tmpbams) == 0: logger.error("no succesful mutations") sys.exit() tmpbams.sort() # merge tmp bams if len(tmpbams) == 1: os.rename(tmpbams[0], outbam_mutsfile) elif len(tmpbams) > 1: mergebams(tmpbams, outbam_mutsfile, maxopen=int(args.maxopen)) bedfile.close() # cleanup for bam in tmpbams: if os.path.exists(bam): os.remove(bam) if os.path.exists(bam + '.bai'): os.remove(bam + '.bai') if os.listdir(args.tmpdir) == []: os.rmdir(args.tmpdir) if args.skipmerge: logger.info("skipping merge, plase merge reads from %s manually." % outbam_mutsfile) else: if args.tagreads: from bamsurgeon.markreads import markreads tmp_tag_bam = 'tag.%s.bam' % str(uuid4()) markreads(outbam_mutsfile, tmp_tag_bam) move(tmp_tag_bam, outbam_mutsfile) logger.info("tagged reads.") logger.info("done making mutations, merging mutations into %s --> %s" % (args.bamFileName, args.outBamFile)) replace(args.bamFileName, outbam_mutsfile, args.outBamFile, seed=args.seed) #cleanup os.remove(outbam_mutsfile) var_basename = '.'.join(os.path.basename(args.varFileName).split('.')[:-1]) bam_basename = '.'.join(os.path.basename(args.outBamFile).split('.')[:-1]) vcf_fn = bam_basename + '.addindel.' + var_basename + '.vcf' makevcf.write_vcf_indel( 'addindel_logs_' + os.path.basename(args.outBamFile), args.refFasta, vcf_fn) logger.info('vcf output written to ' + vcf_fn)
if __name__ == '__main__': # user input bam_dir = argv[ 1] # script to move all relevant files (env samples(BAM,FQ,CNS_5)) from NGS_runa to data3/sewer min_depth = int(argv[2]) # now 5, maybe change later refseq_path = argv[3] # preparations bam_dir = bam_dir + '/' if not bam_dir.endswith( '/') else bam_dir # make sure path ends with '/' # get reference name refseq_name = os.path.basename(refseq_path).strip('.fasta') # index refseq samtools in python pysam.faidx(refseq_path) refseq_series = pd.Series( [x for x in pysam.Fastafile(refseq_path).fetch(reference=refseq_name)]) excel_mutTable = pd.read_excel( "/data/projects/Dana/scripts/covid19/mutationsTable.xlsx", sheet_name=None, engine='openpyxl') for name in excel_mutTable: frame = excel_mutTable[name] excel_mutTable[name] = frame[ frame['Mutation type'].str.lower() != 'insertion'] excel_mutTable[name][ 'lineage'] = name # add a lineage column to all variant's tables # uniq_lineages = [lin.rsplit('_', 1)[0] for lin in excel_mutTable] uniq_lineages = excel_mutTable.keys() muttable_by_lineage = excel_mutTable
def testFTPView(self): if not check_url(self.url): return with pysam.Fastafile(self.url) as f: self.assertEqual(len(f.fetch("chr1", 0, 1000)), 1000)
sam_fh_in.getrname(r.tid))) # NOTE pysam's set_tag inferred value_type 'd' which is # undefined according to # https://samtools.github.io/hts-specs/SAMv1.pdf r.set_tag(IDENT_TAG, round(ident * 100, 1), value_type='f', replace=REPLACE_TAG) sam_fh_out.write(r) if __name__ == "__main__": REPLACE_TAG = False try: sam_in, sam_out, ref_fa = sys.argv[1:] except ValueError: sys.stderr.write( "FATAL: Need input and output BAM (stdout supported) as well as the (indexed) reference as (only) arguments (but got {})\n" .format(' '.join(sys.argv[1:]))) sys.exit(1) assert not os.path.exists(sam_out) fasta_fh = pysam.Fastafile(ref_fa) sam_fh_in = pysam.Samfile(sam_in) # mode automatically inferred out_mode = 'w' if sam_out.endswith(".bam"): out_mode += "b" sam_fh_out = pysam.Samfile(sam_out, out_mode, template=sam_fh_in) main(sam_fh_in, sam_fh_out, fasta_fh)