def crossmap_vcf_file(mapping, infile, outfile, liftoverfile, refgenome, noCompAllele=False, compress=False): ''' Convert genome coordinates in VCF format. Parameters ---------- mapping : dict Dictionary with source chrom name as key, IntervalTree object as value. infile : file Input file in VCF format. Can be a regular or compressed (*.gz, *.Z,*.z, *.bz, *.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to remote file. outfile : str prefix of output files. liftoverfile : file Chain (https://genome.ucsc.edu/goldenPath/help/chain.html) format file. Can be a regular or compressed (*.gz, *.Z,*.z, *.bz, *.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to remote file. refgenome : file The genome sequence file of 'target' assembly in FASTA format. noCompAllele : bool A logical value indicates whether to compare ref_allele to alt_allele after liftover. If True, the variant will be marked as "unmap" if ref_allele == alt_allele. ''' if noCompAllele: printlog( ["Keep variants [reference_allele == alternative_allele] ..."]) else: printlog([ "Filter out variants [reference_allele == alternative_allele] ..." ]) #index refegenome file if it hasn't been done if not os.path.exists(refgenome + '.fai'): printlog(["Creating index for", refgenome]) pysam.faidx(refgenome) refFasta = pysam.Fastafile(refgenome) FILE_OUT = open(outfile, 'w') UNMAP = open(outfile + '.unmap', 'w') total = 0 fail = 0 withChr = False # check if the VCF data lines use 'chr1' or '1' for line in ireader.reader(infile): if not line.strip(): continue line = line.strip() #deal with meta-information lines. #meta-information lines needed in both mapped and unmapped files if line.startswith('##fileformat'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##INFO'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##FILTER'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##FORMAT'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##ALT'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##SAMPLE'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##PEDIGREE'): print(line, file=FILE_OUT) print(line, file=UNMAP) #meta-information lines needed in unmapped files elif line.startswith('##assembly'): print(line, file=UNMAP) elif line.startswith('##contig'): print(line, file=UNMAP) if 'ID=chr' in line: withChr = True #update contig information elif line.startswith('#CHROM'): printlog(["Updating contig field ... "]) target_gsize = dict( list(zip(refFasta.references, refFasta.lengths))) for chr_id in sorted(target_gsize): if chr_id.startswith('chr'): if withChr is True: print("##contig=<ID=%s,length=%d,assembly=%s>" % (chr_id, target_gsize[chr_id], os.path.basename(refgenome)), file=FILE_OUT) else: print("##contig=<ID=%s,length=%d,assembly=%s>" % (chr_id.replace('chr', ''), target_gsize[chr_id], os.path.basename(refgenome)), file=FILE_OUT) else: if withChr is True: print("##contig=<ID=%s,length=%d,assembly=%s>" % ('chr' + chr_id, target_gsize[chr_id], os.path.basename(refgenome)), file=FILE_OUT) else: print("##contig=<ID=%s,length=%d,assembly=%s>" % (chr_id, target_gsize[chr_id], os.path.basename(refgenome)), file=FILE_OUT) print( "##liftOverProgram=<CrossMap,version=%s,website=https://sourceforge.net/projects/crossmap>" % __version__, file=FILE_OUT) print("##liftOverChainFile=<%s>" % liftoverfile, file=FILE_OUT) print("##originalFile=<%s>" % infile, file=FILE_OUT) print("##targetRefGenome=<%s>" % refgenome, file=FILE_OUT) print("##liftOverDate=<%s>" % datetime.date.today().strftime("%B%d,%Y"), file=FILE_OUT) print(line, file=FILE_OUT) print(line, file=UNMAP) printlog(["Lifting over ... "]) else: if line.startswith('#'): continue fields = str.split(line, maxsplit=7) total += 1 chrom = fields[0] start = int(fields[1]) - 1 # 0 based end = start + len(fields[3]) a = map_coordinates(mapping, chrom, start, end, '+') if a is None: print(line + "\tFail(Unmap)", file=UNMAP) fail += 1 continue if len(a) == 2: # update chrom target_chr = str( a[1][0] ) #target_chr is from chain file, could be 'chr1' or '1' target_start = a[1][1] target_end = a[1][2] fields[0] = target_chr # update start coordinate fields[1] = target_start + 1 # update ref allele target_chr = update_chromID(refFasta.references[0], target_chr) try: fields[3] = refFasta.fetch(target_chr, target_start, target_end).upper() except: print(line + "\tFail(KeyError)", file=UNMAP) fail += 1 continue # update END if any fields[7] = re.sub('END\=\d+', 'END=' + str(target_end), fields[7]) if a[1][3] == '-': fields[4] = revcomp_DNA(fields[4], True) # check if ref_allele is the same as alt_allele if noCompAllele: print('\t'.join(map(str, fields)), file=FILE_OUT) else: if fields[3] != fields[4]: print('\t'.join(map(str, fields)), file=FILE_OUT) else: print(line + "\tFail(REF==ALT)", file=UNMAP) fail += 1 else: print(line + "\tFail(Multiple_hits)", file=UNMAP) fail += 1 continue FILE_OUT.close() UNMAP.close() printlog(["Total entries:", str(total)]) printlog(["Failed to map:", str(fail)]) if compress: try: printlog(["Compressing \"%s\" ..." % outfile]) subprocess.call("gzip " + outfile, shell=True) except: pass
def crossmap_bam_file(mapping, chainfile, infile, outfile_prefix, chrom_size, IS_size=200, IS_std=30.0, fold=3, addtag=True): ''' Description ----------- Convert genome coordinates (in BAM/SAM format) between assemblies. BAM/SAM format: http://samtools.sourceforge.net/ chrom_size is target chromosome size Parameters ---------- mapping : dict Dictionary with source chrom name as key, IntervalTree object as value. chainfile : file Input chain format file. infile : file Input BAM, SAM or CRAM foramt file. outfile_prefix : str Output prefix. chrom_size : dict Chromosome size of the *target* assembly, used to build bam header. IS_size : int Average insert size of pair-end sequencing. IS_std : float Stanadard deviation of insert size. fold : float A mapped pair is considered as \"proper pair\" if both ends mapped to different strand and the distance between them is less then fold * stdev from the mean. addtag : bool if addtag is set to True, will add tags to each alignmnet: Q = QC (QC failed) N = unmapped (originally unmapped or originally mapped but failed to liftover to new assembly) M = multiple mapped (alignment can be liftover to multiple places) U = unique mapped (alignment can be liftover to only 1 place) tags for pair-end sequencing include: QF: QC failed NN: both read1 and read2 unmapped NU: read1 unmapped, read2 unique mapped NM: read1 unmapped, multiple mapped UN: read1 uniquely mapped, read2 unmap UU: both read1 and read2 uniquely mapped UM: read1 uniquely mapped, read2 multiple mapped MN: read1 multiple mapped, read2 unmapped MU: read1 multiple mapped, read2 unique mapped MM: both read1 and read2 multiple mapped tags for single-end sequencing include: QF: QC failed SN: unmaped SM: multiple mapped SU: uniquely mapped ''' # determine the input file format (BAM, CRAM or SAM) file_type = '' if infile.lower().endswith('.bam'): file_type = 'BAM' comments = ['ORIGINAL_BAM_FILE=' + infile] samfile = pysam.Samfile(infile, 'rb') if len(samfile.header) == 0: print("BAM file has no header section. Exit!", file=sys.stderr) sys.exit(1) elif infile.lower().endswith('.cram'): file_type = 'CRAM' comments = ['ORIGINAL_CRAM_FILE=' + infile] samfile = pysam.Samfile(infile, 'rc') if len(samfile.header) == 0: print("CRAM file has no header section. Exit!", file=sys.stderr) sys.exit(1) elif infile.lower().endswith('.sam'): file_type = 'SAM' comments = ['ORIGINAL_SAM_FILE=' + infile] samfile = pysam.Samfile(infile, 'r') if len(samfile.header) == 0: print("SAM file has no header section. Exit!", file=sys.stderr) sys.exit(1) else: print( "Unknown file type! Input file must have suffix '.bam','.cram', or '.sam'.", file=sys.stderr) sys.exit(1) comments.append('CHAIN_FILE=' + chainfile) sam_ori_header = samfile.header.to_dict() # chromosome ID style of the original BAM file chrom_style = sam_ori_header['SQ'][0]['SN'] # either 'chr1' or '1' # update chrom_size of target genome target_chrom_sizes = {} for n, l in chrom_size.items(): target_chrom_sizes[update_chromID(chrom_style, n)] = l (new_header, name_to_id) = sam_header.bam_header_generator( orig_header=sam_ori_header, chrom_size=target_chrom_sizes, prog_name="CrossMap", prog_ver=__version__, format_ver=1.0, sort_type='coordinate', co=comments) # write to file if outfile_prefix is not None: if file_type == 'BAM': OUT_FILE = pysam.Samfile(outfile_prefix + '.bam', "wb", header=new_header) printlog( ["Liftover BAM file:", infile, '==>', outfile_prefix + '.bam']) elif file_type == 'CRAM': OUT_FILE = pysam.Samfile(outfile_prefix + '.bam', "wb", header=new_header) printlog([ "Liftover CRAM file:", infile, '==>', outfile_prefix + '.bam' ]) elif file_type == 'SAM': OUT_FILE = pysam.Samfile(outfile_prefix + '.sam', "wh", header=new_header) printlog( ["Liftover SAM file:", infile, '==>', outfile_prefix + '.sam']) else: print( "Unknown file type! Input file must have suffix '.bam','.cram', or '.sam'.", file=sys.stderr) sys.exit(1) # write to screen else: if file_type == 'BAM': OUT_FILE = pysam.Samfile('-', "wb", header=new_header) printlog(["Liftover BAM file:", infile]) elif file_type == 'CRAM': OUT_FILE = pysam.Samfile('-', "wb", header=new_header) printlog(["Liftover CRAM file:", infile]) elif file_type == 'SAM': OUT_FILE = pysam.Samfile('-', "w", header=new_header) printlog(["Liftover SAM file:", infile]) else: print( "Unknown file type! Input file must have suffix '.bam','.cram', or '.sam'.", file=sys.stderr) sys.exit(1) QF = 0 NN = 0 NU = 0 NM = 0 UN = 0 UU = 0 UM = 0 MN = 0 MU = 0 MM = 0 SN = 0 SM = 0 SU = 0 total_item = 0 try: while (1): total_item += 1 old_alignment = next(samfile) new_alignment = pysam.AlignedRead() # create AlignedRead object new_alignment.query_name = old_alignment.query_name # 1st column. read name. new_alignment.query_sequence = old_alignment.query_sequence # 10th column. read sequence. all bases. new_alignment.query_qualities = old_alignment.query_qualities # 11th column. read sequence quality. all bases. new_alignment.set_tags(old_alignment.get_tags()) # 12 - columns # by default pysam will change RG:Z to RG:A, which can cause downstream failures with GATK and freebayes # Thanks Wolfgang Resch <*****@*****.**> identified this bug and provided solution. try: rg, rgt = old_alignment.get_tag("RG", with_value_type=True) except KeyError: pass else: new_alignment.set_tag("RG", str(rg), rgt) ## Pair-end sequencing if old_alignment.is_paired: new_alignment.flag = 0x1 #pair-end in sequencing if old_alignment.is_read1: new_alignment.flag = new_alignment.flag | 0x40 elif old_alignment.is_read2: new_alignment.flag = new_alignment.flag | 0x80 if old_alignment.is_qcfail: new_alignment.flag = new_alignment.flag | 0x200 new_alignment.reference_id = -1 #3 new_alignment.reference_start = 0 #4 new_alignment.mapping_quality = 255 #5 new_alignment.cigartuples = old_alignment.cigartuples #6 new_alignment.next_reference_id = -1 #7 new_alignment.next_reference_start = 0 #8 new_alignment.template_length = 0 #9 QF += 1 if addtag: new_alignment.set_tag(tag="QF", value=0) OUT_FILE.write(new_alignment) continue #================================== # R1 originally unmapped #================================== elif old_alignment.is_unmapped: new_alignment.flag = new_alignment.flag | 0x4 #2 new_alignment.reference_id = -1 #3 new_alignment.reference_start = 0 #4 new_alignment.mapping_quality = 255 #5 new_alignment.cigartuples = old_alignment.cigartuples #6 # R1 & R2 originally unmapped if old_alignment.mate_is_unmapped: new_alignment.next_reference_id = -1 #7 new_alignment.next_reference_start = 0 #8 new_alignment.template_length = 0 #9 NN += 1 if addtag: new_alignment.set_tag(tag="NN", value=0) OUT_FILE.write(new_alignment) continue # R1 unmap, R2 is mapped else: try: read2_chr = samfile.get_reference_name( old_alignment.next_reference_id) read2_strand = '-' if old_alignment.mate_is_reverse else '+' read2_start = old_alignment.next_reference_start read2_end = read2_start + 1 read2_maps = map_coordinates( mapping, read2_chr, read2_start, read2_end, read2_strand) except: read2_maps = None #------------------------------------ # R1 unmapped, R2 failed to liftover #------------------------------------ if read2_maps is None: new_alignment.next_reference_id = -1 #7 new_alignment.next_reference_start = 0 #8 new_alignment.template_length = 0 #9 NN += 1 if addtag: new_alignment.set_tag(tag="NN", value=0) OUT_FILE.write(new_alignment) continue #------------------------------------ # R1 unmapped, R2 unique #------------------------------------ elif len(read2_maps) == 2: # 2-9 if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20 new_alignment.reference_id = name_to_id[ read2_maps[1] [0]] #recommend to set the RNAME of unmapped read to its mate's new_alignment.reference_start = read2_maps[1][ 1] #recommend to set the POS of unmapped read to its mate's new_alignment.mapping_quality = old_alignment.mapping_quality new_alignment.cigartuples = old_alignment.cigartuples new_alignment.next_reference_id = name_to_id[ read2_maps[1][0]] new_alignment.next_reference_start = read2_maps[1][ 1] new_alignment.template_length = 0 NU += 1 if addtag: new_alignment.set_tag(tag="NU", value=0) OUT_FILE.write(new_alignment) continue #------------------------------------ # R1 unmapped, R2 multiple #------------------------------------ else: if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20 # 2-9 new_alignment.flag = new_alignment.flag | 0x100 new_alignment.reference_id = name_to_id[ read2_maps[1][0]] new_alignment.reference_start = read2_maps[1][1] new_alignment.mapping_quality = old_alignment.mapping_quality new_alignment.cigartuples = old_alignment.cigartuples new_alignment.next_reference_id = name_to_id[ read2_maps[1][0]] new_alignment.next_reference_start = read2_maps[1][ 1] new_alignment.template_length = 0 NM += 1 if addtag: new_alignment.set_tag(tag="NM", value=0) OUT_FILE.write(new_alignment) continue #================================== # R1 is originally mapped #================================== else: try: read1_chr = samfile.get_reference_name( old_alignment.reference_id) read1_strand = '-' if old_alignment.is_reverse else '+' read1_start = old_alignment.reference_start read1_end = old_alignment.reference_end read1_maps = map_coordinates(mapping, read1_chr, read1_start, read1_end, read1_strand) except: read1_maps = None if not old_alignment.mate_is_unmapped: try: read2_chr = samfile.get_reference_name( old_alignment.next_reference_id) read2_strand = '-' if old_alignment.mate_is_reverse else '+' read2_start = old_alignment.next_reference_start read2_end = read2_start + 1 read2_maps = map_coordinates( mapping, read2_chr, read2_start, read2_end, read2_strand) except: read2_maps = None #------------------------------------ # R1 failed to liftover #------------------------------------ if read1_maps is None: # read2 is unmapped or failed to convertion if old_alignment.mate_is_unmapped or (read2_maps is None): # col2 - col9 new_alignment.flag = new_alignment.flag | 0x4 #2 new_alignment.reference_id = -1 #3 new_alignment.reference_start = 0 #4 new_alignment.mapping_quality = 255 #5 new_alignment.cigartuples = old_alignment.cigartuples #6 new_alignment.next_reference_id = -1 #7 new_alignment.next_reference_start = 0 #8 new_alignment.template_length = 0 #9 if addtag: new_alignment.set_tag(tag="NN", value=0) NN += 1 OUT_FILE.write(new_alignment) continue # read2 is unique mapped elif len(read2_maps) == 2: # col2 - col9 if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20 new_alignment.reference_id = name_to_id[ read2_maps[1] [0]] #recommend to set the RNAME of unmapped read to its mate's new_alignment.reference_start = read2_maps[1][ 1] #recommend to set the POS of unmapped read to its mate's new_alignment.mapping_quality = old_alignment.mapping_quality new_alignment.cigartuples = old_alignment.cigartuples new_alignment.next_reference_id = name_to_id[ read2_maps[1][0]] new_alignment.next_reference_start = read2_maps[1][ 1] #start new_alignment.template_length = 0 NU += 1 if addtag: new_alignment.set_tag(tag="NU", value=0) OUT_FILE.write(new_alignment) continue # read2 is multiple mapped else: # col2 - col9 if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20 new_alignment.flag = new_alignment.flag | 0x100 new_alignment.reference_id = name_to_id[ read2_maps[1][0]] new_alignment.reference_start = read2_maps[1][1] new_alignment.mapping_quality = 255 # mapq not available new_alignment.cigartuples = old_alignment.cigartuples new_alignment.next_reference_id = name_to_id[ read2_maps[1][0]] new_alignment.next_reference_start = read2_maps[1][ 1] #start new_alignment.template_length = 0 NM += 1 if addtag: new_alignment.set_tag(tag="NM", value=0) OUT_FILE.write(new_alignment) continue #------------------------------------ # R1 uniquely mapped #------------------------------------ elif len(read1_maps) == 2: # col2 - col5 if read1_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x10 new_alignment.reference_id = name_to_id[read1_maps[1] [0]] new_alignment.reference_start = read1_maps[1][1] new_alignment.mapping_quality = old_alignment.mapping_quality if read1_maps[0][3] != read1_maps[1][ 3]: # opposite strand # 6 new_alignment.cigartuples = old_alignment.cigartuples[:: -1] #reverse cigar tuple # 10 new_alignment.query_sequence = revcomp_DNA( old_alignment.query_sequence ) #reverse complement read sequence # 11 new_alignment.query_qualities = old_alignment.query_qualities[:: -1] #reverse quality string elif read1_maps[0][3] == read1_maps[1][ 3]: # same strand # 6 new_alignment.cigartuples = old_alignment.cigartuples # R2 unmapped before or after conversion if (old_alignment.mate_is_unmapped) or (read2_maps is None): #2,7-9 new_alignment.flag = new_alignment.flag | 0x8 new_alignment.next_reference_id = name_to_id[ read1_maps[1][0]] new_alignment.next_reference_start = read1_maps[1][ 1] new_alignment.template_length = 0 UN += 1 if addtag: new_alignment.set_tag(tag="UN", value=0) OUT_FILE.write(new_alignment) continue # R2 is unique mapped elif len(read2_maps) == 2: # 2,7-9 if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20 new_alignment.next_reference_id = name_to_id[ read2_maps[1][0]] #chrom new_alignment.next_reference_start = read2_maps[1][ 1] new_alignment.template_length = abs( new_alignment.reference_start - new_alignment.next_reference_start ) + old_alignment.reference_length # 2 if (read2_maps[1][3] != read1_maps[1][3]) and ( new_alignment.template_length <= IS_size + fold * IS_std) and ( new_alignment.template_length >= IS_size - fold * IS_std): new_alignment.flag = new_alignment.flag | 0x2 UU += 1 if addtag: new_alignment.set_tag(tag="UU", value=0) OUT_FILE.write(new_alignment) continue # R2 is multiple mapped else: # 2 (strand) if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20 # 2 (secondary alignment) new_alignment.flag = new_alignment.flag | 0x100 #7-9 new_alignment.next_reference_id = name_to_id[ read2_maps[1][0]] new_alignment.next_reference_start = read2_maps[1][ 1] new_alignment.template_length = 0 UM += 1 if addtag: new_alignment.set_tag(tag="UM", value=0) OUT_FILE.write(new_alignment) continue #------------------------------------ # R1 multiple mapped #----------------------------------- elif len(read1_maps) > 2 and len(read1_maps) % 2 == 0: # 2 new_alignment.flag = new_alignment.flag | 0x100 if read1_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x10 # 3-5 new_alignment.tid = name_to_id[read1_maps[1] [0]] #chrom new_alignment.pos = read1_maps[1][1] #start new_alignment.mapq = 255 if read1_maps[0][3] != read1_maps[1][ 3]: # opposite strand # 6 new_alignment.cigartuples = old_alignment.cigartuples[:: -1] #reverse cigar tuple # 10 new_alignment.query_sequence = revcomp_DNA( old_alignment.query_sequence ) #reverse complement read sequence # 11 new_alignment.query_qualities = old_alignment.query_qualities[:: -1] #reverse quality string elif read1_maps[0][3] == read1_maps[1][ 3]: # same strand # 6 new_alignment.cigartuples = old_alignment.cigartuples # (1) R2 is unmapped if (old_alignment.mate_is_unmapped) or (read2_maps is None): #2,7-9 new_alignment.flag = new_alignment.flag | 0x8 new_alignment.next_reference_id = name_to_id[ read1_maps[1][0]] new_alignment.next_reference_start = read1_maps[1][ 1] new_alignment.template_length = 0 MN += 1 if addtag: new_alignment.set_tag(tag="MN", value=0) OUT_FILE.write(new_alignment) continue # (2) read2 is unique mapped elif len(read2_maps) == 2: # 2,7-9 if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20 new_alignment.next_reference_id = name_to_id[ read2_maps[1][0]] #chrom new_alignment.next_reference_start = read2_maps[1][ 1] new_alignment.template_length = 0 MU += 1 if addtag: new_alignment.set_tag(tag="MU", value=0) OUT_FILE.write(new_alignment) continue # (3) R2 is multiple mapped else: # 2,7-9 if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20 # 2 (secondary alignment) new_alignment.flag = new_alignment.flag | 0x100 new_alignment.next_reference_id = name_to_id[ read2_maps[1][0]] #chrom new_alignment.next_reference_start = read2_maps[1][ 1] new_alignment.template_length = 0 MM += 1 if addtag: new_alignment.set_tag(tag="MM", value=0) OUT_FILE.write(new_alignment) continue # Singel end sequencing else: # 7-9 new_alignment.next_reference_id = -1 new_alignment.next_reference_start = 0 new_alignment.template_length = 0 # (1) originally unmapped if old_alignment.is_unmapped: # 2-6 new_alignment.flag = new_alignment.flag | 0x4 new_alignment.reference_id = -1 new_alignment.reference_start = 0 new_alignment.mapping_quality = 255 new_alignment.cigartuples = old_alignment.cigartuples SN += 1 if addtag: new_alignment.set_tag(tag="SN", value=0) OUT_FILE.write(new_alignment) continue else: new_alignment.flag = 0x0 read_chr = samfile.get_reference_name( old_alignment.reference_id) read_strand = '-' if old_alignment.is_reverse else '+' read_start = old_alignment.reference_start read_end = old_alignment.reference_end read_maps = map_coordinates(mapping, read_chr, read_start, read_end, read_strand) # (2) unmapped afte liftover if read_maps is None: new_alignment.flag = new_alignment.flag | 0x4 new_alignment.reference_id = -1 new_alignment.reference_start = 0 new_alignment.mapping_quality = 255 SN += 1 if addtag: new_alignment.set_tag(tag="SN", value=0) OUT_FILE.write(new_alignment) continue # (3) unique mapped if len(read_maps) == 2: if read_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x10 if read_maps[0][3] != read_maps[1][3]: # 6 new_alignment.cigartuples = old_alignment.cigartuples[:: -1] #reverse cigar tuple # 10 new_alignment.query_sequence = revcomp_DNA( old_alignment.query_sequence ) #reverse complement read sequence # 11 try: new_alignment.query_qualities = old_alignment.query_qualities[:: -1] #reverse quality string except: new_alignment.query_qualities = [] else: # 6 new_alignment.cigartuples = old_alignment.cigartuples # 3-5 new_alignment.reference_id = name_to_id[read_maps[1] [0]] new_alignment.reference_start = read_maps[1][1] new_alignment.mapping_quality = old_alignment.mapping_quality SU += 1 if addtag: new_alignment.set_tag(tag="SU", value=0) OUT_FILE.write(new_alignment) continue # (4) multiple mapped if len(read_maps) > 2 and len(read_maps) % 2 == 0: new_alignment.flag = new_alignment.flag | 0x100 if read_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x10 if read_maps[0][3] != read_maps[1][3]: # 6 new_alignment.cigartuples = old_alignment.cigartuples[:: -1] #reverse cigar tuple # 10 new_alignment.query_sequence = revcomp_DNA( old_alignment.query_sequence ) #reverse complement read sequence # 11 new_alignment.query_qualities = old_alignment.query_qualities[:: -1] #reverse quality string else: # 6 new_alignment.cigartuples = old_alignment.cigartuples # 3-5 new_alignment.tid = name_to_id[read_maps[1][0]] new_alignment.pos = read_maps[1][1] new_alignment.mapq = old_alignment.mapq SM += 1 if addtag: new_alignment.set_tag(tag="SM", value=0) OUT_FILE.write(new_alignment) continue except StopIteration: printlog(["Done!"]) OUT_FILE.close() if outfile_prefix is not None: if file_type == "BAM" or file_type == "CRAM": try: printlog([ 'Sort "%s" and save as "%s"' % (outfile_prefix + '.bam', outfile_prefix + '.sorted.bam') ]) pysam.sort("-o", outfile_prefix + '.sorted.bam', outfile_prefix + '.bam') except: printlog(["Warning: ", "output BAM file was NOT sorted"]) try: printlog(['Index "%s" ...' % (outfile_prefix + '.sorted.bam')]) pysam.index(outfile_prefix + '.sorted.bam', outfile_prefix + '.sorted.bam.bai') except: printlog(["Warning: ", "output BAM file was NOT indexed."]) print("Total alignments:" + str(total_item - 1)) print(" QC failed: " + str(QF)) if max(NN, NU, NM, UN, UU, UM, MN, MU, MM) > 0: print(" Paired-end reads:") print("\tR1 unique, R2 unique (UU): " + str(UU)) print("\tR1 unique, R2 unmapp (UN): " + str(UN)) print("\tR1 unique, R2 multiple (UM): " + str(UM)) print("\tR1 multiple, R2 multiple (MM): " + str(MM)) print("\tR1 multiple, R2 unique (MU): " + str(MU)) print("\tR1 multiple, R2 unmapped (MN): " + str(MN)) print("\tR1 unmap, R2 unmap (NN): " + str(NN)) print("\tR1 unmap, R2 unique (NU): " + str(NU)) print("\tR1 unmap, R2 multiple (NM): " + str(NM)) if max(SN, SU, SM) > 0: print(" Single-end reads:") print("\tUniquley mapped (SU): " + str(SU)) print("\tMultiple mapped (SM): " + str(SM)) print("\tUnmapped (SN): " + str(SN))
def crossmap_maf_file(mapping, infile, outfile, liftoverfile, refgenome, ref_name): ''' Convert genome coordinates in MAF (mutation annotation foramt) format. Parameters ---------- mapping : dict Dictionary with source chrom name as key, IntervalTree object as value. infile : file Input file in VCF format. Can be a regular or compressed (*.gz, *.Z,*.z, *.bz, *.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to remote file. outfile : str prefix of output files. liftoverfile : file Chain (https://genome.ucsc.edu/goldenPath/help/chain.html) format file. Can be a regular or compressed (*.gz, *.Z,*.z, *.bz, *.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to remote file. refgenome : file The genome sequence file of 'target' assembly in FASTA format. ref_name : str The NCBI build name of the target assembly, for example, "GRCh37", "GRCh38". ''' #index refegenome file if it hasn't been done if not os.path.exists(refgenome + '.fai'): logging.info("Creating index for: %s" % refgenome) pysam.faidx(refgenome) if os.path.getctime(refgenome + '.fai') < os.path.getctime(refgenome): logging.info( "Index file is older than reference genome. Re-creating index for: %s" % refgenome) pysam.faidx(refgenome) refFasta = pysam.Fastafile(refgenome) FILE_OUT = open(outfile, 'w') UNMAP = open(outfile + '.unmap', 'w') total = 0 fail = 0 for line in ireader.reader(infile): if not line.strip(): continue line = line.strip() #meta-information lines needed in both mapped and unmapped files if line.startswith('#'): print(line, file=FILE_OUT) print(line, file=UNMAP) continue elif line.startswith('Hugo_Symbol'): print( "#liftOver: Program=%sv%s, Time=%s, ChainFile=%s, NewRefGenome=%s" % ("CrossMap", __version__, datetime.date.today().strftime("%B%d,%Y"), liftoverfile, refgenome), file=FILE_OUT) print(line, file=FILE_OUT) print(line, file=UNMAP) logging.info("Lifting over ... ") else: fields = str.split(line, sep='\t') total += 1 fields[3] = ref_name chrom = fields[4] start = int(fields[5]) - 1 # 0 based end = int(fields[6]) #strand = fields[7] a = map_coordinates(mapping, chrom, start, end, '+') if a is None: print(line, file=UNMAP) fail += 1 continue if len(a) == 2: target_chr = str( a[1][0] ) #target_chr is from chain file, could be 'chr1' or '1' target_start = a[1][1] target_end = a[1][2] # update chrom fields[4] = target_chr # update start coordinate fields[5] = target_start + 1 # update end fields[6] = target_end # update ref allele try: target_chr = update_chromID(refFasta.references[0], target_chr) fields[10] = refFasta.fetch(target_chr, target_start, target_end).upper() except: print(line, file=UNMAP) fail += 1 continue if a[1][3] == '-': fields[10] = revcomp_DNA(fields[10], True) print('\t'.join(map(str, fields)), file=FILE_OUT) else: print(line, file=UNMAP) fail += 1 continue FILE_OUT.close() UNMAP.close() logging.info("Total entries: %d", total) logging.info("Failed to map: %d", fail)
def crossmap_gvcf_file(mapping, infile, outfile, liftoverfile, refgenome, noCompAllele = False, compress = False, cstyle = 'a'): ''' Convert genome coordinates in GVCF format. Parameters ---------- mapping : dict Dictionary with source chrom name as key, IntervalTree object as value. infile : file Input file in GVCF format. Can be a regular or compressed (*.gz, *.Z,*.z, *.bz, *.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to remote file. outfile : str prefix of output files. liftoverfile : file Chain (https://genome.ucsc.edu/goldenPath/help/chain.html) format file. Can be a regular or compressed (*.gz, *.Z,*.z, *.bz, *.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to remote file. refgenome : file The genome sequence file of 'target' assembly in FASTA format. noCompAllele : bool A logical value indicates whether to compare ref_allele to alt_allele after liftover. If True, the variant will be marked as "unmap" if ref_allele == alt_allele. cstyle : str, optional Chromosome ID style. Must be one of ['a', 's', 'l'], where 'a' : as-is. The chromosome ID of the output file is in the same style of the input file. 's' : short ID, such as "1", "2", "X. 'l' : long ID, such as "chr1", "chr2", "chrX. ''' if noCompAllele: logging.info("Keep variants [reference_allele == alternative_allele] ...") else: logging.info("Filter out variants [reference_allele == alternative_allele] ...") #index refegenome file if it hasn't been done if not os.path.exists(refgenome + '.fai'): logging.info("Creating index for: %s" % refgenome) pysam.faidx(refgenome) if os.path.getmtime(refgenome + '.fai') < os.path.getmtime(refgenome): logging.info("Index file is older than reference genome. Re-creating index for: %s" % refgenome) pysam.faidx(refgenome) refFasta = pysam.Fastafile(refgenome) FILE_OUT = open(outfile ,'w') UNMAP = open(outfile + '.unmap','w') total_var = 0 failed_var = 0 total_region = 0 failed_region = 0 withChr = False # check if the VCF data lines use 'chr1' or '1' for line in ireader.reader(infile): if not line.strip(): continue line=line.strip() #deal with meta-information lines. #meta-information lines needed in both mapped and unmapped files if line.startswith('##fileformat'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##INFO'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##FILTER'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##FORMAT'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##ALT'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##SAMPLE'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##PEDIGREE'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##GVCFBlock'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##GATKCommandLine'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##source'): print(line, file=FILE_OUT) print(line, file=UNMAP) #meta-information lines needed in unmapped files elif line.startswith('##assembly'): print(line, file=UNMAP) elif line.startswith('##contig'): print(line, file=UNMAP) if 'ID=chr' in line: chr_template = 'chr1' else: chr_template = '1' #update contig information elif line.startswith('#CHROM'): logging.info("Updating contig field ... ") target_gsize = dict(list(zip(refFasta.references, refFasta.lengths))) for chr_id in sorted(target_gsize): if chr_id.startswith('chr'): #if withChr is True: print("##contig=<ID=%s,length=%d,assembly=%s>" % (update_chromID(chr_template, chr_id, cstyle), target_gsize[chr_id], os.path.basename(refgenome)), file=FILE_OUT) print("##liftOverProgram=<CrossMap,version=%s,website=https://sourceforge.net/projects/crossmap>" % __version__, file=FILE_OUT) print("##liftOverChainFile=<%s>" % liftoverfile, file=FILE_OUT) print("##originalFile=<%s>" % infile, file=FILE_OUT) print("##targetRefGenome=<%s>" % refgenome, file=FILE_OUT) print("##liftOverDate=<%s>" % datetime.date.today().strftime("%B%d,%Y"), file=FILE_OUT) print(line, file=FILE_OUT) print(line, file=UNMAP) logging.info("Lifting over ... ") else: if line.startswith('#'):continue # process non-variant region if 'END=' in line: fields = str.split(line,maxsplit=8) total_region += 1 chrom = fields[0] start = int(fields[1])-1 # 0 based try: m = re.search(r"END\=(\d+)", line) end = int(m[1]) except: print (line + "\tFail(Unmap)", file=UNMAP) failed_region += 1 continue a = map_coordinates(mapping, chrom, start, end, '+', chrom_style = cstyle) if a is None: print (line + "\tFail(Unmap)", file=UNMAP) failed_region += 1 continue if len(a) == 2: # update chrom target_chr = str(a[1][0]) #target_chr is from chain file, could be 'chr1' or '1' target_start = a[1][1] target_end = a[1][2] fields[0] = target_chr # update start coordinate fields[1] = target_start + 1 # update END fields[7] = fields[7].replace(('END=' + str(end)), ('END=' + str(target_end))) print('\t'.join(map(str, fields)), file=FILE_OUT) # process variant line else: fields = str.split(line,maxsplit=7) total_var += 1 chrom = fields[0] start = int(fields[1])-1 # 0 based, ref_allele start end = start + len(fields[3]) # ref_allele end alt_allele = fields[4].replace(' ','').split(',')[0] # 20 10000598 . T A,<NON_REF> 1754.77 . DP=54; a = map_coordinates(mapping, chrom, start, end, '+', chrom_style = cstyle) if a is None: print (line + "\tFail(Unmap)", file=UNMAP) failed_var += 1 continue if len(a) == 2: # update chrom target_chr = str(a[1][0]) #target_chr is from chain file, could be 'chr1' or '1' target_start = a[1][1] target_end = a[1][2] fields[0] = target_chr # update start coordinate fields[1] = target_start + 1 # update ref allele try: target_chr = update_chromID(refFasta.references[0], target_chr) fields[3] = refFasta.fetch(target_chr,target_start,target_end).upper() except: print(line+ "\tFail(No_targetRef)", file=UNMAP) failed_var += 1 if a[1][3] == '-': fields[4] = revcomp_DNA(alt_allele, True) + ',<NON_REF>' # check if ref_allele is the same as alt_allele if noCompAllele: print('\t'.join(map(str, fields)), file=FILE_OUT) else: if fields[3] != fields[4]: print('\t'.join(map(str, fields)), file=FILE_OUT) else: print (line + "\tFail(REF==ALT)", file=UNMAP) failed_var += 1 else: print (line + "\tFail(Multiple_hits)", file=UNMAP) failed_var += 1 continue FILE_OUT.close() UNMAP.close() logging.info ("Total variants: %d" % total_var) logging.info ("Variants failed to map: %d" % failed_var) logging.info ("Total non-variant regions: %d" % total_region) logging.info ("Non-variant regions failed to map: %d" % failed_region) if compress: try: logging.info("Compressing \"%s\" ..." % outfile) subprocess.call("gzip " + outfile, shell=True) except: pass
def crossmap_gvcf_file(mapping, infile, outfile, liftoverfile, refgenome): ''' Convert genome coordinates in GVCF format. Parameters ---------- mapping : dict Dictionary with source chrom name as key, IntervalTree object as value. infile : file Input file in GVCF format. Can be a regular or compressed (*.gz, *.Z,*.z, *.bz, *.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to remote file. outfile : str prefix of output files. liftoverfile : file Chain (https://genome.ucsc.edu/goldenPath/help/chain.html) format file. Can be a regular or compressed (*.gz, *.Z,*.z, *.bz, *.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to remote file. refgenome : file The genome sequence file of 'target' assembly in FASTA format. ''' #index refegenome file if it hasn't been done if not os.path.exists(refgenome + '.fai'): printlog(["Creating index for", refgenome]) pysam.faidx(refgenome) refFasta = pysam.Fastafile(refgenome) FILE_OUT = open(outfile, 'w') UNMAP = open(outfile + '.unmap', 'w') total_var = 0 failed_var = 0 total_region = 0 failed_region = 0 withChr = False # check if the VCF data lines use 'chr1' or '1' for line in ireader.reader(infile): if not line.strip(): continue line = line.strip() #deal with meta-information lines. #meta-information lines needed in both mapped and unmapped files if line.startswith('##fileformat'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##INFO'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##FILTER'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##FORMAT'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##ALT'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##SAMPLE'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##PEDIGREE'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##GVCFBlock'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##GATKCommandLine'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##source'): print(line, file=FILE_OUT) print(line, file=UNMAP) #meta-information lines needed in unmapped files elif line.startswith('##assembly'): print(line, file=UNMAP) elif line.startswith('##contig'): print(line, file=UNMAP) if 'ID=chr' in line: withChr = True #update contig information elif line.startswith('#CHROM'): printlog(["Updating contig field ... "]) target_gsize = dict( list(zip(refFasta.references, refFasta.lengths))) for chr_id in sorted(target_gsize): if chr_id.startswith('chr'): if withChr is True: print("##contig=<ID=%s,length=%d,assembly=%s>" % (chr_id, target_gsize[chr_id], os.path.basename(refgenome)), file=FILE_OUT) else: print("##contig=<ID=%s,length=%d,assembly=%s>" % (chr_id.replace('chr', ''), target_gsize[chr_id], os.path.basename(refgenome)), file=FILE_OUT) else: if withChr is True: print("##contig=<ID=%s,length=%d,assembly=%s>" % ('chr' + chr_id, target_gsize[chr_id], os.path.basename(refgenome)), file=FILE_OUT) else: print("##contig=<ID=%s,length=%d,assembly=%s>" % (chr_id, target_gsize[chr_id], os.path.basename(refgenome)), file=FILE_OUT) print( "##liftOverProgram=<CrossMap,version=%s,website=https://sourceforge.net/projects/crossmap>" % __version__, file=FILE_OUT) print("##liftOverChainFile=<%s>" % liftoverfile, file=FILE_OUT) print("##originalFile=<%s>" % infile, file=FILE_OUT) print("##targetRefGenome=<%s>" % refgenome, file=FILE_OUT) print("##liftOverDate=<%s>" % datetime.date.today().strftime("%B%d,%Y"), file=FILE_OUT) print(line, file=FILE_OUT) print(line, file=UNMAP) printlog(["Lifting over ... "]) else: if line.startswith('#'): continue # process non-variant region if 'END=' in line: fields = str.split(line, maxsplit=8) total_region += 1 chrom = fields[0] start = int(fields[1]) - 1 # 0 based try: m = re.search(r"END\=(\d+)", line) end = int(m[1]) except: print(line + "\tFail(Unmap)", file=UNMAP) failed_region += 1 continue a = map_coordinates(mapping, chrom, start, end, '+') if a is None: print(line + "\tFail(Unmap)", file=UNMAP) failed_region += 1 continue if len(a) == 2: # update chrom target_chr = str( a[1][0] ) #target_chr is from chain file, could be 'chr1' or '1' target_start = a[1][1] target_end = a[1][2] fields[0] = target_chr # update start coordinate fields[1] = target_start + 1 # update END fields[7] = fields[7].replace(('END=' + str(end)), ('END=' + str(target_end))) print('\t'.join(map(str, fields)), file=FILE_OUT) # process variant line else: fields = str.split(line, maxsplit=7) total_var += 1 chrom = fields[0] start = int(fields[1]) - 1 # 0 based, ref_allele start end = start + len(fields[3]) # ref_allele end alt_allele = fields[4].replace(' ', '').split( ',' )[0] # 20 10000598 . T A,<NON_REF> 1754.77 . DP=54; a = map_coordinates(mapping, chrom, start, end, '+') if a is None: print(line + "\tFail(Unmap)", file=UNMAP) failed_var += 1 continue if len(a) == 2: # update chrom target_chr = str( a[1][0] ) #target_chr is from chain file, could be 'chr1' or '1' target_start = a[1][1] target_end = a[1][2] fields[0] = target_chr # update start coordinate fields[1] = target_start + 1 # update ref allele target_chr = update_chromID(refFasta.references[0], target_chr) fields[3] = refFasta.fetch(target_chr, target_start, target_end).upper() if a[1][3] == '-': fields[4] = revcomp_DNA(alt_allele, True) + ',<NON_REF>' #ref_allele and alt_alele are different if fields[3] != alt_allele: print('\t'.join(map(str, fields)), file=FILE_OUT) else: print(line + "\tFail(REF==ALT)", file=UNMAP) failed_var += 1 else: print(line + "\tFail(Multiple_hits)", file=UNMAP) failed_var += 1 continue FILE_OUT.close() UNMAP.close() printlog(["Total variants:", str(total_var)]) printlog(["Variants failed to map:", str(failed_var)]) printlog(["Total non-variant regions:", str(total_region)]) printlog(["Non-variant regions failed to map:", str(failed_region)])
def crossmap_wig_file(mapping, in_file, out_prefix, taget_chrom_size, in_format, binSize=100000): ''' Description ----------- Convert genome coordinates (in wiggle/bigwig format) between assemblies. wiggle format: http://genome.ucsc.edu/goldenPath/help/wiggle.html bigwig format: http://genome.ucsc.edu/goldenPath/help/bigWig.html Parameters ---------- mapping : dict Dictionary with source chrom name as key, IntervalTree object as value. in_file : file Input file in wig or bigwig format. Both "variableStep" and "fixedStep" wiggle lines are supported. out_prefix : str Prefix of output files. taget_chrom_size : dict Chromosome size of the target genome assembly. Key is chromosome ID, value is the length of the chromosome. Note, the chromosome ID and length information were extracted from the chain file, therefore, the chrom_IDs can be with or without the leading "chr". in_format : str Either "wiggle" or "bigwig" binSize : int The chunk size when reading bigwig file in each iteration. ''' OUT_FILE1 = open(out_prefix + '.bgr', 'w') # original bgr file OUT_FILE2 = open(out_prefix + '.sorted.bgr', 'w') # sorted bgr file OUT_FILE3 = pyBigWig.open(out_prefix + '.bw', "w") # bigwig file chrom_style = 'chr1' if in_format.upper() == "WIGGLE": logging.info("Liftover wiggle file \"%s\" to bedGraph file \"%s\"" % (in_file, out_prefix + '.bgr')) for chrom, start, end, strand, score in wiggleReader(in_file): chrom_style = chrom maps = map_coordinates(mapping, chrom, start, end, '+') if maps is None: continue if len(maps) == 2: print('\t'.join([ str(i) for i in [maps[1][0], maps[1][1], maps[1][2], score] ]), file=OUT_FILE1) else: continue maps[:] = [] OUT_FILE1.close() logging.info("Merging overlapped entries in bedGraph file") for (chrom, start, end, score) in bgrMerge.merge(out_prefix + '.bgr'): print('\t'.join([str(i) for i in (chrom, start, end, score)]), file=OUT_FILE2) OUT_FILE2.close() os.remove(out_prefix + '.bgr') #remove .bgr, keep .sorted.bgr # make bigwig header target_chroms_sorted = [] for k in sorted(taget_chrom_size.keys()): i_chrom = update_chromID(chrom_style, k) i_value = taget_chrom_size[k] target_chroms_sorted.append((i_chrom, i_value)) # add bigwig header logging.info("Writing header to \"%s\" ..." % (out_prefix + '.bw')) OUT_FILE3.addHeader(target_chroms_sorted) # add entries to bigwig file logging.info("Writing entries to \"%s\" ..." % (out_prefix + '.bw')) for line in ireader.reader(out_prefix + '.sorted.bgr'): r_chr, r_st, r_end, r_value = line.split() OUT_FILE3.addEntries([r_chr], [int(r_st)], ends=[int(r_end)], values=[float(r_value)]) OUT_FILE3.close() elif in_format.upper() == "BIGWIG": logging.info("Liftover bigwig file %s to bedGraph file %s:" % (in_file, out_prefix + '.bgr')) for chrom, start, end, score in bigwigReader(in_file): chrom_style = chrom maps = map_coordinates(mapping, chrom, start, end, '+') try: if maps is None: continue if len(maps) == 2: print('\t'.join([ str(i) for i in [maps[1][0], maps[1][1], maps[1][2], score] ]), file=OUT_FILE1) else: continue except: continue maps[:] = [] OUT_FILE1.close() logging.info("Merging overlapped entries in bedGraph file") for (chrom, start, end, score) in bgrMerge.merge(out_prefix + '.bgr'): print('\t'.join([str(i) for i in (chrom, start, end, score)]), file=OUT_FILE2) OUT_FILE2.close() os.remove(out_prefix + '.bgr') #remove .bgr, keep .sorted.bgr logging.info("Writing header to \"%s\" ..." % (out_prefix + '.bw')) # make bigwig header target_chroms_sorted = [] for k in sorted(taget_chrom_size.keys()): i_chrom = update_chromID(chrom_style, k) i_value = taget_chrom_size[k] target_chroms_sorted.append((i_chrom, i_value)) # add bigwig header OUT_FILE3.addHeader(target_chroms_sorted) # add entries to bigwig file logging.info("Writing entries to \"%s\" ..." % (out_prefix + '.bw')) for line in ireader.reader(out_prefix + '.sorted.bgr'): r_chr, r_st, r_end, r_value = line.split() OUT_FILE3.addEntries([r_chr], [int(r_st)], [int(r_end)], [float(r_value)]) OUT_FILE3.close() else: raise Exception("Unknown foramt. Must be 'wiggle' or 'bigwig'")