def wiggleReader(f): ''' Read wiggle (http://genome.ucsc.edu/goldenPath/help/wiggle) file of different styles. Parameters ---------- f : file file in wiggle format. Can be fixedStep, variableStep, or bed4 Yields ------ chrom, start, end, strand, score ''' current_chrom = None current_pos = None current_step = None # always for wiggle data strand = '+' mode = "bed" for line in ireader.reader(f): if line.isspace() or line.startswith(("track", "#", "browser")): continue elif line.startswith("variableStep"): header = parse_header(line) current_chrom = header['chrom'] current_pos = None current_step = None if 'span' in header: current_span = int(header['span']) else: current_span = 1 mode = "variableStep" elif line.startswith("fixedStep"): header = parse_header(line) current_chrom = header['chrom'] current_pos = int(header['start']) - 1 current_step = int(header['step']) if 'span' in header: current_span = int(header['span']) else: current_span = 1 mode = "fixedStep" elif mode == "bed": fields = line.split() if len(fields) > 3: if len(fields) > 5: yield fields[0], int(fields[1]), int( fields[2]), fields[5], float(fields[3]) else: yield fields[0], int(fields[1]), int( fields[2]), strand, float(fields[3]) elif mode == "variableStep": fields = line.split() pos = int(fields[0]) - 1 yield current_chrom, pos, pos + current_span, strand, float( fields[1]) elif mode == "fixedStep": yield current_chrom, current_pos, current_pos + current_span, strand, float( line.split()[0]) current_pos += current_step else: raise "Unexpected input line: %s" % line.strip()
def read_chain_file(chain_file, print_table=False): ''' Read chain file. Parameters ---------- chain_file : file Chain format file. Input chain_file could be either plain text, compressed file (".gz",".Z", ".z", ".bz", ".bz2", ".bzip2"), or a URL pointing to the chain file ("http://","https://", "ftp://"). If url was used, chain file must be plain text. print_table : bool, optional Print mappings in human readable table. Returns ------- maps : dict Dictionary with source chrom name as key, IntervalTree object as value. An IntervalTree contains many intervals. An interval is a start and end position and a value. eg. Interval(11, 12, strand="-", value = "abc") target_chromSize : dict Chromosome sizes of target genome source_chromSize : dict Chromosome sizes of source genome ''' logging.info("Read the chain file \"%s\" " % chain_file) maps = {} target_chromSize = {} source_chromSize = {} if print_table: blocks = [] for line in ireader.reader(chain_file): # Example: chain 4900 chrY 58368225 + 25985403 25985638 chr5 151006098 - 43257292 43257528 1 if not line.strip(): continue line = line.strip() if line.startswith(('#', ' ')): continue fields = line.split() if fields[0] == 'chain' and len(fields) in [12, 13]: #score = int(fields[1]) # Alignment score source_name = fields[2] # E.g. chrY source_size = int(fields[3]) # Full length of the chromosome source_strand = fields[4] # Must be + if source_strand != '+': raise Exception( "Source strand in a chain file must be +. (%s)" % line) source_start = int(fields[5]) # Start of source region #source_end = int(fields[6]) # End of source region target_name = fields[7] # E.g. chr5 target_size = int(fields[8]) # Full length of the chromosome target_strand = fields[9] # + or - target_start = int(fields[10]) #target_end = int(fields[11]) target_chromSize[target_name] = target_size source_chromSize[source_name] = source_size if target_strand not in ['+', '-']: raise Exception("Target strand must be - or +. (%s)" % line) #chain_id = None if len(fields) == 12 else fields[12] if source_name not in maps: maps[source_name] = Intersecter() sfrom, tfrom = source_start, target_start # Now read the alignment chain from the file and store it as a list (source_from, source_to) -> (target_from, target_to) elif fields[0] != 'chain' and len(fields) == 3: size, sgap, tgap = int(fields[0]), int(fields[1]), int(fields[2]) if print_table: if target_strand == '+': blocks.append( (source_name, sfrom, sfrom + size, source_strand, target_name, tfrom, tfrom + size, target_strand)) elif target_strand == '-': blocks.append( (source_name, sfrom, sfrom + size, source_strand, target_name, target_size - (tfrom + size), target_size - tfrom, target_strand)) if target_strand == '+': maps[source_name].add_interval( Interval( sfrom, sfrom + size, (target_name, tfrom, tfrom + size, target_strand))) elif target_strand == '-': maps[source_name].add_interval( Interval(sfrom, sfrom + size, (target_name, target_size - (tfrom + size), target_size - tfrom, target_strand))) sfrom += size + sgap tfrom += size + tgap elif fields[0] != 'chain' and len(fields) == 1: size = int(fields[0]) if print_table: if target_strand == '+': blocks.append( (source_name, sfrom, sfrom + size, source_strand, target_name, tfrom, tfrom + size, target_strand)) elif target_strand == '-': blocks.append( (source_name, sfrom, sfrom + size, source_strand, target_name, target_size - (tfrom + size), target_size - tfrom, target_strand)) if target_strand == '+': maps[source_name].add_interval( Interval( sfrom, sfrom + size, (target_name, tfrom, tfrom + size, target_strand))) elif target_strand == '-': maps[source_name].add_interval( Interval(sfrom, sfrom + size, (target_name, target_size - (tfrom + size), target_size - tfrom, target_strand))) else: raise Exception("Invalid chain format. (%s)" % line) #if (sfrom + size) != source_end or (tfrom + size) != target_end: # raise Exception("Alignment blocks do not match specified block sizes. (%s)" % header) if print_table: for i in blocks: print('\t'.join([str(n) for n in i])) return (maps, target_chromSize, source_chromSize)
def crossmap_vcf_file(mapping, infile, outfile, liftoverfile, refgenome, noCompAllele=False, compress=False): ''' Convert genome coordinates in VCF format. Parameters ---------- mapping : dict Dictionary with source chrom name as key, IntervalTree object as value. infile : file Input file in VCF format. Can be a regular or compressed (*.gz, *.Z,*.z, *.bz, *.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to remote file. outfile : str prefix of output files. liftoverfile : file Chain (https://genome.ucsc.edu/goldenPath/help/chain.html) format file. Can be a regular or compressed (*.gz, *.Z,*.z, *.bz, *.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to remote file. refgenome : file The genome sequence file of 'target' assembly in FASTA format. noCompAllele : bool A logical value indicates whether to compare ref_allele to alt_allele after liftover. If True, the variant will be marked as "unmap" if ref_allele == alt_allele. ''' if noCompAllele: printlog( ["Keep variants [reference_allele == alternative_allele] ..."]) else: printlog([ "Filter out variants [reference_allele == alternative_allele] ..." ]) #index refegenome file if it hasn't been done if not os.path.exists(refgenome + '.fai'): printlog(["Creating index for", refgenome]) pysam.faidx(refgenome) refFasta = pysam.Fastafile(refgenome) FILE_OUT = open(outfile, 'w') UNMAP = open(outfile + '.unmap', 'w') total = 0 fail = 0 withChr = False # check if the VCF data lines use 'chr1' or '1' for line in ireader.reader(infile): if not line.strip(): continue line = line.strip() #deal with meta-information lines. #meta-information lines needed in both mapped and unmapped files if line.startswith('##fileformat'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##INFO'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##FILTER'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##FORMAT'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##ALT'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##SAMPLE'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##PEDIGREE'): print(line, file=FILE_OUT) print(line, file=UNMAP) #meta-information lines needed in unmapped files elif line.startswith('##assembly'): print(line, file=UNMAP) elif line.startswith('##contig'): print(line, file=UNMAP) if 'ID=chr' in line: withChr = True #update contig information elif line.startswith('#CHROM'): printlog(["Updating contig field ... "]) target_gsize = dict( list(zip(refFasta.references, refFasta.lengths))) for chr_id in sorted(target_gsize): if chr_id.startswith('chr'): if withChr is True: print("##contig=<ID=%s,length=%d,assembly=%s>" % (chr_id, target_gsize[chr_id], os.path.basename(refgenome)), file=FILE_OUT) else: print("##contig=<ID=%s,length=%d,assembly=%s>" % (chr_id.replace('chr', ''), target_gsize[chr_id], os.path.basename(refgenome)), file=FILE_OUT) else: if withChr is True: print("##contig=<ID=%s,length=%d,assembly=%s>" % ('chr' + chr_id, target_gsize[chr_id], os.path.basename(refgenome)), file=FILE_OUT) else: print("##contig=<ID=%s,length=%d,assembly=%s>" % (chr_id, target_gsize[chr_id], os.path.basename(refgenome)), file=FILE_OUT) print( "##liftOverProgram=<CrossMap,version=%s,website=https://sourceforge.net/projects/crossmap>" % __version__, file=FILE_OUT) print("##liftOverChainFile=<%s>" % liftoverfile, file=FILE_OUT) print("##originalFile=<%s>" % infile, file=FILE_OUT) print("##targetRefGenome=<%s>" % refgenome, file=FILE_OUT) print("##liftOverDate=<%s>" % datetime.date.today().strftime("%B%d,%Y"), file=FILE_OUT) print(line, file=FILE_OUT) print(line, file=UNMAP) printlog(["Lifting over ... "]) else: if line.startswith('#'): continue fields = str.split(line, maxsplit=7) total += 1 chrom = fields[0] start = int(fields[1]) - 1 # 0 based end = start + len(fields[3]) a = map_coordinates(mapping, chrom, start, end, '+') if a is None: print(line + "\tFail(Unmap)", file=UNMAP) fail += 1 continue if len(a) == 2: # update chrom target_chr = str( a[1][0] ) #target_chr is from chain file, could be 'chr1' or '1' target_start = a[1][1] target_end = a[1][2] fields[0] = target_chr # update start coordinate fields[1] = target_start + 1 # update ref allele target_chr = update_chromID(refFasta.references[0], target_chr) try: fields[3] = refFasta.fetch(target_chr, target_start, target_end).upper() except: print(line + "\tFail(KeyError)", file=UNMAP) fail += 1 continue # update END if any fields[7] = re.sub('END\=\d+', 'END=' + str(target_end), fields[7]) if a[1][3] == '-': fields[4] = revcomp_DNA(fields[4], True) # check if ref_allele is the same as alt_allele if noCompAllele: print('\t'.join(map(str, fields)), file=FILE_OUT) else: if fields[3] != fields[4]: print('\t'.join(map(str, fields)), file=FILE_OUT) else: print(line + "\tFail(REF==ALT)", file=UNMAP) fail += 1 else: print(line + "\tFail(Multiple_hits)", file=UNMAP) fail += 1 continue FILE_OUT.close() UNMAP.close() printlog(["Total entries:", str(total)]) printlog(["Failed to map:", str(fail)]) if compress: try: printlog(["Compressing \"%s\" ..." % outfile]) subprocess.call("gzip " + outfile, shell=True) except: pass
def crossmap_bed_file(mapping, inbed, outfile=None, unmapfile=None, cstyle='a'): ''' Convert genome coordinates (in bed format) between assemblies. BED format: http://genome.ucsc.edu/FAQ/FAQformat.html#format1 Parameters ---------- mapping : dict Dictionary with source chrom name as key, IntervalTree object as value. inbed : file Input BED file. outfile : str, optional Prefix of output files. unmapfile: str, optional Name of file to save unmapped entries. This option will be ignored if outfile is None. cstyle : str, optional Chromosome ID style. Must be one of ['a', 's', 'l'], where 'a' : as-is. The chromosome ID of the output file is in the same style of the input file. 's' : short ID, such as "1", "2", "X. 'l' : long ID, such as "chr1", "chr2", "chrX. ''' # check if 'outfile' was set. If not set, print to screen, if set, print to file if outfile is not None: FILE_OUT = open(outfile, 'w') if unmapfile is not None: UNMAP = open(unmapfile, 'w') else: UNMAP = open(outfile + '.unmap', 'w') else: pass for line in ireader.reader(inbed): if line.startswith(('#', 'track', 'browser')): continue if not line.strip(): continue line = line.strip() fields = line.split() strand = '+' # filter out line less than 3 columns if len(fields) < 3: print("Less than 3 fields. skip " + line, file=sys.stderr) if outfile: print(line + '\tInvalidBedFormat', file=UNMAP) continue try: int(fields[1]) except: print("Start coordinate is not an integer. skip " + line, file=sys.stderr) if outfile: print(line + '\tInvalidStartPosition', file=UNMAP) continue try: int(fields[2]) except: print("End coordinate is not an integer. skip " + line, file=sys.stderr) if outfile: print(line + '\tInvalidEndPosition', file=UNMAP) continue if int(fields[1]) > int(fields[2]): print( "\"Start\" is larger than \"End\" coordinate is not an integer. skip " + line, file=sys.stderr) if outfile: print(line + '\tStart>End', file=UNMAP) continue # deal with bed less than 12 columns if len(fields) < 12: # try to reset strand try: for f in fields: if f in ['+', '-']: strand = f except: pass chrom = fields[0] start = int(fields[1]) end = int(fields[2]) a = map_coordinates(mapping, chrom, start, end, strand, chrom_style=cstyle) try: if (a is None) or (len(a) % 2 != 0): if outfile is None: print(line + '\tUnmap') else: print(line + '\tUnmap', file=UNMAP) continue if len(a) == 2: #reset fields fields[0] = a[1][0] fields[1] = a[1][1] fields[2] = a[1][2] for i in range( 0, len(fields)): #update the strand information if fields[i] in ['+', '-']: fields[i] = a[1][3] if outfile is None: print(line + '\t->\t' + '\t'.join([str(i) for i in fields])) else: print('\t'.join([str(i) for i in fields]), file=FILE_OUT) if len(a) > 2: count = 0 for j in range(1, len(a), 2): count += 1 fields[0] = a[j][0] fields[1] = a[j][1] fields[2] = a[j][2] for i in range( 0, len(fields)): #update the strand information if fields[i] in ['+', '-']: fields[i] = a[j][3] if outfile is None: print(line + '\t' + '(split.' + str(count) + ':' + ':'.join([str(i) for i in a[j - 1]]) + ')\t' + '\t'.join([str(i) for i in fields])) else: print('\t'.join([str(i) for i in fields]), file=FILE_OUT) except: if outfile is None: print(line + '\tFail') else: print(line + '\tFail', file=UNMAP) continue # deal with bed12 and bed12+8 (genePred format) if len(fields) == 12 or len(fields) == 20: strand = fields[5] if strand not in ['+', '-']: raise Exception("Unknown strand: %s. Can only be '+' or '-'." % strand) fail_flag = False exons_old_pos = annoGene.getExonFromLine( line) #[[chr,st,end],[chr,st,end],...] #print exons_old_pos exons_new_pos = [] for e_chr, e_start, e_end in exons_old_pos: # a has two elements, first is query, 2nd is target. # [('chr1', 246974830, 246974833,'+'), ('chr1', 248908207, 248908210,'+')] a = map_coordinates(mapping, e_chr, e_start, e_end, strand, chrom_style=cstyle) if a is None: fail_flag = True break if len(a) == 2: exons_new_pos.append(a[1]) else: fail_flag = True break if not fail_flag: # check if all exons were mapped to the same chromosome and the same strand chr_id = set() exon_strand = set() for e_chr, e_start, e_end, e_strand in exons_new_pos: chr_id.add(e_chr) exon_strand.add(e_strand) if len(chr_id) != 1 or len(exon_strand) != 1: fail_flag = True if not fail_flag: # build new bed cds_start_offset = int(fields[6]) - int(fields[1]) cds_end_offset = int(fields[2]) - int(fields[7]) new_chrom = exons_new_pos[0][0] new_chrom_st = exons_new_pos[0][1] new_chrom_end = exons_new_pos[-1][2] new_name = fields[3] new_score = fields[4] new_strand = exons_new_pos[0][3] new_thickStart = new_chrom_st + cds_start_offset new_thickEnd = new_chrom_end - cds_end_offset new_ittemRgb = fields[8] new_blockCount = len(exons_new_pos) new_blockSizes = ','.join( [str(o - n) for m, n, o, p in exons_new_pos]) new_blockStarts = ','.join([ str(n - new_chrom_st) for m, n, o, p in exons_new_pos ]) new_bedline = '\t'.join( str(i) for i in (new_chrom, new_chrom_st, new_chrom_end, new_name, new_score, new_strand, new_thickStart, new_thickEnd, new_ittemRgb, new_blockCount, new_blockSizes, new_blockStarts)) if check_bed12(new_bedline) is False: fail_flag = True else: if outfile is None: print(line + '\t->\t' + new_bedline) else: print(new_bedline, file=FILE_OUT) if fail_flag: if outfile is None: print(line + '\tFail') else: print(line, file=UNMAP)
def crossmap_gff_file(mapping, ingff, outfile=None, cstyle='a'): ''' Description ----------- Convert genome coordinates (in GFF/GTF format) between assemblies. GFF (General Feature Format) lines have nine required fields that must be Tab-separated: 1. seqname - The name of the sequence. Must be a chromosome or scaffold. 2. source - The program that generated this feature. 3. feature - The name of this type of feature. Some examples of standard feature types are "CDS", "start_codon", "stop_codon", and "exon". 4. start - The starting position of the feature in the sequence. The first base is numbered 1. 5. end - The ending position of the feature (inclusive). 6. score - A score between 0 and 1000. If the track line useScore attribute is set to 1 for this annotation data set, the score value will determine the level of gray in which this feature is displayed (higher numbers = darker gray). If there is no score value, enter ".". 7. strand - Valid entries include '+', '-', or '.' (for don't know/don't care). 8. frame - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'. 9. group - All lines with the same group are linked together into a single item. GFF format: http://genome.ucsc.edu/FAQ/FAQformat.html#format3 GTF (Gene Transfer Format) is a refinement to GFF that tightens the specification. The first eight GTF fields are the same as GFF. The group field has been expanded into a list of attributes. Each attribute consists of a type/value pair. Attributes must end in a semi-colon, and be separated from any following attribute by exactly one space. GTF format: http://genome.ucsc.edu/FAQ/FAQformat.html#format4 We do NOT check if features (exon, CDS, etc) originally belonging to the same gene were converted into the same chromosome/strand. Parameters ---------- mapping : dict Dictionary with source chrom name as key, IntervalTree object as value. ingff : file Input GFF/GTF file. outfile : str, optional Prefix of output files. cstyle : str, optional Chromosome ID style. Must be one of ['a', 's', 'l'], where 'a' : as-is. The chromosome ID of the output file is in the same style of the input file. 's' : short ID, such as "1", "2", "X. 'l' : long ID, such as "chr1", "chr2", "chrX. ''' if outfile is not None: rand_str = ''.join( random.choices(string.ascii_uppercase + string.digits, k=8)) FILE_OUT = open(outfile, 'w') UNMAP = open(outfile + '.' + rand_str + '.unmap', 'w') for line in ireader.reader(ingff): if line.startswith(('#', 'track', 'browser', 'visibility')): continue if not line.strip(): continue line = line.strip() fields = line.split('\t') try: start = int(fields[3]) - 1 #0-based end = int(fields[4]) / 1 feature_size = end - start except: print('Cannot recognize \"start\" and \"end\" coordinates. Skip ' + line, file=sys.stderr) if outfile: print(line, file=UNMAP) continue if fields[6] not in ['+', '-', '.']: print('Cannot recognize \"strand\". Skip ' + line, file=sys.stderr) if outfile: print(line, file=UNMAP) continue strand = '-' if fields[6] == '-' else '+' chrom = fields[0] a = map_coordinates(mapping, chrom, start, end, strand, chrom_style=cstyle) if a is None: if outfile is None: print(line + '\tfail (no match to target assembly)') else: print(line, file=UNMAP) continue if len(a) != 2: if outfile is None: print(line + '\tfail (multpile match to target assembly)') else: print(line, file=UNMAP) else: if (int(a[1][2]) - int( a[1][1])) != feature_size: # check if it is exact match if outfile is None: print(line + '\tfail (not exact match)') else: print(line, file=UNMAP) fields[0] = a[1][0] # chrom fields[3] = int(a[1][1]) + 1 # start, 1-based fields[4] = int(a[1][2]) fields[6] = a[1][3] if outfile is None: print(line + '\t->\t' + '\t'.join([str(i) for i in fields])) else: print('\t'.join([str(i) for i in fields]), file=FILE_OUT)
def crossmap_maf_file(mapping, infile, outfile, liftoverfile, refgenome, ref_name): ''' Convert genome coordinates in MAF (mutation annotation foramt) format. Parameters ---------- mapping : dict Dictionary with source chrom name as key, IntervalTree object as value. infile : file Input file in VCF format. Can be a regular or compressed (*.gz, *.Z,*.z, *.bz, *.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to remote file. outfile : str prefix of output files. liftoverfile : file Chain (https://genome.ucsc.edu/goldenPath/help/chain.html) format file. Can be a regular or compressed (*.gz, *.Z,*.z, *.bz, *.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to remote file. refgenome : file The genome sequence file of 'target' assembly in FASTA format. ref_name : str The NCBI build name of the target assembly, for example, "GRCh37", "GRCh38". ''' #index refegenome file if it hasn't been done if not os.path.exists(refgenome + '.fai'): logging.info("Creating index for: %s" % refgenome) pysam.faidx(refgenome) if os.path.getctime(refgenome + '.fai') < os.path.getctime(refgenome): logging.info( "Index file is older than reference genome. Re-creating index for: %s" % refgenome) pysam.faidx(refgenome) refFasta = pysam.Fastafile(refgenome) FILE_OUT = open(outfile, 'w') UNMAP = open(outfile + '.unmap', 'w') total = 0 fail = 0 for line in ireader.reader(infile): if not line.strip(): continue line = line.strip() #meta-information lines needed in both mapped and unmapped files if line.startswith('#'): print(line, file=FILE_OUT) print(line, file=UNMAP) continue elif line.startswith('Hugo_Symbol'): print( "#liftOver: Program=%sv%s, Time=%s, ChainFile=%s, NewRefGenome=%s" % ("CrossMap", __version__, datetime.date.today().strftime("%B%d,%Y"), liftoverfile, refgenome), file=FILE_OUT) print(line, file=FILE_OUT) print(line, file=UNMAP) logging.info("Lifting over ... ") else: fields = str.split(line, sep='\t') total += 1 fields[3] = ref_name chrom = fields[4] start = int(fields[5]) - 1 # 0 based end = int(fields[6]) #strand = fields[7] a = map_coordinates(mapping, chrom, start, end, '+') if a is None: print(line, file=UNMAP) fail += 1 continue if len(a) == 2: target_chr = str( a[1][0] ) #target_chr is from chain file, could be 'chr1' or '1' target_start = a[1][1] target_end = a[1][2] # update chrom fields[4] = target_chr # update start coordinate fields[5] = target_start + 1 # update end fields[6] = target_end # update ref allele try: target_chr = update_chromID(refFasta.references[0], target_chr) fields[10] = refFasta.fetch(target_chr, target_start, target_end).upper() except: print(line, file=UNMAP) fail += 1 continue if a[1][3] == '-': fields[10] = revcomp_DNA(fields[10], True) print('\t'.join(map(str, fields)), file=FILE_OUT) else: print(line, file=UNMAP) fail += 1 continue FILE_OUT.close() UNMAP.close() logging.info("Total entries: %d", total) logging.info("Failed to map: %d", fail)
def crossmap_gvcf_file(mapping, infile, outfile, liftoverfile, refgenome, noCompAllele = False, compress = False, cstyle = 'a'): ''' Convert genome coordinates in GVCF format. Parameters ---------- mapping : dict Dictionary with source chrom name as key, IntervalTree object as value. infile : file Input file in GVCF format. Can be a regular or compressed (*.gz, *.Z,*.z, *.bz, *.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to remote file. outfile : str prefix of output files. liftoverfile : file Chain (https://genome.ucsc.edu/goldenPath/help/chain.html) format file. Can be a regular or compressed (*.gz, *.Z,*.z, *.bz, *.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to remote file. refgenome : file The genome sequence file of 'target' assembly in FASTA format. noCompAllele : bool A logical value indicates whether to compare ref_allele to alt_allele after liftover. If True, the variant will be marked as "unmap" if ref_allele == alt_allele. cstyle : str, optional Chromosome ID style. Must be one of ['a', 's', 'l'], where 'a' : as-is. The chromosome ID of the output file is in the same style of the input file. 's' : short ID, such as "1", "2", "X. 'l' : long ID, such as "chr1", "chr2", "chrX. ''' if noCompAllele: logging.info("Keep variants [reference_allele == alternative_allele] ...") else: logging.info("Filter out variants [reference_allele == alternative_allele] ...") #index refegenome file if it hasn't been done if not os.path.exists(refgenome + '.fai'): logging.info("Creating index for: %s" % refgenome) pysam.faidx(refgenome) if os.path.getmtime(refgenome + '.fai') < os.path.getmtime(refgenome): logging.info("Index file is older than reference genome. Re-creating index for: %s" % refgenome) pysam.faidx(refgenome) refFasta = pysam.Fastafile(refgenome) FILE_OUT = open(outfile ,'w') UNMAP = open(outfile + '.unmap','w') total_var = 0 failed_var = 0 total_region = 0 failed_region = 0 withChr = False # check if the VCF data lines use 'chr1' or '1' for line in ireader.reader(infile): if not line.strip(): continue line=line.strip() #deal with meta-information lines. #meta-information lines needed in both mapped and unmapped files if line.startswith('##fileformat'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##INFO'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##FILTER'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##FORMAT'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##ALT'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##SAMPLE'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##PEDIGREE'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##GVCFBlock'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##GATKCommandLine'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##source'): print(line, file=FILE_OUT) print(line, file=UNMAP) #meta-information lines needed in unmapped files elif line.startswith('##assembly'): print(line, file=UNMAP) elif line.startswith('##contig'): print(line, file=UNMAP) if 'ID=chr' in line: chr_template = 'chr1' else: chr_template = '1' #update contig information elif line.startswith('#CHROM'): logging.info("Updating contig field ... ") target_gsize = dict(list(zip(refFasta.references, refFasta.lengths))) for chr_id in sorted(target_gsize): if chr_id.startswith('chr'): #if withChr is True: print("##contig=<ID=%s,length=%d,assembly=%s>" % (update_chromID(chr_template, chr_id, cstyle), target_gsize[chr_id], os.path.basename(refgenome)), file=FILE_OUT) print("##liftOverProgram=<CrossMap,version=%s,website=https://sourceforge.net/projects/crossmap>" % __version__, file=FILE_OUT) print("##liftOverChainFile=<%s>" % liftoverfile, file=FILE_OUT) print("##originalFile=<%s>" % infile, file=FILE_OUT) print("##targetRefGenome=<%s>" % refgenome, file=FILE_OUT) print("##liftOverDate=<%s>" % datetime.date.today().strftime("%B%d,%Y"), file=FILE_OUT) print(line, file=FILE_OUT) print(line, file=UNMAP) logging.info("Lifting over ... ") else: if line.startswith('#'):continue # process non-variant region if 'END=' in line: fields = str.split(line,maxsplit=8) total_region += 1 chrom = fields[0] start = int(fields[1])-1 # 0 based try: m = re.search(r"END\=(\d+)", line) end = int(m[1]) except: print (line + "\tFail(Unmap)", file=UNMAP) failed_region += 1 continue a = map_coordinates(mapping, chrom, start, end, '+', chrom_style = cstyle) if a is None: print (line + "\tFail(Unmap)", file=UNMAP) failed_region += 1 continue if len(a) == 2: # update chrom target_chr = str(a[1][0]) #target_chr is from chain file, could be 'chr1' or '1' target_start = a[1][1] target_end = a[1][2] fields[0] = target_chr # update start coordinate fields[1] = target_start + 1 # update END fields[7] = fields[7].replace(('END=' + str(end)), ('END=' + str(target_end))) print('\t'.join(map(str, fields)), file=FILE_OUT) # process variant line else: fields = str.split(line,maxsplit=7) total_var += 1 chrom = fields[0] start = int(fields[1])-1 # 0 based, ref_allele start end = start + len(fields[3]) # ref_allele end alt_allele = fields[4].replace(' ','').split(',')[0] # 20 10000598 . T A,<NON_REF> 1754.77 . DP=54; a = map_coordinates(mapping, chrom, start, end, '+', chrom_style = cstyle) if a is None: print (line + "\tFail(Unmap)", file=UNMAP) failed_var += 1 continue if len(a) == 2: # update chrom target_chr = str(a[1][0]) #target_chr is from chain file, could be 'chr1' or '1' target_start = a[1][1] target_end = a[1][2] fields[0] = target_chr # update start coordinate fields[1] = target_start + 1 # update ref allele try: target_chr = update_chromID(refFasta.references[0], target_chr) fields[3] = refFasta.fetch(target_chr,target_start,target_end).upper() except: print(line+ "\tFail(No_targetRef)", file=UNMAP) failed_var += 1 if a[1][3] == '-': fields[4] = revcomp_DNA(alt_allele, True) + ',<NON_REF>' # check if ref_allele is the same as alt_allele if noCompAllele: print('\t'.join(map(str, fields)), file=FILE_OUT) else: if fields[3] != fields[4]: print('\t'.join(map(str, fields)), file=FILE_OUT) else: print (line + "\tFail(REF==ALT)", file=UNMAP) failed_var += 1 else: print (line + "\tFail(Multiple_hits)", file=UNMAP) failed_var += 1 continue FILE_OUT.close() UNMAP.close() logging.info ("Total variants: %d" % total_var) logging.info ("Variants failed to map: %d" % failed_var) logging.info ("Total non-variant regions: %d" % total_region) logging.info ("Non-variant regions failed to map: %d" % failed_region) if compress: try: logging.info("Compressing \"%s\" ..." % outfile) subprocess.call("gzip " + outfile, shell=True) except: pass
def crossmap_region_file(mapping, inbed, outfile=None, min_ratio=0.85, cstyle='a'): ''' Convert large genomic regions (in bed format) between assemblies. BED format: http://genome.ucsc.edu/FAQ/FAQformat.html#format1 Parameters ---------- mapping : dict Dictionary with source chrom name as key, IntervalTree object as value. inbed : file Input BED file. outfile : str, optional Prefix of output files. min_ratio : float, optional Minimum ratio of query bases that must remap cstyle : str, optional Chromosome ID style. Must be one of ['a', 's', 'l'], where 'a' : as-is. The chromosome ID of the output file is in the same style of the input file. 's' : short ID, such as "1", "2", "X. 'l' : long ID, such as "chr1", "chr2", "chrX. ''' # check if 'outfile' was set. If not set, print to screen, if set, print to file if outfile is not None: FILE_OUT = open(outfile, 'w') UNMAP = open(outfile + '.unmap', 'w') else: pass for line in ireader.reader(inbed): if line.startswith(('#', 'track', 'browser')): continue if not line.strip(): continue line = line.strip() fields = line.split() strand = '+' # filter out line less than 3 columns if len(fields) < 3: print("Less than 3 fields. skip " + line, file=sys.stderr) if outfile: print(line + '\tInvalidBedFormat', file=UNMAP) continue try: int(fields[1]) except: print("Start coordinate is not an integer. skip " + line, file=sys.stderr) if outfile: print(line + '\tInvalidStartPosition', file=UNMAP) continue try: int(fields[2]) except: print("End coordinate is not an integer. skip " + line, file=sys.stderr) if outfile: print(line + '\tInvalidEndPosition', file=UNMAP) continue if int(fields[1]) > int(fields[2]): print( "\"Start\" is larger than \"End\" coordinate is not an integer. skip " + line, file=sys.stderr) if outfile: print(line + '\tStart>End', file=UNMAP) continue # try to reset strand try: for f in fields: if f in ['+', '-']: strand = f except: pass chrom = fields[0] start = int(fields[1]) end = int(fields[2]) total_query_length = end - start #used to calculate q_map_ratio a = map_coordinates(mapping, chrom, start, end, strand, chrom_style=cstyle) # input: 'chr1',246974830,247024835 # output: [('chr1', 246974830, 246974833, '+' ), ('chr1', 248908207, 248908210, '+' ), ('chr1', 247024833, 247024835, '+'), ('chr1', 249058210, 249058212,'+')] # [('chr1', 246974830, 246974833), ('chr1', 248908207, 248908210)] if (a is None) or (len(a) % 2 != 0): if outfile is None: print(line + '\tFail\tUnmap') else: print(line + '\tFail\tUnmap', file=UNMAP) continue #when a == 2, there is one-to-one match (i.e. 100% match) if len(a) == 2: #reset fields to target assembly fields[0] = a[1][0] fields[1] = a[1][1] fields[2] = a[1][2] for i in range(0, len(fields)): #update the strand information if fields[i] in ['+', '-']: fields[i] = a[1][3] if outfile is None: print(line + '\t->\t' + '\t'.join([str(i) for i in fields]) + "\tmap_ratio=1.0000") else: print('\t'.join([str(i) for i in fields]) + "\tmap_ratio=1.0000", file=FILE_OUT) #when a is an even number but bigger than 2, each segment is 100% match, # but the whole region is not. In this case, check *min_ratio* of the query if len(a) > 2: a_query = a[:: 2] #EVEN: [('chr1', 246974830, 246974833, '+'), ('chr1', 247024833, 247024835, '+')] a_query_mapped_nt = sum([i[2] - i[1] for i in a_query]) #sum([3,2]) a_target = a[ 1:: 2] #ODDS: [('chr1', 248908207, 248908210, '+'), ('chr1', 249058210, 249058212, '+')] a_target_chroms = set([i[0] for i in a_target]) a_target_chroms = set([i[0] for i in a_target]) a_target_starts = [i[1] for i in a_target] a_target_ends = [i[2] for i in a_target] #print (a_target_ends) map_ratio = a_query_mapped_nt / total_query_length #map_ratio > cutoff if map_ratio >= min_ratio: if len(a_target_chroms) == 1: t_chrom = a_target_chroms.pop() fields[0] = t_chrom fields[1] = min(a_target_starts) fields[2] = max(a_target_ends) if outfile is None: print(line + '\t->\t' + '\t'.join([str(i) for i in fields]) + ("\tmap_ratio=%.4f" % map_ratio)) else: print('\t'.join([str(i) for i in fields]) + ("\tmap_ratio=%.4f" % map_ratio), file=FILE_OUT) else: if outfile is None: print(line + '\tFail\tCrossChroms') else: print(line + '\tFail\tCrossChroms', file=UNMAP) # map_ratio > 0 but < cutoff elif map_ratio > 0 and map_ratio < min_ratio: if outfile is None: print(line + '\tFail' + ("\tmap_ratio=%.4f" % map_ratio)) else: print(line + '\tFail' + ("\tmap_ratio=%.4f" % map_ratio), file=UNMAP)
def crossmap_gvcf_file(mapping, infile, outfile, liftoverfile, refgenome): ''' Convert genome coordinates in GVCF format. Parameters ---------- mapping : dict Dictionary with source chrom name as key, IntervalTree object as value. infile : file Input file in GVCF format. Can be a regular or compressed (*.gz, *.Z,*.z, *.bz, *.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to remote file. outfile : str prefix of output files. liftoverfile : file Chain (https://genome.ucsc.edu/goldenPath/help/chain.html) format file. Can be a regular or compressed (*.gz, *.Z,*.z, *.bz, *.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to remote file. refgenome : file The genome sequence file of 'target' assembly in FASTA format. ''' #index refegenome file if it hasn't been done if not os.path.exists(refgenome + '.fai'): printlog(["Creating index for", refgenome]) pysam.faidx(refgenome) refFasta = pysam.Fastafile(refgenome) FILE_OUT = open(outfile, 'w') UNMAP = open(outfile + '.unmap', 'w') total_var = 0 failed_var = 0 total_region = 0 failed_region = 0 withChr = False # check if the VCF data lines use 'chr1' or '1' for line in ireader.reader(infile): if not line.strip(): continue line = line.strip() #deal with meta-information lines. #meta-information lines needed in both mapped and unmapped files if line.startswith('##fileformat'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##INFO'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##FILTER'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##FORMAT'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##ALT'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##SAMPLE'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##PEDIGREE'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##GVCFBlock'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##GATKCommandLine'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##source'): print(line, file=FILE_OUT) print(line, file=UNMAP) #meta-information lines needed in unmapped files elif line.startswith('##assembly'): print(line, file=UNMAP) elif line.startswith('##contig'): print(line, file=UNMAP) if 'ID=chr' in line: withChr = True #update contig information elif line.startswith('#CHROM'): printlog(["Updating contig field ... "]) target_gsize = dict( list(zip(refFasta.references, refFasta.lengths))) for chr_id in sorted(target_gsize): if chr_id.startswith('chr'): if withChr is True: print("##contig=<ID=%s,length=%d,assembly=%s>" % (chr_id, target_gsize[chr_id], os.path.basename(refgenome)), file=FILE_OUT) else: print("##contig=<ID=%s,length=%d,assembly=%s>" % (chr_id.replace('chr', ''), target_gsize[chr_id], os.path.basename(refgenome)), file=FILE_OUT) else: if withChr is True: print("##contig=<ID=%s,length=%d,assembly=%s>" % ('chr' + chr_id, target_gsize[chr_id], os.path.basename(refgenome)), file=FILE_OUT) else: print("##contig=<ID=%s,length=%d,assembly=%s>" % (chr_id, target_gsize[chr_id], os.path.basename(refgenome)), file=FILE_OUT) print( "##liftOverProgram=<CrossMap,version=%s,website=https://sourceforge.net/projects/crossmap>" % __version__, file=FILE_OUT) print("##liftOverChainFile=<%s>" % liftoverfile, file=FILE_OUT) print("##originalFile=<%s>" % infile, file=FILE_OUT) print("##targetRefGenome=<%s>" % refgenome, file=FILE_OUT) print("##liftOverDate=<%s>" % datetime.date.today().strftime("%B%d,%Y"), file=FILE_OUT) print(line, file=FILE_OUT) print(line, file=UNMAP) printlog(["Lifting over ... "]) else: if line.startswith('#'): continue # process non-variant region if 'END=' in line: fields = str.split(line, maxsplit=8) total_region += 1 chrom = fields[0] start = int(fields[1]) - 1 # 0 based try: m = re.search(r"END\=(\d+)", line) end = int(m[1]) except: print(line + "\tFail(Unmap)", file=UNMAP) failed_region += 1 continue a = map_coordinates(mapping, chrom, start, end, '+') if a is None: print(line + "\tFail(Unmap)", file=UNMAP) failed_region += 1 continue if len(a) == 2: # update chrom target_chr = str( a[1][0] ) #target_chr is from chain file, could be 'chr1' or '1' target_start = a[1][1] target_end = a[1][2] fields[0] = target_chr # update start coordinate fields[1] = target_start + 1 # update END fields[7] = fields[7].replace(('END=' + str(end)), ('END=' + str(target_end))) print('\t'.join(map(str, fields)), file=FILE_OUT) # process variant line else: fields = str.split(line, maxsplit=7) total_var += 1 chrom = fields[0] start = int(fields[1]) - 1 # 0 based, ref_allele start end = start + len(fields[3]) # ref_allele end alt_allele = fields[4].replace(' ', '').split( ',' )[0] # 20 10000598 . T A,<NON_REF> 1754.77 . DP=54; a = map_coordinates(mapping, chrom, start, end, '+') if a is None: print(line + "\tFail(Unmap)", file=UNMAP) failed_var += 1 continue if len(a) == 2: # update chrom target_chr = str( a[1][0] ) #target_chr is from chain file, could be 'chr1' or '1' target_start = a[1][1] target_end = a[1][2] fields[0] = target_chr # update start coordinate fields[1] = target_start + 1 # update ref allele target_chr = update_chromID(refFasta.references[0], target_chr) fields[3] = refFasta.fetch(target_chr, target_start, target_end).upper() if a[1][3] == '-': fields[4] = revcomp_DNA(alt_allele, True) + ',<NON_REF>' #ref_allele and alt_alele are different if fields[3] != alt_allele: print('\t'.join(map(str, fields)), file=FILE_OUT) else: print(line + "\tFail(REF==ALT)", file=UNMAP) failed_var += 1 else: print(line + "\tFail(Multiple_hits)", file=UNMAP) failed_var += 1 continue FILE_OUT.close() UNMAP.close() printlog(["Total variants:", str(total_var)]) printlog(["Variants failed to map:", str(failed_var)]) printlog(["Total non-variant regions:", str(total_region)]) printlog(["Non-variant regions failed to map:", str(failed_region)])
def crossmap_wig_file(mapping, in_file, out_prefix, taget_chrom_size, in_format, binSize=100000): ''' Description ----------- Convert genome coordinates (in wiggle/bigwig format) between assemblies. wiggle format: http://genome.ucsc.edu/goldenPath/help/wiggle.html bigwig format: http://genome.ucsc.edu/goldenPath/help/bigWig.html Parameters ---------- mapping : dict Dictionary with source chrom name as key, IntervalTree object as value. in_file : file Input file in wig or bigwig format. Both "variableStep" and "fixedStep" wiggle lines are supported. out_prefix : str Prefix of output files. taget_chrom_size : dict Chromosome size of the target genome assembly. Key is chromosome ID, value is the length of the chromosome. Note, the chromosome ID and length information were extracted from the chain file, therefore, the chrom_IDs can be with or without the leading "chr". in_format : str Either "wiggle" or "bigwig" binSize : int The chunk size when reading bigwig file in each iteration. ''' OUT_FILE1 = open(out_prefix + '.bgr', 'w') # original bgr file OUT_FILE2 = open(out_prefix + '.sorted.bgr', 'w') # sorted bgr file OUT_FILE3 = pyBigWig.open(out_prefix + '.bw', "w") # bigwig file chrom_style = 'chr1' if in_format.upper() == "WIGGLE": logging.info("Liftover wiggle file \"%s\" to bedGraph file \"%s\"" % (in_file, out_prefix + '.bgr')) for chrom, start, end, strand, score in wiggleReader(in_file): chrom_style = chrom maps = map_coordinates(mapping, chrom, start, end, '+') if maps is None: continue if len(maps) == 2: print('\t'.join([ str(i) for i in [maps[1][0], maps[1][1], maps[1][2], score] ]), file=OUT_FILE1) else: continue maps[:] = [] OUT_FILE1.close() logging.info("Merging overlapped entries in bedGraph file") for (chrom, start, end, score) in bgrMerge.merge(out_prefix + '.bgr'): print('\t'.join([str(i) for i in (chrom, start, end, score)]), file=OUT_FILE2) OUT_FILE2.close() os.remove(out_prefix + '.bgr') #remove .bgr, keep .sorted.bgr # make bigwig header target_chroms_sorted = [] for k in sorted(taget_chrom_size.keys()): i_chrom = update_chromID(chrom_style, k) i_value = taget_chrom_size[k] target_chroms_sorted.append((i_chrom, i_value)) # add bigwig header logging.info("Writing header to \"%s\" ..." % (out_prefix + '.bw')) OUT_FILE3.addHeader(target_chroms_sorted) # add entries to bigwig file logging.info("Writing entries to \"%s\" ..." % (out_prefix + '.bw')) for line in ireader.reader(out_prefix + '.sorted.bgr'): r_chr, r_st, r_end, r_value = line.split() OUT_FILE3.addEntries([r_chr], [int(r_st)], ends=[int(r_end)], values=[float(r_value)]) OUT_FILE3.close() elif in_format.upper() == "BIGWIG": logging.info("Liftover bigwig file %s to bedGraph file %s:" % (in_file, out_prefix + '.bgr')) for chrom, start, end, score in bigwigReader(in_file): chrom_style = chrom maps = map_coordinates(mapping, chrom, start, end, '+') try: if maps is None: continue if len(maps) == 2: print('\t'.join([ str(i) for i in [maps[1][0], maps[1][1], maps[1][2], score] ]), file=OUT_FILE1) else: continue except: continue maps[:] = [] OUT_FILE1.close() logging.info("Merging overlapped entries in bedGraph file") for (chrom, start, end, score) in bgrMerge.merge(out_prefix + '.bgr'): print('\t'.join([str(i) for i in (chrom, start, end, score)]), file=OUT_FILE2) OUT_FILE2.close() os.remove(out_prefix + '.bgr') #remove .bgr, keep .sorted.bgr logging.info("Writing header to \"%s\" ..." % (out_prefix + '.bw')) # make bigwig header target_chroms_sorted = [] for k in sorted(taget_chrom_size.keys()): i_chrom = update_chromID(chrom_style, k) i_value = taget_chrom_size[k] target_chroms_sorted.append((i_chrom, i_value)) # add bigwig header OUT_FILE3.addHeader(target_chroms_sorted) # add entries to bigwig file logging.info("Writing entries to \"%s\" ..." % (out_prefix + '.bw')) for line in ireader.reader(out_prefix + '.sorted.bgr'): r_chr, r_st, r_end, r_value = line.split() OUT_FILE3.addEntries([r_chr], [int(r_st)], [int(r_end)], [float(r_value)]) OUT_FILE3.close() else: raise Exception("Unknown foramt. Must be 'wiggle' or 'bigwig'")