def try_find_build(rs, pos): snps_info = fetch_snps(rs) #snps_info = [('rs3737728', 'GRCh38.p2', '1', '1086035'), ('rs3934834', 'GRCh38.p2', '1', '1070426'), ('rs9651273', 'GRCh38.p2', '1', '1096160')] logging.info("Loading liftover chain files...") lift38_19 = LiftOver('pyliftover/hg38ToHg19.over.chain.gz') lift19_18 = LiftOver('pyliftover/hg19ToHg18.over.chain.gz') lift19_17 = LiftOver('pyliftover/hg19ToHg17.over.chain.gz') logging.info("Done") for (rsId, build, true_chr, pos_hg38), source_pos in zip(snps_info, pos): try: #if build != 'GRCh38.p2': # assume a specific build we get from Entrez.efetch(db='SNP') # continue source_pos -= 1 pos_hg19 = lift38_19.convert_coordinate('chr{}'.format(true_chr), int(pos_hg38) - 1)[0][1] pos_hg18 = lift19_18.convert_coordinate('chr{}'.format(true_chr), pos_hg19)[0][1] pos_hg17 = lift19_17.convert_coordinate('chr{}'.format(true_chr), pos_hg19)[0][1] print( "build={} {} chr{} source={} hg38={}{} hg19={}{} hg18={}{} hg17={}{}" .format(build, rsId, true_chr, source_pos, pos_hg38, '*' if pos_hg38 == source_pos else '', pos_hg19, '*' if pos_hg19 == source_pos else '', pos_hg18, '*' if pos_hg18 == source_pos else '', pos_hg17, '*' if pos_hg17 == source_pos else '')) except: pass
def main(args): # open input vcf vcf = vcf_parser.Vcf(args['inputfile']) # add 3 new tag definitions - for hg19 liftover: chr, pos, and end hg19CHROM_definition = '##INFO=<ID=hg19_chr,Number=1,Type=String,Description="CHROM in hg19 using LiftOver from pyliftover">' hg19POS_definition = '##INFO=<ID=hg19_pos,Number=1,Type=Integer,Description="POS in hg19 using LiftOver from pyliftover (converted back to 1-based)">' hg19END_definition = '##INFO=<ID=hg19_end,Number=1,Type=Integer,Description="END in hg19 using LiftOver from pyliftover (converted back to 1-based)">' vcf.header.add_tag_definition(hg19END_definition) vcf.header.add_tag_definition(hg19POS_definition) vcf.header.add_tag_definition(hg19CHROM_definition) # get chain file for liftover lo = LiftOver(args['chainfile']) # write header and then loop variants, adding liftover coordiantes to INFO fields when appropriate. write all variants. with open(args['outputfile'], 'w') as fo: vcf.write_header(fo) for vnt_obj in vcf.parse_variants(): # generate hg19 LO coordinates based on CHROM and POS hits = lo.convert_coordinate(vnt_obj.CHROM, vnt_obj.POS-1) if len(hits) > 0: #add hg19_chr hg19CHROM_value = 'hg19_chr='+hits[0][0].split('chr')[1] vnt_obj.add_tag_info(hg19CHROM_value) #add hg19_pos hg19POS_value = 'hg19_pos='+str(hits[0][1]+1) vnt_obj.add_tag_info(hg19POS_value) # also want to incorporate END position for SV and CNV # check if "END" exists in INFO and if it does, try a liftover try: END = int(vnt_obj.INFO.split("END=")[1].split(";")[0]) except: END = '' if END != '': hits_end = lo.convert_coordinate(vnt_obj.CHROM, END-1) if len(hits_end) > 0: try: #if hg19_chr is already defined, don't add it vnt_obj.get_tag_value("hg19_chr") #add hg19_end hg19END_value = 'hg19_end='+str(hits_end[0][1]+1) vnt_obj.add_tag_info(hg19END_value) except: #if hg19_chr is not defined, add hg19_chr hg19CHROM_value = 'hg19_chr='+hits_end[0][0].split('chr')[1] vnt_obj.add_tag_info(hg19CHROM_value) #add hg19_end hg19END_value = 'hg19_end='+str(hits_end[0][1]+1) vnt_obj.add_tag_info(hg19END_value) vcf.write_variant(fo, vnt_obj) subprocess.run(["bgzip", args['outputfile']]) subprocess.run(["tabix",args['outputfile']+".gz"])
def main(): usage = "\n\n\tusage: {} cancer_introns.b38.annot_ready.tsv hg38ToHg19.over.chain.gz > cancer_introns.b37.annot_ready.tsv\n\n".format( sys.argv[0]) if len(sys.argv) < 3: print(usage, file=sys.stderr) sys.exit(1) cancer_introns_file = sys.argv[1] hg_chain_file = sys.argv[2] lo = LiftOver('hg38ToHg19.over.chain.gz') with open(cancer_introns_file, 'rt') as fh: header = next(fh) header = header.rstrip() print(header) for line in fh: line = line.rstrip() vals = line.split("\t") intron = vals[0] chr, coordset = intron.split(":") (lend, rend) = coordset.split("-") lend = int(lend) rend = int(rend) new_lend = lo.convert_coordinate(chr, lend - 1) #print("new_lend: {}".format(str(new_lend))) new_rend = lo.convert_coordinate(chr, rend - 1) #print("new_rend: {}".format(str(new_rend))) if new_lend and new_rend: new_lend_chr = new_lend[0][0] new_lend_coord = new_lend[0][1] + 1 new_rend_chr = new_rend[0][0] new_rend_coord = new_rend[0][1] + 1 if new_lend_chr != new_rend_chr or new_lend_chr != chr: sys.stderr.write("-failed conversion of {}".format(line) + " --> {} {}, {} {}\n".format( new_lend_chr, new_lend_coord, new_rend_chr, new_rend_coord)) continue if new_lend_coord > new_rend_coord: (new_lend_coord, new_rend_coord) = (new_rend_coord, new_lend_coord) new_intron_feature = "{}:{}-{}".format(chr, new_lend_coord, new_rend_coord) vals[0] = new_intron_feature print("\t".join(vals)) sys.exit(0)
def setup(self): self.civicdata = {} lifter = LiftOver(constants.liftover_chain_paths['hg19']) page_url = 'https://civicdb.org/api/variants?count=500&page=1' while page_url is not None: try: r = requests.get(page_url, timeout=5) except requests.exceptions.ConnectionError: msg = 'ERROR: Incomplete CIVIC data load' print(msg) self.logger.error(msg) break d = json.loads(r.text) records = d['records'] page_url = d['_meta']['links']['next'] for variant in records: chrom_37 = variant['coordinates']['chromosome'] pos_37 = variant['coordinates']['start'] if chrom_37 is None or pos_37 is None: continue new_coords = lifter.convert_coordinate("chr" + chrom_37, int(pos_37)) if len(new_coords) > 0: chrom_38 = new_coords[0][0].replace('chr', '') pos_38 = new_coords[0][1] else: continue ref = variant['coordinates']['reference_bases'] alt = variant['coordinates']['variant_bases'] toks = [chrom_38, pos_38, ref, alt] if None not in toks: vkey = ':'.join(map(str, toks)) self.civicdata[vkey] = variant else: continue
def setup(self): r = requests.get('https://civicdb.org/api/variants?count=5000&page=1') variants = json.loads(r.text)['records'] lifter = LiftOver(constants.liftover_chain_paths['hg19']) vdict = {} for variant in variants: chrom_37 = variant['coordinates']['chromosome'] pos_37 = variant['coordinates']['start'] if chrom_37 is None or pos_37 is None: continue new_coords = lifter.convert_coordinate("chr" + chrom_37, int(pos_37)) if len(new_coords) > 0: chrom_38 = new_coords[0][0].replace('chr', '') pos_38 = new_coords[0][1] else: continue ref = variant['coordinates']['reference_bases'] alt = variant['coordinates']['variant_bases'] toks = [chrom_38, pos_38, ref, alt] if None not in toks: vkey = ':'.join(map(str, toks)) vdict[vkey] = variant else: continue self.civicdata = vdict
def lift_pos(posvec, chrvec, chainFile): logging.info("Lifting genomic positions...") nsnps = len(posvec) posvec = posvec - 1 pos_lifted = np.empty((nsnps, ), dtype='int32') chr_lifted = np.empty((nsnps, ), dtype='int32') pos_indi = np.empty((nsnps, ), dtype='|S10') dup_indi = np.empty((nsnps, ), dtype='bool') dup_indi.fill(False) lift = LiftOver(chainFile) for i in range(nsnps): if (i + 1) % 200000 == 0: logging.info("{} SNPs done".format(i + 1)) pos = posvec[i] chr = 'chr%d' % (chrvec[i], ) tmp = lift.convert_coordinate(chr, pos) if not tmp: pos_lifted[i] = pos pos_indi[i] = 'miss' chr_lifted[i] = chrvec[i] elif len(tmp) > 1: pos_lifted[i] = tmp[0][1] chr_lifted[i] = re.sub('chr', '', tmp[0][0]) pos_indi[i] = 'multi' else: pos_lifted[i] = tmp[0][1] chr_lifted[i] = re.sub('chr', '', tmp[0][0]) if pos == tmp[0][1]: pos_indi[i] = 'unchanged' else: pos_indi[i] = 'lifted' return pos_lifted + 1, pos_indi, chr_lifted
class Converter: def __init__(self): ## lo = LiftOver("/opt/data/misc/hg38ToHg19.over.chain.gz") self.lo = LiftOver('hg19', 'hg38') def hg38(self, ch, pos): ch = str(ch).upper() if (ch.isdigit() or ch == 'X' or ch == 'Y'): ch = "chr{}".format(ch) try: coord = self.lo.convert_coordinate(ch, pos - 1) except: print "WARNING: HG38 conversion at {}:{}".format(ch, pos) coord = None if (not coord): return None if (len(coord) == 0): return "No Match" r = coord[0][1] + 1 if (len(coord) == 1): return r return r, coord def close(self): return
def liftover_to_19(loc, build): floc = [loc.split(':')[0], loc.split(':')[1]] lo = LiftOver(os.path.join(chainpath, chains.get(build))) con_pos = lo.convert_coordinate(*floc) if con_pos: return int(con_pos[0][1]) return NaN
class UniqueLiftover(object): def __init__(self, chainfile): """ This object will perform unique single positional liftovers - it will only lift over chromosome positions that map unique to the new genome and if the strand hasn't changed. Note: You should run a VCF Normalization sweep on all lifted ofer CPRAs to check for variants that need to be re-normalized, and to remove variants where the REF now doesn't match after a liftover. The combination of these steps will ensure high quality liftovers. However, it should be noted that this won't prevent the situation where multiple positions in the old genome pile up uniquely in the new genome, so one needs to check for this. It's organised as an object rather than a collection of functions so that the LiftOver chainfile only gets opened/passed once and not for every position to be lifted over. :param chainfile: A string containing the path to the local UCSC .gzipped chainfile :return: """ self.liftover = LiftOver(chainfile) def liftover_cpra(self, chromosome, position, verbose=False): """ Given chromosome, position in 1-based co-ordinates, This will use pyliftover to liftover a CPRA, will return a (c,p) tuple or raise NonUniqueLiftover if no unique and strand maintaining liftover is possible :param chromosome: string with the chromosome as it's represented in the from_genome :param position: position on chromosome (will be cast to int) :param verbose: print verbose information for debugging :return: ((str) chromosome, (int) position) or None if no liftover """ chromosome = str(chromosome) position = int(position) # Perform the liftover lookup, shift the position by 1 as pyliftover deals in 0-based co-ords new = self.liftover.convert_coordinate(chromosome, position - 1) # This has to be here as new will be NoneType when the chromosome doesn't exist in the chainfile if new: # If the liftover is unique if len(new) == 1: # If the liftover hasn't changed strand if new[0][2] == "+": # Set the co-ordinates to the lifted-over ones and write out new_chromosome = str(new[0][0]) # Shift the position forward by one to convert back to a 1-based co-ords new_position = int(new[0][1]) + 1 return new_chromosome, new_position else: exception_string = ( "{},{} has a flipped strand in liftover: {}".format( chromosome, position, new)) else: exception_string = "{},{} lifts over to multiple positions: {}".format( chromosome, position, new) elif new is None: exception_string = "Chromosome '{}' provided not in chain file".format( chromosome) if verbose: logging.error(exception_string) return None, None
def ancestral_fasta(args): """subroutine for ancestor subcommand """ # single chromosome fasta file for reference genome ref = pyfaidx.Fasta(args.reference, read_ahead=10000) # make a copy to build our ancestor for this chromosome copyfile(args.reference, args.output) anc = pyfaidx.Fasta(args.output, read_ahead=10000, mutable=True) # reference genome for outgroup species (all chromosomes) out = pyfaidx.Fasta(args.outgroup, read_ahead=10000) # outgroup to reference alignment chain file lo = LiftOver(args.chain) # snps database for the same chromosome vcf = cyvcf2.VCF(args.vcf) # change regions outside of callability mask to all N bases if args.bed: if args.bed == '-': bed = sys.stdin else: bed = open(args.bed, 'r') last_end = 0 for line in bed: chrom, start, end = line.rstrip().split('\t')[:3] start = int(start) anc[chrom][last_end:start] = 'N' * (start - last_end) last_end = int(end) anc[chrom][last_end:len(anc[chrom])] = 'N' * (len(anc[chrom]) - last_end) for variant in vcf: # change variants that are not biallelic SNPs to N bases if not (variant.is_snp and len(variant.ALT) == 1): anc[variant.CHROM][variant.start:variant.end] = 'N' * ( variant.end - variant.start) else: out_coords = lo.convert_coordinate(variant.CHROM, variant.start) # change ambiguously aligning sites to N bases if out_coords is None or len(out_coords) != 1: anc[variant.CHROM][variant.start] = 'N' else: if variant.REF != ref[variant.CHROM][ variant.start].seq.upper(): raise ValueError(f'variant reference allele {variant.REF} ' f'mismatches reference sequence ' f'{ref[variant.CHROM][variant.start]}') out_chromosome, out_position, out_strand = out_coords[0][:3] out_allele = out[out_chromosome][out_position].seq # if negative strand, take reverse complement base if out_strand == '-': out_allele = reverse_complement(out_allele) # and finally, polarize if out_allele.upper() == variant.ALT[0]: anc[variant.CHROM][variant.start] = out_allele elif out_allele.upper() != variant.REF: # triallelic anc[variant.CHROM][variant.start] = 'N'
class UniqueLiftover(object): def __init__(self, chainfile): """ This object will perform unique single positional liftovers - it will only lift over chromosome positions that map unique to the new genome and if the strand hasn't changed. Note: You should run a VCF Normalization sweep on all lifted ofer CPRAs to check for variants that need to be re-normalized, and to remove variants where the REF now doesn't match after a liftover. The combination of these steps will ensure high quality liftovers. However, it should be noted that this won't prevent the situation where multiple positions in the old genome pile up uniquely in the new genome, so one needs to check for this. It's organised as an object rather than a collection of functions so that the LiftOver chainfile only gets opened/passed once and not for every position to be lifted over. :param chainfile: A string containing the path to the local UCSC .gzipped chainfile :return: """ self.liftover = LiftOver(chainfile) def liftover_cpra(self, chromosome, position, verbose=False): """ Given chromosome, position in 1-based co-ordinates, This will use pyliftover to liftover a CPRA, will return a (c,p) tuple or raise NonUniqueLiftover if no unique and strand maintaining liftover is possible :param chromosome: string with the chromosome as it's represented in the from_genome :param position: position on chromosome (will be cast to int) :return: ((str) chromosome, (int) position) or None if no liftover """ chromosome = str(chromosome) position = int(position) # Perform the liftover lookup, shift the position by 1 as pyliftover deals in 0-based co-ords new = self.liftover.convert_coordinate(chromosome, position - 1) # This has to be here as new will be NoneType when the chromosome doesn't exist in the chainfile if new: # If the liftover is unique if len(new) == 1: # If the liftover hasn't changed strand if new[0][2] == "+": # Set the co-ordinates to the lifted-over ones and write out new_chromosome = str(new[0][0]) # Shift the position forward by one to convert back to a 1-based co-ords new_position = int(new[0][1]) + 1 return new_chromosome, new_position else: exception_string = "{},{} has a flipped strand in liftover: {}".format(chromosome, position, new) else: exception_string = "{},{} lifts over to multiple positions: {}".format(chromosome, position, new) elif new is None: exception_string = "Chromosome '{}' provided not in chain file".format(chromosome) if verbose: logging.error(exception_string) return None, None
def PCGP_mut_df_genome_build_check(df,pos_col=4): col_check_hg18= [ col for col in df.columns if 'hg18' in col.lower() ] col_check_hg38= [ col for col in df.columns if 'hg38' in col.lower() ] if len(col_check_hg18) > 0 or len(col_check_hg38) > 0: if (len(col_check_hg18) == 1 and len(col_check_hg38) == 0) or (len(col_check_hg18) == 0 and len(col_check_hg38) == 1): if len(col_check_hg18) == 1: fd=col_check_hg18[0] col_check=col_check_hg18 print("[Warning] following columns from hg18 genome build: %s" % fd) lo=LiftOver('hg18', 'hg19') elif len(col_check_hg38) == 1: fd=col_check_hg38[0] col_check=col_check_hg38 print("[Warning] following columns from hg38 genome build: %s" % fd) lo=LiftOver('hg38', 'hg19') pos=[] #print(df) print(fd) for idx, row in df.iterrows(): conversion=lo.convert_coordinate(row['Chr'], row[col_check[0]]) if conversion: newpos=lo.convert_coordinate(row['Chr'], row[col_check[0]])[0] pos.append(newpos[1]) else: newpos=(row['Chr'],-1) pos.append(0) #newpos=lo.convert_coordinate(row['Chr'], row[col_check[0]])[0] #pos.append(newpos[1]) df['Position_hg19']=pos return df else: print("[Error] only one column allowed for conversion: %s ... quit" % col_check) quit() else: #print("No change") cols=df.columns.values cols[pos_col]='Position_hg19' df.columns=cols return df
class liftover: def __init__(self, build_from, build_to): # Source Genome Build if build_from in map_release.values(): self.build_from = build_from else: build_mapped = map_release.get(build_from) if build_mapped is None: raise Exception( 'Unknown SOURCE genome build. The value was: {}'.format( build_from)) else: self.build_from = build_mapped # Destination Genome Build if build_to in map_release.values(): self.build_to = build_to else: build_mapped = map_release.get(build_to) if build_mapped is None: raise Exception( 'Unknown DESTINATION genome build. The value was: {}'. format(build_from)) else: self.build_to = build_mapped # Download/Source the Chain from UCSC if self.build_from != self.build_to: self.GetChain() else: self.chain = None def GetChain(self): '''Downloads the chain from UCSC ''' self.chain_name = 'UCSC: {} to {}'.format(self.build_from, self.build_to) self.chain = LiftOver(self.build_from, self.build_to) def lift(self, chr, pos): lifted = self.chain.convert_coordinate( 'chr{}'.format(str(chr)), int(pos) ) # ToDo figure out whether this step should be adjusted for 0/1 indexing? if lifted is not None: if len(lifted) == 1: return lifted[0][0][3:], int( lifted[0][1]), False # Only 1 position if len(lifted) > 1: return lifted[0][0][3:], int( lifted[0][1]), True # Multiple positions (take first) else: return None, None, None else: return None, None, None
class CravatAnnotator(BaseAnnotator): def setup(self): chain_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data', 'hg38ToHg19.over.chain') self.liftover = LiftOver(chain_path) def annotate(self, input_data, secondary_data=None): out = {} hg19_data = self.liftover.convert_coordinate(input_data['chrom'], int(input_data['pos']) - 1) if len(hg19_data) > 0: out['chrom'] = hg19_data[0][0] out['pos'] = hg19_data[0][1] + 1 return out
def liftover(self): # todo # Not sure what the failure mode of this tool is. Will probably need to write a try catch eventually # Changing the chromosome and position messes up the key as well. Could probably fix that. But i don't have # the ref and alt alleles on hand and I don't want to parse them out of chromosomeHgvsName. from pyliftover import LiftOver lo = LiftOver('hg38', self.build) lifted = lo.convert_coordinate(self.chromosome, self.position) self.chromosome = lifted[0][0] self.position = lifted[0][1]
def from_hg18_to_hg19(chr, coord): """ object to perform hg18 --> hg19 conversion. ----------- REMEMBER that LIFT-OVER coordinates are 0-based!!! ----------- ADD +1 to obtain a values in 1-based coordinate!! :param chr: chromosome name, e.g. 'chr6' :param coord: integer, e.g. 10000 :return: coord in hg coordinates system """ lo = LiftOver('hg18', 'hg19') conv = lo.convert_coordinate(chr, int(coord)+1) hg19_coord = conv[0][1] return hg19_coord
def liftover_pickering(df): lo = LiftOver('hg19', 'hg38') chroms = ('chr' + df['Chromosome'].astype(str)).tolist() startpos = df['Start_position'].tolist() endpos = df['End_position'].tolist() rows = zip(chroms, startpos, endpos) new_startpos = [] new_endpos = [] for row in rows: new_start = lo.convert_coordinate(row[0], row[1]-1) if len(new_start) == 0: print(f"Didn't find hg38 coordinate for {row[0]}:{row[1]}-{row[2]}") new_startpos.append('NA') else: new_startpos.append(new_start[0][1]+1) new_end = lo.convert_coordinate(row[0], row[2]-1) if len(new_end) == 0: print(f"Didn't find hg38 coordinate for {row[0]}:{row[1]}-{row[2]}") new_endpos.append('NA') else: new_endpos.append(new_end[0][1] + 1) df['Start_position'] = new_startpos df['End_position'] = new_endpos return df
def pyliftover(hg38_chrom, hg38_coord): hg38_key = '%s:%s' % (hg38_chrom, hg38_coord) if hg38_key not in pyliftover_dict: lo = LiftOver(config.input_dir + 'hg38ToHg19.over.chain.gz') result = lo.convert_coordinate(hg38_chrom, int(hg38_coord)) if result is not None: coords_list = result[0] pyliftover_dict[hg38_key] = { 'chrom': coords_list[0], 'coord': str(coords_list[1]) } return pyliftover_dict[hg38_key]
def liftover(pos, chro, from_assembly, to_assembly): """ LiftOver a specific coordinate between assemblies using the UCSC LiftOver tool NOTE: pyLiftover uses base 0, whereas coordinate system uses base 1 therefore position 27107251 is actually 27107250 in pyLiftover """ if from_assembly == to_assembly: return pos chro = 'chr' + str(chro) pos = int(pos) lo = LiftOver(from_assembly, to_assembly) out = lo.convert_coordinate(chro, pos) return out[0][1]
def liftover(self, chromosome, position, build='hg19'): # todo # Not sure what the failure mode of this tool is. Will probably need to write a try catch eventually # Changing the chromosome and position messes up the key as well. Could probably fix that. But i don't have # the ref and alt alleles on hand and I don't want to parse them out of chromosomeHgvsName. lo = LiftOver('hg38', build) lifted = lo.convert_coordinate(chromosome, position) new_chromosome = lifted[0][0] new_position = lifted[0][1] if self.debug: print("%s %s -> %s %s" % (chromosome, position, new_chromosome, new_position)) return new_chromosome, new_position
def main(coords, orig_assembly, new_assembly, chainfile, outfh): # Create a LiftOver object with desired mapping. lo = LiftOver(orig_assembly, new_assembly) results = [] for coord in coords: try: chrom, pos = coord.split(':') # No idea why, but pos needs to be an int instead of a str! returnval = lo.convert_coordinate(chrom, int(pos))[0] results.append((chrom, pos,) + returnval) except: # Not sure what kinds of errors we can get. I think if a locus is # deleted, we'll get None as a result (which we'll want to handle), # but apart from that, not sure what to expect. sys.stderr.write('Offending coord: %s' % coord) raise print_results(results, outfh)
def main(coords, orig_assembly, new_assembly, chainfile, outfh): # Create a LiftOver object with desired mapping. lo = LiftOver(orig_assembly, new_assembly) results = [] for coord in coords: try: chrom, pos = coord.split(':') # No idea why, but pos needs to be an int instead of a str! returnval = lo.convert_coordinate(chrom, int(pos))[0] results.append(( chrom, pos, ) + returnval) except: # Not sure what kinds of errors we can get. I think if a locus is # deleted, we'll get None as a result (which we'll want to handle), # but apart from that, not sure what to expect. sys.stderr.write('Offending coord: %s' % coord) raise print_results(results, outfh)
Usage: lift_over.py <from-build> <to-build> stdin line format: chrom bp_in_from_build stdout line format: bp_in_to_build, or '-' if not found Created on February 19, 2014 @author: Oren Livne <*****@*****.**> ============================================================ ''' import sys, traceback, util from pyliftover import LiftOver if __name__ == '__main__': try: src, target = sys.argv[1:3] if src == target: for _, bp in (line.strip().split(' ') for line in sys.stdin): print '%d %d' % (int(bp), int(bp)) else: lo = LiftOver(src, target) for chrom, bp in (line.strip().split(' ') for line in sys.stdin): out = lo.convert_coordinate('chr' + chrom, int(bp)) if not out: print '-' else: print '%d' % (out[0][1],) except: traceback.print_exc(file=sys.stdout) sys.exit(util.EXIT_FAILURE)
mongo_client = MongoClient() db = mongo_client.fasttrack lo = LiftOver("hg38ToHg19.over.chain.gz") unmatched = 0 matched = 0 for r in db.gwas.find(): chrid = r["chr_id"] chrpos = r["chr_pos"] if chrid and chrpos: try: _chrpos = int(chrpos) except: pass else: lifted = lo.convert_coordinate("chr%s" % chrid, _chrpos - 1) if lifted: new_chrid = lifted[0][0].split("chr")[1] new_chrpos = lifted[0][1] matched += 1 db.gwas.update_many( {"chr_id": chrid, "chr_pos": chrpos}, {"$set": {"hg19chr": new_chrid, "hg19pos": new_chrpos}} ) else: # print('NONE: %s %s' %(chrid, chrpos)) unmatched += 1 print(unmatched, matched)
def read_gwas(args, filename, report=None): liftover = None wrong_column_count = float_conv_failed = yes = no = 0 desc = {} default_p, default_std = args['gwas:default:p'], args['gwas:default:se'] default_n, default_chr = args['gwas:default:n'], args['gwas:default:chr'] default_beta = args['gwas:default:beta'] def select(name, options, fail=True): option_name = 'gwas:' + name if not args[option_name] is None: desc[name] = args[option_name] if name in desc: try: return header.index(desc[name]) except IndexError: print('Specified header (--gwas:' + name, args[option_name] + ') not found.') exit(1) for option in options: header_upper = list(map(str.upper, header)) if option.upper() in header_upper: desc[name] = option return header_upper.index(option.upper()) if fail and not args.get('gwas:default:' + name): print('Could not find a header in GWAS for', name) print(' specify with --' + option_name) print('suggestions:') for part in header: print(' * --' + option_name, part) exit(1) try: with fopen(filename) as f: for lineno, line in enumerate(f, 1): if lineno == 1: if not args['gwas:build'] is None: desc['build'] = args['gwas:build'] elif any(hint in line for hint in GWAS_HG19_HINTS): desc['build'] = 'hg19' elif any(hint in line for hint in GWAS_HG18_HINTS): desc['build'] = 'hg18' if '\0' in line: # iibdgc-trans-ancestry-filtered-summary-stats.tgz contains zero byte garbage garbage_end = line.rindex('\0') line = line[garbage_end + 1:] if args['gwas:header:remove']: line = line.replace(args['gwas:header:remove'], '') header = line.split(args['gwas:sep']) hpos = select('chr_bp', GWAS_H_CHR_AND_BP_COMB_OPTIONS, fail=False) if hpos is None: postype_combined = False hpos_ch = select('chr', GWAS_H_CHR_OPTIONS) hpos_bp = select('bp', GWAS_H_BP_OPTIONS) else: postype_combined = True href = select('effect', GWAS_H_EFF_OPTIONS) hoth = select('other', GWAS_H_OTH_OPTIONS) hfreq = select('freq', GWAS_H_FREQ_OPTIONS) hse = select('se', GWAS_H_SE_OPTIONS) hp = select('p', GWAS_H_PVALUE_OPTIONS) if args['gwas:beta'] is not None: hb = select('beta', GWAS_H_BETA_OPTIONS) elif args['gwas:or'] is not None: hb = None hor = select('or', []) else: hb = select( 'beta', GWAS_H_BETA_OPTIONS) # select default or fail if not args['gwas:n'] is None: hn = [ header.index(col) for col in args['gwas:n'].split(',') ] desc['n'] = '+'.join(args['gwas:n'].split(',')) elif any(col in header for col in GWAS_H_NTOTAL_OPTIONS): ncol = next(col_ for col_ in GWAS_H_NTOTAL_OPTIONS if col_ in header) desc['n'] = ncol hn = [header.index(ncol)] elif (any(col in header for col in GWAS_H_NCASE_OPTIONS) and any(col in header for col in GWAS_H_NCONTROL_OPTIONS)): ncol_a = next(col_ for col_ in GWAS_H_NCASE_OPTIONS if col_ in header) ncol_b = next(col_ for col_ in GWAS_H_NCONTROL_OPTIONS if col_ in header) desc['n'] = ncol_a + '+' + ncol_b hn = [header.index(ncol_a), header.index(ncol_b)] elif not args['gwas:default:n']: print( 'Could not find a header in GWAS for the number of samples, or the number of cases and controls.' ) exit(1) else: hn = None if 'build' not in desc: print( 'Could not determine GWAS genome build; use flag --gwas:build <BUILD>.' ) exit(1) if desc['build'] != args['gen:build']: liftover = LiftOver(desc['build'], args['gen:build']) print('converting', desc['build'], '->', args['gen:build']) print('= Detected headers =') for k, v in args.items(): if k.startswith('gwas:default') and v: desc[k[13:]] = 'DEFAULT ' + v for k, v in desc.items(): print(k.ljust(10), v) if args['header_only']: exit(0) print('= Converting =') reporter = ReporterLine('Reading gwas data.') continue parts = line.split(args['gwas:sep']) if len(parts) != len(header): # MDD switches halfway to a different format for a small number of non-significant SNPs if report: log_error(report, 'wrong_column_count', gwas=parts) wrong_column_count += 1 continue if postype_combined: ch, bp, *_ = parts[hpos].split( ':', 2) # Some append :<SNP>/:<INDEL>, just ignore if default_chr: print( 'Default chromosome specified but reading chr:bp column.' ) exit(1) else: ch = default_chr or parts[hpos_ch] bp = parts[hpos_bp] try: if default_n: n = default_n else: n = sum(int(float(parts[col]) + 0.5) for col in hn) # some GWASs default to n=-9, which is then picked up # by the header autodetector as valid data.. if n < 0: print('Negative N!!!') exit(1) except ValueError: n = 'NA' gwas_freq = parts[hfreq] try: if default_beta: gwas_beta = default_beta elif hb is None: or_ = float(parts[hor]) if or_ < 0: print('negative ODDS ratio. is this a beta?') exit(1) gwas_beta = math.log(or_) else: gwas_beta = float(parts[hb]) gwas_freq = float(gwas_freq) except ValueError: row = GWASRow(parts[href].upper(), parts[hoth].upper(), gwas_freq, gwas_beta, default_std or parts[hse], default_p or parts[hp], lineno, ch, bp, n) if report: log_error(report, 'gwas_float_conv_failed', gwas=row) float_conv_failed += 1 continue row = GWASRow(parts[href].upper(), parts[hoth].upper(), gwas_freq, gwas_beta, default_std or parts[hse], default_p or parts[hp], lineno, ch, bp, n) ch = ch.upper() if ch.startswith('CHR'): ch = ch[3:] ch = ch.lstrip('0') ch = conv_chr_letter(ch) if liftover: conv = liftover.convert_coordinate('chr' + ch, int(bp)) if conv: ch, bp, s19, _ = conv[0] bp = str(bp) if ch.startswith('chr'): ch = ch[3:] yes += 1 else: no += 1 if report: log_error(report, 'gwas_build_conv_failed', gwas=row) continue ch = ch.zfill(2) yield (ch, bp), row if lineno % 40000 == 0: reporter.update(lineno, f.fileno()) except KeyboardInterrupt: print('Aborted reading gwas data at line', lineno) except UnicodeDecodeError: # IBD turns into gibberish after 95%, we can probably discard that print('UnicodeDecodeError, aborted reading gwas data at line', lineno) if liftover: print('Successfully', desc['build'], '->', args['gen:build'], 'converted', yes, 'rows') print('Build conversion failed for', no, 'rows (reported as gwas_build_conv_failed).') if float_conv_failed: print('Numeric conversion failed for', float_conv_failed, 'rows (reported as gwas_float_conv_failed).') if wrong_column_count: print('Invalid number of columns for', wrong_column_count, 'rows (reported as wrong_column_count).') print() print()
else: # Sometimes it includes multipe dbSNP/rsID combinations, so always # take the more recent rsID (farmost right). rsid = cols[7].split(":")[-1] phase = [x[0] for x in cols[8:]] allele1 = [x[1] for x in cols[8:]] allele2 = [x[2] for x in cols[8:]] # Only report SNPs if type != "snp": continue log_snps += 1 # Convert hg19 coordinate to hg38 using pyliftover # CGI is 0-based, so feed the start position to pyliftover lo_result = lo.convert_coordinate("chr" + chr, int(start)) # Unable to convert from hg19 to hg38 if len(lo_result) == 0: log_nonconvert += 1 sys.stderr.write("No conversion\t%s\t%s\t%s\n" % (chr, start, rsid)) continue # Multiple coordinates on hg38 if len(lo_result) > 1: log_multi += 1 sys.stderr.write("Multiple coordinates\t%s\t%s\t%s\n" % (chr, start, rsid)) continue lo_chr, lo_pos = lo_result[0][:2]
def liftover(hgvs_genomic, build_from, build_to, hn, reverse_normalizer, evm, validator, specify_tx=False, liftover_level=False, g_to_g=False): """ Step 1, attempt to liftover using a common RefSeq transcript Step 2, attempt to liftover using PyLiftover. Lift position > Check bases > Lift back and confirm the original position :param hgvs_genomic: :param build_from: :param build_to: :param hn: :param reverse_normalizer: :param evm: :param validator: Validator obj :return: """ try: hgvs_genomic = validator.hp.parse(hgvs_genomic) except TypeError as e: logger.debug("Except passed, %s", e) # Create return dictionary lifted_response = {} # Check genome build type if 'GRC' in build_from: from_set = 'grc_chr' alt_from_set = 'ucsc_chr' if '37' in build_from: lo_from = 'hg19' alt_build_from = 'hg19' elif '38' in build_from: lo_from = 'hg38' alt_build_from = 'hg38' else: lo_from = '' alt_build_from = '' else: from_set = 'ucsc_chr' alt_from_set = 'grc_chr' if '19' in build_from: lo_from = 'hg19' alt_build_from = 'GRCh37' elif '38' in build_from: lo_from = 'hg38' alt_build_from = 'GRCh38' else: lo_from = '' alt_build_from = '' if 'GRC' in build_to: to_set = 'grc_chr' alt_to_set = 'ucsc_chr' if '37' in build_to: lo_to = 'hg19' alt_build_to = 'hg19' elif '38' in build_to: lo_to = 'hg38' alt_build_to = 'hg38' else: lo_to = '' alt_build_to = '' else: to_set = 'ucsc_chr' alt_to_set = 'grc_chr' if '19' in build_to: lo_to = 'hg19' alt_build_to = 'GRCh37' elif '38' in build_to: lo_to = 'hg38' alt_build_to = 'GRCh38' else: lo_to = '' alt_build_to = '' # populate the variant from data vcf = hgvs_utils.report_hgvs2vcf(hgvs_genomic, build_from, reverse_normalizer, validator.sf) # Create to and from dictionaries lifted_response[build_from.lower()] = {} lifted_response[build_from.lower()][hgvs_genomic.ac] = { 'hgvs_genomic_description': mystr(hgvs_genomic), 'vcf': { 'chr': vcf[from_set], 'pos': str(vcf['pos']), 'ref': vcf['ref'], 'alt': vcf['alt'] } } lifted_response[alt_build_from.lower()] = {} lifted_response[alt_build_from.lower()][hgvs_genomic.ac] = { 'hgvs_genomic_description': mystr(hgvs_genomic), 'vcf': { 'chr': vcf[alt_from_set], 'pos': str(vcf['pos']), 'ref': vcf['ref'], 'alt': vcf['alt'] } } # From dictionary currently blank lifted_response[build_to.lower()] = {} lifted_response[alt_build_to.lower()] = {} # Get a list of overlapping RefSeq transcripts # Note, due to 0 base positions in UTA (I think) occasionally tx will rts_list = validator.hdp.get_tx_for_region( hgvs_genomic.ac, 'splign', hgvs_genomic.posedit.pos.start.base - 1, hgvs_genomic.posedit.pos.end.base) #- 1) rts_dict = {} tx_list = False if g_to_g is True: pass else: for tx_dat in rts_list: rts_dict[tx_dat[0]] = True if evm is not None: rts_list_2 = evm.relevant_transcripts(hgvs_genomic) else: rts_list_2 = [] for tx_dat_2 in rts_list_2: rts_dict[tx_dat_2] = True if rts_dict != {}: tx_list = list(rts_dict.keys()) # Try to liftover if tx_list is not False: selected = [] # Liftover via a specific tx if it can be done! if specify_tx is not False: tx_list = [specify_tx] for tx in tx_list: # identify the first transcript if any options = validator.hdp.get_tx_mapping_options(tx) for op in options: sfm = None if op[1].startswith('NC_'): if build_to.startswith('GRC'): sfm = seq_data.to_chr_num_refseq(op[1], build_to) if build_to.startswith('hg'): sfm = seq_data.to_chr_num_ucsc(op[1], build_to) if build_from.startswith('GRC'): sfm = seq_data.to_chr_num_refseq(op[1], build_from) if build_from.startswith('hg'): sfm = seq_data.to_chr_num_ucsc(op[1], build_from) if sfm is not None: selected.append([op[0], op[1]]) if liftover_level == 'primary': continue else: if op[1].startswith('NT_'): if build_to.startswith('GRC'): sfm = seq_data.to_chr_num_refseq(op[1], build_to) if build_to.startswith('hg'): sfm = seq_data.to_chr_num_ucsc(op[1], build_to) if build_from.startswith('GRC'): sfm = seq_data.to_chr_num_refseq(op[1], build_from) if build_from.startswith('hg'): sfm = seq_data.to_chr_num_ucsc(op[1], build_from) if sfm is not None: selected.append([op[0], op[1]]) if op[1].startswith('NW_'): if build_to.startswith('GRC'): sfm = seq_data.to_chr_num_refseq(op[1], build_to) if build_to.startswith('hg'): sfm = seq_data.to_chr_num_ucsc(op[1], build_to) if build_from.startswith('GRC'): sfm = seq_data.to_chr_num_refseq(op[1], build_from) if build_from.startswith('hg'): sfm = seq_data.to_chr_num_ucsc(op[1], build_from) if sfm is not None: selected.append([op[0], op[1]]) # remove duplicate chroms filtered_1 = {} if selected: for chroms in selected: if chroms[1] not in list(filtered_1.keys()): filtered_1[chroms[1]] = chroms[0] added_data = False for key, val in list(filtered_1.items()): try: # Note, due to 0 base positions in UTA (I think) occasionally tx will # be identified that cannot be mapped to. # In this instance, do not mark added data as True hgvs_tx = validator.vm.g_to_t(hgvs_genomic, val) hgvs_alt_genomic = validator.vm.t_to_g(hgvs_tx, key) alt_vcf = hgvs_utils.report_hgvs2vcf( hgvs_alt_genomic, build_to, reverse_normalizer, validator.sf) # Add the to build dictionaries lifted_response[build_to.lower()][hgvs_alt_genomic.ac] = { 'hgvs_genomic_description': mystr(hgvs_alt_genomic), 'vcf': { 'chr': alt_vcf[to_set], 'pos': str(alt_vcf['pos']), 'ref': alt_vcf['ref'], 'alt': alt_vcf['alt'] } } lifted_response[alt_build_to.lower()][ hgvs_alt_genomic.ac] = { 'hgvs_genomic_description': mystr(hgvs_alt_genomic), 'vcf': { 'chr': alt_vcf[alt_to_set], 'pos': str(alt_vcf['pos']), 'ref': alt_vcf['ref'], 'alt': alt_vcf['alt'] } } # Overwrite build from info as PAR may require additional info lifted_response[build_from.lower()][ hgvs_alt_genomic.ac] = { 'hgvs_genomic_description': mystr(hgvs_alt_genomic), 'vcf': { 'chr': alt_vcf[to_set], 'pos': str(alt_vcf['pos']), 'ref': alt_vcf['ref'], 'alt': alt_vcf['alt'] } } lifted_response[alt_build_from.lower()][ hgvs_alt_genomic.ac] = { 'hgvs_genomic_description': mystr(hgvs_alt_genomic), 'vcf': { 'chr': alt_vcf[alt_to_set], 'pos': str(alt_vcf['pos']), 'ref': alt_vcf['ref'], 'alt': alt_vcf['alt'] } } added_data = True except vvhgvs.exceptions.HGVSError: continue if lifted_response != {} and added_data is not False: return lifted_response # Note: pyliftover uses the UCSC liftOver tool. # https://pypi.org/project/pyliftover/ # Once validated, download the UCSC liftover files from http://hgdownload.cse.ucsc.edu/goldenPath/hg38/liftOver/ # The structure of the following code comes from VV pymod, so need to create a list genome_builds = [build_to] # Create liftover vcf from_vcf = hgvs_utils.report_hgvs2vcf(hgvs_genomic, lo_from, reverse_normalizer, validator.sf) lo = LiftOver(lo_from, lo_to) # Fix the GRC CHR if from_vcf[from_set].startswith('chr'): liftover_list = lo.convert_coordinate(from_vcf[from_set], int(from_vcf['pos'])) else: my_chrom = 'chr' + from_vcf[from_set] liftover_list = lo.convert_coordinate(my_chrom, int(from_vcf['pos'])) # Create dictionary for lifted in liftover_list: chrom = lifted[0] pos = lifted[1] orientated = lifted[2] lifted_ref_bases = from_vcf['ref'] lifted_alt_bases = from_vcf['alt'] # Inverted sequence if orientated != '+': my_seq = Seq(lifted_ref_bases) lifted_ref_bases = my_seq.reverse_complement() your_seq = Seq(lifted_alt_bases) lifted_alt_bases = your_seq.reverse_complement() accession = seq_data.to_accession(chrom, lo_to) if accession is None: wrn = 'Unable to identify an equivalent %s chromosome ID for %s' % ( str(lo_to), str(chrom)) logger.info(wrn) continue else: not_delins = accession + ':g.' + str(pos) + '_' + str(( pos - 1) + len(lifted_ref_bases)) + 'delins' + lifted_alt_bases not_delins = str(not_delins) hgvs_not_delins = validator.hp.parse_hgvs_variant(not_delins) hgvs_lifted = hn.normalize(hgvs_not_delins) # Now try map back lo = LiftOver(lo_to, lo_from) # Lift back liftback_list = lo.convert_coordinate(chrom, pos) for lifted_back in liftback_list: # Pull out the good guys! # Need to add chr to the from_set if not lifted_back[0].startswith('chr'): my_from_chr = 'chr' + lifted_back[0] else: my_from_chr = lifted_back[0] if lifted_back[0] == from_vcf[from_set] or lifted_back[ 0] == my_from_chr: if lifted_back[1] == int(from_vcf['pos']): for build in genome_builds: vcf_dict = hgvs_utils.report_hgvs2vcf( hgvs_lifted, build, reverse_normalizer, validator.sf) if build.startswith('GRC'): lifted_response[build_to.lower()][ hgvs_lifted.ac] = { 'hgvs_genomic_description': mystr(hgvs_lifted), 'vcf': { 'chr': vcf_dict['grc_chr'], 'pos': str(vcf_dict['pos']), 'ref': vcf_dict['ref'], 'alt': vcf_dict['alt'] } } lifted_response[alt_build_to.lower()][ hgvs_lifted.ac] = { 'hgvs_genomic_description': mystr(hgvs_lifted), 'vcf': { 'chr': vcf_dict['ucsc_chr'], 'pos': str(vcf_dict['pos']), 'ref': vcf_dict['ref'], 'alt': vcf_dict['alt'] } } else: lifted_response[build_to.lower()][ hgvs_lifted.ac] = { 'hgvs_genomic_description': mystr(hgvs_lifted), 'vcf': { 'chr': vcf_dict['ucsc_chr'], 'pos': str(vcf_dict['pos']), 'ref': vcf_dict['ref'], 'alt': vcf_dict['alt'] } } lifted_response[alt_build_to.lower()][ hgvs_lifted.ac] = { 'hgvs_genomic_description': mystr(hgvs_lifted), 'vcf': { 'chr': vcf_dict['grc_chr'], 'pos': str(vcf_dict['pos']), 'ref': vcf_dict['ref'], 'alt': vcf_dict['alt'] } } return lifted_response
else: # Sometimes it includes multipe dbSNP/rsID combinations, so always # take the more recent rsID (farmost right). rsid = cols[7].split(":")[-1] phase = [x[0] for x in cols[8:]] allele1 = [x[1] for x in cols[8:]] allele2 = [x[2] for x in cols[8:]] # Only report SNPs if type != "snp": continue log_snps += 1 # Convert hg19 coordinate to hg38 using pyliftover # CGI is 0-based, so feed the start position to pyliftover lo_result = lo.convert_coordinate("chr" + chr, int(start)) # Unable to convert from hg19 to hg38 if len(lo_result) == 0: log_nonconvert += 1 sys.stderr.write("No conversion\t%s\t%s\t%s\n"%(chr, start, rsid)) continue # Multiple coordinates on hg38 if len(lo_result) > 1: log_multi += 1 sys.stderr.write("Multiple coordinates\t%s\t%s\t%s\n"%(chr, start, rsid)) continue lo_chr, lo_pos = lo_result[0][:2]
def addTSSInfo(self, vcfInputFile): vcf_reader = vcf.Reader(open(vcfInputFile, 'r')) vcf_reader.infos['TSSOL'] = VcfInfo('TSSOL', vcf_field_counts['A'], 'String', 'Info indicates whether the variant overlapping with the' ' transcription start site(TSS)') vcf_writer = vcf.VCFWriter(open('output.vcf', 'w'), vcf_reader) query = SPARQLQueries.sparqlQueries() totalVar = 0 tssOLVar = 0 lo = LiftOver('hg38ToHg19.over.chain.gz') for record in vcf_reader: variantStart = record.start variantEnd = record.end variantChromosome = record.CHROM variantSubType = record.var_subtype isOverlapping = False # Adding chr prefix to the chromosome if "chr" not in variantChromosome: variantChromosome = "chr"+str(record.CHROM) #liftover from hg20 to hg19 data = lo.convert_coordinate(variantChromosome, variantStart) #print variantChromosome print variantStart print variantEnd if ((data != None)): data2 = data.pop() variantChromosomehg19 = data2[0] variantStarthg19 = data2[1] data = lo.convert_coordinate(variantChromosome, variantEnd) data2 = data.pop() variantEndhg19 = data2[1] # SPARQL query result = query.getTSS('http://ep.dbcls.jp/fantom5/sparql', variantStarthg19, variantEndhg19, variantChromosomehg19) for row in result: values = sparql.unpack_row(row) cageStart = values[1] cageEnd = values[2] if ((variantSubType == 'ins') & ( variantStart > cageStart )): isOverlapping = True tssOLVar = tssOLVar+1 break elif ((variantSubType != 'ins') & (cageStart > 0)): isOverlapping = True tssOLVar = tssOLVar+1 break totalVar = totalVar+1 record.add_info('TSSOL', [isOverlapping]) else: print "No liftover found for this pos = "+record.ID vcf_writer.write_record(record) print "No of variants = "+str(totalVar) print "No of tss overlapping variants = "+str(tssOLVar)
class SubmitHiCLiftOver: def __init__(self, args): self.args = args self.doLiftOver = LiftOver('hg19', 'hg38') self.lengths_orig = [] self.lengths_filtered = [] self.oldVsNew = [] def splitStrCoordStr(self, raw): chrom = raw.split(':')[0] start = raw.split(':')[1].split('-')[0] end = raw.split(':')[1].split('-')[1] return "\t".join([chrom, start, end]) def splitStrCoord(self, raw): chrom = raw.split(':')[0] start = raw.split(':')[1].split('-')[0] end = raw.split(':')[1].split('-')[1] return [chrom, int(start), int(end)] def wrapLiftover(self, debug, chrom, start, end, errMsg): lift_start = self.doLiftOver.convert_coordinate(chrom, start) if not lift_start: if debug: print(errMsg + " start", chrom, start) return None lift_start = lift_start[0] lift_end = self.doLiftOver.convert_coordinate(chrom, end) if not lift_end: if debug: print(errMsg + " end", chrom, end) return None lift_end = lift_end[0] if lift_start[0] != lift_end[0]: if debug: print(errMsg + " no longer same chrom", chrom, start, end, lift_start[0], lift_end[0]) return None oldLen = end - start chromLift = lift_start[0] startLift = lift_start[1] endLift = lift_end[1] newLen = endLift - startLift if oldLen < 1: if debug: print(errMsg + " oldLen: negative!", chrom, start, end) return None if newLen < 1: if debug: print(errMsg + " newLen: negative!", chromLift, startLift, endLift) return None absDiff = abs(newLen - oldLen) return [chromLift, startLift, endLift, oldLen, newLen, absDiff] def coordToStr(self, c): return c[0] + ':' + str(c[1]) + '-' + str(c[2]) def parseLine(self, line): # chr10 3240001 4120000 boundary.3|hg19|chr10:3240001-3280000___boundary.4|hg19|chr10:4080001-4120000 1.06090369391 # [0chrom, 1start, 2end, 3mess, 4value] toks = line.split() leftCoord = toks[:3] leftCoord[1] = int(leftCoord[1]) leftCoord[2] = int(leftCoord[2]) mtoks = toks[3].split('|') midBoundaryLeft = mtoks[0] if 3 != len(mtoks): midBoundaryRight = mtoks[2].split('__')[1] midCoordRaw = mtoks[2].split('__')[0] midCoord = self.splitStrCoord(midCoordRaw) if 3 != len(mtoks): rightCoord = self.splitStrCoord(mtoks[-1]) leftCoordLift = self.wrapLiftover(False, leftCoord[0], leftCoord[1], leftCoord[2], "left") if not leftCoordLift: return None self.lengths_orig.append([leftCoordLift[3], leftCoordLift[4]]) if leftCoordLift[5] > 5000: if 0: print("skipping b/c of lengths change") return None midCoordLift = self.wrapLiftover(False, midCoord[0], midCoord[1], midCoord[2], "mid") if not midCoordLift: return None if midCoordLift[5] > 5000: return None if 3 != len(mtoks): rightCoordLift = self.wrapLiftover(False, rightCoord[0], rightCoord[1], rightCoord[2], "right") if not rightCoordLift: return None if rightCoordLift[5] > 5000: return None self.lengths_filtered.append([leftCoordLift[3], leftCoordLift[4]]) if 3 != len(mtoks): mid = [midBoundaryLeft, "hg38-liftOver", self.coordToStr(midCoordLift) + '___' + midBoundaryRight, "hg38-liftOver", self.coordToStr(rightCoordLift)] else: mid = [midBoundaryLeft, "hg38-liftOver", self.coordToStr(midCoordLift)] ret = "\t".join([str(x) for x in leftCoordLift[:3] + ['|'.join(mid)] + [toks[4]]]) self.oldVsNew.append([line, ret]) return ret def tmpFile(self, accession, assembly, prefix): return os.path.join("/home/mjp/tadsLiftOverHg19ToHg38", assembly + "_liftOver_" + prefix + '_' + accession + ".bed.gz") def parseOutFile(self, accession, fnp): good = 0 bad = 0 with gzip.open(fnp) as f: with gzip.open(self.tmpFile(accession, 'hg38', 'point'), 'wb') as outF: for line in f: newLine = self.parseLine(line) if newLine: outF.write(newLine + '\n') good += 1 else: bad += 1 print("lifted:", accession, good, bad) def runLiftover(self): mc = MemCacheWrapper() qd = QueryDCC(cache=mc) url = "https://www.encodeproject.org/search/?type=Experiment&assay_title=Hi-C&status=released" for exp in qd.getExps(url): for f in exp.getTADs(): f.download() self.parseOutFile(f.fileID, f.fnp()) fnp = "/home/mjp/tadsLiftOverHg19ToHg38/lengths_orig.tsv" with open(fnp, 'w') as f: for r in self.lengths_orig: f.write('\t'.join([str(x) for x in r]) + '\n') print("wrote", fnp) fnp = "/home/mjp/tadsLiftOverHg19ToHg38/lengths_filtered.tsv" with open(fnp, 'w') as f: for r in self.lengths_filtered: f.write('\t'.join([str(x) for x in r]) + '\n') print("wrote", fnp) fnp = "/home/mjp/tadsLiftOverHg19ToHg38/oldVsNew.tsv" with open(fnp, 'w') as f: for r in self.oldVsNew: f.write(r[0]) f.write(r[1] + '\n') print("wrote", fnp) def fileJson(self, exp, f, fnp): return { "dataset": exp.encodeID, "file_format": "bed", "file_format_type": "bed3+", "file_size": os.path.getsize(fnp), "md5sum": Utils.md5(fnp), "output_type": f.output_type, "assembly": "GRCh38", "award": "/awards/U41HG007000/", "lab": "/labs/zhiping-weng/", "derived_from": [f.fileID], "submitted_file_name": fnp, "aliases": ["zhiping-weng:hic-tad-hg38-liftOver-" + f.fileID] } def submitFile(self, exp, f): fileAccession = f.fileID fnp = self.tmpFile(fileAccession, 'hg38', 'point') j = self.fileJson(exp, f, fnp) print(j) submitFile(self.args, j) def runSubmit(self): authenticateEncodeTxt(self.args) mc = MemCacheWrapper() qd = QueryDCC(cache=mc) url = "https://www.encodeproject.org/search/?type=Experiment&assay_title=Hi-C&status=released" for exp in qd.getExps(url): for f in exp.getTADs(): f.download() self.submitFile(exp, f)
mapping_dict = {} with open(mapping_file, "r") as map_f: for line in map_f: line_p = line.rstrip("\n").split("\t") mapping_dict[line_p[0]]=line_p[2] mapping_dict[line_p[1]]=line_p[2] print(mapping_dict) enhancer_files = glob.glob(enhancer_dir+"/*.txt") print(enhancer_files) for e_file in enhancer_files: if e_file.endswith(".py") or e_file.endswith("mapping"): continue with open(e_file, "r") as e: e_file_name = e_file.split("/")[-1] with open(processed_dir+"converted_"+e_file_name, "w") as p: for line in e: p_line = line.rstrip("\n").split("\t") print(p_line) candidate = p_line[6] if candidate not in mapping_dict: continue new_name = mapping_dict[candidate] new_line = p_line[0:6] + [new_name] new_line[1] = str(lo.convert_coordinate(p_line[0], int(p_line[1]))) new_line[2] = str(lo.convert_coordinate(p_line[0], int(p_line[2]))) new_line = "\t".join(new_line) + "\n" p.write(new_line)
#!/usr/bin/env python3 import pandas as pd from pyliftover import LiftOver File = "p-Value_threshold_1_hapmap3_all_variant_effect_non_zero_GRCh37.txt" Input_file = pd.read_csv(File, index_col=None, header=None, sep=" ") lo = LiftOver('hg19', 'hg38') Input_file[6] = "" Input_file[7] = "" #hg38 = [] id_not_found = list() Asd = list() for var in range(0, len(Input_file[1])): print(var) try: Asd = lo.convert_coordinate("chr" + Input_file[0][var].astype(str), Input_file[1][var]) Asdf = lo.convert_coordinate("chr" + Input_file[0][var].astype(str), Input_file[2][var]) Input_file[6][var] = Asd[0][1] Input_file[7][var] = Asdf[0][1] except IndexError: id_not_found.append(list([var, Input_file[5][var]])) #Input_file[6][var] = Asd[2] #Input_file[7][var] = Asdf[2] pass Input_file.loc[Input_file[5] == "rs12728058"][6] = 555 Input_file.loc[Input_file[5] == "rs12728058"][7] = "" Input_file.loc[id_not_found[5][0], ] Input_file.loc[id_not_found[4][0], 6] = 142739784 Input_file.loc[id_not_found[4][0], 7] = 142739784
def get_info(snp_code, snp_list): chr_prefix = "20" old_build = "hg18" new_build = "hg38" cols = [ x for x in zip(*snp_list) ] # This bit, unzips [* asterisk] the lis, which is actually a LIST of lists # and 'dumps' all same indexed elements from sublists in a tuple, # The list cols, contains 5 tuples; len(cols)=5 snp_codes = cols[ 0] # snp_codes : cols[0] = ('snp_0', 'snp_1', 'snp_2', 'snp_3', ...) __ 1st column of gwas.cases.gen hg18_coordinates = cols[ 2] # hg18_coordinates : cols[2] = ('9098', '9150', '9795', '10731',.......) __ 3rd column of gwas.cases.gen ref = cols[ 3] # ref_base : cols[3] = ('C', 'T', 'G', 'C', 'A', '.............) __ 4th column of gwas.cases.gen alt = cols[ 4] # alt_base : cols[4] = ('T', 'A', 'T', 'A', 'C', ..............) __ 5th column of gwas.cases.gen # the whole rationale behind this, is that the index stays true amongst the wbove lists # in relation to the initial file. # So if we took the elements cols[0][0],cols[1][0], cols[2][0], cols[3][0],cols[][0] # we'd get all the elements in the row of snp_0 in the file gwas.cases.gen # NOW THAT WE HAVE EACH COLUMN OF THE FILE STORED IN A TUPLE, # WE CAN USE THEM TO ITERATE AND GENERATE THE HGVS id # THAT WE'LL NEED FOR SENDING THE GET REQUEST TO GET INFO. if snp_codes.count( snp_code ) != 0: # this checks if the user_input matches a snp_id i.e. snp_90 from the file 12345_long.txt for i in range(0, len(snp_codes)): if snp_code == snp_codes[i]: sys.stdout.write( "\n\nUpdating chromosome coordinates to new assembly..") pylift_id = int(hg18_coordinates[i]) lo = LiftOver( old_build, new_build ) # Stating from ('hg18') to('hg38') which build we want our coordinates to be updated # We'll use: LiftOver.convert_coordinate('chrX', 'XXXXX') to update the position on the chromosome # All SNPs are located on chromosome 20, so # the first argument of lo.convert_coordinates will be 'chr20' for all SNPs # the second argument of lo.convert_coordinates will be the pylift_id from above # The output of lo.convert_coordinates will be a list with 1 element; a tuple with 4 elements # i.e.: pylift_tuple = [('chr20', 80456, '+', 5643036713)] # We can access the elements of the tuple by using 2 sq.brackets as index indicators # SO: # pylift_tuple[0][0] = 'chr20' ______ chromosome i.e. # pylift_tuple[0][1] = '80456' ______ coordinates in hg38 <---- we'll need this one # pylift_tuple[0][2] = '+' ______ DNA strand, + for coding, - for non-coding # pylift_tuple[0][3] = 5643036713 ______ allignment score* # NOTE: We will need the coordinates aka the pylift_tuple[0][1] pylift_tuple = lo.convert_coordinate('chr' + chr_prefix, pylift_id) sys.stdout.write(" Done \n") # The pylift_tuple[0][1] is a string with the updated coordinates # hg38_coordinates, will be used to make the genomic HGVS id, necessary for the request hg38_coordinates = int( pylift_tuple[0][1]) #the updated coordinates # We have chosen to use GET requests to get info # Since NOT all samples have an rs_id # We will try to reconstruct the genomic HGVS id for all samples using their coordinates # Let's take a look at the url link used for a GET request: # url = 'http://rest.ensembl.org/vep/human/hgvs/9:g.22125504G>C?' # For all of our samples the first part i.e. 'http://rest.ensembl.org/vep/human/hgvs/20:g. # would be the same. # All we need is for each snp to reconstruct the remaining bit,i.e. 22125504G>C? # which is actually a string of this form: # i.e. for snp_0: hg38_coordinates + snp[3][0] + '>' + snp[4]+'?' # Let request_id # be the variable that holds the reconstructed HGVS id for each SNP request_id = chr_prefix + ":g" + str( hg38_coordinates) + ref[i] + '>' + alt[i] + '?' url = 'http://rest.ensembl.org/vep/human/hgvs/' headers = {"Content-Type": "application/json"} #At last, actually making the request using requests lib: r = requests.get(url + request_id, headers=headers) if (r.ok) == True: data = r.json() print( ('\nQuery successful!\n\nInfo about', snp_code, "has been saved in the file: ", snp_code + ".info")) print(('\nYour json file contains the following', len(list(data[0].keys())), 'keys:\n')) print(('\n'.join(list(data[0].keys())), '\n\n')) saveout = sys.stdout file = snp_code + ".info" save = open( file, 'w' ) #saving in a file the output of print which is a decoded json file sys.stdout = save print(data) sys.stdout = saveout save.close( ) #closing the file, and now stdout again on terminal else: print(('Sorry, information for ', snp_code, " currently unavailable.")) #REMINDER: The else below, goes with the: # if snp[0].count(snp_id)!=0: # If user didn't type the right snp_id, # the following message will be printed: else: print(( '\nSorry, but', snp_code, 'is not included in the dataset.\nOr maybe you have misspelled the snp_id?.\nPlease try again.\n' )) return ("")
__author__ = 'rajaram' #Reference : https://pypi.python.org/pypi/pyliftover #Left over data : http://hgdownload.cse.ucsc.edu/gbdb/hg38/liftOver/ from pyliftover import LiftOver #lo = LiftOver('hg38', 'hg19') lo = LiftOver('hg38ToHg19.over.chain.gz') for x in range(0, 100): data = lo.convert_coordinate('chr1', 1000000+x) print data data2 = data.pop() print data2[0]
import re from pyliftover import LiftOver liftover = LiftOver('hg18', 'hg19') clonal = pandas.read_csv("nature13600-s1-table-s6-clonal.tsv", sep = "\t") subclonal = pandas.read_csv("nature13600-s1-table-s7-subclonal.tsv", sep = "\t") all = pandas.concat( [clonal, subclonal] ) # keep only validated variants, i.e. those with a Duplex_P_val smaller 0.01 and not NA filtered = all[ all['Duplex_P_val'].str.contains('0,0[123456789]|NA|0,[123456789]') == False ] with open("Wang2014_ground_truth_non_synonymous_variants.hg18_to_hg19.tsv", mode='w') as out: print( '\t'.join( filtered.columns.values.tolist() ), file=out, end='\n') for index, row in filtered.iterrows(): print( row['chrom'], row['pos'], row['REF'], row['VAR'] ) lo = liftover.convert_coordinate(row['chrom'],row['pos'] - 1)[0] row['chrom'] = lo[0] row['pos'] = lo[1] + 1 # reclassify clonals' zygosity based on the Duplex_Freq if row['class'] == 'clonal': freq = float(row['Duplex_Freq'].replace(',', '.')) if freq >= 0.6: row['zygosity'] = 'hom' else: row['zygosity'] = 'het' print( '\t'.join(map(str, row)) ) print( '\t'.join(map(str, row)), file=out, end='\n' )
import sys import os indir = os.environ['indir'] summarystats = os.environ['summarystats'] converted = os.environ['converted'] # download chain file # hg38 to hg19 lo = LiftOver('hg38', 'hg19') # read in sumstats sumstats = pd.read_csv('{}/{}'.format(indir, summarystats), sep='\t') # convert coordinates chrom = lambda x: lo.convert_coordinate(x.CHR, x.POS)[0][0] if len( lo.convert_coordinate(x.CHR, x.POS)) > 0 else 'chr0' loc = lambda x: lo.convert_coordinate(x.CHR, x.POS)[0][1] if len( lo.convert_coordinate(x.CHR, x.POS)) > 0 else 0 sumstats['Lifted_Chrom'] = sumstats.apply(chrom, axis='columns') sumstats['Lifted_Loc'] = sumstats.apply(loc, axis='columns') # drop unmatched data sumstats = sumstats[sumstats.Lifted_Loc != 0] # reformat and eliminate columns unnecessary for ldsc sumstats.drop(columns=['CHR', 'POS'], inplace=True) sumstats.rename(columns={ 'Lifted_Chrom': 'CHR', 'Lifted_Loc': 'POS',
for line in f: # Read columns for each variant in the MAF file columns = line.split('\t') # Filter empty rows and headers if len(columns)>2 and columns[0] != "Hugo_Symbol": pair_key = columns[15] + ' ' + columns[16] # Filtering variants in TCGA # 1) SNPs # 2) This sample comparison exists in GDC if columns[9] == "SNP" and pair_key in gdc_pairs: start = lo.convert_coordinate('chr' + columns[4], int(columns[5])) end = lo.convert_coordinate('chr' + columns[4], int(columns[6])) total_variants += 1 # Check if reference has been correctly crossed if start is not None and end is not None and len(start)==1 and len(end)==1: refbase = BedTool.seq(start[0][0].replace('chr','') + ':' + str(start[0][1]) + '-' + str(end[0][1]), fastaRef) # Check if reference in TCGA is the same in hg38 ref if refbase == columns[10]: variant_key = ' '.join([start[0][0], str(start[0][1]), str(end[0][1]), start[0][2], columns[15], columns[16]]) # Create pair if it is not created if pair_key in pair_list:
__author__ = 'rajaram' #Reference : https://pypi.python.org/pypi/pyliftover #Left over data : http://hgdownload.cse.ucsc.edu/gbdb/hg38/liftOver/ from pyliftover import LiftOver #lo = LiftOver('hg38', 'hg19') lo = LiftOver('hg38ToHg19.over.chain.gz') for x in range(0, 100): data = lo.convert_coordinate('chr1', 1000000 + x) print data data2 = data.pop() print data2[0]
shutil.copy(os.path.join(hg19_dir,'%s_desc.xml' %hg19_test), os.path.join(hg18_dir,'%s_desc.xml' %hg18_test)) shutil.copy(os.path.join(hg19_dir,'%s_input.txt' %hg19_test), os.path.join(hg18_dir,'%s_input.txt' %hg18_test)) shutil.copy(os.path.join(hg19_dir,'%s_key.csv' %hg19_test), os.path.join(hg18_dir,'%s_key.csv' %hg18_test)) # Add a <hg18>on</hg18> tag to the desc.xml print 'Changing desc file' desc_path = os.path.join(hg18_dir,'%s_desc.xml' %hg18_test) desc = ET.parse(desc_path) hg18 = ET.Element('hg18') hg18.text = 'on' desc.find('sub_params').append(hg18) desc.write(desc_path) # Shift genomic coordinates to hg18 print 'Lifting over coordinates' input_path = os.path.join(hg18_dir,'%s_input.txt' %hg18_test) input_text = open(input_path,'r').read() lines19 = input_text.split('\n') lines18 = [] for line19 in lines19: elems19 = line19.split('\t') elems18 = elems19 genom18 = lo.convert_coordinate(elems19[1],int(elems19[2]))[0] elems18[1] = genom18[0] elems18[2] = str(genom18[1]) lines18.append('\t'.join(elems18)) with open(input_path,'w') as f: f.write('\n'.join(lines18)) print 'Completed' print 'All completed'
class MasterCravatConverter(object): """ Convert a file of ambiguous format to .crv format. Reads in CravatConverter classes in the same directory, selects the correct converter, and writes a crv file. """ ALREADYCRV = 2 def __init__(self, args=None): try: args = args if args else sys.argv self.input_path = None self.f = None self.input_format = None self.logger = None self.crv_writer = None self.crs_writer = None self.crm_writer = None self.crl_writer = None self.err_file = None self.primary_converter = None self.converters = {} self.possible_formats = [] self.ready_to_convert = False self.cmd_args = None self.output_dir = None self.output_base_fname = None self.vtracker = VTracker() self._parse_cmd_args(args) self._setup_logger() except Exception as e: self.__handle_exception(e) def _parse_cmd_args(self, args): """ Parse the arguments in sys.argv """ parser = argparse.ArgumentParser() parser.add_argument('path', help='Path to this converter\'s python module') parser.add_argument('input', help='File to be converted to .crv') parser.add_argument('-f', dest='format', help='Specify an input format') parser.add_argument('-n', '--name', dest='name', help='Name of job. Default is input file name.') parser.add_argument('-d', '--output-dir', dest='output_dir', help='Output directory. '\ +'Default is input file directory.') parser.add_argument( '-l', '--liftover', dest='liftover', choices=['hg38'] + list(constants.liftover_chain_paths.keys()), default='hg38', help='Input gene assembly. Will be lifted over to hg38') parsed_args = parser.parse_args(args) self.input_path = os.path.abspath(parsed_args.input) if parsed_args.format: self.input_format = parsed_args.format input_dir, input_fname = os.path.split(self.input_path) if parsed_args.output_dir: self.output_dir = parsed_args.output_dir else: self.output_dir = input_dir if not (os.path.exists(self.output_dir)): os.makedirs(self.output_dir) if parsed_args.name: self.output_base_fname = parsed_args.name else: self.output_base_fname = input_fname self.input_assembly = parsed_args.liftover self.do_liftover = self.input_assembly != 'hg38' if self.do_liftover: self.lifter = LiftOver( constants.liftover_chain_paths[self.input_assembly]) else: self.lifter = None def setup(self): """ Do necesarry pre-run tasks """ if self.ready_to_convert: return # Open file handle to input path self.f = open(self.input_path) # Read in the available converters self._initialize_converters() # Select the converter that matches the input format self._select_primary_converter() # A correct .crv file is not processed. if self.input_format == 'crv' and \ self.input_path.split('.')[-1] == 'crv': #exit(cravat.util.exit_codes['alreadycrv']) exit(1) # Open the output files self._open_output_files() self.ready_to_convert = True def _setup_logger(self): """ Open a log file and set up log handler """ self.log_path = os.path.join(self.output_dir, self.output_base_fname + '.converter.log') self.logger = logging.getLogger('converter_log') self.logger.propagate = False self.logger.setLevel('INFO') handler = logging.FileHandler(self.log_path, mode='w') formatter = logging.Formatter() handler.setFormatter(formatter) self.logger.addHandler(handler) self.logger.info('MasterConverter log') self.logger.info('Opened %s' % time.asctime()) self.logger.info('Input file: %s' % self.input_path) if self.do_liftover: self.logger.info('Liftover from %s' % self.input_assembly) def _initialize_converters(self): """ Reads in available converters. Loads any python files in same directory that start with _ as python modules. Initializes the CravatConverter class from that module and places them in a dict keyed by their input format """ for module_info in au.get_local_module_infos_of_type( 'converter').values(): # path based import from https://docs.python.org/3/library/importlib.html#importing-a-source-file-directly spec = importlib.util.spec_from_file_location( module_info.name, module_info.script_path) module = importlib.util.module_from_spec(spec) spec.loader.exec_module(module) converter = module.CravatConverter() if converter.format_name not in self.converters: self.converters[converter.format_name] = converter else: err_msg = 'Cannot load two converters for format %s' \ %converter.format_name raise Exception(err_msg) self.possible_formats = list(self.converters.keys()) def _select_primary_converter(self): """ Choose the converter which matches the input format. If a input format was not specified in the cmd args, uses the check_format() method of the CravatConverters to identify a converter which can parse the input file. """ if self.input_format is not None: if self.input_format not in self.possible_formats: sys.exit('Invalid input format. Please select from [%s]' \ %', '.join(self.possible_formats)) else: valid_formats = [] self.f.seek(0) for converter_name, converter in self.converters.items(): check_success = converter.check_format(self.f) self.f.seek(0) if check_success: valid_formats.append(converter_name) if len(valid_formats) == 0: sys.exit('Input format could not be determined. ' +\ 'Exiting without conversion.') elif len(valid_formats) > 1: sys.exit('Input format ambiguous in [%s]. '\ %', '.join(valid_formats)\ +'Please specify an input format.') else: self.input_format = valid_formats[0] self.primary_converter = self.converters[self.input_format] self.logger.info('Input format: %s' % self.input_format) def _open_output_files(self): """ Open .crv .crs and .crm output files, plus .err file. .crv .crs and .crm files are opened using a CravatWriter. .err file will contain all errors which occur during conversion. .map file contains two columns showing which lines in input correspond to which lines in output. """ # Setup CravatWriter self.wpath = os.path.join(self.output_dir, self.output_base_fname + '.crv') self.crv_writer = CravatWriter(self.wpath) self.crv_writer.add_columns(constants.crv_def) self.crv_writer.write_definition() for index_columns in constants.crv_idx: self.crv_writer.add_index(index_columns) self.logger.info('Output file: %s' % self.wpath) # Setup err file self.err_path = os.path.join(self.output_dir, self.output_base_fname + '.converter.err') self.err_file = open(self.err_path, 'w') self.logger.info('Error file: %s' % self.err_path) # Setup crm line mappings file self.crm_path = os.path.join(self.output_dir, self.output_base_fname + '.crm') self.crm_writer = CravatWriter(self.crm_path) self.crm_writer.add_columns(constants.crm_def) self.crm_writer.write_definition() for index_columns in constants.crm_idx: self.crm_writer.add_index(index_columns) self.logger.info('Map file: %s' % self.crm_path) # Setup crs sample file self.crs_path = os.path.join(self.output_dir, self.output_base_fname + '.crs') self.crs_writer = CravatWriter(self.crs_path) self.crs_writer.add_columns(constants.crs_def) if hasattr(self.primary_converter, 'addl_cols'): self.crs_writer.add_columns(self.primary_converter.addl_cols, append=True) constants.crs_def.extend(self.primary_converter.addl_cols) self.crs_writer.write_definition() for index_columns in constants.crs_idx: self.crs_writer.add_index(index_columns) self.logger.info('Sample crs file: %s' % self.crs_path) # Setup liftover var file if self.do_liftover: self.crl_path = '.'.join([self.wpath, self.input_assembly, 'var']) self.crl_writer = CravatWriter(self.crl_path) assm_crl_def = copy.deepcopy(constants.crl_def) assm_crl_def[1]['title'] = '{0} Chrom'.format( self.input_assembly.title()) assm_crl_def[2]['title'] = '{0} Position'.format( self.input_assembly.title()) self.crl_writer.add_columns(assm_crl_def) self.crl_writer.write_definition() self.crl_writer.write_names(self.input_assembly, self.input_assembly.title()) def run(self): """ Convert input file to a .crv file using the primary converter.""" try: self.setup() start_time = time.time() self.logger.info('Conversion start: %s' % \ time.asctime(time.localtime(start_time))) self.primary_converter.setup(self.f) self.f.seek(0) read_lnum = 0 write_lnum = 0 num_errors = 0 for l in self.f: read_lnum += 1 try: # all_wdicts is a list, since one input line can become # multiple output lines all_wdicts = self.primary_converter.convert_line(l) if all_wdicts is None: continue except Exception as e: num_errors += 1 self._log_conversion_error(read_lnum, e) continue if all_wdicts: UIDMap = [] for wdict in all_wdicts: if wdict['ref_base'] == '' \ and wdict['alt_base'] not in ['A','T','C','G']: num_errors += 1 e = BadFormatError( 'Reference base required for non SNV') self._log_conversion_error(read_lnum, e) continue if self.do_liftover: prelift_wdict = copy.copy(wdict) try: wdict['chrom'], wdict['pos'] = self.liftover( wdict['chrom'], wdict['pos']) except LiftoverFailure as e: num_errors += 1 self._log_conversion_error(read_lnum, e) continue unique, UID = self.vtracker.addVar( wdict['chrom'], int(wdict['pos']), wdict['ref_base'], wdict['alt_base']) wdict['uid'] = UID if unique: write_lnum += 1 self.crv_writer.write_data(wdict) if self.do_liftover: prelift_wdict['uid'] = UID self.crl_writer.write_data(prelift_wdict) if UID not in UIDMap: #For this input line, only write to the .crm if the UID has not yet been written to the map file. self.crm_writer.write_data({ 'original_line': read_lnum, 'tags': wdict['tags'], 'uid': UID }) UIDMap.append(UID) self.crs_writer.write_data(wdict) end_time = time.time() self.logger.info('Conversion end: %s' %\ time.asctime(time.localtime(end_time))) self.logger.info('Read lines: %d' % read_lnum) self.logger.info('Error lines: %d' % num_errors) self.logger.info('Wrote lines: %d' % write_lnum) runtime = round(end_time - start_time, 3) self.logger.info('Conversion runtime: %s' % runtime) self._close_files() except Exception as e: self.__handle_exception(e) def liftover(self, old_chrom, old_pos): new_coords = self.lifter.convert_coordinate(old_chrom, int(old_pos)) if len(new_coords) > 0: new_chrom = new_coords[0][0] new_pos = new_coords[0][1] return new_chrom, new_pos else: raise LiftoverFailure(old_chrom, old_pos) def __handle_exception(self, e): sys.stderr.write(traceback.format_exc()) if hasattr(self, 'logger'): if self.logger is not None: self.logger.exception(e) sys.exit(2) sys.exit(1) def _log_conversion_error(self, ln, e): """ Log exceptions thrown by primary converter. All exceptions are written to the .err file with the exception type and message. Exceptions are also written to the log file, with the traceback. Exceptions of type InvalidData do not have their traceback logged. """ err_toks = [str(x) for x in [ln, e.__class__.__name__, e]] self.err_file.write('\t'.join(err_toks) + '\n') if not (isinstance(e, InvalidData)): self.logger.exception(e) def _close_files(self): """ Close the input and output files. """ self.f.close() self.crv_writer.close() self.crm_writer.close() self.crs_writer.close() self.err_file.close()
def plot_manhattan( args, annotations, l_x, l_y, l_c, x_ticks, y_max, d_pos_init_chrom): y_max = max(int(y_max + 3), args.min_y) if args.EFO: ## Just make some assumptions about builds here for now. ## https://en.wikipedia.org/wiki/Reference_genome lo = LiftOver('hg38', 'hg19') with open(args.EFO) as f: cnt = collections.Counter() for line in f: cnt[line.split('\t')[7]] += 1 trait_most_common = cnt.most_common(1)[0][0] with open(args.EFO) as f: ## Skip header. for line in f: break for line in f: l = line.split('\t') # ## Try to weed out all the garbage present in the GWAS catalog. # if not l[7] == trait_most_common: # continue CHR_ID = l[11] ## Skip if missing data. if CHR_ID == '': continue try: CHR_POS = int(l[12]) ## Continue if CHR_POS is not an integer. except ValueError: continue rsID = l[21] y = PVALUE_MLOG = min(y_max, float(l[28])) # if y < -math.log10(args.threshold_p): # continue try: x = d_pos_init_chrom[CHR_ID] + lo.convert_coordinate( 'chr{}'.format(CHR_ID), CHR_POS)[0][1] except KeyError: assert CHR_ID == 'X' continue except IndexError: print('IndexError', CHR_ID, CHR_POS, lo.convert_coordinate('chr{}'.format(CHR_ID), CHR_POS), file=sys.stderr) continue # l_x.append(x) # l_y.append(y) # l_c.append('#FF0000') ## Colour most frequently occuring trait red. if l[7] == trait_most_common: plt.vlines(x, 0, y, colors='#FF0000', linewidth=0.5, linestyle='--') ## Colour less frequently occuring traits orange, ## because these might be junk in the GWAS catalog. else: plt.vlines(x, 0, y, colors='#FF8000', linewidth=0.5, linestyle='--') n = len(l_y) plt.ylabel(r'-log$_{10}$($p$)') # plt.axhline(-math.log10(0.05 / n), color='0.8', linewidth=0.5) # plt.axhline(-math.log10(5 * 10 ** -8), color='0.5', linewidth=0.5) plt.axhline(-math.log10(args.threshold_p), color='0.2', linewidth=0.5, linestyle='--') try: plt.ylim((0, y_max)) # todo: make argument except: pass print('plt.scatter(manhattan)', file=sys.stderr) plt.scatter(l_x, l_y, c=l_c, s=3) plt.title(args.title, fontsize='small') for annotation in annotations: # if annotation['prob'] > 0.05 / n: if annotation['prob'] > args.threshold_p: continue print('\t'.join( [str(annotation[k]) for k in sorted(annotation.keys())])) plt.annotate( '\n'.join(( 'p={:.1E}'.format(annotation['prob']), 'pos={:,}'.format(annotation['pos']), 'MAF={:.3f}'.format(min(annotation['af'], 1 - annotation['af'])), annotation['rsID'], ','.join(annotation['gene_names']), )), xy=(annotation['x'], annotation['y']), ## xytext=(), fontsize='xx-small', horizontalalignment='center', verticalalignment='bottom', rotation=30, ) plt.xticks( *zip(*x_ticks), rotation=-75, size=6, fontsize=6) print('plt.savefig( {}.manhattan.png )'.format(args.out), file=sys.stderr) plt.savefig('{}.manhattan.png'.format(args.out), dpi=600) return
def LiftDown_hg18(_bim, _hg, _out): HG_input = 'hg{}'.format(_hg) # print("HG: {}".format(HG_input)) df_bim = pd.read_csv(_bim, sep='\s+', header=None, dtype=str, names=['Chr', 'Label', 'GD', 'BP', 'a1', 'a2']) # print("df_bim:\n{}\n".format(df_bim)) ### Main Liftover ### if HG_input == 'hg38': """ 'hg38' -> 'hg19' -> 'hg18' is needed. The Liftover tool (by UCSC Genomics Institute) doesn't provide 'hg38' to 'hg18'. """ lo_hg38_to_hg19 = LiftOver(HG_input, 'hg19') lo_hg19_to_hg18 = LiftOver('hg19', 'hg18') sr_hg19 = df_bim['BP'] \ .astype(int) \ .map(lambda x: lo_hg38_to_hg19.convert_coordinate('chr6', x)) \ .map(lambda x: x[0][1] if len(x) > 0 else -1) # print("(hg19):\n{}\n".format(sr_hg19)) sr_hg18 = sr_hg19 \ .map(lambda x: lo_hg19_to_hg18.convert_coordinate('chr6', x)) \ .map(lambda x: x[0][1] if len(x) > 0 else -1) # print("(hg18):\n{}\n".format(sr_hg18)) else: lo = LiftOver(HG_input, 'hg18') # Liftdown to hg18 sr_hg18 = df_bim['BP'] \ .astype(int) \ .map(lambda x: lo.convert_coordinate('chr6', x)) \ .map(lambda x: x[0][1] if len(x) > 0 else -1) df_bim['BP'] = sr_hg18 # Setting new BPs (Liftdown) ### Makrers that failed the Liftdown. ### f_failed = sr_hg18 == -1 if f_failed.any(): print( std_WARNING_MAIN_PROCESS_NAME + "Next markers of Target('{}') failed to Liftdown to hg18. These markers will be excluded." .format(_bim)) print(df_bim[f_failed]) # print("df_bim_hg18:\n{}\n".format(df_bim)) df_bim.to_csv(_out, sep='\t', header=False, index=False) return _out
interval = intrxn[1].split(":")[1].split("-") if len(interval) == 2: for i in range(int(interval[0]), int(interval[1])): dist[i] += 1 print "RNA size:", len(dist) #Use the following part to liftover mouse coordinates to human liftfiles = {"mm28S": "/Users/lu/Documents/chang/rrna/liftover/mmtohs28S.liftoverchain", \ "mm45S": "/Users/lu/Documents/chang/rrna/liftover/mmtohs45S.liftoverchain", \ "Malat1": "/Users/lu/Documents/chang/psoralen/examples/MALAT1/mmtohg_Malat1.liftoverchain"} if RNAtoplot in liftfiles: newdist = [0 for i in range(0, size)] lo = LiftOver(liftfiles[RNAtoplot]) for i in range(0, size): lifted = lo.convert_coordinate(RNAtoplot, i, '+') if lifted: newdist[lifted[0][1]] += dist[i] dist = newdist figure = plt.figure(figsize=(8,2)) axes = plt.Axes(figure, [.3,.3,.6,.6]) figure.add_axes(axes) plt.bar(range(0, size), dist, color='k') axes.spines['top'].set_visible(False) axes.spines['right'].set_visible(False) axes.yaxis.set_ticks_position('left') axes.xaxis.set_ticks_position('bottom') plt.xlim(0, size) plt.xlabel(xlab)