def setup_conv(in_build): global b3x global str_db_file global contig global contigmt global pos_triplet_fn global lo_37to38 global lo_38to37 print("Loading LiftOver conversion chain file for build %d..." % in_build) if in_build == 19: b3x = 'b37' str_db_file = 'str_hg19.gff3' contig = 'chrY' contigmt = 'chrM' pos_triplet_fn = pos_triplet_37 lo_37to38 = LiftOver('crossmap/GRCh37_to_GRCh38.chain.gz') elif in_build == 37: b3x = 'b37' str_db_file = 'str_hg19.gff3' contig = 'Y' contigmt = 'MT' pos_triplet_fn = pos_triplet_37 lo_37to38 = LiftOver('crossmap/GRCh37_to_GRCh38.chain.gz') else: b3x = 'b38' str_db_file = 'str_hg38.gff3' contig = 'chrY' contigmt = 'chrM' pos_triplet_fn = pos_triplet_38 lo_38to37 = LiftOver('crossmap/GRCh38_to_GRCh37.chain.gz')
def try_find_build(rs, pos): snps_info = fetch_snps(rs) #snps_info = [('rs3737728', 'GRCh38.p2', '1', '1086035'), ('rs3934834', 'GRCh38.p2', '1', '1070426'), ('rs9651273', 'GRCh38.p2', '1', '1096160')] logging.info("Loading liftover chain files...") lift38_19 = LiftOver('pyliftover/hg38ToHg19.over.chain.gz') lift19_18 = LiftOver('pyliftover/hg19ToHg18.over.chain.gz') lift19_17 = LiftOver('pyliftover/hg19ToHg17.over.chain.gz') logging.info("Done") for (rsId, build, true_chr, pos_hg38), source_pos in zip(snps_info, pos): try: #if build != 'GRCh38.p2': # assume a specific build we get from Entrez.efetch(db='SNP') # continue source_pos -= 1 pos_hg19 = lift38_19.convert_coordinate('chr{}'.format(true_chr), int(pos_hg38) - 1)[0][1] pos_hg18 = lift19_18.convert_coordinate('chr{}'.format(true_chr), pos_hg19)[0][1] pos_hg17 = lift19_17.convert_coordinate('chr{}'.format(true_chr), pos_hg19)[0][1] print( "build={} {} chr{} source={} hg38={}{} hg19={}{} hg18={}{} hg17={}{}" .format(build, rsId, true_chr, source_pos, pos_hg38, '*' if pos_hg38 == source_pos else '', pos_hg19, '*' if pos_hg19 == source_pos else '', pos_hg18, '*' if pos_hg18 == source_pos else '', pos_hg17, '*' if pos_hg17 == source_pos else '')) except: pass
def hgVersionJudge(self, nowVersion): if (int(nowVersion) != 19): strs = 'hg' + str(nowVersion) lo = LiftOver(strs, 'hg19') return lo else: return 0
def create_lo(input_version, output_version): lo = LiftOver(input_version, output_version) return { "input_version": input_version, "output_version": output_version, "lo": lo }
def get_schic_contacts(filename): all_contacts = np.loadtxt(filename, dtype=str) # filter for cis chrX contacts contacts = all_contacts[(all_contacts[:, 0] == 'chrX') & (all_contacts[:, 2] == 'chrX')] contacts = contacts[:, (1, 3)].astype(int) # lift over all contacts from mm10 to mm9 lo = LiftOver('mm10', 'mm9') def do_lift(loc): lifted_loc = lo.convert_coordinate('chrX', loc) if len(lifted_loc) == 1: return lifted_loc[0][1] elif len(lifted_loc) > 1: raise ("Non-unique liftover result") else: print "Locus {} not in mm9 assembly".format(loc) lifted_contacts = np.array( zip(map(do_lift, contacts[:, 0]), map(do_lift, contacts[:, 1]))) # keep only contacts in genomic region of interest contacts = contacts[(contacts[:, 0] >= coords_min) & (contacts[:, 1] <= coords_max)] return contacts
def setup(self): r = requests.get('https://civicdb.org/api/variants?count=5000&page=1') variants = json.loads(r.text)['records'] lifter = LiftOver(constants.liftover_chain_paths['hg19']) vdict = {} for variant in variants: chrom_37 = variant['coordinates']['chromosome'] pos_37 = variant['coordinates']['start'] if chrom_37 is None or pos_37 is None: continue new_coords = lifter.convert_coordinate("chr" + chrom_37, int(pos_37)) if len(new_coords) > 0: chrom_38 = new_coords[0][0].replace('chr', '') pos_38 = new_coords[0][1] else: continue ref = variant['coordinates']['reference_bases'] alt = variant['coordinates']['variant_bases'] toks = [chrom_38, pos_38, ref, alt] if None not in toks: vkey = ':'.join(map(str, toks)) vdict[vkey] = variant else: continue self.civicdata = vdict
def __init__(self, regionsFileName, hg): with open(regionsFileName, 'r') as f: self.regionsDict = json.load(f) f.close() self.lo = None if hg != 'hg38': self.lo = LiftOver(hg, 'hg38')
async def live_annotate(input_data, annotators): from cravat.constants import mapping_parser_name from cravat.constants import all_mappings_col_name from cravat.inout import AllMappingsParser global live_modules global live_mapper global module_confs global modules_to_run_ordered response = {} assembly = input_data.get('assembly', 'hg38') if assembly in cravat.constants.liftover_chain_paths: lifter = LiftOver(cravat.constants.liftover_chain_paths[assembly]) chrom, pos, ref, alt = liftover(input_data, lifter) input_data['chrom'] = chrom input_data['pos'] = pos input_data['ref'] = ref input_data['alt'] = alt crx_data = live_mapper.map(input_data) crx_data = live_mapper.live_report_substitute(crx_data) crx_data[mapping_parser_name] = AllMappingsParser( crx_data[all_mappings_col_name]) for module_name in modules_to_run_ordered: module = live_modules[module_name] if annotators is not None and module_name not in annotators: continue try: conf = module_confs[module_name] json_colnames = [] for col in conf['output_columns']: if 'table' in col and col['table'] == True: json_colnames.append(col['name']) if 'secondary_inputs' in conf: sec_mods = conf['secondary_inputs'] secondary_data = {} for sec_mod in sec_mods: secondary_data[sec_mod] = [response[sec_mod]] annot_data = module.annotate(input_data=crx_data, secondary_data=secondary_data) else: annot_data = module.annotate(input_data=crx_data) annot_data = module.live_report_substitute(annot_data) if annot_data == '' or annot_data == {}: annot_data = None elif type(annot_data) is dict: annot_data = clean_annot_dict(annot_data) if annot_data is not None: for colname in json_colnames: json_data = annot_data.get(colname, None) if json_data is not None and type(json_data) == str: json_data = json.loads(json_data) annot_data[colname] = json_data response[module_name] = annot_data except Exception as e: import traceback traceback.print_exc() response[module_name] = None del crx_data[mapping_parser_name] set_crx_canonical(crx_data) response['crx'] = crx_data return response
def liftover_to_19(loc, build): floc = [loc.split(':')[0], loc.split(':')[1]] lo = LiftOver(os.path.join(chainpath, chains.get(build))) con_pos = lo.convert_coordinate(*floc) if con_pos: return int(con_pos[0][1]) return NaN
def __init__(self, args): self.args = args self.doLiftOver = LiftOver('hg19', 'hg38') self.lengths_orig = [] self.lengths_filtered = [] self.oldVsNew = []
def lift_pos(posvec, chrvec, chainFile): logging.info("Lifting genomic positions...") nsnps = len(posvec) posvec = posvec - 1 pos_lifted = np.empty((nsnps, ), dtype='int32') chr_lifted = np.empty((nsnps, ), dtype='int32') pos_indi = np.empty((nsnps, ), dtype='|S10') dup_indi = np.empty((nsnps, ), dtype='bool') dup_indi.fill(False) lift = LiftOver(chainFile) for i in range(nsnps): if (i + 1) % 200000 == 0: logging.info("{} SNPs done".format(i + 1)) pos = posvec[i] chr = 'chr%d' % (chrvec[i], ) tmp = lift.convert_coordinate(chr, pos) if not tmp: pos_lifted[i] = pos pos_indi[i] = 'miss' chr_lifted[i] = chrvec[i] elif len(tmp) > 1: pos_lifted[i] = tmp[0][1] chr_lifted[i] = re.sub('chr', '', tmp[0][0]) pos_indi[i] = 'multi' else: pos_lifted[i] = tmp[0][1] chr_lifted[i] = re.sub('chr', '', tmp[0][0]) if pos == tmp[0][1]: pos_indi[i] = 'unchanged' else: pos_indi[i] = 'lifted' return pos_lifted + 1, pos_indi, chr_lifted
def setup(self): self.civicdata = {} lifter = LiftOver(constants.liftover_chain_paths['hg19']) page_url = 'https://civicdb.org/api/variants?count=500&page=1' while page_url is not None: try: r = requests.get(page_url, timeout=5) except requests.exceptions.ConnectionError: msg = 'ERROR: Incomplete CIVIC data load' print(msg) self.logger.error(msg) break d = json.loads(r.text) records = d['records'] page_url = d['_meta']['links']['next'] for variant in records: chrom_37 = variant['coordinates']['chromosome'] pos_37 = variant['coordinates']['start'] if chrom_37 is None or pos_37 is None: continue new_coords = lifter.convert_coordinate("chr" + chrom_37, int(pos_37)) if len(new_coords) > 0: chrom_38 = new_coords[0][0].replace('chr', '') pos_38 = new_coords[0][1] else: continue ref = variant['coordinates']['reference_bases'] alt = variant['coordinates']['variant_bases'] toks = [chrom_38, pos_38, ref, alt] if None not in toks: vkey = ':'.join(map(str, toks)) self.civicdata[vkey] = variant else: continue
def ancestral_fasta(args): """subroutine for ancestor subcommand """ # single chromosome fasta file for reference genome ref = pyfaidx.Fasta(args.reference, read_ahead=10000) # make a copy to build our ancestor for this chromosome copyfile(args.reference, args.output) anc = pyfaidx.Fasta(args.output, read_ahead=10000, mutable=True) # reference genome for outgroup species (all chromosomes) out = pyfaidx.Fasta(args.outgroup, read_ahead=10000) # outgroup to reference alignment chain file lo = LiftOver(args.chain) # snps database for the same chromosome vcf = cyvcf2.VCF(args.vcf) # change regions outside of callability mask to all N bases if args.bed: if args.bed == '-': bed = sys.stdin else: bed = open(args.bed, 'r') last_end = 0 for line in bed: chrom, start, end = line.rstrip().split('\t')[:3] start = int(start) anc[chrom][last_end:start] = 'N' * (start - last_end) last_end = int(end) anc[chrom][last_end:len(anc[chrom])] = 'N' * (len(anc[chrom]) - last_end) for variant in vcf: # change variants that are not biallelic SNPs to N bases if not (variant.is_snp and len(variant.ALT) == 1): anc[variant.CHROM][variant.start:variant.end] = 'N' * ( variant.end - variant.start) else: out_coords = lo.convert_coordinate(variant.CHROM, variant.start) # change ambiguously aligning sites to N bases if out_coords is None or len(out_coords) != 1: anc[variant.CHROM][variant.start] = 'N' else: if variant.REF != ref[variant.CHROM][ variant.start].seq.upper(): raise ValueError(f'variant reference allele {variant.REF} ' f'mismatches reference sequence ' f'{ref[variant.CHROM][variant.start]}') out_chromosome, out_position, out_strand = out_coords[0][:3] out_allele = out[out_chromosome][out_position].seq # if negative strand, take reverse complement base if out_strand == '-': out_allele = reverse_complement(out_allele) # and finally, polarize if out_allele.upper() == variant.ALT[0]: anc[variant.CHROM][variant.start] = out_allele elif out_allele.upper() != variant.REF: # triallelic anc[variant.CHROM][variant.start] = 'N'
def main(): usage = "\n\n\tusage: {} cancer_introns.b38.annot_ready.tsv hg38ToHg19.over.chain.gz > cancer_introns.b37.annot_ready.tsv\n\n".format( sys.argv[0]) if len(sys.argv) < 3: print(usage, file=sys.stderr) sys.exit(1) cancer_introns_file = sys.argv[1] hg_chain_file = sys.argv[2] lo = LiftOver('hg38ToHg19.over.chain.gz') with open(cancer_introns_file, 'rt') as fh: header = next(fh) header = header.rstrip() print(header) for line in fh: line = line.rstrip() vals = line.split("\t") intron = vals[0] chr, coordset = intron.split(":") (lend, rend) = coordset.split("-") lend = int(lend) rend = int(rend) new_lend = lo.convert_coordinate(chr, lend - 1) #print("new_lend: {}".format(str(new_lend))) new_rend = lo.convert_coordinate(chr, rend - 1) #print("new_rend: {}".format(str(new_rend))) if new_lend and new_rend: new_lend_chr = new_lend[0][0] new_lend_coord = new_lend[0][1] + 1 new_rend_chr = new_rend[0][0] new_rend_coord = new_rend[0][1] + 1 if new_lend_chr != new_rend_chr or new_lend_chr != chr: sys.stderr.write("-failed conversion of {}".format(line) + " --> {} {}, {} {}\n".format( new_lend_chr, new_lend_coord, new_rend_chr, new_rend_coord)) continue if new_lend_coord > new_rend_coord: (new_lend_coord, new_rend_coord) = (new_rend_coord, new_lend_coord) new_intron_feature = "{}:{}-{}".format(chr, new_lend_coord, new_rend_coord) vals[0] = new_intron_feature print("\t".join(vals)) sys.exit(0)
def main(args): # open input vcf vcf = vcf_parser.Vcf(args['inputfile']) # add 3 new tag definitions - for hg19 liftover: chr, pos, and end hg19CHROM_definition = '##INFO=<ID=hg19_chr,Number=1,Type=String,Description="CHROM in hg19 using LiftOver from pyliftover">' hg19POS_definition = '##INFO=<ID=hg19_pos,Number=1,Type=Integer,Description="POS in hg19 using LiftOver from pyliftover (converted back to 1-based)">' hg19END_definition = '##INFO=<ID=hg19_end,Number=1,Type=Integer,Description="END in hg19 using LiftOver from pyliftover (converted back to 1-based)">' vcf.header.add_tag_definition(hg19END_definition) vcf.header.add_tag_definition(hg19POS_definition) vcf.header.add_tag_definition(hg19CHROM_definition) # get chain file for liftover lo = LiftOver(args['chainfile']) # write header and then loop variants, adding liftover coordiantes to INFO fields when appropriate. write all variants. with open(args['outputfile'], 'w') as fo: vcf.write_header(fo) for vnt_obj in vcf.parse_variants(): # generate hg19 LO coordinates based on CHROM and POS hits = lo.convert_coordinate(vnt_obj.CHROM, vnt_obj.POS-1) if len(hits) > 0: #add hg19_chr hg19CHROM_value = 'hg19_chr='+hits[0][0].split('chr')[1] vnt_obj.add_tag_info(hg19CHROM_value) #add hg19_pos hg19POS_value = 'hg19_pos='+str(hits[0][1]+1) vnt_obj.add_tag_info(hg19POS_value) # also want to incorporate END position for SV and CNV # check if "END" exists in INFO and if it does, try a liftover try: END = int(vnt_obj.INFO.split("END=")[1].split(";")[0]) except: END = '' if END != '': hits_end = lo.convert_coordinate(vnt_obj.CHROM, END-1) if len(hits_end) > 0: try: #if hg19_chr is already defined, don't add it vnt_obj.get_tag_value("hg19_chr") #add hg19_end hg19END_value = 'hg19_end='+str(hits_end[0][1]+1) vnt_obj.add_tag_info(hg19END_value) except: #if hg19_chr is not defined, add hg19_chr hg19CHROM_value = 'hg19_chr='+hits_end[0][0].split('chr')[1] vnt_obj.add_tag_info(hg19CHROM_value) #add hg19_end hg19END_value = 'hg19_end='+str(hits_end[0][1]+1) vnt_obj.add_tag_info(hg19END_value) vcf.write_variant(fo, vnt_obj) subprocess.run(["bgzip", args['outputfile']]) subprocess.run(["tabix",args['outputfile']+".gz"])
def main(): # Parse args args = parse_args() confidence_orders = ['High', 'Medium', 'Low'] # Used to sort "highest" confidence # Load gold-standards gold_standards = load_gold_standards(args.input_pattern) # Create liftOver instances from chain files if args.grch37_to_38: args.grch37_to_38 = LiftOver(args.grch37_to_38) if args.grch38_to_37: args.grch38_to_37 = LiftOver(args.grch38_to_37) # Iterate over and process records out_data = [] for record in gold_standards: # Lift-over positions to all assemblies record['sentinel_variant'] = fill_in_assemblies( record['sentinel_variant'], args.grch37_to_38, args.grch38_to_37 ) # Extract highest confidence record['gold_standard_info']['highest_confidence'] = sorted( [entry['confidence'] for entry in record['gold_standard_info']['evidence']], key=lambda x: confidence_orders.index(x) )[0] out_data.append(record) # Write output if not os.path.exists(os.path.dirname(args.output)): os.makedirs(os.path.dirname(args.output), exist_ok=True) with open(args.output, 'w') as out_h: json.dump(out_data, out_h, ensure_ascii=False, indent=2) return 0
def PCGP_mut_df_genome_build_check(df,pos_col=4): col_check_hg18= [ col for col in df.columns if 'hg18' in col.lower() ] col_check_hg38= [ col for col in df.columns if 'hg38' in col.lower() ] if len(col_check_hg18) > 0 or len(col_check_hg38) > 0: if (len(col_check_hg18) == 1 and len(col_check_hg38) == 0) or (len(col_check_hg18) == 0 and len(col_check_hg38) == 1): if len(col_check_hg18) == 1: fd=col_check_hg18[0] col_check=col_check_hg18 print("[Warning] following columns from hg18 genome build: %s" % fd) lo=LiftOver('hg18', 'hg19') elif len(col_check_hg38) == 1: fd=col_check_hg38[0] col_check=col_check_hg38 print("[Warning] following columns from hg38 genome build: %s" % fd) lo=LiftOver('hg38', 'hg19') pos=[] #print(df) print(fd) for idx, row in df.iterrows(): conversion=lo.convert_coordinate(row['Chr'], row[col_check[0]]) if conversion: newpos=lo.convert_coordinate(row['Chr'], row[col_check[0]])[0] pos.append(newpos[1]) else: newpos=(row['Chr'],-1) pos.append(0) #newpos=lo.convert_coordinate(row['Chr'], row[col_check[0]])[0] #pos.append(newpos[1]) df['Position_hg19']=pos return df else: print("[Error] only one column allowed for conversion: %s ... quit" % col_check) quit() else: #print("No change") cols=df.columns.values cols[pos_col]='Position_hg19' df.columns=cols return df
def liftover_cho(df): lo = LiftOver('hg18', 'hg38') def lift_coord(row): chrom = 'chr' + str(row['Chromosome']) pos = row['Genomic position'] - 1 result = lo.convert_coordinate(chrom, pos) if len(result) == 0: print(f"Didn't find hg38 coordinate for {row['Chromosome']}:{row['Genomic position']}") return 'NA' return result[0][1] + 1 df['Genomic position'] = df.apply(lift_coord, axis=1) return df
def get_liftover(frm=19, to=38): """ Info: http://hgdownload.cse.ucsc.edu/downloads.html """ from pyliftover import LiftOver liftoverfile = 'hg{}ToHg{}.over.chain.gz'.format(frm, to) try: return LiftOver(processedDataStorage + liftoverfile) except FileNotFoundError: raise FileNotFoundError( 'Source: http://hgdownload.cse.ucsc.edu/gbdb/hg{}/liftOver/{}'. format(frm, liftoverfile))
def liftover(self): # todo # Not sure what the failure mode of this tool is. Will probably need to write a try catch eventually # Changing the chromosome and position messes up the key as well. Could probably fix that. But i don't have # the ref and alt alleles on hand and I don't want to parse them out of chromosomeHgvsName. from pyliftover import LiftOver lo = LiftOver('hg38', self.build) lifted = lo.convert_coordinate(self.chromosome, self.position) self.chromosome = lifted[0][0] self.position = lifted[0][1]
def from_hg18_to_hg19(chr, coord): """ object to perform hg18 --> hg19 conversion. ----------- REMEMBER that LIFT-OVER coordinates are 0-based!!! ----------- ADD +1 to obtain a values in 1-based coordinate!! :param chr: chromosome name, e.g. 'chr6' :param coord: integer, e.g. 10000 :return: coord in hg coordinates system """ lo = LiftOver('hg18', 'hg19') conv = lo.convert_coordinate(chr, int(coord)+1) hg19_coord = conv[0][1] return hg19_coord
def _parse_cmd_args(self, args): """ Parse the arguments in sys.argv """ parser = argparse.ArgumentParser() parser.add_argument('path', help='Path to this converter\'s python module') parser.add_argument('inputs', nargs='+', help='Files to be converted to .crv') parser.add_argument('-f', dest='format', help='Specify an input format') parser.add_argument('-n', '--name', dest='name', help='Name of job. Default is input file name.') parser.add_argument('-d', '--output-dir', dest='output_dir', help='Output directory. '\ +'Default is input file directory.') parser.add_argument( '-l', '--liftover', dest='liftover', choices=['hg38'] + list(constants.liftover_chain_paths.keys()), default='hg38', help='Input gene assembly. Will be lifted over to hg38') parsed_args = parser.parse_args(args) self.input_paths = [os.path.abspath(x) for x in parsed_args.inputs] if parsed_args.format: self.input_format = parsed_args.format self.input_dir = os.path.dirname(self.input_paths[0]) if parsed_args.output_dir: self.output_dir = parsed_args.output_dir else: self.output_dir = self.input_dir if not (os.path.exists(self.output_dir)): os.makedirs(self.output_dir) if parsed_args.name: self.output_base_fname = parsed_args.name else: self.output_base_fname = os.path.basename(self.input_paths[0]) self.input_assembly = parsed_args.liftover self.do_liftover = self.input_assembly != 'hg38' if self.do_liftover: self.lifter = LiftOver( constants.liftover_chain_paths[self.input_assembly]) else: self.lifter = None self.status_fpath = os.path.join( self.output_dir, self.output_base_fname + '.status.json')
def liftover_loci_in_df(df, chrom_column = 'chromosome', pos_column = 'position', source_ref_genome = 'hg38', \ target_ref_genome = 'hg19'): from pyliftover import LiftOver liftover = LiftOver(source_ref_genome, target_ref_genome) new_loci = [] for _, (chrom, pos) in df[[chrom_column, pos_column]].iterrows(): new_loci.append(liftover_locus(liftover, chrom, pos)) new_chroms, new_positions = (pd.Series(list(values), index=df.index) for values in zip(*new_loci)) return pd.concat([new_chroms.rename(chrom_column) if column == chrom_column else (new_positions.rename(pos_column) if \ column == pos_column else df[column]) for column in df.columns], axis = 1)
def convertPos(cls, chrom, pos): if cls.sHandler is None: print("Initializing hg38 -> hg19 liftover conversion", file=sys.stderr) cls.sHandler = LiftOver("hg38", "hg19") if chrom not in cls.sChromMap: cls.sChromMap[chrom] = normalizeChromName(chrom) try: coord = cls.sHandler.convert_coordinate(cls.sChromMap[chrom], pos - 1) except Exception: return None if (len(coord) == 0): return None return coord[0][1] + 1
def __init__(self, chainfile): """ This object will perform unique single positional liftovers - it will only lift over chromosome positions that map unique to the new genome and if the strand hasn't changed. Note: You should run a VCF Normalization sweep on all lifted ofer CPRAs to check for variants that need to be re-normalized, and to remove variants where the REF now doesn't match after a liftover. The combination of these steps will ensure high quality liftovers. However, it should be noted that this won't prevent the situation where multiple positions in the old genome pile up uniquely in the new genome, so one needs to check for this. It's organised as an object rather than a collection of functions so that the LiftOver chainfile only gets opened/passed once and not for every position to be lifted over. :param chainfile: A string containing the path to the local UCSC .gzipped chainfile :return: """ self.liftover = LiftOver(chainfile)
def liftover(chr, pos, chainfile): # chr: number or chrN # pos: 1-base position lo = LiftOver(chainfile) # formatting chromosome if (not isinstance(chr[0], str)) or ('chr' not in chr[0]): chr = ['chr' + str(i) for i in chr] pos = pos - 1 # pyliftover uses base-0 lo_out = [_tidy_liftover(i, j, lo) for i, j in zip(chr, pos)] out = pd.DataFrame(lo_out, columns=['liftover_chr', 'liftover_pos']) out.iloc[:, 1] = out.iloc[:, 1] + 1 # convert back to base-1 return out
def liftover(self, chromosome, position, build='hg19'): # todo # Not sure what the failure mode of this tool is. Will probably need to write a try catch eventually # Changing the chromosome and position messes up the key as well. Could probably fix that. But i don't have # the ref and alt alleles on hand and I don't want to parse them out of chromosomeHgvsName. lo = LiftOver('hg38', build) lifted = lo.convert_coordinate(chromosome, position) new_chromosome = lifted[0][0] new_position = lifted[0][1] if self.debug: print("%s %s -> %s %s" % (chromosome, position, new_chromosome, new_position)) return new_chromosome, new_position
def liftover(pos, chro, from_assembly, to_assembly): """ LiftOver a specific coordinate between assemblies using the UCSC LiftOver tool NOTE: pyLiftover uses base 0, whereas coordinate system uses base 1 therefore position 27107251 is actually 27107250 in pyLiftover """ if from_assembly == to_assembly: return pos chro = 'chr' + str(chro) pos = int(pos) lo = LiftOver(from_assembly, to_assembly) out = lo.convert_coordinate(chro, pos) return out[0][1]
def open_file_and_process(file, from_build, to_build): filename = get_filename(file) new_filename = 'liftover_' + filename + '.tsv' build_map = None if from_build != to_build: build_map = LiftOver(ucsc_release.get(from_build), ucsc_release.get(to_build)) with open(file) as csv_file: count = 0 result_file = open(new_filename, "w") csv_reader = csv.DictReader(csv_file, delimiter='\t') fieldnames = csv_reader.fieldnames writer = csv.DictWriter(result_file, fieldnames=fieldnames, delimiter='\t') writer.writeheader() for row in csv_reader: chromosome = row[CHR_DSET].replace('23', 'X').replace('24', 'Y') bp = row[BP_DSET] # do the bp location mapping if needed if from_build != to_build: mapped_bp = map_bp_to_build_via_liftover(chromosome=chromosome, bp=bp, build_map=build_map) if mapped_bp is None: mapped_bp = map_bp_to_build_via_ensembl( chromosome=chromosome, bp=bp, from_build=from_build, to_build=to_build) row[BP_DSET] = mapped_bp writer.writerow(row) count += 1 if count % 1000 == 0: print(count)
def main(coords, orig_assembly, new_assembly, chainfile, outfh): # Create a LiftOver object with desired mapping. lo = LiftOver(orig_assembly, new_assembly) results = [] for coord in coords: try: chrom, pos = coord.split(':') # No idea why, but pos needs to be an int instead of a str! returnval = lo.convert_coordinate(chrom, int(pos))[0] results.append(( chrom, pos, ) + returnval) except: # Not sure what kinds of errors we can get. I think if a locus is # deleted, we'll get None as a result (which we'll want to handle), # but apart from that, not sure what to expect. sys.stderr.write('Offending coord: %s' % coord) raise print_results(results, outfh)