Пример #1
0
def setup_conv(in_build):
    global b3x
    global str_db_file
    global contig
    global contigmt
    global pos_triplet_fn
    global lo_37to38
    global lo_38to37
    print("Loading LiftOver conversion chain file for build %d..." % in_build)
    if in_build == 19:
        b3x = 'b37'
        str_db_file = 'str_hg19.gff3'
        contig = 'chrY'
        contigmt = 'chrM'
        pos_triplet_fn = pos_triplet_37
        lo_37to38 = LiftOver('crossmap/GRCh37_to_GRCh38.chain.gz')
    elif in_build == 37:
        b3x = 'b37'
        str_db_file = 'str_hg19.gff3'
        contig = 'Y'
        contigmt = 'MT'
        pos_triplet_fn = pos_triplet_37
        lo_37to38 = LiftOver('crossmap/GRCh37_to_GRCh38.chain.gz')
    else:
        b3x = 'b38'
        str_db_file = 'str_hg38.gff3'
        contig = 'chrY'
        contigmt = 'chrM'
        pos_triplet_fn = pos_triplet_38
        lo_38to37 = LiftOver('crossmap/GRCh38_to_GRCh37.chain.gz')
Пример #2
0
def try_find_build(rs, pos):
    snps_info = fetch_snps(rs)
    #snps_info = [('rs3737728', 'GRCh38.p2', '1', '1086035'), ('rs3934834', 'GRCh38.p2', '1', '1070426'), ('rs9651273', 'GRCh38.p2', '1', '1096160')]
    logging.info("Loading liftover chain files...")
    lift38_19 = LiftOver('pyliftover/hg38ToHg19.over.chain.gz')
    lift19_18 = LiftOver('pyliftover/hg19ToHg18.over.chain.gz')
    lift19_17 = LiftOver('pyliftover/hg19ToHg17.over.chain.gz')
    logging.info("Done")

    for (rsId, build, true_chr, pos_hg38), source_pos in zip(snps_info, pos):
        try:
            #if build != 'GRCh38.p2':  # assume a specific build we get from Entrez.efetch(db='SNP')
            #    continue
            source_pos -= 1
            pos_hg19 = lift38_19.convert_coordinate('chr{}'.format(true_chr),
                                                    int(pos_hg38) - 1)[0][1]
            pos_hg18 = lift19_18.convert_coordinate('chr{}'.format(true_chr),
                                                    pos_hg19)[0][1]
            pos_hg17 = lift19_17.convert_coordinate('chr{}'.format(true_chr),
                                                    pos_hg19)[0][1]
            print(
                "build={} {} chr{} source={} hg38={}{} hg19={}{} hg18={}{} hg17={}{}"
                .format(build, rsId, true_chr, source_pos, pos_hg38,
                        '*' if pos_hg38 == source_pos else '', pos_hg19,
                        '*' if pos_hg19 == source_pos else '', pos_hg18,
                        '*' if pos_hg18 == source_pos else '', pos_hg17,
                        '*' if pos_hg17 == source_pos else ''))
        except:
            pass
Пример #3
0
 def hgVersionJudge(self, nowVersion):
     if (int(nowVersion) != 19):
         strs = 'hg' + str(nowVersion)
         lo = LiftOver(strs, 'hg19')
         return lo
     else:
         return 0
Пример #4
0
def create_lo(input_version, output_version):
    lo = LiftOver(input_version, output_version)
    return {
        "input_version": input_version,
        "output_version": output_version,
        "lo": lo
    }
def get_schic_contacts(filename):

    all_contacts = np.loadtxt(filename, dtype=str)

    # filter for cis chrX contacts
    contacts = all_contacts[(all_contacts[:, 0] == 'chrX')
                            & (all_contacts[:, 2] == 'chrX')]
    contacts = contacts[:, (1, 3)].astype(int)

    # lift over all contacts from mm10 to mm9
    lo = LiftOver('mm10', 'mm9')

    def do_lift(loc):
        lifted_loc = lo.convert_coordinate('chrX', loc)
        if len(lifted_loc) == 1:
            return lifted_loc[0][1]
        elif len(lifted_loc) > 1:
            raise ("Non-unique liftover result")
        else:
            print "Locus {} not in mm9 assembly".format(loc)

    lifted_contacts = np.array(
        zip(map(do_lift, contacts[:, 0]), map(do_lift, contacts[:, 1])))

    # keep only contacts in genomic region of interest
    contacts = contacts[(contacts[:, 0] >= coords_min)
                        & (contacts[:, 1] <= coords_max)]

    return contacts
Пример #6
0
 def setup(self):
     r = requests.get('https://civicdb.org/api/variants?count=5000&page=1')
     variants = json.loads(r.text)['records']
     lifter = LiftOver(constants.liftover_chain_paths['hg19'])
     vdict = {}
     for variant in variants:
         chrom_37 = variant['coordinates']['chromosome']
         pos_37 = variant['coordinates']['start']
         if chrom_37 is None or pos_37 is None: continue
         new_coords = lifter.convert_coordinate("chr" + chrom_37,
                                                int(pos_37))
         if len(new_coords) > 0:
             chrom_38 = new_coords[0][0].replace('chr', '')
             pos_38 = new_coords[0][1]
         else:
             continue
         ref = variant['coordinates']['reference_bases']
         alt = variant['coordinates']['variant_bases']
         toks = [chrom_38, pos_38, ref, alt]
         if None not in toks:
             vkey = ':'.join(map(str, toks))
             vdict[vkey] = variant
         else:
             continue
     self.civicdata = vdict
Пример #7
0
 def __init__(self, regionsFileName, hg):
     with open(regionsFileName, 'r') as f:
         self.regionsDict = json.load(f)
     f.close()
     self.lo = None
     if hg != 'hg38':
         self.lo = LiftOver(hg, 'hg38')
Пример #8
0
async def live_annotate(input_data, annotators):
    from cravat.constants import mapping_parser_name
    from cravat.constants import all_mappings_col_name
    from cravat.inout import AllMappingsParser
    global live_modules
    global live_mapper
    global module_confs
    global modules_to_run_ordered
    response = {}
    assembly = input_data.get('assembly', 'hg38')
    if assembly in cravat.constants.liftover_chain_paths:
        lifter = LiftOver(cravat.constants.liftover_chain_paths[assembly])
        chrom, pos, ref, alt = liftover(input_data, lifter)
        input_data['chrom'] = chrom
        input_data['pos'] = pos
        input_data['ref'] = ref
        input_data['alt'] = alt
    crx_data = live_mapper.map(input_data)
    crx_data = live_mapper.live_report_substitute(crx_data)
    crx_data[mapping_parser_name] = AllMappingsParser(
        crx_data[all_mappings_col_name])
    for module_name in modules_to_run_ordered:
        module = live_modules[module_name]
        if annotators is not None and module_name not in annotators:
            continue
        try:
            conf = module_confs[module_name]
            json_colnames = []
            for col in conf['output_columns']:
                if 'table' in col and col['table'] == True:
                    json_colnames.append(col['name'])
            if 'secondary_inputs' in conf:
                sec_mods = conf['secondary_inputs']
                secondary_data = {}
                for sec_mod in sec_mods:
                    secondary_data[sec_mod] = [response[sec_mod]]
                annot_data = module.annotate(input_data=crx_data,
                                             secondary_data=secondary_data)
            else:
                annot_data = module.annotate(input_data=crx_data)
            annot_data = module.live_report_substitute(annot_data)
            if annot_data == '' or annot_data == {}:
                annot_data = None
            elif type(annot_data) is dict:
                annot_data = clean_annot_dict(annot_data)
            if annot_data is not None:
                for colname in json_colnames:
                    json_data = annot_data.get(colname, None)
                    if json_data is not None and type(json_data) == str:
                        json_data = json.loads(json_data)
                    annot_data[colname] = json_data
            response[module_name] = annot_data
        except Exception as e:
            import traceback
            traceback.print_exc()
            response[module_name] = None
    del crx_data[mapping_parser_name]
    set_crx_canonical(crx_data)
    response['crx'] = crx_data
    return response
Пример #9
0
def liftover_to_19(loc, build):
    floc = [loc.split(':')[0], loc.split(':')[1]]
    lo = LiftOver(os.path.join(chainpath, chains.get(build)))
    con_pos = lo.convert_coordinate(*floc)
    if con_pos:
        return int(con_pos[0][1])
    return NaN
    def __init__(self, args):
        self.args = args
        self.doLiftOver = LiftOver('hg19', 'hg38')

        self.lengths_orig = []
        self.lengths_filtered = []
        self.oldVsNew = []
Пример #11
0
def lift_pos(posvec, chrvec, chainFile):
    logging.info("Lifting genomic positions...")
    nsnps = len(posvec)
    posvec = posvec - 1
    pos_lifted = np.empty((nsnps, ), dtype='int32')
    chr_lifted = np.empty((nsnps, ), dtype='int32')
    pos_indi = np.empty((nsnps, ), dtype='|S10')
    dup_indi = np.empty((nsnps, ), dtype='bool')
    dup_indi.fill(False)
    lift = LiftOver(chainFile)
    for i in range(nsnps):
        if (i + 1) % 200000 == 0:
            logging.info("{} SNPs done".format(i + 1))
        pos = posvec[i]
        chr = 'chr%d' % (chrvec[i], )
        tmp = lift.convert_coordinate(chr, pos)
        if not tmp:
            pos_lifted[i] = pos
            pos_indi[i] = 'miss'
            chr_lifted[i] = chrvec[i]
        elif len(tmp) > 1:
            pos_lifted[i] = tmp[0][1]
            chr_lifted[i] = re.sub('chr', '', tmp[0][0])
            pos_indi[i] = 'multi'
        else:
            pos_lifted[i] = tmp[0][1]
            chr_lifted[i] = re.sub('chr', '', tmp[0][0])
            if pos == tmp[0][1]:
                pos_indi[i] = 'unchanged'
            else:
                pos_indi[i] = 'lifted'
    return pos_lifted + 1, pos_indi, chr_lifted
Пример #12
0
 def setup(self):
     self.civicdata = {}
     lifter = LiftOver(constants.liftover_chain_paths['hg19'])
     page_url = 'https://civicdb.org/api/variants?count=500&page=1'
     while page_url is not None:
         try:
             r = requests.get(page_url, timeout=5)
         except requests.exceptions.ConnectionError:
             msg = 'ERROR: Incomplete CIVIC data load'
             print(msg)
             self.logger.error(msg)
             break
         d = json.loads(r.text)
         records = d['records']
         page_url = d['_meta']['links']['next']
         for variant in records:
             chrom_37 = variant['coordinates']['chromosome']
             pos_37 = variant['coordinates']['start']
             if chrom_37 is None or pos_37 is None: continue
             new_coords = lifter.convert_coordinate("chr" + chrom_37,
                                                    int(pos_37))
             if len(new_coords) > 0:
                 chrom_38 = new_coords[0][0].replace('chr', '')
                 pos_38 = new_coords[0][1]
             else:
                 continue
             ref = variant['coordinates']['reference_bases']
             alt = variant['coordinates']['variant_bases']
             toks = [chrom_38, pos_38, ref, alt]
             if None not in toks:
                 vkey = ':'.join(map(str, toks))
                 self.civicdata[vkey] = variant
             else:
                 continue
Пример #13
0
def ancestral_fasta(args):
    """subroutine for ancestor subcommand
    """
    # single chromosome fasta file for reference genome
    ref = pyfaidx.Fasta(args.reference, read_ahead=10000)
    # make a copy to build our ancestor for this chromosome
    copyfile(args.reference, args.output)
    anc = pyfaidx.Fasta(args.output, read_ahead=10000, mutable=True)
    # reference genome for outgroup species (all chromosomes)
    out = pyfaidx.Fasta(args.outgroup, read_ahead=10000)
    # outgroup to reference alignment chain file
    lo = LiftOver(args.chain)
    # snps database for the same chromosome
    vcf = cyvcf2.VCF(args.vcf)

    # change regions outside of callability mask to all N bases
    if args.bed:
        if args.bed == '-':
            bed = sys.stdin
        else:
            bed = open(args.bed, 'r')
        last_end = 0
        for line in bed:
            chrom, start, end = line.rstrip().split('\t')[:3]
            start = int(start)
            anc[chrom][last_end:start] = 'N' * (start - last_end)
            last_end = int(end)
        anc[chrom][last_end:len(anc[chrom])] = 'N' * (len(anc[chrom]) -
                                                      last_end)

    for variant in vcf:
        # change variants that are not biallelic SNPs to N bases
        if not (variant.is_snp and len(variant.ALT) == 1):
            anc[variant.CHROM][variant.start:variant.end] = 'N' * (
                variant.end - variant.start)
        else:
            out_coords = lo.convert_coordinate(variant.CHROM, variant.start)
            # change ambiguously aligning sites to N bases
            if out_coords is None or len(out_coords) != 1:
                anc[variant.CHROM][variant.start] = 'N'
            else:
                if variant.REF != ref[variant.CHROM][
                        variant.start].seq.upper():
                    raise ValueError(f'variant reference allele {variant.REF} '
                                     f'mismatches reference sequence '
                                     f'{ref[variant.CHROM][variant.start]}')
                out_chromosome, out_position, out_strand = out_coords[0][:3]
                out_allele = out[out_chromosome][out_position].seq
                # if negative strand, take reverse complement base
                if out_strand == '-':
                    out_allele = reverse_complement(out_allele)
                # and finally, polarize
                if out_allele.upper() == variant.ALT[0]:
                    anc[variant.CHROM][variant.start] = out_allele
                elif out_allele.upper() != variant.REF:
                    # triallelic
                    anc[variant.CHROM][variant.start] = 'N'
def main():

    usage = "\n\n\tusage: {} cancer_introns.b38.annot_ready.tsv hg38ToHg19.over.chain.gz > cancer_introns.b37.annot_ready.tsv\n\n".format(
        sys.argv[0])

    if len(sys.argv) < 3:
        print(usage, file=sys.stderr)
        sys.exit(1)

    cancer_introns_file = sys.argv[1]
    hg_chain_file = sys.argv[2]

    lo = LiftOver('hg38ToHg19.over.chain.gz')

    with open(cancer_introns_file, 'rt') as fh:
        header = next(fh)
        header = header.rstrip()
        print(header)
        for line in fh:
            line = line.rstrip()
            vals = line.split("\t")
            intron = vals[0]
            chr, coordset = intron.split(":")
            (lend, rend) = coordset.split("-")
            lend = int(lend)
            rend = int(rend)

            new_lend = lo.convert_coordinate(chr, lend - 1)
            #print("new_lend: {}".format(str(new_lend)))
            new_rend = lo.convert_coordinate(chr, rend - 1)
            #print("new_rend: {}".format(str(new_rend)))
            if new_lend and new_rend:

                new_lend_chr = new_lend[0][0]
                new_lend_coord = new_lend[0][1] + 1

                new_rend_chr = new_rend[0][0]
                new_rend_coord = new_rend[0][1] + 1

                if new_lend_chr != new_rend_chr or new_lend_chr != chr:
                    sys.stderr.write("-failed conversion of {}".format(line) +
                                     "  --> {} {}, {} {}\n".format(
                                         new_lend_chr, new_lend_coord,
                                         new_rend_chr, new_rend_coord))
                    continue

                if new_lend_coord > new_rend_coord:
                    (new_lend_coord, new_rend_coord) = (new_rend_coord,
                                                        new_lend_coord)

                new_intron_feature = "{}:{}-{}".format(chr, new_lend_coord,
                                                       new_rend_coord)
                vals[0] = new_intron_feature
                print("\t".join(vals))

    sys.exit(0)
Пример #15
0
def main(args):
    # open input vcf
    vcf = vcf_parser.Vcf(args['inputfile'])
    # add 3 new tag definitions - for hg19 liftover: chr, pos, and end
    hg19CHROM_definition = '##INFO=<ID=hg19_chr,Number=1,Type=String,Description="CHROM in hg19 using LiftOver from pyliftover">'
    hg19POS_definition = '##INFO=<ID=hg19_pos,Number=1,Type=Integer,Description="POS in hg19 using LiftOver from pyliftover (converted back to 1-based)">'
    hg19END_definition = '##INFO=<ID=hg19_end,Number=1,Type=Integer,Description="END in hg19 using LiftOver from pyliftover (converted back to 1-based)">'
    vcf.header.add_tag_definition(hg19END_definition)
    vcf.header.add_tag_definition(hg19POS_definition)
    vcf.header.add_tag_definition(hg19CHROM_definition)

    # get chain file for liftover
    lo = LiftOver(args['chainfile'])

    # write header and then loop variants, adding liftover coordiantes to INFO fields when appropriate. write all variants.
    with open(args['outputfile'], 'w') as fo:
        vcf.write_header(fo)
        for vnt_obj in vcf.parse_variants():

            # generate hg19 LO coordinates based on CHROM and POS
            hits = lo.convert_coordinate(vnt_obj.CHROM, vnt_obj.POS-1)
            if len(hits) > 0:
                #add hg19_chr
                hg19CHROM_value = 'hg19_chr='+hits[0][0].split('chr')[1]
                vnt_obj.add_tag_info(hg19CHROM_value)
                #add hg19_pos
                hg19POS_value = 'hg19_pos='+str(hits[0][1]+1)
                vnt_obj.add_tag_info(hg19POS_value)

            # also want to incorporate END position for SV and CNV
            # check if "END" exists in INFO and if it does, try a liftover
            try:
                END = int(vnt_obj.INFO.split("END=")[1].split(";")[0])
            except:
                END = ''

            if END != '':
                hits_end = lo.convert_coordinate(vnt_obj.CHROM, END-1)
                if len(hits_end) > 0:
                    try:
                        #if hg19_chr is already defined, don't add it
                        vnt_obj.get_tag_value("hg19_chr")
                        #add hg19_end
                        hg19END_value = 'hg19_end='+str(hits_end[0][1]+1)
                        vnt_obj.add_tag_info(hg19END_value)
                    except:
                        #if hg19_chr is not defined, add hg19_chr
                        hg19CHROM_value = 'hg19_chr='+hits_end[0][0].split('chr')[1]
                        vnt_obj.add_tag_info(hg19CHROM_value)
                        #add hg19_end
                        hg19END_value = 'hg19_end='+str(hits_end[0][1]+1)
                        vnt_obj.add_tag_info(hg19END_value)
            vcf.write_variant(fo, vnt_obj)

    subprocess.run(["bgzip", args['outputfile']])
    subprocess.run(["tabix",args['outputfile']+".gz"])
def main():

    # Parse args
    args = parse_args()
    confidence_orders = ['High', 'Medium', 'Low'] # Used to sort "highest" confidence

    # Load gold-standards
    gold_standards = load_gold_standards(args.input_pattern)

    # Create liftOver instances from chain files
    if args.grch37_to_38:
        args.grch37_to_38 = LiftOver(args.grch37_to_38)
    if args.grch38_to_37:
        args.grch38_to_37 = LiftOver(args.grch38_to_37)

    # Iterate over and process records
    out_data = []
    for record in gold_standards:

        # Lift-over positions to all assemblies
        record['sentinel_variant'] = fill_in_assemblies(
            record['sentinel_variant'],
            args.grch37_to_38,
            args.grch38_to_37
        )

        # Extract highest confidence
        record['gold_standard_info']['highest_confidence'] = sorted(
            [entry['confidence'] for entry in
             record['gold_standard_info']['evidence']],
            key=lambda x: confidence_orders.index(x)
        )[0]

        out_data.append(record)
    
    # Write output
    if not os.path.exists(os.path.dirname(args.output)):
        os.makedirs(os.path.dirname(args.output), exist_ok=True)
    with open(args.output, 'w') as out_h:
        json.dump(out_data, out_h, ensure_ascii=False, indent=2)

    return 0
Пример #17
0
def PCGP_mut_df_genome_build_check(df,pos_col=4):
	col_check_hg18= [ col for col in df.columns if 'hg18' in col.lower() ]
        col_check_hg38= [ col for col in df.columns if 'hg38' in col.lower() ]
	if len(col_check_hg18) > 0 or len(col_check_hg38) > 0:
		if (len(col_check_hg18) == 1 and len(col_check_hg38) == 0) or (len(col_check_hg18) == 0 and len(col_check_hg38) == 1):
                        if len(col_check_hg18) == 1:
			    fd=col_check_hg18[0]
                            col_check=col_check_hg18
			    print("[Warning] following columns from hg18 genome build: %s" % fd)
			    lo=LiftOver('hg18', 'hg19')
                        elif len(col_check_hg38) == 1:
                            fd=col_check_hg38[0]
                            col_check=col_check_hg38
                            print("[Warning] following columns from hg38 genome build: %s" % fd)
                            lo=LiftOver('hg38', 'hg19')
			pos=[]
			#print(df)
                        print(fd)
			for idx, row in df.iterrows():
				conversion=lo.convert_coordinate(row['Chr'], row[col_check[0]])
				if conversion:
					newpos=lo.convert_coordinate(row['Chr'], row[col_check[0]])[0]
					pos.append(newpos[1])
				else:
					newpos=(row['Chr'],-1)
					pos.append(0)
				#newpos=lo.convert_coordinate(row['Chr'], row[col_check[0]])[0]
				#pos.append(newpos[1])
			
			df['Position_hg19']=pos
			return df	
				
		else:
			print("[Error] only one column allowed for conversion: %s ... quit" % col_check)
			quit()
	else:
		#print("No change")
		cols=df.columns.values
		cols[pos_col]='Position_hg19'
		df.columns=cols
		return df
Пример #18
0
def liftover_cho(df):
    lo = LiftOver('hg18', 'hg38')
    def lift_coord(row):
        chrom = 'chr' + str(row['Chromosome'])
        pos = row['Genomic position'] - 1
        result = lo.convert_coordinate(chrom, pos)
        if len(result) == 0:
            print(f"Didn't find hg38 coordinate for {row['Chromosome']}:{row['Genomic position']}")
            return 'NA'
        return result[0][1] + 1
    df['Genomic position'] = df.apply(lift_coord, axis=1)
    return df
Пример #19
0
def get_liftover(frm=19, to=38):
    """
    Info: http://hgdownload.cse.ucsc.edu/downloads.html
    """
    from pyliftover import LiftOver
    liftoverfile = 'hg{}ToHg{}.over.chain.gz'.format(frm, to)
    try:
        return LiftOver(processedDataStorage + liftoverfile)
    except FileNotFoundError:
        raise FileNotFoundError(
            'Source: http://hgdownload.cse.ucsc.edu/gbdb/hg{}/liftOver/{}'.
            format(frm, liftoverfile))
Пример #20
0
    def liftover(self):

        # todo
        # Not sure what the failure mode of this tool is.  Will probably need to write a try catch eventually
        # Changing the chromosome and position messes up the key as well.  Could probably fix that.  But i don't have
        # the ref and alt alleles on hand and I don't want to parse them out of chromosomeHgvsName.

        from pyliftover import LiftOver
        lo = LiftOver('hg38', self.build)
        lifted = lo.convert_coordinate(self.chromosome, self.position)

        self.chromosome = lifted[0][0]
        self.position = lifted[0][1]
Пример #21
0
def from_hg18_to_hg19(chr, coord):
    """
    object to perform hg18 --> hg19 conversion.
    ----------- REMEMBER that LIFT-OVER coordinates are 0-based!!!
    ----------- ADD +1 to obtain a values in 1-based coordinate!!
    :param chr: chromosome name, e.g. 'chr6'
    :param coord: integer, e.g. 10000
    :return: coord in hg coordinates system
    """
    lo = LiftOver('hg18', 'hg19')
    conv = lo.convert_coordinate(chr, int(coord)+1)
    hg19_coord = conv[0][1]
    return hg19_coord
Пример #22
0
 def _parse_cmd_args(self, args):
     """ Parse the arguments in sys.argv """
     parser = argparse.ArgumentParser()
     parser.add_argument('path',
                         help='Path to this converter\'s python module')
     parser.add_argument('inputs',
                         nargs='+',
                         help='Files to be converted to .crv')
     parser.add_argument('-f',
                         dest='format',
                         help='Specify an input format')
     parser.add_argument('-n',
                         '--name',
                         dest='name',
                         help='Name of job. Default is input file name.')
     parser.add_argument('-d', '--output-dir',
                         dest='output_dir',
                         help='Output directory. '\
                              +'Default is input file directory.')
     parser.add_argument(
         '-l',
         '--liftover',
         dest='liftover',
         choices=['hg38'] + list(constants.liftover_chain_paths.keys()),
         default='hg38',
         help='Input gene assembly. Will be lifted over to hg38')
     parsed_args = parser.parse_args(args)
     self.input_paths = [os.path.abspath(x) for x in parsed_args.inputs]
     if parsed_args.format:
         self.input_format = parsed_args.format
     self.input_dir = os.path.dirname(self.input_paths[0])
     if parsed_args.output_dir:
         self.output_dir = parsed_args.output_dir
     else:
         self.output_dir = self.input_dir
     if not (os.path.exists(self.output_dir)):
         os.makedirs(self.output_dir)
     if parsed_args.name:
         self.output_base_fname = parsed_args.name
     else:
         self.output_base_fname = os.path.basename(self.input_paths[0])
     self.input_assembly = parsed_args.liftover
     self.do_liftover = self.input_assembly != 'hg38'
     if self.do_liftover:
         self.lifter = LiftOver(
             constants.liftover_chain_paths[self.input_assembly])
     else:
         self.lifter = None
     self.status_fpath = os.path.join(
         self.output_dir, self.output_base_fname + '.status.json')
Пример #23
0
def liftover_loci_in_df(df, chrom_column = 'chromosome', pos_column = 'position', source_ref_genome = 'hg38', \
        target_ref_genome = 'hg19'):

    from pyliftover import LiftOver

    liftover = LiftOver(source_ref_genome, target_ref_genome)
    new_loci = []

    for _, (chrom, pos) in df[[chrom_column, pos_column]].iterrows():
        new_loci.append(liftover_locus(liftover, chrom, pos))

    new_chroms, new_positions = (pd.Series(list(values), index=df.index)
                                 for values in zip(*new_loci))
    return pd.concat([new_chroms.rename(chrom_column) if column == chrom_column else (new_positions.rename(pos_column) if \
            column == pos_column else df[column]) for column in df.columns], axis = 1)
Пример #24
0
 def convertPos(cls, chrom, pos):
     if cls.sHandler is None:
         print("Initializing hg38 -> hg19 liftover conversion",
               file=sys.stderr)
         cls.sHandler = LiftOver("hg38", "hg19")
     if chrom not in cls.sChromMap:
         cls.sChromMap[chrom] = normalizeChromName(chrom)
     try:
         coord = cls.sHandler.convert_coordinate(cls.sChromMap[chrom],
                                                 pos - 1)
     except Exception:
         return None
     if (len(coord) == 0):
         return None
     return coord[0][1] + 1
Пример #25
0
    def __init__(self, chainfile):
        """
        This object will perform unique single positional liftovers - it will only lift over chromosome positions that
        map unique to the new genome and if the strand hasn't changed.
        Note: You should run a VCF Normalization sweep on all lifted ofer CPRAs to check for variants that need to be
        re-normalized, and to remove variants where the REF now doesn't match after a liftover.
        The combination of these steps will ensure high quality liftovers. However, it should be noted that this won't
        prevent the situation where multiple positions in the old genome pile up uniquely in the new genome, so one
        needs to check for this.
        It's organised as an object rather than a collection of functions  so that the LiftOver chainfile
        only gets opened/passed once and not for every position to be lifted over.
        :param chainfile: A string containing the path to the local UCSC .gzipped chainfile
        :return:
        """

        self.liftover = LiftOver(chainfile)
Пример #26
0
def liftover(chr, pos, chainfile):
    # chr: number or chrN
    # pos: 1-base position

    lo = LiftOver(chainfile)

    # formatting chromosome
    if (not isinstance(chr[0], str)) or ('chr' not in chr[0]):
        chr = ['chr' + str(i) for i in chr]

    pos = pos - 1  # pyliftover uses base-0
    lo_out = [_tidy_liftover(i, j, lo) for i, j in zip(chr, pos)]

    out = pd.DataFrame(lo_out, columns=['liftover_chr', 'liftover_pos'])
    out.iloc[:, 1] = out.iloc[:, 1] + 1  # convert back to base-1
    return out
Пример #27
0
    def liftover(self, chromosome, position, build='hg19'):

        # todo
        # Not sure what the failure mode of this tool is.  Will probably need to write a try catch eventually
        # Changing the chromosome and position messes up the key as well.  Could probably fix that.  But i don't have
        # the ref and alt alleles on hand and I don't want to parse them out of chromosomeHgvsName.

        lo = LiftOver('hg38', build)
        lifted = lo.convert_coordinate(chromosome, position)

        new_chromosome = lifted[0][0]
        new_position = lifted[0][1]

        if self.debug:
            print("%s %s -> %s %s" % (chromosome, position, new_chromosome, new_position))

        return new_chromosome, new_position
def liftover(pos, chro, from_assembly, to_assembly):
    """
        LiftOver a specific coordinate between assemblies using the UCSC LiftOver tool

        NOTE:   pyLiftover uses base 0, whereas coordinate system uses base 1
                therefore position 27107251 is actually 27107250 in pyLiftover
        """
    if from_assembly == to_assembly:
        return pos

    chro = 'chr' + str(chro)
    pos = int(pos)

    lo = LiftOver(from_assembly, to_assembly)
    out = lo.convert_coordinate(chro, pos)

    return out[0][1]
def open_file_and_process(file, from_build, to_build):
    filename = get_filename(file)
    new_filename = 'liftover_' + filename + '.tsv'
    build_map = None
    if from_build != to_build:
        build_map = LiftOver(ucsc_release.get(from_build),
                             ucsc_release.get(to_build))

    with open(file) as csv_file:
        count = 0
        result_file = open(new_filename, "w")
        csv_reader = csv.DictReader(csv_file, delimiter='\t')
        fieldnames = csv_reader.fieldnames
        writer = csv.DictWriter(result_file,
                                fieldnames=fieldnames,
                                delimiter='\t')

        writer.writeheader()

        for row in csv_reader:
            chromosome = row[CHR_DSET].replace('23', 'X').replace('24', 'Y')
            bp = row[BP_DSET]

            # do the bp location mapping if needed
            if from_build != to_build:
                mapped_bp = map_bp_to_build_via_liftover(chromosome=chromosome,
                                                         bp=bp,
                                                         build_map=build_map)
                if mapped_bp is None:
                    mapped_bp = map_bp_to_build_via_ensembl(
                        chromosome=chromosome,
                        bp=bp,
                        from_build=from_build,
                        to_build=to_build)
                row[BP_DSET] = mapped_bp

            writer.writerow(row)
            count += 1
            if count % 1000 == 0:
                print(count)
Пример #30
0
def main(coords, orig_assembly, new_assembly, chainfile, outfh):
    # Create a LiftOver object with desired mapping.
    lo = LiftOver(orig_assembly, new_assembly)

    results = []
    for coord in coords:
        try:
            chrom, pos = coord.split(':')
            # No idea why, but pos needs to be an int instead of a str!
            returnval = lo.convert_coordinate(chrom, int(pos))[0]
            results.append((
                chrom,
                pos,
            ) + returnval)
        except:
            # Not sure what kinds of errors we can get.  I think if a locus is
            # deleted, we'll get None as a result (which we'll want to handle),
            # but apart from that, not sure what to expect.
            sys.stderr.write('Offending coord: %s' % coord)
            raise

    print_results(results, outfh)