Пример #1
0
def try_find_build(rs, pos):
    snps_info = fetch_snps(rs)
    #snps_info = [('rs3737728', 'GRCh38.p2', '1', '1086035'), ('rs3934834', 'GRCh38.p2', '1', '1070426'), ('rs9651273', 'GRCh38.p2', '1', '1096160')]
    logging.info("Loading liftover chain files...")
    lift38_19 = LiftOver('pyliftover/hg38ToHg19.over.chain.gz')
    lift19_18 = LiftOver('pyliftover/hg19ToHg18.over.chain.gz')
    lift19_17 = LiftOver('pyliftover/hg19ToHg17.over.chain.gz')
    logging.info("Done")

    for (rsId, build, true_chr, pos_hg38), source_pos in zip(snps_info, pos):
        try:
            #if build != 'GRCh38.p2':  # assume a specific build we get from Entrez.efetch(db='SNP')
            #    continue
            source_pos -= 1
            pos_hg19 = lift38_19.convert_coordinate('chr{}'.format(true_chr),
                                                    int(pos_hg38) - 1)[0][1]
            pos_hg18 = lift19_18.convert_coordinate('chr{}'.format(true_chr),
                                                    pos_hg19)[0][1]
            pos_hg17 = lift19_17.convert_coordinate('chr{}'.format(true_chr),
                                                    pos_hg19)[0][1]
            print(
                "build={} {} chr{} source={} hg38={}{} hg19={}{} hg18={}{} hg17={}{}"
                .format(build, rsId, true_chr, source_pos, pos_hg38,
                        '*' if pos_hg38 == source_pos else '', pos_hg19,
                        '*' if pos_hg19 == source_pos else '', pos_hg18,
                        '*' if pos_hg18 == source_pos else '', pos_hg17,
                        '*' if pos_hg17 == source_pos else ''))
        except:
            pass
Пример #2
0
def main(args):
    # open input vcf
    vcf = vcf_parser.Vcf(args['inputfile'])
    # add 3 new tag definitions - for hg19 liftover: chr, pos, and end
    hg19CHROM_definition = '##INFO=<ID=hg19_chr,Number=1,Type=String,Description="CHROM in hg19 using LiftOver from pyliftover">'
    hg19POS_definition = '##INFO=<ID=hg19_pos,Number=1,Type=Integer,Description="POS in hg19 using LiftOver from pyliftover (converted back to 1-based)">'
    hg19END_definition = '##INFO=<ID=hg19_end,Number=1,Type=Integer,Description="END in hg19 using LiftOver from pyliftover (converted back to 1-based)">'
    vcf.header.add_tag_definition(hg19END_definition)
    vcf.header.add_tag_definition(hg19POS_definition)
    vcf.header.add_tag_definition(hg19CHROM_definition)

    # get chain file for liftover
    lo = LiftOver(args['chainfile'])

    # write header and then loop variants, adding liftover coordiantes to INFO fields when appropriate. write all variants.
    with open(args['outputfile'], 'w') as fo:
        vcf.write_header(fo)
        for vnt_obj in vcf.parse_variants():

            # generate hg19 LO coordinates based on CHROM and POS
            hits = lo.convert_coordinate(vnt_obj.CHROM, vnt_obj.POS-1)
            if len(hits) > 0:
                #add hg19_chr
                hg19CHROM_value = 'hg19_chr='+hits[0][0].split('chr')[1]
                vnt_obj.add_tag_info(hg19CHROM_value)
                #add hg19_pos
                hg19POS_value = 'hg19_pos='+str(hits[0][1]+1)
                vnt_obj.add_tag_info(hg19POS_value)

            # also want to incorporate END position for SV and CNV
            # check if "END" exists in INFO and if it does, try a liftover
            try:
                END = int(vnt_obj.INFO.split("END=")[1].split(";")[0])
            except:
                END = ''

            if END != '':
                hits_end = lo.convert_coordinate(vnt_obj.CHROM, END-1)
                if len(hits_end) > 0:
                    try:
                        #if hg19_chr is already defined, don't add it
                        vnt_obj.get_tag_value("hg19_chr")
                        #add hg19_end
                        hg19END_value = 'hg19_end='+str(hits_end[0][1]+1)
                        vnt_obj.add_tag_info(hg19END_value)
                    except:
                        #if hg19_chr is not defined, add hg19_chr
                        hg19CHROM_value = 'hg19_chr='+hits_end[0][0].split('chr')[1]
                        vnt_obj.add_tag_info(hg19CHROM_value)
                        #add hg19_end
                        hg19END_value = 'hg19_end='+str(hits_end[0][1]+1)
                        vnt_obj.add_tag_info(hg19END_value)
            vcf.write_variant(fo, vnt_obj)

    subprocess.run(["bgzip", args['outputfile']])
    subprocess.run(["tabix",args['outputfile']+".gz"])
def main():

    usage = "\n\n\tusage: {} cancer_introns.b38.annot_ready.tsv hg38ToHg19.over.chain.gz > cancer_introns.b37.annot_ready.tsv\n\n".format(
        sys.argv[0])

    if len(sys.argv) < 3:
        print(usage, file=sys.stderr)
        sys.exit(1)

    cancer_introns_file = sys.argv[1]
    hg_chain_file = sys.argv[2]

    lo = LiftOver('hg38ToHg19.over.chain.gz')

    with open(cancer_introns_file, 'rt') as fh:
        header = next(fh)
        header = header.rstrip()
        print(header)
        for line in fh:
            line = line.rstrip()
            vals = line.split("\t")
            intron = vals[0]
            chr, coordset = intron.split(":")
            (lend, rend) = coordset.split("-")
            lend = int(lend)
            rend = int(rend)

            new_lend = lo.convert_coordinate(chr, lend - 1)
            #print("new_lend: {}".format(str(new_lend)))
            new_rend = lo.convert_coordinate(chr, rend - 1)
            #print("new_rend: {}".format(str(new_rend)))
            if new_lend and new_rend:

                new_lend_chr = new_lend[0][0]
                new_lend_coord = new_lend[0][1] + 1

                new_rend_chr = new_rend[0][0]
                new_rend_coord = new_rend[0][1] + 1

                if new_lend_chr != new_rend_chr or new_lend_chr != chr:
                    sys.stderr.write("-failed conversion of {}".format(line) +
                                     "  --> {} {}, {} {}\n".format(
                                         new_lend_chr, new_lend_coord,
                                         new_rend_chr, new_rend_coord))
                    continue

                if new_lend_coord > new_rend_coord:
                    (new_lend_coord, new_rend_coord) = (new_rend_coord,
                                                        new_lend_coord)

                new_intron_feature = "{}:{}-{}".format(chr, new_lend_coord,
                                                       new_rend_coord)
                vals[0] = new_intron_feature
                print("\t".join(vals))

    sys.exit(0)
Пример #4
0
 def setup(self):
     self.civicdata = {}
     lifter = LiftOver(constants.liftover_chain_paths['hg19'])
     page_url = 'https://civicdb.org/api/variants?count=500&page=1'
     while page_url is not None:
         try:
             r = requests.get(page_url, timeout=5)
         except requests.exceptions.ConnectionError:
             msg = 'ERROR: Incomplete CIVIC data load'
             print(msg)
             self.logger.error(msg)
             break
         d = json.loads(r.text)
         records = d['records']
         page_url = d['_meta']['links']['next']
         for variant in records:
             chrom_37 = variant['coordinates']['chromosome']
             pos_37 = variant['coordinates']['start']
             if chrom_37 is None or pos_37 is None: continue
             new_coords = lifter.convert_coordinate("chr" + chrom_37,
                                                    int(pos_37))
             if len(new_coords) > 0:
                 chrom_38 = new_coords[0][0].replace('chr', '')
                 pos_38 = new_coords[0][1]
             else:
                 continue
             ref = variant['coordinates']['reference_bases']
             alt = variant['coordinates']['variant_bases']
             toks = [chrom_38, pos_38, ref, alt]
             if None not in toks:
                 vkey = ':'.join(map(str, toks))
                 self.civicdata[vkey] = variant
             else:
                 continue
Пример #5
0
 def setup(self):
     r = requests.get('https://civicdb.org/api/variants?count=5000&page=1')
     variants = json.loads(r.text)['records']
     lifter = LiftOver(constants.liftover_chain_paths['hg19'])
     vdict = {}
     for variant in variants:
         chrom_37 = variant['coordinates']['chromosome']
         pos_37 = variant['coordinates']['start']
         if chrom_37 is None or pos_37 is None: continue
         new_coords = lifter.convert_coordinate("chr" + chrom_37,
                                                int(pos_37))
         if len(new_coords) > 0:
             chrom_38 = new_coords[0][0].replace('chr', '')
             pos_38 = new_coords[0][1]
         else:
             continue
         ref = variant['coordinates']['reference_bases']
         alt = variant['coordinates']['variant_bases']
         toks = [chrom_38, pos_38, ref, alt]
         if None not in toks:
             vkey = ':'.join(map(str, toks))
             vdict[vkey] = variant
         else:
             continue
     self.civicdata = vdict
Пример #6
0
def lift_pos(posvec, chrvec, chainFile):
    logging.info("Lifting genomic positions...")
    nsnps = len(posvec)
    posvec = posvec - 1
    pos_lifted = np.empty((nsnps, ), dtype='int32')
    chr_lifted = np.empty((nsnps, ), dtype='int32')
    pos_indi = np.empty((nsnps, ), dtype='|S10')
    dup_indi = np.empty((nsnps, ), dtype='bool')
    dup_indi.fill(False)
    lift = LiftOver(chainFile)
    for i in range(nsnps):
        if (i + 1) % 200000 == 0:
            logging.info("{} SNPs done".format(i + 1))
        pos = posvec[i]
        chr = 'chr%d' % (chrvec[i], )
        tmp = lift.convert_coordinate(chr, pos)
        if not tmp:
            pos_lifted[i] = pos
            pos_indi[i] = 'miss'
            chr_lifted[i] = chrvec[i]
        elif len(tmp) > 1:
            pos_lifted[i] = tmp[0][1]
            chr_lifted[i] = re.sub('chr', '', tmp[0][0])
            pos_indi[i] = 'multi'
        else:
            pos_lifted[i] = tmp[0][1]
            chr_lifted[i] = re.sub('chr', '', tmp[0][0])
            if pos == tmp[0][1]:
                pos_indi[i] = 'unchanged'
            else:
                pos_indi[i] = 'lifted'
    return pos_lifted + 1, pos_indi, chr_lifted
Пример #7
0
class Converter:
    def __init__(self):
        ## lo = LiftOver("/opt/data/misc/hg38ToHg19.over.chain.gz")
        self.lo = LiftOver('hg19', 'hg38')

    def hg38(self, ch, pos):
        ch = str(ch).upper()
        if (ch.isdigit() or ch == 'X' or ch == 'Y'):
            ch = "chr{}".format(ch)
        try:
            coord = self.lo.convert_coordinate(ch, pos - 1)
        except:
            print "WARNING: HG38 conversion at {}:{}".format(ch, pos)
            coord = None
        if (not coord):
            return None
        if (len(coord) == 0):
            return "No Match"
        r = coord[0][1] + 1
        if (len(coord) == 1):
            return r
        return r, coord

    def close(self):
        return
Пример #8
0
def liftover_to_19(loc, build):
    floc = [loc.split(':')[0], loc.split(':')[1]]
    lo = LiftOver(os.path.join(chainpath, chains.get(build)))
    con_pos = lo.convert_coordinate(*floc)
    if con_pos:
        return int(con_pos[0][1])
    return NaN
Пример #9
0
class UniqueLiftover(object):
    def __init__(self, chainfile):
        """
        This object will perform unique single positional liftovers - it will only lift over chromosome positions that
        map unique to the new genome and if the strand hasn't changed.
        Note: You should run a VCF Normalization sweep on all lifted ofer CPRAs to check for variants that need to be
        re-normalized, and to remove variants where the REF now doesn't match after a liftover.
        The combination of these steps will ensure high quality liftovers. However, it should be noted that this won't
        prevent the situation where multiple positions in the old genome pile up uniquely in the new genome, so one
        needs to check for this.
        It's organised as an object rather than a collection of functions  so that the LiftOver chainfile
        only gets opened/passed once and not for every position to be lifted over.
        :param chainfile: A string containing the path to the local UCSC .gzipped chainfile
        :return:
        """

        self.liftover = LiftOver(chainfile)

    def liftover_cpra(self, chromosome, position, verbose=False):
        """
        Given chromosome, position in 1-based co-ordinates,
        This will use pyliftover to liftover a CPRA, will return a (c,p) tuple or raise NonUniqueLiftover if no unique
        and strand maintaining liftover is possible
        :param chromosome: string with the chromosome as it's represented in the from_genome
        :param position: position on chromosome (will be cast to int)
        :param verbose: print verbose information for debugging
        :return: ((str) chromosome, (int) position) or None if no liftover
        """

        chromosome = str(chromosome)
        position = int(position)

        # Perform the liftover lookup, shift the position by 1 as pyliftover deals in 0-based co-ords
        new = self.liftover.convert_coordinate(chromosome, position - 1)
        # This has to be here as new will be NoneType when the chromosome doesn't exist in the chainfile
        if new:
            # If the liftover is unique
            if len(new) == 1:
                # If the liftover hasn't changed strand
                if new[0][2] == "+":
                    # Set the co-ordinates to the lifted-over ones and write out
                    new_chromosome = str(new[0][0])
                    # Shift the position forward by one to convert back to a 1-based co-ords
                    new_position = int(new[0][1]) + 1
                    return new_chromosome, new_position
                else:
                    exception_string = (
                        "{},{} has a flipped strand in liftover: {}".format(
                            chromosome, position, new))
            else:
                exception_string = "{},{} lifts over to multiple positions: {}".format(
                    chromosome, position, new)
        elif new is None:
            exception_string = "Chromosome '{}' provided not in chain file".format(
                chromosome)

        if verbose:
            logging.error(exception_string)
        return None, None
Пример #10
0
def ancestral_fasta(args):
    """subroutine for ancestor subcommand
    """
    # single chromosome fasta file for reference genome
    ref = pyfaidx.Fasta(args.reference, read_ahead=10000)
    # make a copy to build our ancestor for this chromosome
    copyfile(args.reference, args.output)
    anc = pyfaidx.Fasta(args.output, read_ahead=10000, mutable=True)
    # reference genome for outgroup species (all chromosomes)
    out = pyfaidx.Fasta(args.outgroup, read_ahead=10000)
    # outgroup to reference alignment chain file
    lo = LiftOver(args.chain)
    # snps database for the same chromosome
    vcf = cyvcf2.VCF(args.vcf)

    # change regions outside of callability mask to all N bases
    if args.bed:
        if args.bed == '-':
            bed = sys.stdin
        else:
            bed = open(args.bed, 'r')
        last_end = 0
        for line in bed:
            chrom, start, end = line.rstrip().split('\t')[:3]
            start = int(start)
            anc[chrom][last_end:start] = 'N' * (start - last_end)
            last_end = int(end)
        anc[chrom][last_end:len(anc[chrom])] = 'N' * (len(anc[chrom]) -
                                                      last_end)

    for variant in vcf:
        # change variants that are not biallelic SNPs to N bases
        if not (variant.is_snp and len(variant.ALT) == 1):
            anc[variant.CHROM][variant.start:variant.end] = 'N' * (
                variant.end - variant.start)
        else:
            out_coords = lo.convert_coordinate(variant.CHROM, variant.start)
            # change ambiguously aligning sites to N bases
            if out_coords is None or len(out_coords) != 1:
                anc[variant.CHROM][variant.start] = 'N'
            else:
                if variant.REF != ref[variant.CHROM][
                        variant.start].seq.upper():
                    raise ValueError(f'variant reference allele {variant.REF} '
                                     f'mismatches reference sequence '
                                     f'{ref[variant.CHROM][variant.start]}')
                out_chromosome, out_position, out_strand = out_coords[0][:3]
                out_allele = out[out_chromosome][out_position].seq
                # if negative strand, take reverse complement base
                if out_strand == '-':
                    out_allele = reverse_complement(out_allele)
                # and finally, polarize
                if out_allele.upper() == variant.ALT[0]:
                    anc[variant.CHROM][variant.start] = out_allele
                elif out_allele.upper() != variant.REF:
                    # triallelic
                    anc[variant.CHROM][variant.start] = 'N'
Пример #11
0
class UniqueLiftover(object):

    def __init__(self, chainfile):
        """
        This object will perform unique single positional liftovers - it will only lift over chromosome positions that
        map unique to the new genome and if the strand hasn't changed.
        Note: You should run a VCF Normalization sweep on all lifted ofer CPRAs to check for variants that need to be
        re-normalized, and to remove variants where the REF now doesn't match after a liftover.
        The combination of these steps will ensure high quality liftovers. However, it should be noted that this won't
        prevent the situation where multiple positions in the old genome pile up uniquely in the new genome, so one
        needs to check for this.
        It's organised as an object rather than a collection of functions  so that the LiftOver chainfile
        only gets opened/passed once and not for every position to be lifted over.
        :param chainfile: A string containing the path to the local UCSC .gzipped chainfile
        :return:
        """

        self.liftover = LiftOver(chainfile)

    def liftover_cpra(self, chromosome, position, verbose=False):
        """
        Given chromosome, position in 1-based co-ordinates,
        This will use pyliftover to liftover a CPRA, will return a (c,p) tuple or raise NonUniqueLiftover if no unique
        and strand maintaining liftover is possible
        :param chromosome: string with the chromosome as it's represented in the from_genome
        :param position: position on chromosome (will be cast to int)
        :return: ((str) chromosome, (int) position) or None if no liftover
        """

        chromosome = str(chromosome)
        position = int(position)

        # Perform the liftover lookup, shift the position by 1 as pyliftover deals in 0-based co-ords
        new = self.liftover.convert_coordinate(chromosome, position - 1)
        # This has to be here as new will be NoneType when the chromosome doesn't exist in the chainfile
        if new:
            # If the liftover is unique
            if len(new) == 1:
                # If the liftover hasn't changed strand
                if new[0][2] == "+":
                    # Set the co-ordinates to the lifted-over ones and write out
                    new_chromosome = str(new[0][0])
                    # Shift the position forward by one to convert back to a 1-based co-ords
                    new_position = int(new[0][1]) + 1
                    return new_chromosome, new_position
                else:
                    exception_string = "{},{} has a flipped strand in liftover: {}".format(chromosome, position, new)
            else:
                exception_string = "{},{} lifts over to multiple positions: {}".format(chromosome, position, new)
        elif new is None:
            exception_string = "Chromosome '{}' provided not in chain file".format(chromosome)

        if verbose:
            logging.error(exception_string)
        return None, None
Пример #12
0
def PCGP_mut_df_genome_build_check(df,pos_col=4):
	col_check_hg18= [ col for col in df.columns if 'hg18' in col.lower() ]
        col_check_hg38= [ col for col in df.columns if 'hg38' in col.lower() ]
	if len(col_check_hg18) > 0 or len(col_check_hg38) > 0:
		if (len(col_check_hg18) == 1 and len(col_check_hg38) == 0) or (len(col_check_hg18) == 0 and len(col_check_hg38) == 1):
                        if len(col_check_hg18) == 1:
			    fd=col_check_hg18[0]
                            col_check=col_check_hg18
			    print("[Warning] following columns from hg18 genome build: %s" % fd)
			    lo=LiftOver('hg18', 'hg19')
                        elif len(col_check_hg38) == 1:
                            fd=col_check_hg38[0]
                            col_check=col_check_hg38
                            print("[Warning] following columns from hg38 genome build: %s" % fd)
                            lo=LiftOver('hg38', 'hg19')
			pos=[]
			#print(df)
                        print(fd)
			for idx, row in df.iterrows():
				conversion=lo.convert_coordinate(row['Chr'], row[col_check[0]])
				if conversion:
					newpos=lo.convert_coordinate(row['Chr'], row[col_check[0]])[0]
					pos.append(newpos[1])
				else:
					newpos=(row['Chr'],-1)
					pos.append(0)
				#newpos=lo.convert_coordinate(row['Chr'], row[col_check[0]])[0]
				#pos.append(newpos[1])
			
			df['Position_hg19']=pos
			return df	
				
		else:
			print("[Error] only one column allowed for conversion: %s ... quit" % col_check)
			quit()
	else:
		#print("No change")
		cols=df.columns.values
		cols[pos_col]='Position_hg19'
		df.columns=cols
		return df
Пример #13
0
class liftover:
    def __init__(self, build_from, build_to):
        # Source Genome Build
        if build_from in map_release.values():
            self.build_from = build_from
        else:
            build_mapped = map_release.get(build_from)
            if build_mapped is None:
                raise Exception(
                    'Unknown SOURCE genome build. The value was: {}'.format(
                        build_from))
            else:
                self.build_from = build_mapped

        # Destination Genome Build
        if build_to in map_release.values():
            self.build_to = build_to
        else:
            build_mapped = map_release.get(build_to)
            if build_mapped is None:
                raise Exception(
                    'Unknown DESTINATION genome build. The value was: {}'.
                    format(build_from))
            else:
                self.build_to = build_mapped

        # Download/Source the Chain from UCSC
        if self.build_from != self.build_to:
            self.GetChain()
        else:
            self.chain = None

    def GetChain(self):
        '''Downloads the chain from UCSC '''
        self.chain_name = 'UCSC: {} to {}'.format(self.build_from,
                                                  self.build_to)
        self.chain = LiftOver(self.build_from, self.build_to)

    def lift(self, chr, pos):
        lifted = self.chain.convert_coordinate(
            'chr{}'.format(str(chr)), int(pos)
        )  # ToDo figure out whether this step should be adjusted for 0/1 indexing?
        if lifted is not None:
            if len(lifted) == 1:
                return lifted[0][0][3:], int(
                    lifted[0][1]), False  # Only 1 position
            if len(lifted) > 1:
                return lifted[0][0][3:], int(
                    lifted[0][1]), True  # Multiple positions (take first)
            else:
                return None, None, None
        else:
            return None, None, None
class CravatAnnotator(BaseAnnotator):

    def setup(self): 
        chain_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data', 'hg38ToHg19.over.chain')
        self.liftover = LiftOver(chain_path)
    
    def annotate(self, input_data, secondary_data=None):
        out = {}
        hg19_data = self.liftover.convert_coordinate(input_data['chrom'], int(input_data['pos']) - 1)
        if len(hg19_data) > 0:
            out['chrom'] = hg19_data[0][0]
            out['pos'] = hg19_data[0][1] + 1
        return out
Пример #15
0
    def liftover(self):

        # todo
        # Not sure what the failure mode of this tool is.  Will probably need to write a try catch eventually
        # Changing the chromosome and position messes up the key as well.  Could probably fix that.  But i don't have
        # the ref and alt alleles on hand and I don't want to parse them out of chromosomeHgvsName.

        from pyliftover import LiftOver
        lo = LiftOver('hg38', self.build)
        lifted = lo.convert_coordinate(self.chromosome, self.position)

        self.chromosome = lifted[0][0]
        self.position = lifted[0][1]
Пример #16
0
def from_hg18_to_hg19(chr, coord):
    """
    object to perform hg18 --> hg19 conversion.
    ----------- REMEMBER that LIFT-OVER coordinates are 0-based!!!
    ----------- ADD +1 to obtain a values in 1-based coordinate!!
    :param chr: chromosome name, e.g. 'chr6'
    :param coord: integer, e.g. 10000
    :return: coord in hg coordinates system
    """
    lo = LiftOver('hg18', 'hg19')
    conv = lo.convert_coordinate(chr, int(coord)+1)
    hg19_coord = conv[0][1]
    return hg19_coord
Пример #17
0
def liftover_pickering(df):
    lo = LiftOver('hg19', 'hg38')
    chroms = ('chr' + df['Chromosome'].astype(str)).tolist()
    startpos = df['Start_position'].tolist()
    endpos = df['End_position'].tolist()
    rows = zip(chroms, startpos, endpos)
    new_startpos = []
    new_endpos = []
    for row in rows:
        new_start = lo.convert_coordinate(row[0], row[1]-1)
        if len(new_start) == 0:
            print(f"Didn't find hg38 coordinate for {row[0]}:{row[1]}-{row[2]}")
            new_startpos.append('NA')
        else:
            new_startpos.append(new_start[0][1]+1)
        new_end = lo.convert_coordinate(row[0], row[2]-1)
        if len(new_end) == 0:
            print(f"Didn't find hg38 coordinate for {row[0]}:{row[1]}-{row[2]}")
            new_endpos.append('NA')
        else:
            new_endpos.append(new_end[0][1] + 1)
    df['Start_position'] = new_startpos
    df['End_position'] = new_endpos
    return df
Пример #18
0
def pyliftover(hg38_chrom, hg38_coord):
    hg38_key = '%s:%s' % (hg38_chrom, hg38_coord)

    if hg38_key not in pyliftover_dict:
        lo = LiftOver(config.input_dir + 'hg38ToHg19.over.chain.gz')
        result = lo.convert_coordinate(hg38_chrom, int(hg38_coord))

        if result is not None:
            coords_list = result[0]

            pyliftover_dict[hg38_key] = {
                'chrom': coords_list[0],
                'coord': str(coords_list[1])
            }

    return pyliftover_dict[hg38_key]
def liftover(pos, chro, from_assembly, to_assembly):
    """
        LiftOver a specific coordinate between assemblies using the UCSC LiftOver tool

        NOTE:   pyLiftover uses base 0, whereas coordinate system uses base 1
                therefore position 27107251 is actually 27107250 in pyLiftover
        """
    if from_assembly == to_assembly:
        return pos

    chro = 'chr' + str(chro)
    pos = int(pos)

    lo = LiftOver(from_assembly, to_assembly)
    out = lo.convert_coordinate(chro, pos)

    return out[0][1]
Пример #20
0
    def liftover(self, chromosome, position, build='hg19'):

        # todo
        # Not sure what the failure mode of this tool is.  Will probably need to write a try catch eventually
        # Changing the chromosome and position messes up the key as well.  Could probably fix that.  But i don't have
        # the ref and alt alleles on hand and I don't want to parse them out of chromosomeHgvsName.

        lo = LiftOver('hg38', build)
        lifted = lo.convert_coordinate(chromosome, position)

        new_chromosome = lifted[0][0]
        new_position = lifted[0][1]

        if self.debug:
            print("%s %s -> %s %s" % (chromosome, position, new_chromosome, new_position))

        return new_chromosome, new_position
Пример #21
0
def main(coords, orig_assembly, new_assembly, chainfile, outfh):
    # Create a LiftOver object with desired mapping.
    lo = LiftOver(orig_assembly, new_assembly)

    results = []
    for coord in coords:
        try:
            chrom, pos = coord.split(':')
            # No idea why, but pos needs to be an int instead of a str!
            returnval = lo.convert_coordinate(chrom, int(pos))[0]
            results.append((chrom, pos,) + returnval)
        except:
            # Not sure what kinds of errors we can get.  I think if a locus is
            # deleted, we'll get None as a result (which we'll want to handle),
            # but apart from that, not sure what to expect.  
            sys.stderr.write('Offending coord: %s' % coord)
            raise

    print_results(results, outfh)
Пример #22
0
def main(coords, orig_assembly, new_assembly, chainfile, outfh):
    # Create a LiftOver object with desired mapping.
    lo = LiftOver(orig_assembly, new_assembly)

    results = []
    for coord in coords:
        try:
            chrom, pos = coord.split(':')
            # No idea why, but pos needs to be an int instead of a str!
            returnval = lo.convert_coordinate(chrom, int(pos))[0]
            results.append((
                chrom,
                pos,
            ) + returnval)
        except:
            # Not sure what kinds of errors we can get.  I think if a locus is
            # deleted, we'll get None as a result (which we'll want to handle),
            # but apart from that, not sure what to expect.
            sys.stderr.write('Offending coord: %s' % coord)
            raise

    print_results(results, outfh)
Пример #23
0
Usage: lift_over.py <from-build> <to-build>

stdin line format: chrom bp_in_from_build
stdout line format: bp_in_to_build, or '-' if not found

Created on February 19, 2014
@author: Oren Livne <*****@*****.**>
============================================================
'''
import sys, traceback, util
from pyliftover import LiftOver

if __name__ == '__main__':
    try:
        src, target = sys.argv[1:3]
        if src == target:
        	for _, bp in (line.strip().split(' ') for line in sys.stdin):
        	    print '%d %d' % (int(bp), int(bp))
        else:
            lo = LiftOver(src, target)
            for chrom, bp in (line.strip().split(' ') for line in sys.stdin):
                out = lo.convert_coordinate('chr' + chrom, int(bp))
                if not out:
                    print '-'
                else:
                    print '%d' % (out[0][1],)
    except:
        traceback.print_exc(file=sys.stdout)
        sys.exit(util.EXIT_FAILURE)
Пример #24
0
mongo_client = MongoClient()
db = mongo_client.fasttrack

lo = LiftOver("hg38ToHg19.over.chain.gz")

unmatched = 0
matched = 0
for r in db.gwas.find():
    chrid = r["chr_id"]
    chrpos = r["chr_pos"]
    if chrid and chrpos:
        try:
            _chrpos = int(chrpos)
        except:
            pass
        else:
            lifted = lo.convert_coordinate("chr%s" % chrid, _chrpos - 1)
            if lifted:
                new_chrid = lifted[0][0].split("chr")[1]
                new_chrpos = lifted[0][1]
                matched += 1
                db.gwas.update_many(
                    {"chr_id": chrid, "chr_pos": chrpos}, {"$set": {"hg19chr": new_chrid, "hg19pos": new_chrpos}}
                )
            else:
                # print('NONE: %s %s' %(chrid, chrpos))
                unmatched += 1

print(unmatched, matched)
Пример #25
0
def read_gwas(args, filename, report=None):
    liftover = None
    wrong_column_count = float_conv_failed = yes = no = 0
    desc = {}
    default_p, default_std = args['gwas:default:p'], args['gwas:default:se']
    default_n, default_chr = args['gwas:default:n'], args['gwas:default:chr']
    default_beta = args['gwas:default:beta']

    def select(name, options, fail=True):
        option_name = 'gwas:' + name
        if not args[option_name] is None:
            desc[name] = args[option_name]
        if name in desc:
            try:
                return header.index(desc[name])
            except IndexError:
                print('Specified header (--gwas:' + name,
                      args[option_name] + ') not found.')
                exit(1)
        for option in options:
            header_upper = list(map(str.upper, header))
            if option.upper() in header_upper:
                desc[name] = option
                return header_upper.index(option.upper())
        if fail and not args.get('gwas:default:' + name):
            print('Could not find a header in GWAS for', name)
            print('  specify with --' + option_name)
            print('suggestions:')
            for part in header:
                print(' * --' + option_name, part)
            exit(1)

    try:
        with fopen(filename) as f:
            for lineno, line in enumerate(f, 1):
                if lineno == 1:
                    if not args['gwas:build'] is None:
                        desc['build'] = args['gwas:build']
                    elif any(hint in line for hint in GWAS_HG19_HINTS):
                        desc['build'] = 'hg19'
                    elif any(hint in line for hint in GWAS_HG18_HINTS):
                        desc['build'] = 'hg18'
                    if '\0' in line:  # iibdgc-trans-ancestry-filtered-summary-stats.tgz contains zero byte garbage
                        garbage_end = line.rindex('\0')
                        line = line[garbage_end + 1:]
                    if args['gwas:header:remove']:
                        line = line.replace(args['gwas:header:remove'], '')
                    header = line.split(args['gwas:sep'])
                    hpos = select('chr_bp',
                                  GWAS_H_CHR_AND_BP_COMB_OPTIONS,
                                  fail=False)
                    if hpos is None:
                        postype_combined = False
                        hpos_ch = select('chr', GWAS_H_CHR_OPTIONS)
                        hpos_bp = select('bp', GWAS_H_BP_OPTIONS)
                    else:
                        postype_combined = True
                    href = select('effect', GWAS_H_EFF_OPTIONS)
                    hoth = select('other', GWAS_H_OTH_OPTIONS)
                    hfreq = select('freq', GWAS_H_FREQ_OPTIONS)
                    hse = select('se', GWAS_H_SE_OPTIONS)
                    hp = select('p', GWAS_H_PVALUE_OPTIONS)
                    if args['gwas:beta'] is not None:
                        hb = select('beta', GWAS_H_BETA_OPTIONS)
                    elif args['gwas:or'] is not None:
                        hb = None
                        hor = select('or', [])
                    else:
                        hb = select(
                            'beta',
                            GWAS_H_BETA_OPTIONS)  # select default or fail
                    if not args['gwas:n'] is None:
                        hn = [
                            header.index(col)
                            for col in args['gwas:n'].split(',')
                        ]
                        desc['n'] = '+'.join(args['gwas:n'].split(','))
                    elif any(col in header for col in GWAS_H_NTOTAL_OPTIONS):
                        ncol = next(col_ for col_ in GWAS_H_NTOTAL_OPTIONS
                                    if col_ in header)
                        desc['n'] = ncol
                        hn = [header.index(ncol)]
                    elif (any(col in header for col in GWAS_H_NCASE_OPTIONS)
                          and any(col in header
                                  for col in GWAS_H_NCONTROL_OPTIONS)):
                        ncol_a = next(col_ for col_ in GWAS_H_NCASE_OPTIONS
                                      if col_ in header)
                        ncol_b = next(col_ for col_ in GWAS_H_NCONTROL_OPTIONS
                                      if col_ in header)
                        desc['n'] = ncol_a + '+' + ncol_b
                        hn = [header.index(ncol_a), header.index(ncol_b)]
                    elif not args['gwas:default:n']:
                        print(
                            'Could not find a header in GWAS for the number of samples, or the number of cases and controls.'
                        )
                        exit(1)
                    else:
                        hn = None
                    if 'build' not in desc:
                        print(
                            'Could not determine GWAS genome build; use flag --gwas:build <BUILD>.'
                        )
                        exit(1)
                    if desc['build'] != args['gen:build']:
                        liftover = LiftOver(desc['build'], args['gen:build'])
                        print('converting', desc['build'], '->',
                              args['gen:build'])
                    print('= Detected headers =')
                    for k, v in args.items():
                        if k.startswith('gwas:default') and v:
                            desc[k[13:]] = 'DEFAULT ' + v
                    for k, v in desc.items():
                        print(k.ljust(10), v)
                    if args['header_only']:
                        exit(0)
                    print('= Converting =')
                    reporter = ReporterLine('Reading gwas data.')
                    continue
                parts = line.split(args['gwas:sep'])
                if len(parts) != len(header):
                    # MDD switches halfway to a different format for a small number of non-significant SNPs
                    if report:
                        log_error(report, 'wrong_column_count', gwas=parts)
                    wrong_column_count += 1
                    continue
                if postype_combined:
                    ch, bp, *_ = parts[hpos].split(
                        ':', 2)  # Some append :<SNP>/:<INDEL>, just ignore
                    if default_chr:
                        print(
                            'Default chromosome specified but reading chr:bp column.'
                        )
                        exit(1)
                else:
                    ch = default_chr or parts[hpos_ch]
                    bp = parts[hpos_bp]
                try:
                    if default_n:
                        n = default_n
                    else:
                        n = sum(int(float(parts[col]) + 0.5) for col in hn)
                        # some GWASs default to n=-9, which is then picked up
                        # by the header autodetector as valid data..
                        if n < 0:
                            print('Negative N!!!')
                            exit(1)
                except ValueError:
                    n = 'NA'
                gwas_freq = parts[hfreq]
                try:
                    if default_beta:
                        gwas_beta = default_beta
                    elif hb is None:
                        or_ = float(parts[hor])
                        if or_ < 0:
                            print('negative ODDS ratio. is this a beta?')
                            exit(1)
                        gwas_beta = math.log(or_)
                    else:
                        gwas_beta = float(parts[hb])
                    gwas_freq = float(gwas_freq)
                except ValueError:
                    row = GWASRow(parts[href].upper(), parts[hoth].upper(),
                                  gwas_freq, gwas_beta, default_std
                                  or parts[hse], default_p or parts[hp],
                                  lineno, ch, bp, n)
                    if report:
                        log_error(report, 'gwas_float_conv_failed', gwas=row)
                    float_conv_failed += 1
                    continue
                row = GWASRow(parts[href].upper(), parts[hoth].upper(),
                              gwas_freq, gwas_beta, default_std or parts[hse],
                              default_p or parts[hp], lineno, ch, bp, n)
                ch = ch.upper()
                if ch.startswith('CHR'):
                    ch = ch[3:]
                ch = ch.lstrip('0')
                ch = conv_chr_letter(ch)
                if liftover:
                    conv = liftover.convert_coordinate('chr' + ch, int(bp))
                    if conv:
                        ch, bp, s19, _ = conv[0]
                        bp = str(bp)
                        if ch.startswith('chr'):
                            ch = ch[3:]
                        yes += 1
                    else:
                        no += 1
                        if report:
                            log_error(report,
                                      'gwas_build_conv_failed',
                                      gwas=row)
                        continue
                ch = ch.zfill(2)
                yield (ch, bp), row
                if lineno % 40000 == 0:
                    reporter.update(lineno, f.fileno())
    except KeyboardInterrupt:
        print('Aborted reading gwas data at line', lineno)
    except UnicodeDecodeError:
        # IBD turns into gibberish after 95%, we can probably discard that
        print('UnicodeDecodeError, aborted reading gwas data at line', lineno)
    if liftover:
        print('Successfully', desc['build'], '->', args['gen:build'],
              'converted', yes, 'rows')
        print('Build conversion failed for', no,
              'rows (reported as gwas_build_conv_failed).')
    if float_conv_failed:
        print('Numeric conversion failed for', float_conv_failed,
              'rows (reported as gwas_float_conv_failed).')
    if wrong_column_count:
        print('Invalid number of columns for', wrong_column_count,
              'rows (reported as wrong_column_count).')
        print()
    print()
Пример #26
0
    else:
        # Sometimes it includes multipe dbSNP/rsID combinations, so always
        # take the more recent rsID (farmost right).
        rsid = cols[7].split(":")[-1]
    phase = [x[0] for x in cols[8:]]
    allele1 = [x[1] for x in cols[8:]]
    allele2 = [x[2] for x in cols[8:]]

    # Only report SNPs
    if type != "snp":
        continue
    log_snps += 1

    # Convert hg19 coordinate to hg38 using pyliftover
    # CGI is 0-based, so feed the start position to pyliftover
    lo_result = lo.convert_coordinate("chr" + chr, int(start))

    # Unable to convert from hg19 to hg38
    if len(lo_result) == 0:
        log_nonconvert += 1
        sys.stderr.write("No conversion\t%s\t%s\t%s\n" % (chr, start, rsid))
        continue

    # Multiple coordinates on hg38
    if len(lo_result) > 1:
        log_multi += 1
        sys.stderr.write("Multiple coordinates\t%s\t%s\t%s\n" %
                         (chr, start, rsid))
        continue

    lo_chr, lo_pos = lo_result[0][:2]
Пример #27
0
def liftover(hgvs_genomic,
             build_from,
             build_to,
             hn,
             reverse_normalizer,
             evm,
             validator,
             specify_tx=False,
             liftover_level=False,
             g_to_g=False):
    """
    Step 1, attempt to liftover using a common RefSeq transcript
    Step 2, attempt to liftover using PyLiftover.
    Lift position > Check bases > Lift back and confirm the original position
    :param hgvs_genomic:
    :param build_from:
    :param build_to:
    :param hn:
    :param reverse_normalizer:
    :param evm:
    :param validator: Validator obj
    :return:
    """

    try:
        hgvs_genomic = validator.hp.parse(hgvs_genomic)
    except TypeError as e:
        logger.debug("Except passed, %s", e)

    # Create return dictionary
    lifted_response = {}
    # Check genome build type
    if 'GRC' in build_from:
        from_set = 'grc_chr'
        alt_from_set = 'ucsc_chr'
        if '37' in build_from:
            lo_from = 'hg19'
            alt_build_from = 'hg19'
        elif '38' in build_from:
            lo_from = 'hg38'
            alt_build_from = 'hg38'
        else:
            lo_from = ''
            alt_build_from = ''

    else:
        from_set = 'ucsc_chr'
        alt_from_set = 'grc_chr'
        if '19' in build_from:
            lo_from = 'hg19'
            alt_build_from = 'GRCh37'
        elif '38' in build_from:
            lo_from = 'hg38'
            alt_build_from = 'GRCh38'
        else:
            lo_from = ''
            alt_build_from = ''

    if 'GRC' in build_to:
        to_set = 'grc_chr'
        alt_to_set = 'ucsc_chr'
        if '37' in build_to:
            lo_to = 'hg19'
            alt_build_to = 'hg19'
        elif '38' in build_to:
            lo_to = 'hg38'
            alt_build_to = 'hg38'
        else:
            lo_to = ''
            alt_build_to = ''
    else:
        to_set = 'ucsc_chr'
        alt_to_set = 'grc_chr'
        if '19' in build_to:
            lo_to = 'hg19'
            alt_build_to = 'GRCh37'
        elif '38' in build_to:
            lo_to = 'hg38'
            alt_build_to = 'GRCh38'
        else:
            lo_to = ''
            alt_build_to = ''

    # populate the variant from data
    vcf = hgvs_utils.report_hgvs2vcf(hgvs_genomic, build_from,
                                     reverse_normalizer, validator.sf)

    # Create to and from dictionaries
    lifted_response[build_from.lower()] = {}
    lifted_response[build_from.lower()][hgvs_genomic.ac] = {
        'hgvs_genomic_description': mystr(hgvs_genomic),
        'vcf': {
            'chr': vcf[from_set],
            'pos': str(vcf['pos']),
            'ref': vcf['ref'],
            'alt': vcf['alt']
        }
    }
    lifted_response[alt_build_from.lower()] = {}
    lifted_response[alt_build_from.lower()][hgvs_genomic.ac] = {
        'hgvs_genomic_description': mystr(hgvs_genomic),
        'vcf': {
            'chr': vcf[alt_from_set],
            'pos': str(vcf['pos']),
            'ref': vcf['ref'],
            'alt': vcf['alt']
        }
    }
    # From dictionary currently blank
    lifted_response[build_to.lower()] = {}
    lifted_response[alt_build_to.lower()] = {}

    # Get a list of overlapping RefSeq transcripts
    # Note, due to 0 base positions in UTA (I think) occasionally tx will
    rts_list = validator.hdp.get_tx_for_region(
        hgvs_genomic.ac, 'splign', hgvs_genomic.posedit.pos.start.base - 1,
        hgvs_genomic.posedit.pos.end.base)  #- 1)
    rts_dict = {}
    tx_list = False
    if g_to_g is True:
        pass
    else:
        for tx_dat in rts_list:
            rts_dict[tx_dat[0]] = True
        if evm is not None:
            rts_list_2 = evm.relevant_transcripts(hgvs_genomic)
        else:
            rts_list_2 = []
        for tx_dat_2 in rts_list_2:
            rts_dict[tx_dat_2] = True
        if rts_dict != {}:
            tx_list = list(rts_dict.keys())

    # Try to liftover
    if tx_list is not False:
        selected = []
        # Liftover via a specific tx if it can be done!
        if specify_tx is not False:
            tx_list = [specify_tx]
        for tx in tx_list:
            # identify the first transcript if any
            options = validator.hdp.get_tx_mapping_options(tx)
            for op in options:
                sfm = None
                if op[1].startswith('NC_'):
                    if build_to.startswith('GRC'):
                        sfm = seq_data.to_chr_num_refseq(op[1], build_to)
                    if build_to.startswith('hg'):
                        sfm = seq_data.to_chr_num_ucsc(op[1], build_to)
                    if build_from.startswith('GRC'):
                        sfm = seq_data.to_chr_num_refseq(op[1], build_from)
                    if build_from.startswith('hg'):
                        sfm = seq_data.to_chr_num_ucsc(op[1], build_from)
                    if sfm is not None:
                        selected.append([op[0], op[1]])
                if liftover_level == 'primary':
                    continue
                else:
                    if op[1].startswith('NT_'):
                        if build_to.startswith('GRC'):
                            sfm = seq_data.to_chr_num_refseq(op[1], build_to)
                        if build_to.startswith('hg'):
                            sfm = seq_data.to_chr_num_ucsc(op[1], build_to)
                        if build_from.startswith('GRC'):
                            sfm = seq_data.to_chr_num_refseq(op[1], build_from)
                        if build_from.startswith('hg'):
                            sfm = seq_data.to_chr_num_ucsc(op[1], build_from)
                        if sfm is not None:
                            selected.append([op[0], op[1]])
                    if op[1].startswith('NW_'):
                        if build_to.startswith('GRC'):
                            sfm = seq_data.to_chr_num_refseq(op[1], build_to)
                        if build_to.startswith('hg'):
                            sfm = seq_data.to_chr_num_ucsc(op[1], build_to)
                        if build_from.startswith('GRC'):
                            sfm = seq_data.to_chr_num_refseq(op[1], build_from)
                        if build_from.startswith('hg'):
                            sfm = seq_data.to_chr_num_ucsc(op[1], build_from)
                        if sfm is not None:
                            selected.append([op[0], op[1]])

        # remove duplicate chroms
        filtered_1 = {}
        if selected:
            for chroms in selected:
                if chroms[1] not in list(filtered_1.keys()):
                    filtered_1[chroms[1]] = chroms[0]
            added_data = False
            for key, val in list(filtered_1.items()):
                try:
                    # Note, due to 0 base positions in UTA (I think) occasionally tx will
                    # be identified that cannot be mapped to.
                    # In this instance, do not mark added data as True
                    hgvs_tx = validator.vm.g_to_t(hgvs_genomic, val)
                    hgvs_alt_genomic = validator.vm.t_to_g(hgvs_tx, key)
                    alt_vcf = hgvs_utils.report_hgvs2vcf(
                        hgvs_alt_genomic, build_to, reverse_normalizer,
                        validator.sf)

                    # Add the to build dictionaries
                    lifted_response[build_to.lower()][hgvs_alt_genomic.ac] = {
                        'hgvs_genomic_description': mystr(hgvs_alt_genomic),
                        'vcf': {
                            'chr': alt_vcf[to_set],
                            'pos': str(alt_vcf['pos']),
                            'ref': alt_vcf['ref'],
                            'alt': alt_vcf['alt']
                        }
                    }
                    lifted_response[alt_build_to.lower()][
                        hgvs_alt_genomic.ac] = {
                            'hgvs_genomic_description':
                            mystr(hgvs_alt_genomic),
                            'vcf': {
                                'chr': alt_vcf[alt_to_set],
                                'pos': str(alt_vcf['pos']),
                                'ref': alt_vcf['ref'],
                                'alt': alt_vcf['alt']
                            }
                        }
                    # Overwrite build from info as PAR may require additional info
                    lifted_response[build_from.lower()][
                        hgvs_alt_genomic.ac] = {
                            'hgvs_genomic_description':
                            mystr(hgvs_alt_genomic),
                            'vcf': {
                                'chr': alt_vcf[to_set],
                                'pos': str(alt_vcf['pos']),
                                'ref': alt_vcf['ref'],
                                'alt': alt_vcf['alt']
                            }
                        }
                    lifted_response[alt_build_from.lower()][
                        hgvs_alt_genomic.ac] = {
                            'hgvs_genomic_description':
                            mystr(hgvs_alt_genomic),
                            'vcf': {
                                'chr': alt_vcf[alt_to_set],
                                'pos': str(alt_vcf['pos']),
                                'ref': alt_vcf['ref'],
                                'alt': alt_vcf['alt']
                            }
                        }

                    added_data = True

                except vvhgvs.exceptions.HGVSError:
                    continue

            if lifted_response != {} and added_data is not False:
                return lifted_response

    # Note: pyliftover uses the UCSC liftOver tool.
    # https://pypi.org/project/pyliftover/
    # Once validated, download the UCSC liftover files from http://hgdownload.cse.ucsc.edu/goldenPath/hg38/liftOver/
    # The structure of the following code comes from VV pymod, so need to create a list
    genome_builds = [build_to]

    # Create liftover vcf
    from_vcf = hgvs_utils.report_hgvs2vcf(hgvs_genomic, lo_from,
                                          reverse_normalizer, validator.sf)

    lo = LiftOver(lo_from, lo_to)

    # Fix the GRC CHR
    if from_vcf[from_set].startswith('chr'):
        liftover_list = lo.convert_coordinate(from_vcf[from_set],
                                              int(from_vcf['pos']))
    else:
        my_chrom = 'chr' + from_vcf[from_set]
        liftover_list = lo.convert_coordinate(my_chrom, int(from_vcf['pos']))

    # Create dictionary
    for lifted in liftover_list:
        chrom = lifted[0]
        pos = lifted[1]
        orientated = lifted[2]

        lifted_ref_bases = from_vcf['ref']
        lifted_alt_bases = from_vcf['alt']

        # Inverted sequence
        if orientated != '+':
            my_seq = Seq(lifted_ref_bases)
            lifted_ref_bases = my_seq.reverse_complement()
            your_seq = Seq(lifted_alt_bases)
            lifted_alt_bases = your_seq.reverse_complement()
        accession = seq_data.to_accession(chrom, lo_to)
        if accession is None:
            wrn = 'Unable to identify an equivalent %s chromosome ID for %s' % (
                str(lo_to), str(chrom))
            logger.info(wrn)
            continue
        else:
            not_delins = accession + ':g.' + str(pos) + '_' + str((
                pos - 1) + len(lifted_ref_bases)) + 'delins' + lifted_alt_bases
            not_delins = str(not_delins)
            hgvs_not_delins = validator.hp.parse_hgvs_variant(not_delins)

            hgvs_lifted = hn.normalize(hgvs_not_delins)
            # Now try map back
            lo = LiftOver(lo_to, lo_from)

            # Lift back
            liftback_list = lo.convert_coordinate(chrom, pos)

            for lifted_back in liftback_list:
                # Pull out the good guys!
                # Need to add chr to the from_set
                if not lifted_back[0].startswith('chr'):
                    my_from_chr = 'chr' + lifted_back[0]
                else:
                    my_from_chr = lifted_back[0]

                if lifted_back[0] == from_vcf[from_set] or lifted_back[
                        0] == my_from_chr:
                    if lifted_back[1] == int(from_vcf['pos']):
                        for build in genome_builds:
                            vcf_dict = hgvs_utils.report_hgvs2vcf(
                                hgvs_lifted, build, reverse_normalizer,
                                validator.sf)
                            if build.startswith('GRC'):
                                lifted_response[build_to.lower()][
                                    hgvs_lifted.ac] = {
                                        'hgvs_genomic_description':
                                        mystr(hgvs_lifted),
                                        'vcf': {
                                            'chr': vcf_dict['grc_chr'],
                                            'pos': str(vcf_dict['pos']),
                                            'ref': vcf_dict['ref'],
                                            'alt': vcf_dict['alt']
                                        }
                                    }
                                lifted_response[alt_build_to.lower()][
                                    hgvs_lifted.ac] = {
                                        'hgvs_genomic_description':
                                        mystr(hgvs_lifted),
                                        'vcf': {
                                            'chr': vcf_dict['ucsc_chr'],
                                            'pos': str(vcf_dict['pos']),
                                            'ref': vcf_dict['ref'],
                                            'alt': vcf_dict['alt']
                                        }
                                    }
                            else:
                                lifted_response[build_to.lower()][
                                    hgvs_lifted.ac] = {
                                        'hgvs_genomic_description':
                                        mystr(hgvs_lifted),
                                        'vcf': {
                                            'chr': vcf_dict['ucsc_chr'],
                                            'pos': str(vcf_dict['pos']),
                                            'ref': vcf_dict['ref'],
                                            'alt': vcf_dict['alt']
                                        }
                                    }
                                lifted_response[alt_build_to.lower()][
                                    hgvs_lifted.ac] = {
                                        'hgvs_genomic_description':
                                        mystr(hgvs_lifted),
                                        'vcf': {
                                            'chr': vcf_dict['grc_chr'],
                                            'pos': str(vcf_dict['pos']),
                                            'ref': vcf_dict['ref'],
                                            'alt': vcf_dict['alt']
                                        }
                                    }
    return lifted_response
Пример #28
0
    else:
        # Sometimes it includes multipe dbSNP/rsID combinations, so always
        # take the more recent rsID (farmost right).
        rsid = cols[7].split(":")[-1]
    phase = [x[0] for x in cols[8:]]
    allele1 = [x[1] for x in cols[8:]]
    allele2 = [x[2] for x in cols[8:]]

    # Only report SNPs
    if type != "snp":
        continue
    log_snps += 1

    # Convert hg19 coordinate to hg38 using pyliftover
    # CGI is 0-based, so feed the start position to pyliftover
    lo_result = lo.convert_coordinate("chr" + chr, int(start))

    # Unable to convert from hg19 to hg38
    if len(lo_result) == 0:
        log_nonconvert += 1
        sys.stderr.write("No conversion\t%s\t%s\t%s\n"%(chr, start, rsid))
        continue

    # Multiple coordinates on hg38
    if len(lo_result) > 1:
        log_multi += 1
        sys.stderr.write("Multiple coordinates\t%s\t%s\t%s\n"%(chr, start, rsid))
        continue

    lo_chr, lo_pos = lo_result[0][:2]
    def addTSSInfo(self, vcfInputFile):
        vcf_reader = vcf.Reader(open(vcfInputFile, 'r'))
        vcf_reader.infos['TSSOL'] = VcfInfo('TSSOL', vcf_field_counts['A'], 'String',
                                            'Info indicates whether the variant overlapping with the'
                                            ' transcription start site(TSS)')

        vcf_writer = vcf.VCFWriter(open('output.vcf', 'w'), vcf_reader)

        query = SPARQLQueries.sparqlQueries()

        totalVar = 0
        tssOLVar = 0

        lo = LiftOver('hg38ToHg19.over.chain.gz')

        for record in vcf_reader:
            variantStart = record.start
            variantEnd = record.end
            variantChromosome = record.CHROM
            variantSubType = record.var_subtype
            isOverlapping = False



            # Adding chr prefix to the chromosome
            if "chr" not in variantChromosome:
                variantChromosome = "chr"+str(record.CHROM)

            #liftover from hg20 to hg19
            data = lo.convert_coordinate(variantChromosome, variantStart)

            #print variantChromosome
            print variantStart
            print variantEnd


            if ((data != None)):
                data2 = data.pop()

                variantChromosomehg19 = data2[0]
                variantStarthg19 = data2[1]



                data = lo.convert_coordinate(variantChromosome, variantEnd)
                data2 = data.pop()

                variantEndhg19 = data2[1]



                # SPARQL query
                result = query.getTSS('http://ep.dbcls.jp/fantom5/sparql', variantStarthg19, variantEndhg19, variantChromosomehg19)

                for row in result:

                    values = sparql.unpack_row(row)
                    cageStart = values[1]
                    cageEnd = values[2]

                    if ((variantSubType == 'ins') & ( variantStart > cageStart )):
                        isOverlapping = True
                        tssOLVar = tssOLVar+1
                        break
                    elif ((variantSubType != 'ins') & (cageStart > 0)):
                       isOverlapping = True
                       tssOLVar = tssOLVar+1
                    break

                totalVar = totalVar+1
                record.add_info('TSSOL', [isOverlapping])
            else:
                print "No liftover found for this pos = "+record.ID

            vcf_writer.write_record(record)

            print "No of variants = "+str(totalVar)
            print "No of tss overlapping variants = "+str(tssOLVar)
class SubmitHiCLiftOver:
    def __init__(self, args):
        self.args = args
        self.doLiftOver = LiftOver('hg19', 'hg38')

        self.lengths_orig = []
        self.lengths_filtered = []
        self.oldVsNew = []

    def splitStrCoordStr(self, raw):
        chrom = raw.split(':')[0]
        start = raw.split(':')[1].split('-')[0]
        end = raw.split(':')[1].split('-')[1]
        return "\t".join([chrom, start, end])

    def splitStrCoord(self, raw):
        chrom = raw.split(':')[0]
        start = raw.split(':')[1].split('-')[0]
        end = raw.split(':')[1].split('-')[1]
        return [chrom, int(start), int(end)]

    def wrapLiftover(self, debug, chrom, start, end, errMsg):
        lift_start = self.doLiftOver.convert_coordinate(chrom, start)
        if not lift_start:
            if debug:
                print(errMsg + " start", chrom, start)
            return None
        lift_start = lift_start[0]
        lift_end = self.doLiftOver.convert_coordinate(chrom, end)
        if not lift_end:
            if debug:
                print(errMsg + " end", chrom, end)
            return None
        lift_end = lift_end[0]
        if lift_start[0] != lift_end[0]:
            if debug:
                print(errMsg + " no longer same chrom", chrom, start, end, lift_start[0], lift_end[0])
            return None
        oldLen = end - start

        chromLift = lift_start[0]
        startLift = lift_start[1]
        endLift = lift_end[1]
        newLen = endLift - startLift

        if oldLen < 1:
            if debug:
                print(errMsg + " oldLen: negative!", chrom, start, end)
            return None
        if newLen < 1:
            if debug:
                print(errMsg + " newLen: negative!", chromLift, startLift, endLift)
            return None

        absDiff = abs(newLen - oldLen)

        return [chromLift, startLift, endLift, oldLen, newLen, absDiff]

    def coordToStr(self, c):
        return c[0] + ':' + str(c[1]) + '-' + str(c[2])

    def parseLine(self, line):
        # chr10   3240001 4120000 boundary.3|hg19|chr10:3240001-3280000___boundary.4|hg19|chr10:4080001-4120000   1.06090369391
        # [0chrom, 1start, 2end, 3mess, 4value]
        toks = line.split()
        leftCoord = toks[:3]
        leftCoord[1] = int(leftCoord[1])
        leftCoord[2] = int(leftCoord[2])
        mtoks = toks[3].split('|')
        midBoundaryLeft = mtoks[0]

        if 3 != len(mtoks):
            midBoundaryRight = mtoks[2].split('__')[1]

        midCoordRaw = mtoks[2].split('__')[0]
        midCoord = self.splitStrCoord(midCoordRaw)

        if 3 != len(mtoks):
            rightCoord = self.splitStrCoord(mtoks[-1])

        leftCoordLift = self.wrapLiftover(False, leftCoord[0], leftCoord[1], leftCoord[2], "left")
        if not leftCoordLift:
            return None
        self.lengths_orig.append([leftCoordLift[3], leftCoordLift[4]])

        if leftCoordLift[5] > 5000:
            if 0:
                print("skipping b/c of lengths change")
            return None

        midCoordLift = self.wrapLiftover(False, midCoord[0], midCoord[1], midCoord[2], "mid")
        if not midCoordLift:
            return None
        if midCoordLift[5] > 5000:
            return None
        if 3 != len(mtoks):
            rightCoordLift = self.wrapLiftover(False, rightCoord[0], rightCoord[1], rightCoord[2], "right")
            if not rightCoordLift:
                return None
            if rightCoordLift[5] > 5000:
                return None

        self.lengths_filtered.append([leftCoordLift[3], leftCoordLift[4]])

        if 3 != len(mtoks):
            mid = [midBoundaryLeft, "hg38-liftOver", self.coordToStr(midCoordLift) + '___' + midBoundaryRight,
                   "hg38-liftOver", self.coordToStr(rightCoordLift)]
        else:
            mid = [midBoundaryLeft, "hg38-liftOver", self.coordToStr(midCoordLift)]

        ret = "\t".join([str(x) for x in leftCoordLift[:3] + ['|'.join(mid)] + [toks[4]]])
        self.oldVsNew.append([line, ret])
        return ret

    def tmpFile(self, accession, assembly, prefix):
        return os.path.join("/home/mjp/tadsLiftOverHg19ToHg38",
                            assembly + "_liftOver_" + prefix + '_' + accession + ".bed.gz")

    def parseOutFile(self, accession, fnp):
        good = 0
        bad = 0
        with gzip.open(fnp) as f:
            with gzip.open(self.tmpFile(accession, 'hg38', 'point'), 'wb') as outF:
                for line in f:
                    newLine = self.parseLine(line)
                    if newLine:
                        outF.write(newLine + '\n')
                        good += 1
                    else:
                        bad += 1
        print("lifted:", accession, good, bad)

    def runLiftover(self):
        mc = MemCacheWrapper()
        qd = QueryDCC(cache=mc)
        url = "https://www.encodeproject.org/search/?type=Experiment&assay_title=Hi-C&status=released"

        for exp in qd.getExps(url):
            for f in exp.getTADs():
                f.download()
                self.parseOutFile(f.fileID, f.fnp())

        fnp = "/home/mjp/tadsLiftOverHg19ToHg38/lengths_orig.tsv"
        with open(fnp, 'w') as f:
            for r in self.lengths_orig:
                f.write('\t'.join([str(x) for x in r]) + '\n')
        print("wrote", fnp)

        fnp = "/home/mjp/tadsLiftOverHg19ToHg38/lengths_filtered.tsv"
        with open(fnp, 'w') as f:
            for r in self.lengths_filtered:
                f.write('\t'.join([str(x) for x in r]) + '\n')
        print("wrote", fnp)

        fnp = "/home/mjp/tadsLiftOverHg19ToHg38/oldVsNew.tsv"
        with open(fnp, 'w') as f:
            for r in self.oldVsNew:
                f.write(r[0])
                f.write(r[1] + '\n')
        print("wrote", fnp)

    def fileJson(self, exp, f, fnp):
        return {
            "dataset": exp.encodeID,
            "file_format": "bed",
            "file_format_type": "bed3+",
            "file_size": os.path.getsize(fnp),
            "md5sum": Utils.md5(fnp),
            "output_type": f.output_type,
            "assembly": "GRCh38",
            "award": "/awards/U41HG007000/",
            "lab": "/labs/zhiping-weng/",
            "derived_from": [f.fileID],
            "submitted_file_name": fnp,
            "aliases": ["zhiping-weng:hic-tad-hg38-liftOver-" + f.fileID]
        }

    def submitFile(self, exp, f):
        fileAccession = f.fileID
        fnp = self.tmpFile(fileAccession, 'hg38', 'point')
        j = self.fileJson(exp, f, fnp)
        print(j)
        submitFile(self.args, j)

    def runSubmit(self):
        authenticateEncodeTxt(self.args)

        mc = MemCacheWrapper()
        qd = QueryDCC(cache=mc)
        url = "https://www.encodeproject.org/search/?type=Experiment&assay_title=Hi-C&status=released"

        for exp in qd.getExps(url):
            for f in exp.getTADs():
                f.download()
                self.submitFile(exp, f)
Пример #31
0
mapping_dict = {}
with open(mapping_file, "r") as map_f:
    for line in map_f:
        line_p = line.rstrip("\n").split("\t")
        mapping_dict[line_p[0]]=line_p[2]
        mapping_dict[line_p[1]]=line_p[2]
print(mapping_dict)
enhancer_files = glob.glob(enhancer_dir+"/*.txt")
print(enhancer_files)

for e_file in enhancer_files:
    if e_file.endswith(".py") or e_file.endswith("mapping"):
        continue
    with open(e_file, "r") as e:
        e_file_name = e_file.split("/")[-1]
        with open(processed_dir+"converted_"+e_file_name, "w") as p:
            for line in e:
                p_line = line.rstrip("\n").split("\t")
                print(p_line)
                candidate = p_line[6]
                if candidate not in mapping_dict:
                    continue
                new_name = mapping_dict[candidate]
                new_line = p_line[0:6] + [new_name]
                new_line[1] = str(lo.convert_coordinate(p_line[0], int(p_line[1])))
                new_line[2] = str(lo.convert_coordinate(p_line[0], int(p_line[2])))
                new_line = "\t".join(new_line) + "\n"
                p.write(new_line)

Пример #32
0
#!/usr/bin/env python3
import pandas as pd
from pyliftover import LiftOver

File = "p-Value_threshold_1_hapmap3_all_variant_effect_non_zero_GRCh37.txt"
Input_file = pd.read_csv(File, index_col=None, header=None, sep=" ")
lo = LiftOver('hg19', 'hg38')
Input_file[6] = ""
Input_file[7] = ""
#hg38 = []
id_not_found = list()
Asd = list()
for var in range(0, len(Input_file[1])):
    print(var)
    try:
        Asd = lo.convert_coordinate("chr" + Input_file[0][var].astype(str),
                                    Input_file[1][var])
        Asdf = lo.convert_coordinate("chr" + Input_file[0][var].astype(str),
                                     Input_file[2][var])
        Input_file[6][var] = Asd[0][1]
        Input_file[7][var] = Asdf[0][1]
    except IndexError:
        id_not_found.append(list([var, Input_file[5][var]]))
        #Input_file[6][var] = Asd[2]
        #Input_file[7][var] = Asdf[2]
        pass
Input_file.loc[Input_file[5] == "rs12728058"][6] = 555
Input_file.loc[Input_file[5] == "rs12728058"][7] = ""

Input_file.loc[id_not_found[5][0], ]
Input_file.loc[id_not_found[4][0], 6] = 142739784
Input_file.loc[id_not_found[4][0], 7] = 142739784
Пример #33
0
def get_info(snp_code, snp_list):

    chr_prefix = "20"
    old_build = "hg18"
    new_build = "hg38"

    cols = [
        x for x in zip(*snp_list)
    ]  # This bit, unzips [* asterisk] the lis, which is actually a LIST of lists
    # and 'dumps' all same indexed elements from sublists in a tuple,
    # The list cols, contains 5 tuples; len(cols)=5

    snp_codes = cols[
        0]  # snp_codes         :  cols[0] = ('snp_0', 'snp_1', 'snp_2', 'snp_3', ...) __ 1st column of gwas.cases.gen
    hg18_coordinates = cols[
        2]  # hg18_coordinates  :  cols[2] = ('9098', '9150', '9795', '10731',.......) __ 3rd column of gwas.cases.gen
    ref = cols[
        3]  # ref_base          :  cols[3] = ('C', 'T', 'G', 'C', 'A', '.............) __ 4th column of gwas.cases.gen
    alt = cols[
        4]  # alt_base          :  cols[4] = ('T', 'A', 'T', 'A', 'C', ..............) __ 5th column of gwas.cases.gen
    # the whole rationale behind this, is that the index stays true amongst the wbove lists
    # in relation to the initial file.
    # So if we took the elements cols[0][0],cols[1][0], cols[2][0], cols[3][0],cols[][0]
    # we'd get all the elements in the row of snp_0 in the file gwas.cases.gen

    # NOW THAT WE HAVE EACH COLUMN OF THE FILE STORED IN A TUPLE,
    # WE CAN USE THEM TO ITERATE AND GENERATE THE HGVS id
    # THAT WE'LL NEED FOR SENDING THE GET REQUEST TO GET INFO.

    if snp_codes.count(
            snp_code
    ) != 0:  # this checks if the user_input matches a snp_id i.e. snp_90 from the file 12345_long.txt
        for i in range(0, len(snp_codes)):
            if snp_code == snp_codes[i]:

                sys.stdout.write(
                    "\n\nUpdating chromosome coordinates to new assembly..")

                pylift_id = int(hg18_coordinates[i])
                lo = LiftOver(
                    old_build, new_build
                )  # Stating from ('hg18') to('hg38') which build we want our coordinates to be updated

                # We'll use: LiftOver.convert_coordinate('chrX', 'XXXXX') to update the position on the chromosome
                # All SNPs are located on chromosome 20, so
                # the first argument of lo.convert_coordinates will be 'chr20' for all SNPs
                # the second argument of lo.convert_coordinates will be the pylift_id from above

                # The output of lo.convert_coordinates will be a list with 1 element; a tuple with 4 elements
                # i.e.: pylift_tuple = [('chr20', 80456, '+', 5643036713)]
                # We can access the elements of the tuple by using 2 sq.brackets as index indicators
                # SO:
                # pylift_tuple[0][0] = 'chr20'     ______ chromosome i.e.
                # pylift_tuple[0][1] = '80456'     ______ coordinates in hg38 <---- we'll need this one
                # pylift_tuple[0][2] = '+'         ______ DNA strand, + for coding, - for non-coding
                # pylift_tuple[0][3] = 5643036713  ______ allignment score*

                # NOTE: We will need the coordinates aka the pylift_tuple[0][1]

                pylift_tuple = lo.convert_coordinate('chr' + chr_prefix,
                                                     pylift_id)

                sys.stdout.write("  Done \n")

                # The pylift_tuple[0][1] is a string with the updated coordinates
                # hg38_coordinates, will be used to make the genomic HGVS id, necessary for the request

                hg38_coordinates = int(
                    pylift_tuple[0][1])  #the updated coordinates

                # We have chosen to use GET requests to get info
                # Since NOT all samples have an rs_id
                # We will try to reconstruct the genomic HGVS id for all samples using their coordinates
                # Let's take a look at the url link used for a GET request:
                # url = 'http://rest.ensembl.org/vep/human/hgvs/9:g.22125504G>C?'
                # For all of our samples the first part i.e. 'http://rest.ensembl.org/vep/human/hgvs/20:g.
                # would be the same.
                # All we need is for each snp to reconstruct the remaining bit,i.e. 22125504G>C?
                # which is actually a string of this form:
                # i.e. for snp_0: hg38_coordinates + snp[3][0] + '>' + snp[4]+'?'

                # Let request_id
                # be the variable that holds the reconstructed HGVS id for each SNP
                request_id = chr_prefix + ":g" + str(
                    hg38_coordinates) + ref[i] + '>' + alt[i] + '?'
                url = 'http://rest.ensembl.org/vep/human/hgvs/'
                headers = {"Content-Type": "application/json"}

                #At last, actually making the request using requests lib:
                r = requests.get(url + request_id, headers=headers)
                if (r.ok) == True:
                    data = r.json()
                    print(
                        ('\nQuery successful!\n\nInfo about', snp_code,
                         "has been saved in the file:  ", snp_code + ".info"))
                    print(('\nYour json file contains the following',
                           len(list(data[0].keys())), 'keys:\n'))
                    print(('\n'.join(list(data[0].keys())), '\n\n'))

                    saveout = sys.stdout
                    file = snp_code + ".info"

                    save = open(
                        file, 'w'
                    )  #saving in a file the output of print which is a decoded json file
                    sys.stdout = save

                    print(data)

                    sys.stdout = saveout
                    save.close(
                    )  #closing the file, and now stdout again on terminal

                else:
                    print(('Sorry, information for ', snp_code,
                           " currently unavailable."))

    #REMINDER: The else below, goes with the:
    # if snp[0].count(snp_id)!=0:
    # If user didn't type the right snp_id,
    # the following message will be printed:
    else:
        print((
            '\nSorry, but', snp_code,
            'is not included in the dataset.\nOr maybe you have misspelled the snp_id?.\nPlease try again.\n'
        ))
    return ("")
__author__ = 'rajaram'

#Reference : https://pypi.python.org/pypi/pyliftover
#Left over data : http://hgdownload.cse.ucsc.edu/gbdb/hg38/liftOver/

from pyliftover import LiftOver
#lo = LiftOver('hg38', 'hg19')
lo = LiftOver('hg38ToHg19.over.chain.gz')
for x in range(0, 100):
    data = lo.convert_coordinate('chr1', 1000000+x)
    print data
    data2 = data.pop()
    print data2[0]
Пример #35
0
import re

from pyliftover import LiftOver
liftover = LiftOver('hg18', 'hg19')

clonal = pandas.read_csv("nature13600-s1-table-s6-clonal.tsv", sep = "\t")
subclonal = pandas.read_csv("nature13600-s1-table-s7-subclonal.tsv", sep = "\t")

all = pandas.concat( [clonal, subclonal] )

# keep only validated variants, i.e. those with a Duplex_P_val smaller 0.01 and not NA
filtered = all[ all['Duplex_P_val'].str.contains('0,0[123456789]|NA|0,[123456789]') == False ]

with open("Wang2014_ground_truth_non_synonymous_variants.hg18_to_hg19.tsv", mode='w') as out:
    print( '\t'.join( filtered.columns.values.tolist() ), file=out, end='\n')
    for index, row in filtered.iterrows():
        print( row['chrom'], row['pos'], row['REF'], row['VAR'] )
        lo = liftover.convert_coordinate(row['chrom'],row['pos'] - 1)[0]
        row['chrom'] = lo[0]
        row['pos'] = lo[1] + 1
        # reclassify clonals' zygosity based on the Duplex_Freq
        if row['class'] == 'clonal':
            freq = float(row['Duplex_Freq'].replace(',', '.'))
            if freq >= 0.6:
                row['zygosity'] = 'hom'
            else:
                row['zygosity'] = 'het'
        print( '\t'.join(map(str, row)) )
        print( '\t'.join(map(str, row)), file=out, end='\n' )

Пример #36
0
import sys
import os

indir = os.environ['indir']
summarystats = os.environ['summarystats']
converted = os.environ['converted']

# download chain file
# hg38 to hg19
lo = LiftOver('hg38', 'hg19')

# read in sumstats
sumstats = pd.read_csv('{}/{}'.format(indir, summarystats), sep='\t')

# convert coordinates
chrom = lambda x: lo.convert_coordinate(x.CHR, x.POS)[0][0] if len(
    lo.convert_coordinate(x.CHR, x.POS)) > 0 else 'chr0'
loc = lambda x: lo.convert_coordinate(x.CHR, x.POS)[0][1] if len(
    lo.convert_coordinate(x.CHR, x.POS)) > 0 else 0

sumstats['Lifted_Chrom'] = sumstats.apply(chrom, axis='columns')
sumstats['Lifted_Loc'] = sumstats.apply(loc, axis='columns')

# drop unmatched data
sumstats = sumstats[sumstats.Lifted_Loc != 0]

# reformat and eliminate columns unnecessary for ldsc
sumstats.drop(columns=['CHR', 'POS'], inplace=True)
sumstats.rename(columns={
    'Lifted_Chrom': 'CHR',
    'Lifted_Loc': 'POS',
   for line in f:
      
       # Read columns for each variant in the MAF file
       columns = line.split('\t')
  
       # Filter empty rows and headers
       if len(columns)>2 and columns[0] != "Hugo_Symbol":

          pair_key = columns[15] + ' ' + columns[16]

          # Filtering variants in TCGA
          # 1) SNPs
          # 2) This sample comparison exists in GDC
          if columns[9] == "SNP" and pair_key in gdc_pairs:

             start = lo.convert_coordinate('chr' + columns[4], int(columns[5]))
             end = lo.convert_coordinate('chr' + columns[4], int(columns[6]))
             total_variants += 1

             # Check if reference has been correctly crossed
             if start is not None and end is not None and len(start)==1 and len(end)==1:
         
                 refbase = BedTool.seq(start[0][0].replace('chr','') + ':' + str(start[0][1]) + '-' + str(end[0][1]), fastaRef)

                 # Check if reference in TCGA is the same in hg38 ref
                 if refbase == columns[10]:

                     variant_key = ' '.join([start[0][0], str(start[0][1]), str(end[0][1]), start[0][2], columns[15], columns[16]])
                
                     # Create pair if it is not created
                     if pair_key in pair_list:
Пример #38
0
__author__ = 'rajaram'

#Reference : https://pypi.python.org/pypi/pyliftover
#Left over data : http://hgdownload.cse.ucsc.edu/gbdb/hg38/liftOver/

from pyliftover import LiftOver
#lo = LiftOver('hg38', 'hg19')
lo = LiftOver('hg38ToHg19.over.chain.gz')
for x in range(0, 100):
    data = lo.convert_coordinate('chr1', 1000000 + x)
    print data
    data2 = data.pop()
    print data2[0]
Пример #39
0
    
shutil.copy(os.path.join(hg19_dir,'%s_desc.xml' %hg19_test), os.path.join(hg18_dir,'%s_desc.xml' %hg18_test))
shutil.copy(os.path.join(hg19_dir,'%s_input.txt' %hg19_test), os.path.join(hg18_dir,'%s_input.txt' %hg18_test)) 
shutil.copy(os.path.join(hg19_dir,'%s_key.csv' %hg19_test), os.path.join(hg18_dir,'%s_key.csv' %hg18_test))    

# Add a <hg18>on</hg18> tag to the desc.xml
print 'Changing desc file'
desc_path = os.path.join(hg18_dir,'%s_desc.xml' %hg18_test)
desc = ET.parse(desc_path)
hg18 = ET.Element('hg18')
hg18.text = 'on'
desc.find('sub_params').append(hg18)
desc.write(desc_path)

# Shift genomic coordinates to hg18
print 'Lifting over coordinates'
input_path = os.path.join(hg18_dir,'%s_input.txt' %hg18_test)
input_text = open(input_path,'r').read()
lines19 = input_text.split('\n')
lines18 = []
for line19 in lines19:
    elems19 = line19.split('\t')
    elems18 = elems19
    genom18 = lo.convert_coordinate(elems19[1],int(elems19[2]))[0]
    elems18[1] = genom18[0]
    elems18[2] = str(genom18[1])
    lines18.append('\t'.join(elems18))
with open(input_path,'w') as f:
    f.write('\n'.join(lines18))
print 'Completed'
print 'All completed'
Пример #40
0
class MasterCravatConverter(object):
    """ Convert a file of ambiguous format to .crv format.
        
        Reads in CravatConverter classes in the same directory, selects the
        correct converter, and writes a crv file.
    """

    ALREADYCRV = 2

    def __init__(self, args=None):
        try:
            args = args if args else sys.argv
            self.input_path = None
            self.f = None
            self.input_format = None
            self.logger = None
            self.crv_writer = None
            self.crs_writer = None
            self.crm_writer = None
            self.crl_writer = None
            self.err_file = None
            self.primary_converter = None
            self.converters = {}
            self.possible_formats = []
            self.ready_to_convert = False
            self.cmd_args = None
            self.output_dir = None
            self.output_base_fname = None
            self.vtracker = VTracker()
            self._parse_cmd_args(args)
            self._setup_logger()
        except Exception as e:
            self.__handle_exception(e)

    def _parse_cmd_args(self, args):
        """ Parse the arguments in sys.argv """
        parser = argparse.ArgumentParser()
        parser.add_argument('path',
                            help='Path to this converter\'s python module')
        parser.add_argument('input', help='File to be converted to .crv')
        parser.add_argument('-f',
                            dest='format',
                            help='Specify an input format')
        parser.add_argument('-n',
                            '--name',
                            dest='name',
                            help='Name of job. Default is input file name.')
        parser.add_argument('-d', '--output-dir',
                            dest='output_dir',
                            help='Output directory. '\
                                 +'Default is input file directory.')
        parser.add_argument(
            '-l',
            '--liftover',
            dest='liftover',
            choices=['hg38'] + list(constants.liftover_chain_paths.keys()),
            default='hg38',
            help='Input gene assembly. Will be lifted over to hg38')
        parsed_args = parser.parse_args(args)
        self.input_path = os.path.abspath(parsed_args.input)
        if parsed_args.format:
            self.input_format = parsed_args.format
        input_dir, input_fname = os.path.split(self.input_path)
        if parsed_args.output_dir:
            self.output_dir = parsed_args.output_dir
        else:
            self.output_dir = input_dir
        if not (os.path.exists(self.output_dir)):
            os.makedirs(self.output_dir)
        if parsed_args.name:
            self.output_base_fname = parsed_args.name
        else:
            self.output_base_fname = input_fname
        self.input_assembly = parsed_args.liftover
        self.do_liftover = self.input_assembly != 'hg38'
        if self.do_liftover:
            self.lifter = LiftOver(
                constants.liftover_chain_paths[self.input_assembly])
        else:
            self.lifter = None

    def setup(self):
        """ Do necesarry pre-run tasks """
        if self.ready_to_convert: return
        # Open file handle to input path
        self.f = open(self.input_path)
        # Read in the available converters
        self._initialize_converters()
        # Select the converter that matches the input format
        self._select_primary_converter()

        # A correct .crv file is not processed.
        if self.input_format == 'crv' and \
            self.input_path.split('.')[-1] == 'crv':
            #exit(cravat.util.exit_codes['alreadycrv'])
            exit(1)

        # Open the output files
        self._open_output_files()
        self.ready_to_convert = True

    def _setup_logger(self):
        """ Open a log file and set up log handler """
        self.log_path = os.path.join(self.output_dir,
                                     self.output_base_fname + '.converter.log')
        self.logger = logging.getLogger('converter_log')
        self.logger.propagate = False
        self.logger.setLevel('INFO')
        handler = logging.FileHandler(self.log_path, mode='w')
        formatter = logging.Formatter()
        handler.setFormatter(formatter)
        self.logger.addHandler(handler)
        self.logger.info('MasterConverter log')
        self.logger.info('Opened %s' % time.asctime())
        self.logger.info('Input file: %s' % self.input_path)
        if self.do_liftover:
            self.logger.info('Liftover from %s' % self.input_assembly)

    def _initialize_converters(self):
        """ Reads in available converters.
            
            Loads any python files in same directory that start with _ as
            python modules. Initializes the CravatConverter class from that
            module and places them in a dict keyed by their input format
        """
        for module_info in au.get_local_module_infos_of_type(
                'converter').values():
            # path based import from https://docs.python.org/3/library/importlib.html#importing-a-source-file-directly
            spec = importlib.util.spec_from_file_location(
                module_info.name, module_info.script_path)
            module = importlib.util.module_from_spec(spec)
            spec.loader.exec_module(module)
            converter = module.CravatConverter()
            if converter.format_name not in self.converters:
                self.converters[converter.format_name] = converter
            else:
                err_msg = 'Cannot load two converters for format %s' \
                    %converter.format_name
                raise Exception(err_msg)
        self.possible_formats = list(self.converters.keys())

    def _select_primary_converter(self):
        """ Choose the converter which matches the input format.
            
            If a input format was not specified in the cmd args, uses the 
            check_format() method of the CravatConverters to identify a
            converter which can parse the input file.
        """
        if self.input_format is not None:
            if self.input_format not in self.possible_formats:
                sys.exit('Invalid input format. Please select from [%s]' \
                         %', '.join(self.possible_formats))
        else:
            valid_formats = []
            self.f.seek(0)
            for converter_name, converter in self.converters.items():
                check_success = converter.check_format(self.f)
                self.f.seek(0)
                if check_success: valid_formats.append(converter_name)
            if len(valid_formats) == 0:
                sys.exit('Input format could not be determined. ' +\
                    'Exiting without conversion.')
            elif len(valid_formats) > 1:
                sys.exit('Input format ambiguous in [%s]. '\
                            %', '.join(valid_formats)\
                         +'Please specify an input format.')
            else:
                self.input_format = valid_formats[0]
        self.primary_converter = self.converters[self.input_format]
        self.logger.info('Input format: %s' % self.input_format)

    def _open_output_files(self):
        """ Open .crv .crs and .crm output files, plus .err file.
            
            .crv .crs and .crm files are opened using a CravatWriter. 
            .err file will contain all errors which occur during conversion.
            .map file contains two columns showing which lines in input
            correspond to which lines in output.
        """
        # Setup CravatWriter
        self.wpath = os.path.join(self.output_dir,
                                  self.output_base_fname + '.crv')
        self.crv_writer = CravatWriter(self.wpath)
        self.crv_writer.add_columns(constants.crv_def)
        self.crv_writer.write_definition()
        for index_columns in constants.crv_idx:
            self.crv_writer.add_index(index_columns)
        self.logger.info('Output file: %s' % self.wpath)
        # Setup err file
        self.err_path = os.path.join(self.output_dir,
                                     self.output_base_fname + '.converter.err')
        self.err_file = open(self.err_path, 'w')
        self.logger.info('Error file: %s' % self.err_path)

        # Setup crm line mappings file
        self.crm_path = os.path.join(self.output_dir,
                                     self.output_base_fname + '.crm')
        self.crm_writer = CravatWriter(self.crm_path)
        self.crm_writer.add_columns(constants.crm_def)
        self.crm_writer.write_definition()
        for index_columns in constants.crm_idx:
            self.crm_writer.add_index(index_columns)
        self.logger.info('Map file: %s' % self.crm_path)

        # Setup crs sample file
        self.crs_path = os.path.join(self.output_dir,
                                     self.output_base_fname + '.crs')
        self.crs_writer = CravatWriter(self.crs_path)
        self.crs_writer.add_columns(constants.crs_def)
        if hasattr(self.primary_converter, 'addl_cols'):
            self.crs_writer.add_columns(self.primary_converter.addl_cols,
                                        append=True)
            constants.crs_def.extend(self.primary_converter.addl_cols)
        self.crs_writer.write_definition()
        for index_columns in constants.crs_idx:
            self.crs_writer.add_index(index_columns)
        self.logger.info('Sample crs file: %s' % self.crs_path)

        # Setup liftover var file
        if self.do_liftover:
            self.crl_path = '.'.join([self.wpath, self.input_assembly, 'var'])
            self.crl_writer = CravatWriter(self.crl_path)
            assm_crl_def = copy.deepcopy(constants.crl_def)
            assm_crl_def[1]['title'] = '{0} Chrom'.format(
                self.input_assembly.title())
            assm_crl_def[2]['title'] = '{0} Position'.format(
                self.input_assembly.title())
            self.crl_writer.add_columns(assm_crl_def)
            self.crl_writer.write_definition()
            self.crl_writer.write_names(self.input_assembly,
                                        self.input_assembly.title())

    def run(self):
        """ Convert input file to a .crv file using the primary converter."""
        try:
            self.setup()
            start_time = time.time()
            self.logger.info('Conversion start: %s' % \
                time.asctime(time.localtime(start_time)))
            self.primary_converter.setup(self.f)
            self.f.seek(0)
            read_lnum = 0
            write_lnum = 0
            num_errors = 0
            for l in self.f:
                read_lnum += 1
                try:
                    # all_wdicts is a list, since one input line can become
                    # multiple output lines
                    all_wdicts = self.primary_converter.convert_line(l)
                    if all_wdicts is None:
                        continue
                except Exception as e:
                    num_errors += 1
                    self._log_conversion_error(read_lnum, e)
                    continue
                if all_wdicts:
                    UIDMap = []
                    for wdict in all_wdicts:
                        if wdict['ref_base'] == '' \
                           and wdict['alt_base'] not in ['A','T','C','G']:
                            num_errors += 1
                            e = BadFormatError(
                                'Reference base required for non SNV')
                            self._log_conversion_error(read_lnum, e)
                            continue
                        if self.do_liftover:
                            prelift_wdict = copy.copy(wdict)
                            try:
                                wdict['chrom'], wdict['pos'] = self.liftover(
                                    wdict['chrom'], wdict['pos'])
                            except LiftoverFailure as e:
                                num_errors += 1
                                self._log_conversion_error(read_lnum, e)
                                continue
                        unique, UID = self.vtracker.addVar(
                            wdict['chrom'], int(wdict['pos']),
                            wdict['ref_base'], wdict['alt_base'])
                        wdict['uid'] = UID
                        if unique:
                            write_lnum += 1
                            self.crv_writer.write_data(wdict)
                            if self.do_liftover:
                                prelift_wdict['uid'] = UID
                                self.crl_writer.write_data(prelift_wdict)
                        if UID not in UIDMap:
                            #For this input line, only write to the .crm if the UID has not yet been written to the map file.
                            self.crm_writer.write_data({
                                'original_line': read_lnum,
                                'tags': wdict['tags'],
                                'uid': UID
                            })
                            UIDMap.append(UID)
                        self.crs_writer.write_data(wdict)
            end_time = time.time()
            self.logger.info('Conversion end: %s' %\
                time.asctime(time.localtime(end_time)))
            self.logger.info('Read lines: %d' % read_lnum)
            self.logger.info('Error lines: %d' % num_errors)
            self.logger.info('Wrote lines: %d' % write_lnum)
            runtime = round(end_time - start_time, 3)
            self.logger.info('Conversion runtime: %s' % runtime)

            self._close_files()

        except Exception as e:
            self.__handle_exception(e)

    def liftover(self, old_chrom, old_pos):
        new_coords = self.lifter.convert_coordinate(old_chrom, int(old_pos))
        if len(new_coords) > 0:
            new_chrom = new_coords[0][0]
            new_pos = new_coords[0][1]
            return new_chrom, new_pos
        else:
            raise LiftoverFailure(old_chrom, old_pos)

    def __handle_exception(self, e):
        sys.stderr.write(traceback.format_exc())
        if hasattr(self, 'logger'):
            if self.logger is not None:
                self.logger.exception(e)
                sys.exit(2)
        sys.exit(1)

    def _log_conversion_error(self, ln, e):
        """ Log exceptions thrown by primary converter.
            All exceptions are written to the .err file with the exception type
            and message. Exceptions are also written to the log file, with the 
            traceback. Exceptions of type InvalidData do not have their
            traceback logged.
        """
        err_toks = [str(x) for x in [ln, e.__class__.__name__, e]]
        self.err_file.write('\t'.join(err_toks) + '\n')
        if not (isinstance(e, InvalidData)):
            self.logger.exception(e)

    def _close_files(self):
        """ Close the input and output files. """
        self.f.close()
        self.crv_writer.close()
        self.crm_writer.close()
        self.crs_writer.close()
        self.err_file.close()
Пример #41
0
def plot_manhattan(
    args, annotations, l_x, l_y, l_c, x_ticks, y_max, d_pos_init_chrom):

    y_max = max(int(y_max + 3), args.min_y)

    if args.EFO:
        ## Just make some assumptions about builds here for now.
        ## https://en.wikipedia.org/wiki/Reference_genome
        lo = LiftOver('hg38', 'hg19')
        with open(args.EFO) as f:
            cnt = collections.Counter()
            for line in f:
                cnt[line.split('\t')[7]] += 1
            trait_most_common = cnt.most_common(1)[0][0]
        with open(args.EFO) as f:
            ## Skip header.
            for line in f:
                break
            for line in f:
                l = line.split('\t')
#                ## Try to weed out all the garbage present in the GWAS catalog.
#                if not l[7] == trait_most_common:
#                    continue
                CHR_ID = l[11]
                ## Skip if missing data.
                if CHR_ID == '':
                    continue
                try:
                    CHR_POS = int(l[12])
                ## Continue if CHR_POS is not an integer.
                except ValueError:
                    continue
                rsID = l[21]
                y = PVALUE_MLOG = min(y_max, float(l[28]))
#                if y < -math.log10(args.threshold_p):
#                    continue
                try:
                    x = d_pos_init_chrom[CHR_ID] + lo.convert_coordinate(
                        'chr{}'.format(CHR_ID), CHR_POS)[0][1]
                except KeyError:
                    assert CHR_ID == 'X'
                    continue
                except IndexError:
                    print('IndexError', CHR_ID, CHR_POS, lo.convert_coordinate('chr{}'.format(CHR_ID), CHR_POS), file=sys.stderr)
                    continue
#                l_x.append(x)
#                l_y.append(y)
#                l_c.append('#FF0000')
                ## Colour most frequently occuring trait red.
                if l[7] == trait_most_common:
                    plt.vlines(x, 0, y, colors='#FF0000', linewidth=0.5, linestyle='--')
                ## Colour less frequently occuring traits orange,
                ## because these might be junk in the GWAS catalog.
                else:
                    plt.vlines(x, 0, y, colors='#FF8000', linewidth=0.5, linestyle='--')

    n = len(l_y)

    plt.ylabel(r'-log$_{10}$($p$)')

#    plt.axhline(-math.log10(0.05 / n), color='0.8', linewidth=0.5)
#    plt.axhline(-math.log10(5 * 10 ** -8), color='0.5', linewidth=0.5)
    plt.axhline(-math.log10(args.threshold_p), color='0.2', linewidth=0.5, linestyle='--')
    try:
        plt.ylim((0, y_max))  # todo: make argument
    except:
        pass

    print('plt.scatter(manhattan)', file=sys.stderr)
    plt.scatter(l_x, l_y, c=l_c, s=3)

    plt.title(args.title, fontsize='small')

    for annotation in annotations:
#        if annotation['prob'] > 0.05 / n:
        if annotation['prob'] > args.threshold_p:
            continue
        print('\t'.join(
            [str(annotation[k]) for k in sorted(annotation.keys())]))
        plt.annotate(
            '\n'.join((
                'p={:.1E}'.format(annotation['prob']),
                'pos={:,}'.format(annotation['pos']),
                'MAF={:.3f}'.format(min(annotation['af'], 1 - annotation['af'])),
                annotation['rsID'],
                ','.join(annotation['gene_names']),
                )),
            xy=(annotation['x'], annotation['y']),
##            xytext=(),
            fontsize='xx-small',
            horizontalalignment='center',
            verticalalignment='bottom',
            rotation=30,
            )

    plt.xticks(
        *zip(*x_ticks),
        rotation=-75, size=6, fontsize=6)

    print('plt.savefig( {}.manhattan.png )'.format(args.out), file=sys.stderr)
    plt.savefig('{}.manhattan.png'.format(args.out), dpi=600)

    return
Пример #42
0
def LiftDown_hg18(_bim, _hg, _out):

    HG_input = 'hg{}'.format(_hg)
    # print("HG: {}".format(HG_input))

    df_bim = pd.read_csv(_bim,
                         sep='\s+',
                         header=None,
                         dtype=str,
                         names=['Chr', 'Label', 'GD', 'BP', 'a1', 'a2'])
    # print("df_bim:\n{}\n".format(df_bim))

    ### Main Liftover ###

    if HG_input == 'hg38':
        """
        'hg38' -> 'hg19' -> 'hg18' is needed.
        The Liftover tool (by UCSC Genomics Institute) doesn't provide 'hg38' to 'hg18'.
        
        """

        lo_hg38_to_hg19 = LiftOver(HG_input, 'hg19')
        lo_hg19_to_hg18 = LiftOver('hg19', 'hg18')

        sr_hg19 = df_bim['BP'] \
            .astype(int) \
            .map(lambda x: lo_hg38_to_hg19.convert_coordinate('chr6', x)) \
            .map(lambda x: x[0][1] if len(x) > 0 else -1)
        # print("(hg19):\n{}\n".format(sr_hg19))

        sr_hg18 = sr_hg19 \
            .map(lambda x: lo_hg19_to_hg18.convert_coordinate('chr6', x)) \
            .map(lambda x: x[0][1] if len(x) > 0 else -1)
        # print("(hg18):\n{}\n".format(sr_hg18))

    else:

        lo = LiftOver(HG_input, 'hg18')  # Liftdown to hg18

        sr_hg18 = df_bim['BP'] \
            .astype(int) \
            .map(lambda x: lo.convert_coordinate('chr6', x)) \
            .map(lambda x: x[0][1] if len(x) > 0 else -1)

    df_bim['BP'] = sr_hg18  # Setting new BPs (Liftdown)

    ### Makrers that failed the Liftdown. ###

    f_failed = sr_hg18 == -1

    if f_failed.any():
        print(
            std_WARNING_MAIN_PROCESS_NAME +
            "Next markers of Target('{}') failed to Liftdown to hg18. These markers will be excluded."
            .format(_bim))
        print(df_bim[f_failed])

    # print("df_bim_hg18:\n{}\n".format(df_bim))
    df_bim.to_csv(_out, sep='\t', header=False, index=False)

    return _out
Пример #43
0
        interval = intrxn[1].split(":")[1].split("-")
    if len(interval) == 2:
        for i in range(int(interval[0]), int(interval[1])):
            dist[i] += 1
print "RNA size:", len(dist)


#Use the following part to liftover mouse coordinates to human 
liftfiles = {"mm28S": "/Users/lu/Documents/chang/rrna/liftover/mmtohs28S.liftoverchain", \
"mm45S": "/Users/lu/Documents/chang/rrna/liftover/mmtohs45S.liftoverchain", \
"Malat1": "/Users/lu/Documents/chang/psoralen/examples/MALAT1/mmtohg_Malat1.liftoverchain"}
if RNAtoplot in liftfiles:
    newdist = [0 for i in range(0, size)]
    lo = LiftOver(liftfiles[RNAtoplot])
    for i in range(0, size):
        lifted = lo.convert_coordinate(RNAtoplot, i, '+')
        if lifted: newdist[lifted[0][1]] += dist[i]
    dist = newdist



figure = plt.figure(figsize=(8,2))
axes = plt.Axes(figure, [.3,.3,.6,.6])
figure.add_axes(axes)
plt.bar(range(0, size), dist, color='k')
axes.spines['top'].set_visible(False)
axes.spines['right'].set_visible(False)
axes.yaxis.set_ticks_position('left')
axes.xaxis.set_ticks_position('bottom')
plt.xlim(0, size)
plt.xlabel(xlab)