예제 #1
0
def run(args):
    Utilities.ensure_requisite_folders(args.output)

    logging.info("starting lifting over.")
    liftover = pyliftover.LiftOver(args.liftover)
    with gzip.open(args.output, "w") as _o:
        with open(args.input) as _i:
            for i,line in enumerate(_i):
                if i ==0:
                    line = "\t".join(line.strip().split()) + "\n"
                    _o.write(line.encode())
                    continue

                try:
                    comps = line.strip().split()
                    chr = comps[0]
                    start = int(comps[1])
                    end = int(comps[2])

                    _chrs, _s = _l(liftover, chr, start)
                    _chre, _e = _l(liftover, chr, end)
                    if _chrs != _chre:
                        logging.warning("{}:{}:{} have different target chromosomes: {}/{}".format(chr, start, end, _chrs, _chre))
                    line = "{}\n".format("\t".join([_chrs, str(_s), str(_e)]))
                    _o.write(line.encode())
                except Exception as e:
                    logging.info("Error for: %s", line)


    logging.info("Finished lifting over.")
def UpdateWarburtonTable1(infile, ref_file, outfile):
    inhandle = open(infile, 'r')
    intable = csv.reader(inhandle, delimiter='\t')

    outhandle = open(outfile, 'w')

    updated_file = '{0}_hg38.tsv'.format('.'.join(infile.split('.')[:-1]))
    updated_handle = open(updated_file, 'w')
    updated_table = csv.writer(updated_handle, delimiter='\t')
    header = intable.next()
    updated_table.writerow(header)
    lo = pyliftover.LiftOver('hg18', 'hg38')

    ##    seq=GetSeq(ref_file)"
    for row in intable:
        chrom, interval = row[-1].split(':')
        left, right = interval.split('-')
        left = int(''.join(left.split(',')))
        right = int(''.join(right.split(',')))

        coord_left = lo.convert_coordinate(chrom, left)[0][1]
        chromosome, coord_right = lo.convert_coordinate(chrom, right)[0][:2]
        print chromosome, left, coord_left
        print chromosome, right, coord_right

        new_line = row[:-1] + [
            '{0}:{1}-{2}'.format(chromosome, coord_left, coord_right)
        ]
        ##        seq_name='>{0}_{1}_{2}_Up{3}_{4}_{5}\n',format(row[7],row[2],row[3], chromosome, coord_left, coord_right,)
        ##        outfile.write(seq_name)
        ##        outfile.write ( '{0}\n'.format( seq[chromosome][coord_left:coord_right].upper() ) )
        updated_table.writerow(new_line)
    inhandle.close()
    updated_handle.close()
    outfile.close()
예제 #3
0
def liftover(args, d):
    logging.info("Performing liftover")
    l = pyliftover.LiftOver(args.liftover)
    new_position = []
    new_chromosome = []
    for t in d.itertuples():
        #NA is important, instead of None or NaN, so that integer positions are not converted to floats by pandas. Yuck!
        _new_chromosome = "NA"
        _new_position = "NA"
        try:
            p = int(t.position)
            l_ = l.convert_coordinate(t.chromosome, p)
            if l_:
                if len(l_) > 1:
                    logging.warning(
                        "Liftover with more than one candidate: %s",
                        t.variant_id)
                _new_chromosome = l_[0][0]
                _new_position = int(l_[0][1])
        except:
            pass
        new_chromosome.append(_new_chromosome)
        new_position.append(_new_position)

    d = d.assign(chromosome=new_chromosome)
    d = d.assign(position=new_position)

    logging.info("%d variants after liftover", d.shape[0])
    return d
예제 #4
0
def call_liftover(df):
    """Call pyliftover.LiftOver to update genomic coordinates."""
    logging.info(f' updating genomic coordinates.')
    build = df['build'][0]

    if (build == 'hg37') | (build == 'hg19') | (build == 'b37'):
        chain = HG19TO38
    elif (build == 'hg18') | (build == 'b18'):
        chain = HG18TO38
    else:
        logging.error(f' genome build information is not available.')
        sys.exit(1)

    lifting = pyliftover.LiftOver(chain)
    new_chrom = []
    new_pos = []
    df['chrom_' + build] = df['chrom']
    df['pos_' + build] = df['pos']
    df['variant_id_' + build] = df['variant_id']
    for t in df.itertuples():
        _lifted_chrom, _lifted_pos = convert_coords(lifting, t)
        new_chrom.append(_lifted_chrom)
        new_pos.append(_lifted_pos)

    df = df.assign(chrom=new_chrom)
    df = df.assign(pos=new_pos)

    # update build information in the dataframe
    df['build'] = 'b38'
    logging.info(f' {str(df.shape[0])} variants after liftover')

    return df
예제 #5
0
def main():
	params = parseArgs()
	if params.liftover:
		lo = pyliftover.LiftOver(params.liftover)
		if params.table:
			tab=pd.read_csv(params.table, sep="\t")
			print("Read table:")
			print(tab)
			def convert(row):
				name="chr"+row[params.chrom]
				ret=lo.convert_coordinate(name, row[params.bp])
				return(int(ret[0][1]))
				
			tab[params.ocol] = tab.apply(convert,axis = 1)
			print("Writing the output table:")
			print(tab)
			tab.to_csv(params.oname, sep="\t", index=False)
			
			if params.marey:
				marey=make_marey(tab, params.chrom, params.ocol)
				print("Created the following Marey Map input:")
				print(marey)
				mout=params.oname+"_mmap.txt"
				marey.to_csv(mout, sep=" ", quoting=csv.QUOTE_NONNUMERIC, index=False)
			
		else:
			params.display_help("Error: No table provided")
	else:
		params.display_help("Error: No liftover file provided")
예제 #6
0
def liftover(v, frm, to):
    import pyliftover #pyliftover is slow!
    # note that pyliftover is 0 based
    # First frm-to pair may take time to download the data from UCSC
    # return a list of tuple.
    lo = pyliftover.LiftOver(frm, to)
    chrom,pos,ref,alt = v.split('-')
    results = lo.convert_coordinate('chr'+chrom, int(pos)-1)
    if not results:
        return []
    return ['-'.join([i[0][3:],str(i[1]+1),ref,alt]) for i in results]
예제 #7
0
def dosage_generator(args, variant_mapping=None, weights=None):
    if args.liftover:
        logging.info("Acquiring liftover conversion")
        liftover_chain = pyliftover.LiftOver(args.liftover)
        liftover_conversion = lambda chr, pos: Genomics.lift(
            liftover_chain, chr, pos, args.zero_based_positions)
    else:
        liftover_chain = None
        liftover_conversion = None

    whitelist = None
    if variant_mapping and type(variant_mapping) == dict:
        logging.info("Setting whitelist from mapping keys")
        whitelist = set(variant_mapping.keys())
    else:
        logging.info("Setting whitelist from available models")
        whitelist = set(weights.rsid)

    d = None
    if args.text_genotypes:
        from metax.genotype import DosageGenotype
        d = DosageGenotype.dosage_files_geno_lines(
            args.text_genotypes,
            variant_mapping=variant_mapping,
            whitelist=whitelist,
            skip_palindromic=args.skip_palindromic,
            liftover_conversion=liftover_conversion)
    elif args.bgen_genotypes:
        from metax.genotype import BGENGenotype
        d = BGENGenotype.bgen_files_geno_lines(
            args.bgen_genotypes,
            variant_mapping=variant_mapping,
            force_colon=args.force_colon,
            use_rsid=args.bgen_use_rsid,
            whitelist=whitelist,
            skip_palindromic=args.skip_palindromic)
    elif args.vcf_genotypes:
        from metax.genotype import CYVCF2Genotype
        d = CYVCF2Genotype.vcf_files_geno_lines(
            args.vcf_genotypes,
            mode=args.vcf_mode,
            variant_mapping=variant_mapping,
            whitelist=whitelist,
            skip_palindromic=args.skip_palindromic,
            liftover_conversion=liftover_conversion)

    if d is None:
        raise Exceptions.InvalidArguments("unsupported genotype input")
    if args.force_mapped_metadata:
        d = Genotype.force_mapped_metadata(d, args.force_mapped_metadata)
    return d
def liftover(args, d):
    logging.info("Performing liftover")
    l = pyliftover.LiftOver(args.liftover)
    new_position = []
    new_chromosome = []
    for t in d.itertuples():
        _new_chromosome, _new_position = _lift(l, t.chromosome, t.position)

        new_chromosome.append(_new_chromosome)
        new_position.append(_new_position)

    d = d.assign(chromosome=new_chromosome)
    d = d.assign(position=new_position)

    logging.info("%d variants after liftover", d.shape[0])
    return d
예제 #9
0
import pyliftover
import sys
if __name__ == '__main__':
    chrom = str("chr" + sys.argv[1])
    pos = int(sys.argv[2])
    lo = pyliftover.LiftOver('hg19ToHg38.over.chain')
    result = lo.convert_coordinate(chrom, pos)
    result = str(result[0]).replace('(', '').replace(')', '').replace("'", "")
    result = [i.strip() for i in result.split(',')]
    print result[0] + ',' + result[1]
예제 #10
0
def converttohg38(args):
    if args.sourcegenome not in ['hg18', 'hg19']:
        print('Source genome should be either hg18 or hg19.')
        exit()
    if os.path.exists(args.db) == False:
        print(args.db, 'does not exist.')
        exit()
    liftover = pyliftover.LiftOver(
        constants.get_liftover_chain_path_for_src_genome(args.sourcegenome))
    print('Extracting table schema from DB...')
    cmd = ['sqlite3', args.db, '.schema']
    output = subprocess.check_output(cmd)
    sqlpath = args.db + '.newdb.sql'
    wf = open(sqlpath, 'w')
    wf.write(output.decode())
    wf.close()
    newdbpath = '.'.join(args.db.split('.')[:-1]) + '.hg38.sqlite'
    if os.path.exists(newdbpath):
        print('Deleting existing hg38 DB...')
        os.remove(newdbpath)
    print('Creating ' + newdbpath + '...')
    newdb = sqlite3.connect(newdbpath)
    newc = newdb.cursor()
    print('Creating same table(s) in ' + newdbpath + '...')
    cmd = ['sqlite3', newdbpath, '.read ' + sqlpath]
    output = subprocess.check_output(cmd)
    db = sqlite3.connect(args.db)
    c = db.cursor()
    if args.tables == None:
        print('tables not given. All tables will be tried.')
        output = subprocess.check_output(['sqlite3', args.db, '.table'])
        args.tables = output.decode().split()
        args.tables.sort()
        print('The following tables will be examined:', ', '.join(args.tables))
    tables_toconvert = []
    tables_tocopy = []
    for table in args.tables:
        c.execute('select * from ' + table + ' limit 1')
        cols = [v[0] for v in c.description]
        hit = False
        if args.chromcol is not None and args.chromcol not in cols:
            tables_tocopy.append(table)
            continue
        for col in args.cols:
            if col in cols:
                hit = True
                break
        if hit:
            tables_toconvert.append(table)
        else:
            tables_tocopy.append(table)
    print('Tables to convert:',
          ', '.join(tables_toconvert) if len(tables_toconvert) > 0 else 'none')
    print('Tables to copy:',
          ', '.join(tables_tocopy) if len(tables_tocopy) > 0 else 'none')
    wf = open(newdbpath + '.noconversion', 'w')
    count_interval = 10000
    for table in tables_toconvert:
        print('Converting ' + table + '...')
        c.execute('select * from ' + table)
        allcols = [v[0] for v in c.description]
        colnos = []
        for col in args.cols:
            if col in allcols:
                colnos.append(allcols.index(col))
        if args.chromcol is None:
            chromcolno = None
        else:
            chromcolno = allcols.index(args.chromcol)
        count = 0
        for row in c.fetchall():
            row = list(row)
            if chromcolno is not None:
                chrom = row[chromcolno]
            else:
                chrom = table
            if chrom.startswith('chr') == False:
                chrom = 'chr' + chrom
            for colno in colnos:
                pos = int(row[colno])
                liftover_out = liftover.convert_coordinate(chrom, pos)
                if liftover_out == None:
                    print('- no liftover mapping:', chrom + ':' + str(pos))
                    continue
                if liftover_out == []:
                    wf.write(table + ':' + ','.join([str(v)
                                                     for v in row]) + '\n')
                    continue
                newpos = liftover_out[0][1]
                row[colno] = newpos
            q = 'insert into ' + table + ' values(' + ','.join([
                '"' + v + '"' if type(v) == type('a') else str(v) for v in row
            ]) + ')'
            newc.execute(q)
            count += 1
            if count % count_interval == 0:
                print('  ' + str(count) + '...')
        print('  ' + table + ': done.', count, 'rows converted')
    wf.close()
    for table in tables_tocopy:
        count = 0
        print('Copying ' + table + '...')
        c.execute('select * from ' + table)
        for row in c.fetchall():
            row = list(row)
            q = 'insert into ' + table + ' values(' + ','.join([
                '"' + v + '"' if type(v) == type('a') else str(v) for v in row
            ]) + ')'
            newc.execute(q)
            count += 1
            if count % count_interval == 0:
                print('  ' + str(count) + '...')
        print('  ' + table + ': done.', count, 'rows converted')
    newdb.commit()
예제 #11
0
def do_liftover(chain_fn, description):
    original_fn = 'nagalakshmi_annotations.txt'
    lifted_fn = 'nagalakshmi_annotations_lifted_{0}.txt'.format(description)

    original_fh = open(original_fn)
    for i in range(2):
        original_fh.readline()

    labels = original_fh.readline().strip().split()

    keys_to_convert = ['SGD_Start', 'SGD_End', '5\'-UTR_Start', '3\'-UTR_End']

    lo = pyliftover.LiftOver(chain_fn)

    with open(lifted_fn, 'w') as lifted_fh:
        original_fh = open(original_fn)
        for i in range(2):
            lifted_fh.write(original_fh.readline())

        labels_line = original_fh.readline()
        lifted_fh.write(labels_line)
        labels = labels_line.strip().split()

        for line in original_fh:
            fields = line.strip('\n').split('\t')
            name = fields[0]
            if name == 'YBR013C':
                # This gets its 5' UTR deleted by liftover. Ignore it for now
                continue
            #if name == 'YJR122W':
            #    # This has its coding sequence misannotated in nagalakshmi.
            #    continue

            pairs = zip(labels, map(maybe_int, fields))
            gene = dict(pairs[1:])

            if gene['Chrom'] == 'chrMito':
                # Renamed in EF4, and not included in weinberg anyways.
                continue

            bad_lift = False
            for key in keys_to_convert:
                if gene[key] != '':
                    lift = lo.convert_coordinate(gene['Chrom'], gene[key] - 1)
                    if lift == []:
                        print gene, 'empty list'
                        bad_lift = True
                        break
                    seqname, coord, _, _ = lift[0]
                    gene[key] = coord
            if bad_lift:
                continue

            if gene['SGD_Start'] < gene['SGD_End']:
                # plus strand
                gene['SGD_End'] = gene['SGD_End'] + 1
            elif gene['SGD_Start'] > gene['SGD_End']:
                # minus strand
                gene['SGD_Start'] = gene['SGD_Start'] + 1
            else:
                raise ValueError(name)

            lifted_line = '\t'.join([name] +
                                    [str(gene[key])
                                     for key in labels[1:]]) + '\n'
            lifted_fh.write(lifted_line)
def run(args):
    if os.path.exists(args.output):
        logging.info("Output already exists, nope.")
        return

    Utilities.ensure_requisite_folders(args.output)
    Utilities.ensure_requisite_folders(args.discard)

    if args.liftover:
        logging.info("Acquiring liftover")
        l = pyliftover.LiftOver(args.liftover)
    else:
        logging.info("Will not perform lift over")
        l = None

    logging.info("Loading snp reference metadata")
    snp_reference_metadata = pandas.read_table(args.snp_reference_metadata)
    reference = {}
    for t in snp_reference_metadata.itertuples():
        k = "chr{}_{}".format(t.chromosome, t.position)
        if k in reference:
            raise RuntimeError("coordinate is already present")
        reference[k] = (t.id, t.rsid)

    dbsnp_format = {x: i for i, x in enumerate(DBSnp.DBSNP._fields)}
    complement_translation = "CGTA".maketrans({"C": "G", "G": "C", "T":"A", "A": "T"})

    logging.info("Processing db snp file")
    if args.discard:
        discard = gzip.open(args.discard, "w")
        discard.write(l_(["rsid", "chromosome", "position", "a0", "a1", "strand", "type", "panel_variant_id", "panel_variant_rsid", "panel_variant_a0", "panel_variant_a1", "swap", "strand_reversal"]))

    allele_re = re.compile("chr\d+_\d+_(.*)_(.*)_b38")

    with gzip.open(args.output, "w") as result:
        result.write(l_(["rsid", "chromosome", "position", "a0", "a1", "strand", "type", "panel_variant_id", "panel_variant_rsid", "panel_variant_a0", "panel_variant_a1", "swap", "strand_reversal"]))
        with gzip.open(args.db_snp_file) as db_snp:
            db_snp.readline()
            for i,line in enumerate(db_snp):
                comps = line.decode().strip().split("\t")

                obs_alleles = comps[9].split("/")
                if len(obs_alleles) < 2:
                    continue

                chr = comps[1]
                start_0 = comps[2]
                _new_chromosome, _new_position = gwas_parsing._lift(l, chr, start_0) if l else (chr, int(start_0))

                if _new_chromosome == "NA" or _new_position == "NA":
                    continue

                k = "{}_{}".format(_new_chromosome, _new_position+1)
                if not k in reference:
                    continue

                rsid = comps[4]
                strand = comps[6]
                ref_allele = comps[7]
                var_type = comps[11]

                alt_alleles_ = [x for x in obs_alleles if x != ref_allele]
                alt_alleles = set(alt_alleles_)

                panel_variant_id, panel_variant_rsid = reference[k]
                panel_variant_rsid = panel_variant_rsid if type(panel_variant_rsid) == str else "NA"
                panel_alleles = allele_re.search(panel_variant_id)
                panel_ref_allele = panel_alleles.group(1)
                panel_alt_allele = panel_alleles.group(2)

                strand_reversed_panel_ref_allele = panel_ref_allele.translate(complement_translation)
                strand_reversed_panel_alt_allele = panel_alt_allele.translate(complement_translation)
                # if args.reverse_swap:
                #     strand_reversed_panel_ref_allele = strand_reversed_panel_ref_allele[::-1]
                #     strand_reversed_panel_alt_allele = strand_reversed_panel_alt_allele[::-1]

                swap, strand_reversal, selected_ref_allele, selected_alt_allele = None, None, ref_allele, alt_alleles_[0]
                if len(panel_ref_allele) == 1 and len(panel_alt_allele) == 1:
                    #snp
                    if panel_ref_allele == ref_allele and panel_alt_allele in alt_alleles:
                        swap, strand_reversal, selected_ref_allele, selected_alt_allele =  1,  1, panel_ref_allele, panel_alt_allele
                    elif panel_ref_allele in alt_alleles and panel_alt_allele == ref_allele:
                        swap, strand_reversal, selected_ref_allele, selected_alt_allele = -1,  1, panel_alt_allele, panel_ref_allele
                    elif strand_reversed_panel_ref_allele == ref_allele and strand_reversed_panel_alt_allele in alt_alleles:
                        swap, strand_reversal, selected_ref_allele, selected_alt_allele =  1, -1, strand_reversed_panel_ref_allele, strand_reversed_panel_alt_allele
                    elif strand_reversed_panel_ref_allele in alt_alleles and strand_reversed_panel_alt_allele == ref_allele:
                        swap, strand_reversal, selected_ref_allele, selected_alt_allele = -1, -1, strand_reversed_panel_alt_allele, strand_reversed_panel_ref_allele
                elif len(panel_ref_allele) > 1 and len(panel_alt_allele) == 1 and ref_allele != "-":
                    #deletion
                    deleted = panel_ref_allele[1:]
                    strand_reversed_deleted = strand_reversed_panel_ref_allele[1:]
                    # if args.reverse_swap:
                    #     strand_reversed_deleted = strand_reversed_panel_ref_allele[:-1]
                    for si_, allele_ in enumerate(alt_alleles):
                        if allele_ == deleted:
                            swap, strand_reversal, selected_ref_allele, selected_alt_allele =  1,  1, allele_, "-"
                        if allele_ == strand_reversed_deleted:
                            swap, strand_reversal, selected_ref_allele, selected_alt_allele =  1, -1, allele_, "-"
                elif len(panel_ref_allele) == 1 and len(panel_alt_allele) > 1 and ref_allele == "-":
                    inserted = panel_alt_allele[1:]
                    strand_reversed_inserted = strand_reversed_panel_alt_allele[1:]#[:-1]
                    # if args.reverse_swap:
                    #     strand_reversed_inserted = strand_reversed_panel_alt_allele[:-1]
                    for si_, allele_ in enumerate(alt_alleles):
                        if allele_ == inserted:
                            swap, strand_reversal, selected_ref_allele, selected_alt_allele =  1,  1, "-", allele_
                        if allele_ == strand_reversed_inserted:
                            swap, strand_reversal, selected_ref_allele, selected_alt_allele =  1, -1, "-", allele_

                else:
                    pass

                ol = l_([rsid, chr, str(int(start_0) + 1), selected_ref_allele, selected_alt_allele, strand, var_type, panel_variant_id, panel_variant_rsid, panel_ref_allele, panel_alt_allele, swap, strand_reversal])
                if swap is not None and strand is not None and selected_ref_allele is not None and selected_alt_allele is not None:
                    result.write(ol)
                else:
                    discard.write(ol)
    discard.close()
    logging.info("Done")
예제 #13
0
def Converter(genome_build_in='mm9', genome_build_out='mm10'):
    return pyliftover.LiftOver(genome_build_in, genome_build_out)
예제 #14
0
   2. discard all positions outside of BRCA region
    Note: about 2000 files from 23andme, 200 files from ancestryDNA, 200 files from ftdna
    ftdna files are not processed because its human reference build version is not specified
"""

import glob
import pyliftover
import pdb
import os

SOURCE = ["23andme", "ancestry", "ftdna"]

# preload all the pyliftover functions because the function needs to download from internet
LIFT_MAP = {
    "37":
    pyliftover.LiftOver('hg19', 'hg38'),
    "36":
    pyliftover.LiftOver('hg18', 'hg38'),
    # pyliftover refuses to translate from hg17 to hg38
    # therefore it's done in two steps hg17 -> hg19 -> hg38
    "35":
    [pyliftover.LiftOver('hg17', 'hg19'),
     pyliftover.LiftOver('hg19', 'hg38')],
    "34":
    pyliftover.LiftOver('hg16', 'hg38')
}

BRCA_BOUNDARY = {
    "38": {
        "chr17": [43045629, 43125483],
        "chr13": [32315474, 32400266]
예제 #15
0
import pyliftover
import os

this_file_folder = os.path.dirname(os.path.realpath(__file__))
lo = pyliftover.LiftOver(
    os.path.join(this_file_folder, 'hg19ToHg38.over.chain.gz'))
예제 #16
0
#!/usr/bin/env python

import pyliftover

lo = LiftOver('hg17', 'hg18')
lo = LiftOver('hg17ToHg18.over.chain.gz')

pyliftover.LiftOver()

# FROM: https://github.com/konstantint/pyliftover/tree/master/pyliftover
# convert_coordinate(self, chromosome, position, strand='+'):
#         '''
#         Returns a *list* of possible conversions for a given chromosome position.
#         The list may be empty (no conversion), have a single element (unique conversion), or several elements (position mapped to several chains).
#         The list contains tuples (target_chromosome, target_position, target_strand, conversion_chain_score),
#         where conversion_chain_score is the "alignment score" field specified at the chain used to perform conversion. If there
#         are several possible conversions, they are sorted by decreasing conversion_chain_score.