Exemplo n.º 1
0
def get_genes_from_gencode_gtf(gtf_file):
    """
    Parse gencode GTF file;
    Returns iter of gene dicts
    """
    for line in gtf_file:
        if line.startswith('#'):
            continue
        fields = line.strip('\n').split('\t')

        if fields[2] != 'gene':
            continue

        chrom = fields[0][3:]
        start = int(fields[3]) + 1  # bed files are 0-indexed
        stop = int(fields[4]) + 1
        info = dict(x.strip().split() for x in fields[8].split(';') if x != '')
        info = {k: v.strip('"') for k, v in info.items()}
        gene_id = info['gene_id'].split('.')[0]

        gene = {
            'gene_id': gene_id,
            'gene_name': info['gene_name'],
            'gene_name_upper': info['gene_name'].upper(),
            'chrom': chrom,
            'start': start,
            'stop': stop,
            'strand': fields[6],
            'xstart': xbrowse.get_xpos(chrom, start),
            'xstop': xbrowse.get_xpos(chrom, stop),
        }
        yield gene
Exemplo n.º 2
0
def get_genes_from_gencode_gtf(gtf_file):
    """
    Parse gencode GTF file;
    Returns iter of gene dicts
    """
    for line in gtf_file:
        if line.startswith('#'):
            continue
        fields = line.strip('\n').split('\t')

        if fields[2] != 'gene':
            continue

        chrom = fields[0][3:]
        start = int(fields[3]) + 1  # bed files are 0-indexed
        stop = int(fields[4]) + 1
        info = dict(x.strip().split() for x in fields[8].split(';') if x != '')
        info = {k: v.strip('"') for k, v in info.items()}
        gene_id = info['gene_id'].split('.')[0]

        gene = {
            'gene_id': gene_id,
            'gene_name': info['gene_name'],
            'chrom': chrom,
            'start': start,
            'stop': stop,
            'strand': fields[6],
            'xstart': xbrowse.get_xpos(chrom, start),
            'xstop': xbrowse.get_xpos(chrom, stop),
        }
        yield gene
Exemplo n.º 3
0
def get_exons_from_gencode_gtf(gtf_file):
    """
    Parse gencode GTF file;
    Returns iter of transcript dicts
    """
    for line in gtf_file:
        if line.startswith('#'):
            continue
        fields = line.strip('\n').split('\t')

        if fields[2] not in ['exon', 'CDS', 'UTR']:
            continue

        chrom = fields[0][3:]
        feature_type = fields[2]
        start = int(fields[3]) + 1  # bed files are 0-indexed
        stop = int(fields[4]) + 1
        info = dict(x.strip().split() for x in fields[8].split(';') if x != '')
        info = {k: v.strip('"') for k, v in info.items()}
        transcript_id = info['transcript_id'].split('.')[0]
        gene_id = info['gene_id'].split('.')[0]

        exon = {
            'feature_type': feature_type,
            'transcript_id': transcript_id,
            'gene_id': gene_id,
            'chrom': chrom,
            'start': start,
            'stop': stop,
            'strand': fields[6],
            'xstart': xbrowse.get_xpos(chrom, start),
            'xstop': xbrowse.get_xpos(chrom, stop),
        }
        yield exon
Exemplo n.º 4
0
def get_exons_from_gencode_gtf(gtf_file):
    """
    Parse gencode GTF file;
    Returns iter of transcript dicts
    """
    for line in gtf_file:
        if line.startswith('#'):
            continue
        fields = line.strip('\n').split('\t')

        if fields[2] not in ['exon', 'CDS', 'UTR']:
            continue

        chrom = fields[0][3:]
        feature_type = fields[2]
        start = int(fields[3]) + 1  # bed files are 0-indexed
        stop = int(fields[4]) + 1
        info = dict(x.strip().split() for x in fields[8].split(';') if x != '')
        info = {k: v.strip('"') for k, v in info.items()}
        transcript_id = info['transcript_id'].split('.')[0]
        gene_id = info['gene_id'].split('.')[0]

        exon = {
            'feature_type': feature_type,
            'transcript_id': transcript_id,
            'gene_id': gene_id,
            'chrom': chrom,
            'start': start,
            'stop': stop,
            'strand': fields[6],
            'xstart': xbrowse.get_xpos(chrom, start),
            'xstop': xbrowse.get_xpos(chrom, stop),
        }
        yield exon
Exemplo n.º 5
0
def get_genes_in_region(db, chrom, start, stop):
    """
    Genes that overlap a region
    """
    xstart = get_xpos(chrom, start)
    xstop = get_xpos(chrom, stop)
    genes = db.genes.find({
        'xstart': {'$lte': xstop},
        'xstop': {'$gte': xstart},
    }, fields={'_id': False})
    return list(genes)
Exemplo n.º 6
0
def get_genes_in_region(db, chrom, start, stop):
    """
    Genes that overlap a region
    """
    xstart = get_xpos(chrom, start)
    xstop = get_xpos(chrom, stop)
    genes = db.genes.find({
        'xstart': {'$lte': xstop},
        'xstop': {'$gte': xstart},
    }, fields={'_id': False})
    return list(genes)
Exemplo n.º 7
0
def get_variants_in_region(db, chrom, start, stop):
    """
    Variants that overlap a region
    Unclear if this will include CNVs
    """
    xstart = get_xpos(chrom, start)
    xstop = get_xpos(chrom, stop)
    variants = db.variants.find({
        'xstart': {'$lte': xstop},  # start of variant should be before (or equal to) end of region
        'xstop': {'$gte': xstart},  # opposite of above
    }, fields={'_id': False}, limit=SEARCH_LIMIT)
    return list(variants)
Exemplo n.º 8
0
def get_variants_in_region(db, chrom, start, stop):
    """
    Variants that overlap a region
    Unclear if this will include CNVs
    """
    xstart = get_xpos(chrom, start)
    xstop = get_xpos(chrom, stop)
    variants = list(db.variants.find({
        'xpos': {'$lte': xstop, '$gte': xstart}
    }, fields={'_id': False}, limit=SEARCH_LIMIT))
    add_consequence_to_variants(variants)
    return list(variants)
Exemplo n.º 9
0
def get_base_coverage_from_file(base_coverage_file):
    """
    Read a base coverage file and return iter of dicts that look like:
    {
        'xpos': 1e9+1,
        'mean': 0.0,
        'median': 0.0,
        '1': 0.0,
        '5': 0.0,
        '10': 0.0,
        '15': 0.0,
        '20': 0.0,
        '25': 0.0,
        '30': 0.0,
        '50': 0.0,
        '100': 0.0,
    }
    """
    float_header_fields = ['mean', 'median', '1', '5', '10', '15', '20', '25', '30', '50', '100']
    for line in base_coverage_file:
        if line.startswith('#'):
            continue
        fields = line.strip('\n').split('\t')
        d = {
            'xpos': xbrowse.get_xpos(fields[0], int(fields[1])),
            'pos': int(fields[1]),
        }
        for i, k in enumerate(float_header_fields):
            d[k] = float(fields[i+2])
        yield d
Exemplo n.º 10
0
def get_base_coverage_from_file(base_coverage_file):
    """
    Read a base coverage file and return iter of dicts that look like:
    {
        'xpos': 1e9+1,
        'mean': 0.0,
        'median': 0.0,
        '1': 0.0,
        '5': 0.0,
        '10': 0.0,
        '15': 0.0,
        '20': 0.0,
        '25': 0.0,
        '30': 0.0,
        '50': 0.0,
        '100': 0.0,
    }
    """

    float_header_fields = [
        'mean', 'median', '1', '5', '10', '15', '20', '25', '30', '50', '100'
    ]
    for line in base_coverage_file:
        if line.startswith('#'):
            continue
        fields = line.strip('\n').split('\t')
        d = {
            'xpos': xbrowse.get_xpos(fields[0], int(fields[1])),
            'pos': int(fields[1]),
        }
        for i, k in enumerate(float_header_fields):
            d[k] = float(fields[i + 2])
        yield d
Exemplo n.º 11
0
def get_snp_from_dbsnp_file(dbsnp_file):
    for line in dbsnp_file:
        fields = line.split('\t')
        rsid = int(fields[0])
        chrom = fields[1].rstrip('T')
        if chrom == 'PAR': continue
        start = int(fields[2]) + 1
        snp = {'xpos': xbrowse.get_xpos(chrom, start), 'rsid': rsid}
        yield snp
Exemplo n.º 12
0
def region_page(region_id):
    db = get_db()
    try:
        region = region_id.split('-')
        cache_key = 't-region-{}'.format(region_id)
        t = cache.get(cache_key)
        if t is None:
            chrom = region[0]
            start = None
            stop = None
            if len(region) == 3:
                chrom, start, stop = region
                start = int(start)
                stop = int(stop)
            if start is None or stop - start > REGION_LIMIT:
                return render_template(
                    'region.html',
                    genes_in_region=None,
                    variants_in_region=None,
                    chrom=chrom,
                    start=start,
                    stop=stop,
                    coverage=None
                )
            genes_in_region = lookups.get_genes_in_region(db, chrom, start, stop)
            variants_in_region = lookups.get_variants_in_region(db, chrom, start, stop)
            xstart = xbrowse.get_xpos(chrom, start)
            xstop = xbrowse.get_xpos(chrom, stop)
            coverage_array = lookups.get_coverage_for_bases(db, xstart, xstop)
            t = render_template(
                'region.html',
                genes_in_region=genes_in_region,
                variants_in_region=variants_in_region,
                chrom=chrom,
                start=start,
                stop=stop,
                coverage=coverage_array
            )
        print 'Rendering region: %s' % region_id
        return t
    except Exception, e:
        print 'Failed on region:', region_id, ';Error=', e
        abort(404)
Exemplo n.º 13
0
def region_page(region_id):
    db = get_db()
    try:
        region = region_id.split('-')
        cache_key = 't-region-{}'.format(region_id)
        t = cache.get(cache_key)
        if t is None:
            chrom = region[0]
            start = None
            stop = None
            if len(region) == 3:
                chrom, start, stop = region
                start = int(start)
                stop = int(stop)
            if start is None or stop - start > REGION_LIMIT:
                return render_template('region.html',
                                       genes_in_region=None,
                                       variants_in_region=None,
                                       chrom=chrom,
                                       start=start,
                                       stop=stop,
                                       coverage=None)
            genes_in_region = lookups.get_genes_in_region(
                db, chrom, start, stop)
            variants_in_region = lookups.get_variants_in_region(
                db, chrom, start, stop)
            xstart = xbrowse.get_xpos(chrom, start)
            xstop = xbrowse.get_xpos(chrom, stop)
            coverage_array = lookups.get_coverage_for_bases(db, xstart, xstop)
            t = render_template('region.html',
                                genes_in_region=genes_in_region,
                                variants_in_region=variants_in_region,
                                chrom=chrom,
                                start=start,
                                stop=stop,
                                coverage=coverage_array)
        print 'Rendering region: %s' % region_id
        return t
    except Exception, e:
        print 'Failed on region:', region_id, ';Error=', e
        abort(404)
Exemplo n.º 14
0
def get_snp_from_dbsnp_file(dbsnp_file):
    for line in dbsnp_file:
        fields = line.split('\t')
        rsid = int(fields[0])
        chrom = fields[1].rstrip('T')
        if chrom == 'PAR': continue
        start = int(fields[2]) + 1
        snp = {
            'xpos': xbrowse.get_xpos(chrom, start),
            'rsid': rsid
        }
        yield snp
Exemplo n.º 15
0
def variant_page(variant_str):
    db = get_db()
    try:
        chrom, pos, ref, alt = variant_str.split('-')
        pos = int(pos)
        # pos, ref, alt = get_minimal_representation(pos, ref, alt)
        xpos = xbrowse.get_xpos(chrom, pos)
        variant = lookups.get_variant(db, xpos, ref, alt)

        if variant is None:
            variant = {
                'chrom': chrom,
                'pos': pos,
                'xpos': xpos,
                'ref': ref,
                'alt': alt
            }
        consequences = None
        ordered_csqs = None
        if 'vep_annotations' in variant:
            variant['vep_annotations'] = order_vep_by_csq(
                variant['vep_annotations'])  # Adds major_consequence
            ordered_csqs = [
                x['major_consequence'] for x in variant['vep_annotations']
            ]
            ordered_csqs = reduce(
                lambda x, y: ','.join([x, y]) if y not in x else x,
                ordered_csqs, '').split(',')  # Close but not quite there
            consequences = defaultdict(lambda: defaultdict(list))
            for annotation in variant['vep_annotations']:
                annotation['HGVS'] = get_proper_hgvs(annotation)
                consequences[annotation['major_consequence']][
                    annotation['Gene']].append(annotation)
        base_coverage = lookups.get_coverage_for_bases(db, xpos,
                                                       xpos + len(ref) - 1)
        any_covered = any([x['has_coverage'] for x in base_coverage])
        metrics = lookups.get_metrics(db, variant)

        print 'Rendering variant: %s' % variant_str
        return render_template('variant.html',
                               variant=variant,
                               base_coverage=base_coverage,
                               consequences=consequences,
                               any_covered=any_covered,
                               ordered_csqs=ordered_csqs,
                               metrics=metrics)
    except Exception, e:
        print 'Failed on variant:', variant_str, '; Error=', traceback.format_exc(
        )
        abort(404)
Exemplo n.º 16
0
def variant_page(variant_str):
    db = get_db()
    try:
        chrom, pos, ref, alt = variant_str.split('-')
        pos = int(pos)
        # pos, ref, alt = get_minimal_representation(pos, ref, alt)
        xpos = xbrowse.get_xpos(chrom, pos)
        variant = lookups.get_variant(db, xpos, ref, alt)

        if variant is None:
            variant = {
                'chrom': chrom,
                'pos': pos,
                'xpos': xpos,
                'ref': ref,
                'alt': alt
            }
        consequences = None
        ordered_csqs = None
        if 'vep_annotations' in variant:
            variant['vep_annotations'] = order_vep_by_csq(variant['vep_annotations'])  # Adds major_consequence
            ordered_csqs = [x['major_consequence'] for x in variant['vep_annotations']]
            ordered_csqs = reduce(lambda x, y: ','.join([x, y]) if y not in x else x, ordered_csqs, '').split(',') # Close but not quite there
            consequences = defaultdict(lambda: defaultdict(list))
            for annotation in variant['vep_annotations']:
                annotation['HGVS'] = get_proper_hgvs(annotation)
                consequences[annotation['major_consequence']][annotation['Gene']].append(annotation)
        base_coverage = lookups.get_coverage_for_bases(db, xpos, xpos + len(ref) - 1)
        any_covered = any([x['has_coverage'] for x in base_coverage])
        metrics = lookups.get_metrics(db, variant)

        print 'Rendering variant: %s' % variant_str
        return render_template(
            'variant.html',
            variant=variant,
            base_coverage=base_coverage,
            consequences=consequences,
            any_covered=any_covered,
            ordered_csqs=ordered_csqs,
            metrics=metrics
        )
    except Exception, e:
        print 'Failed on variant:', variant_str, '; Error=', traceback.format_exc()
        abort(404)
Exemplo n.º 17
0
def get_variants_from_sites_vcf(sites_vcf):
    """
    Parse exac sites VCF file and return iter of variant dicts
    sites_vcf is a file (gzipped), not file path
    """
    vep_field_names = None
    for line in sites_vcf:
        line = line.strip('\n')
        if line.startswith('##INFO=<ID=CSQ'):
            vep_field_names = line.split('Format: ')[-1].strip('">').split('|')
        if line.startswith('#'):
            continue

        # If we get here, it's a variant line

        # This elegant parsing code below is copied from https://github.com/konradjk/loftee
        fields = line.split('\t')
        info_field = dict([(x.split('=', 1)) if '=' in x else (x, x) for x in re.split(';(?=\w)', fields[7])])
        consequence_array = info_field['CSQ'].split(',') if 'CSQ' in info_field else []
        annotations = [dict(zip(vep_field_names, x.split('|'))) for x in consequence_array if len(vep_field_names) == len(x.split('|'))]
        coding_annotations = [ann for ann in annotations if ann['Feature'].startswith('ENST')]

        alt_alleles = fields[4].split(',')

        # different variant for each alt allele
        for i, alt_allele in enumerate(alt_alleles):

            vep_annotations = [ann for ann in coding_annotations if int(ann['ALLELE_NUM']) == i + 1]

            # Variant is just a dict
            # Make a copy of the info_field dict - so all the original data remains
            # Add some new keys that are allele-specific
            pos, ref, alt = get_minimal_representation(fields[1], fields[3], alt_allele)

            variant = {}
            variant['chrom'] = fields[0]
            variant['pos'] = pos
            variant['rsid'] = fields[2]
            variant['xpos'] = xbrowse.get_xpos(variant['chrom'], variant['pos'])
            variant['ref'] = ref
            variant['alt'] = alt
            variant['xstart'] = variant['xpos']
            variant['xstop'] = variant['xpos'] + len(variant['alt']) - len(variant['ref'])
            variant['variant_id'] = '{}-{}-{}-{}'.format(variant['chrom'], variant['pos'], variant['ref'], variant['alt'])
            variant['orig_alt_alleles'] = [
                '{}-{}-{}-{}'.format(variant['chrom'], *get_minimal_representation(fields[1], fields[3], x))
                for x in alt_alleles
            ]
            variant['site_quality'] = float(fields[5])
            variant['filter'] = fields[6]
            variant['vep_annotations'] = vep_annotations

            variant['allele_count'] = int(info_field['AC_Adj'].split(',')[i])
            if not variant['allele_count'] and variant['filter'] == 'PASS': variant['filter'] = 'AC_Adj0' # Temporary filter
            variant['allele_num'] = int(info_field['AN_Adj'])

            if variant['allele_num'] > 0:
                variant['allele_freq'] = variant['allele_count']/float(info_field['AN_Adj'])
            else:
                variant['allele_freq'] = None

            variant['pop_acs'] = dict([(POPS[x], int(info_field['AC_%s' % x].split(',')[i])) for x in POPS])
            variant['pop_ans'] = dict([(POPS[x], int(info_field['AN_%s' % x])) for x in POPS])
            variant['pop_homs'] = dict([(POPS[x], int(info_field['Hom_%s' % x].split(',')[i])) for x in POPS])

            variant['pop_acs']['Other'] = int(info_field['AC_Adj'].split(',')[i]) - sum(variant['pop_acs'].values())
            variant['pop_ans']['Other'] = int(info_field['AN_Adj']) - sum(variant['pop_ans'].values())
            variant['pop_homs']['Other'] = int(info_field['AC_Hom']) - sum(variant['pop_homs'].values())

            variant['genes'] = list({annotation['Gene'] for annotation in vep_annotations})
            variant['transcripts'] = list({annotation['Feature'] for annotation in vep_annotations})

            if 'DP_MID' in info_field:
                mids_all = info_field['DP_MID'].split(',')[0]
                hists_all = info_field['DP_HIST'].split(',')[0]
                mids = info_field['DP_MID'].split(',')[i+1]
                hists = info_field['DP_HIST'].split(',')[i+1]
                variant['genotype_depths'] = [zip(map(float, mids_all.split('|')), map(int, hists_all.split('|'))), zip(map(float, mids.split('|')), map(int, hists.split('|')))]
            if 'GQ_MID' in info_field:
                mids_all = info_field['GQ_MID'].split(',')[0]
                hists_all = info_field['GQ_HIST'].split(',')[0]
                mids = info_field['GQ_MID'].split(',')[i+1]
                hists = info_field['GQ_HIST'].split(',')[i+1]
                variant['genotype_qualities'] = [zip(map(float, mids_all.split('|')), map(int, hists_all.split('|'))), zip(map(float, mids.split('|')), map(int, hists.split('|')))]

            yield variant
Exemplo n.º 18
0
def get_variants_from_sites_vcf(sites_vcf):
    """
    Parse exac sites VCF file and return iter of variant dicts
    sites_vcf is a file (gzipped), not file path
    """
    vep_field_names = None
    for line in sites_vcf:
        try:
            line = line.strip('\n')
            if line.startswith('##INFO=<ID=CSQ'):
                vep_field_names = line.split('Format: ')[-1].strip('">').split(
                    '|')
            if line.startswith('##INFO=<ID=DP_HIST'):
                dp_mids = map(float,
                              line.split('Mids: ')[-1].strip('">').split('|'))
            if line.startswith('##INFO=<ID=GQ_HIST'):
                gq_mids = map(float,
                              line.split('Mids: ')[-1].strip('">').split('|'))
            if line.startswith('#'):
                continue

            # If we get here, it's a variant line
            if vep_field_names is None:
                raise Exception(
                    "VEP_field_names is None. Make sure VCF header is present."
                )
            # This elegant parsing code below is copied from https://github.com/konradjk/loftee
            fields = line.split('\t')
            info_field = dict([(x.split('=', 1)) if '=' in x else (x, x)
                               for x in re.split(';(?=\w)', fields[7])])
            consequence_array = info_field['CSQ'].split(
                ',') if 'CSQ' in info_field else []
            annotations = [
                dict(zip(vep_field_names, x.split('|')))
                for x in consequence_array
                if len(vep_field_names) == len(x.split('|'))
            ]
            coding_annotations = [
                ann for ann in annotations if ann['Feature'].startswith('ENST')
            ]

            alt_alleles = fields[4].split(',')

            # different variant for each alt allele
            for i, alt_allele in enumerate(alt_alleles):

                vep_annotations = [
                    ann for ann in coding_annotations
                    if int(ann['ALLELE_NUM']) == i + 1
                ]

                # Variant is just a dict
                # Make a copy of the info_field dict - so all the original data remains
                # Add some new keys that are allele-specific
                pos, ref, alt = get_minimal_representation(
                    fields[1], fields[3], alt_allele)

                variant = {}
                variant['chrom'] = fields[0]
                variant['pos'] = pos
                variant['rsid'] = fields[2]
                variant['xpos'] = xbrowse.get_xpos(variant['chrom'],
                                                   variant['pos'])
                variant['ref'] = ref
                variant['alt'] = alt
                variant['xstart'] = variant['xpos']
                variant['xstop'] = variant['xpos'] + len(variant['alt']) - len(
                    variant['ref'])
                variant['variant_id'] = '{}-{}-{}-{}'.format(
                    variant['chrom'], variant['pos'], variant['ref'],
                    variant['alt'])
                variant['orig_alt_alleles'] = [
                    '{}-{}-{}-{}'.format(
                        variant['chrom'],
                        *get_minimal_representation(fields[1], fields[3], x))
                    for x in alt_alleles
                ]
                variant['site_quality'] = float(fields[5])
                variant['filter'] = fields[6]
                variant['vep_annotations'] = vep_annotations

                variant['allele_count'] = int(
                    info_field['AC_Adj'].split(',')[i])
                if not variant['allele_count'] and variant['filter'] == 'PASS':
                    variant['filter'] = 'AC_Adj0'  # Temporary filter
                variant['allele_num'] = int(info_field['AN_Adj'])

                if variant['allele_num'] > 0:
                    variant['allele_freq'] = variant['allele_count'] / float(
                        info_field['AN_Adj'])
                else:
                    variant['allele_freq'] = None

                variant['pop_acs'] = dict([
                    (POPS[x], int(info_field['AC_%s' % x].split(',')[i]))
                    for x in POPS
                ])
                variant['pop_ans'] = dict([
                    (POPS[x], int(info_field['AN_%s' % x])) for x in POPS
                ])
                variant['pop_homs'] = dict([
                    (POPS[x], int(info_field['Hom_%s' % x].split(',')[i]))
                    for x in POPS
                ])
                variant['hom_count'] = sum(variant['pop_homs'].values())
                if variant['chrom'] in ('X', 'Y'):
                    variant['pop_hemis'] = dict([
                        (POPS[x], int(info_field['Hemi_%s' % x].split(',')[i]))
                        for x in POPS
                    ])
                    variant['hemi_count'] = sum(variant['pop_hemis'].values())
                variant['quality_metrics'] = dict([(x, info_field[x])
                                                   for x in METRICS
                                                   if x in info_field])

                variant['genes'] = list(
                    {annotation['Gene']
                     for annotation in vep_annotations})
                variant['transcripts'] = list(
                    {annotation['Feature']
                     for annotation in vep_annotations})

                if 'DP_HIST' in info_field:
                    hists_all = [
                        info_field['DP_HIST'].split(',')[0],
                        info_field['DP_HIST'].split(',')[i + 1]
                    ]
                    variant['genotype_depths'] = [
                        zip(dp_mids, map(int, x.split('|'))) for x in hists_all
                    ]
                if 'GQ_HIST' in info_field:
                    hists_all = [
                        info_field['GQ_HIST'].split(',')[0],
                        info_field['GQ_HIST'].split(',')[i + 1]
                    ]
                    variant['genotype_qualities'] = [
                        zip(gq_mids, map(int, x.split('|'))) for x in hists_all
                    ]

                yield variant
        except Exception:
            print("Error parsing vcf line: " + line)
            traceback.print_exc()
            break
Exemplo n.º 19
0
    parser = argparse.ArgumentParser(description=description)
    parser.add_argument('frq')
    args = parser.parse_args()

    filename = args.frq
    if not os.path.exists(filename):
        raise Exception('File does not exist')
    if '.' not in filename:
        raise Exception('Filename must have an extension.')
    out_filename = filename + '.xbrowse.freqs'
    outfile = open(out_filename, 'w')

    for line in open(filename):
        if line.startswith('CHROM'):
            continue
        fields = line.strip('\n').split('\t')
        xpos = get_xpos(fields[0], int(fields[1]))
        allele_af = {}
        for field in fields[4:]:
            allele, af = field.split(':')
            allele_af[allele] = float(af)
        ref_allele = max(allele_af, key=allele_af.get)
        for allele, af in allele_af.items():
            if allele != ref_allele:
                outfile.write('\t'.join([
                    str(xpos),
                    ref_allele,
                    allele,
                    str(af)
                ])+'\n')
    outfile.close()