Пример #1
0
def cli(context, repeats_file):
    """Annotate str variants with str status"""

    repeat_information = {}
    with open(repeats_file, 'r') as file_handle:
        repeat_information = parse_repeat_file(file_handle)

    if not repeat_information:
        LOG.warning("Could not find any repeat info")
        context.abort()

    header = [
        "hgnc_id", "hgnc_symbol", "repid", "ru", "normal_max",
        "pathologic_min", "disease"
    ]
    table_line = "| {0} | {1} | {2} | {3} | {4} | {5} | {6} |"
    click.echo(
        table_line.format(header[0], header[1], header[2], header[3],
                          header[4], header[5], header[6]))
    click.echo(
        table_line.format('-------', '-------', '-------', '-------',
                          '-------', '-------', '-------'))
    for entry in repeat_information:
        click.echo(
            table_line.format(
                repeat_information[entry][header[0]],
                repeat_information[entry][header[1]],
                repeat_information[entry][header[2]],
                repeat_information[entry][header[3]],
                repeat_information[entry][header[4]],
                repeat_information[entry][header[5]],
                repeat_information[entry][header[6]],
            ))
Пример #2
0
def test_parse_repeat_file(repeats_file_handle):
    ## GIVEN a file handle with repeat lines
    ## WHEN parsing the repeat info
    repeats_info = parse_repeat_file(repeats_file_handle)

    ## THEN assert that there are some repeat info returned
    assert repeats_info
Пример #3
0
def cli(context, vcf, repeats_file, loglevel):
    """Annotate str variants with str status"""
    coloredlogs.install(level=loglevel)

    header_string = 'STR_STATUS'
    repeat_information = None
    with open(repeats_file, 'r') as file_handle:
        repeat_information = parse_repeat_file(file_handle)

    if not repeat_information:
        LOG.warning("Could not find any repeat info")
        context.abort()

    vcf_obj = VCF(vcf)
    vcf_obj.add_info_to_header({
        "ID":
        header_string,
        "Number":
        'A',
        "Type":
        "String",
        "Description":
        "Repeat expansion status. Alternatives in ['normal', 'pre_mutation', 'full_mutation']"
    })

    print_headers(vcf_obj)

    for var in vcf_obj:
        repeat_string = get_repeat_info(var, repeat_information)
        if repeat_string:
            var.INFO[header_string] = repeat_string
        click.echo(str(var).rstrip())
Пример #4
0
def test_parse_malformaed_repeat_line_wrong_value():
    ## GIVEN a some malformed repeat info lines
    repeats_info_lines = [
        "#hgnc_id	hgnc_symbol	repid	ru	normal_max	pathologic_min	disease",
        "10548	ATXN1	ATXN1	CAG	hello	45	SCA1"
    ]
    ## WHEN parsing the repeat info
    ## THEN assert that an exception is raised
    with pytest.raises(ValueError):
        repeats_info = parse_repeat_file(repeats_info_lines)
Пример #5
0
def test_parse_repeat_line():
    ## GIVEN a some repeat info lines
    repeats_info_lines = [
        "#hgnc_id	hgnc_symbol	repid	ru	normal_max	pathologic_min	disease",
        "10548	ATXN1	ATXN1	CAG	35	45	SCA1"
    ]
    ## WHEN parsing the repeat info
    repeats_info = parse_repeat_file(repeats_info_lines)

    ## THEN assert that the expected repeat info is there
    assert 'ATXN1' in repeats_info
    ## THEN assert that the hgnc_id is there
    assert repeats_info['ATXN1']['hgnc_id'] == 10548
    assert repeats_info['ATXN1']['hgnc_symbol'] == 'ATXN1'
    assert repeats_info['ATXN1']['repid'] == 'ATXN1'
    assert repeats_info['ATXN1']['ru'] == 'CAG'
    assert repeats_info['ATXN1']['normal_max'] == 35
    assert repeats_info['ATXN1']['pathologic_min'] == 45
    assert repeats_info['ATXN1']['disease'] == 'SCA1'
Пример #6
0
def cli(context, repeats_file):
    """Table print repeat info"""

    repeat_information = {}
    with open(repeats_file, 'r') as file_handle:
        repeat_information = parse_repeat_file(file_handle,
                                               repeats_file_type='json')

    if not repeat_information:
        LOG.warning("Could not find any repeat info")
        context.abort()

    header = [
        "HGNCId", "LocusId", "DisplayRU", "InheritanceMode", "normal_max",
        "pathologic_min", "Disease", "SourceDisplay", "SourceId"
    ]
    table_line = "| {0} | {1} | {2} | {3} | {4} | {5} | {6} | {7} | {8} |"
    click.echo(
        table_line.format(header[0], header[1], header[2], header[3],
                          header[4], header[5], header[6], header[7],
                          header[8]))
    click.echo(
        table_line.format('-------', '-------', '-------', '-------',
                          '-------', '-------', '-------', '-------',
                          '-------'))
    for entry in repeat_information:
        click.echo(
            table_line.format(
                repeat_information[entry][header[0]],
                entry,
                repeat_information[entry][header[2]],
                repeat_information[entry][header[3]],
                repeat_information[entry][header[4]],
                repeat_information[entry][header[5]],
                repeat_information[entry][header[6]],
                repeat_information[entry][header[7]],
                repeat_information[entry][header[8]],
            ))
Пример #7
0
def cli(context, vcf, family_id, repeats_file, loglevel):
    """Annotate str variants with str status"""
    coloredlogs.install(level=loglevel)
    LOG.info("Running stranger version %s", __version__)

    repeat_information = None
    repeats_file_type = 'tsv'
    if repeats_file.endswith('.json'):
        repeats_file_type = 'json'
    LOG.info("Parsing repeats file %s", repeats_file)

    with open(repeats_file, 'r') as file_handle:
        repeat_information = parse_repeat_file(file_handle, repeats_file_type)

    if not repeat_information:
        LOG.warning("Could not find any repeat info")
        context.abort()

    header_definitions = [
        {
            'id':
            'STR_STATUS',
            'num':
            'A',
            'type':
            'String',
            'desc':
            'Repeat expansion status. Alternatives in [normal, pre_mutation, full_mutation]'
        },
        {
            'id': 'STR_NORMAL_MAX',
            'num': '1',
            'type': 'Integer',
            'desc': 'Max number of repeats allowed to call as normal'
        },
        {
            'id': 'STR_PATHOLOGIC_MIN',
            'num': '1',
            'type': 'Integer',
            'desc': 'Min number of repeats required to call as pathologic'
        },
        {
            'id': 'SourceDisplay',
            'num': '1',
            'type': 'String',
            'desc': 'Source for variant definition, display'
        },
        {
            'id': 'Source',
            'num': '1',
            'type': 'String',
            'desc': 'Source collection for variant definition'
        },
        {
            'id': 'SourceId',
            'num': '1',
            'type': 'String',
            'desc': 'Source id for variant definition'
        },
        {
            'id': 'SweGenMean',
            'num': '1',
            'type': 'Float',
            'desc': 'Average number of repeat unit copies in population'
        },
        {
            'id':
            'SweGenStd',
            'num':
            '1',
            'type':
            'Float',
            'desc':
            'Standard deviation of number of repeat unit copies in population'
        },
        {
            'id': 'DisplayRU',
            'num': '1',
            'type': 'String',
            'desc': 'Display repeat unit familiar to clinician'
        },
        {
            'id': 'InheritanceMode',
            'num': '1',
            'type': 'String',
            'desc': 'Main mode of inheritance for disorder'
        },
        {
            'id': 'HGNCId',
            'num': '1',
            'type': 'Integer',
            'desc': 'HGNC gene id for associated disease gene'
        },
        {
            'id':
            'RankScore',
            'num':
            '1',
            'type':
            'String',
            'desc':
            'RankScore for variant in this family as family(str):score(int)'
        },
        {
            'id': 'Disease',
            'num': '1',
            'type': 'String',
            'desc': 'Associated disorder'
        },
    ]

    stranger_headers = []
    for hdef in header_definitions:
        header = '##INFO=<ID={0},Number={1},Type={2},Description="{3}">'.format(
            hdef.get('id'), hdef.get('num'), hdef.get('type'),
            hdef.get('desc'))
        stranger_headers.append(header)

    if vcf.endswith('.gz'):
        LOG.info("Vcf is zipped")
        vcf_handle = getreader('utf-8')(gzip.open(vcf), errors='replace')
    else:
        vcf_handle = open(vcf, mode='r', encoding='utf-8', errors='replace')

    LOG.info("Parsing variants from %s", vcf)
    for line in vcf_handle:
        line = line.rstrip()
        if line.startswith('#'):
            if line.startswith('##'):
                click.echo(line)
                continue
            # Print the new header lines describing stranger annotation
            for header in stranger_headers:
                click.echo(header)
            # Print the vcf header line
            header_info = line[1:].split('\t')
            click.echo(line)
            continue
        variant_info = dict(zip(header_info, line.split('\t')))
        variant_info['alts'] = variant_info['ALT'].split(',')
        variant_info['info_dict'] = get_info_dict(variant_info['INFO'])
        repeat_data = get_repeat_info(variant_info, repeat_information)
        if repeat_data:
            variant_info['info_dict']['STR_STATUS'] = repeat_data[
                'repeat_strings']
            variant_info['info_dict']['STR_NORMAL_MAX'] = str(
                repeat_data['lower'])
            variant_info['info_dict']['STR_PATHOLOGIC_MIN'] = str(
                repeat_data['upper'])
            variant_info['info_dict']['RankScore'] = ':'.join(
                [str(family_id),
                 str(repeat_data['rank_score'])])
            for annotate_repeat_key in ANNOTATE_REPEAT_KEYS:
                if repeat_data.get(annotate_repeat_key):
                    variant_info['info_dict'][annotate_repeat_key] = str(
                        repeat_data[annotate_repeat_key])

        click.echo(get_variant_line(variant_info, header_info))
Пример #8
0
def cli(context, vcf, repeats_file, loglevel):
    """Annotate str variants with str status"""
    coloredlogs.install(level=loglevel)
    LOG.info("Running stranger version %s", __version__)

    repeat_information = None
    repeats_file_type = 'tsv'
    if repeats_file.endswith('.json'):
        repeats_file_type = 'json'
    LOG.info("Parsing repeats file %s", repeats_file)

    with open(repeats_file, 'r') as file_handle:
        repeat_information = parse_repeat_file(file_handle, repeats_file_type)

    if not repeat_information:
        LOG.warning("Could not find any repeat info")
        context.abort()

    header_definitions = [
        {
            'id': 'STR_STATUS', 'num': 'A', 'type': 'String',
            'desc': 'Repeat expansion status. Alternatives in [normal, pre_mutation, full_mutation]'
        },
        {
            'id': 'STR_NORMAL_MAX', 'num': '1', 'type': 'Integer',
            'desc': 'Max number of repeats allowed to call as normal'
        },
        {
            'id': 'STR_FULLMUT_MIN', 'num': '1', 'type': 'Integer',
            'desc': 'Min number of repeats required to call as full mutation'
        }
    ]

    stranger_headers = []
    for hdef in header_definitions:
        header = '##INFO=<ID={0},Number={1},Type={2},Description="{3}">'.format(
            hdef.get('id'), hdef.get('num'), hdef.get('type'), hdef.get('desc'))
        stranger_headers.append(header)
            

    if vcf.endswith('.gz'):
        LOG.info("Vcf is zipped")
        vcf_handle = getreader('utf-8')(gzip.open(vcf), errors='replace')
    else:
        vcf_handle = open(vcf, mode='r', encoding='utf-8', errors='replace')

    LOG.info("Parsing variants from %s", vcf)
    for line in vcf_handle:
        line = line.rstrip()
        if line.startswith('#'):
            if line.startswith('##'):
                click.echo(line)
                continue
            # Print the new header lines describing stranger annotation
            for header in stranger_headers:
                click.echo(header)
            # Print the vcf header line
            header_info = line[1:].split('\t')
            click.echo(line)
            continue
        variant_info = dict(zip(header_info, line.split('\t')))
        variant_info['alts'] = variant_info['ALT'].split(',')
        variant_info['info_dict'] = get_info_dict(variant_info['INFO'])
        repeat_data = get_repeat_info(variant_info, repeat_information)
        if repeat_data:
            variant_info['info_dict']['STR_STATUS'] = repeat_data['repeat_strings']
            variant_info['info_dict']['STR_NORMAL_MAX'] = str(repeat_data['lower'])
            variant_info['info_dict']['STR_FULLMUT_MIN'] = str(repeat_data['upper'])

        click.echo(get_variant_line(variant_info, header_info))