def cli(context, repeats_file): """Annotate str variants with str status""" repeat_information = {} with open(repeats_file, 'r') as file_handle: repeat_information = parse_repeat_file(file_handle) if not repeat_information: LOG.warning("Could not find any repeat info") context.abort() header = [ "hgnc_id", "hgnc_symbol", "repid", "ru", "normal_max", "pathologic_min", "disease" ] table_line = "| {0} | {1} | {2} | {3} | {4} | {5} | {6} |" click.echo( table_line.format(header[0], header[1], header[2], header[3], header[4], header[5], header[6])) click.echo( table_line.format('-------', '-------', '-------', '-------', '-------', '-------', '-------')) for entry in repeat_information: click.echo( table_line.format( repeat_information[entry][header[0]], repeat_information[entry][header[1]], repeat_information[entry][header[2]], repeat_information[entry][header[3]], repeat_information[entry][header[4]], repeat_information[entry][header[5]], repeat_information[entry][header[6]], ))
def test_parse_repeat_file(repeats_file_handle): ## GIVEN a file handle with repeat lines ## WHEN parsing the repeat info repeats_info = parse_repeat_file(repeats_file_handle) ## THEN assert that there are some repeat info returned assert repeats_info
def cli(context, vcf, repeats_file, loglevel): """Annotate str variants with str status""" coloredlogs.install(level=loglevel) header_string = 'STR_STATUS' repeat_information = None with open(repeats_file, 'r') as file_handle: repeat_information = parse_repeat_file(file_handle) if not repeat_information: LOG.warning("Could not find any repeat info") context.abort() vcf_obj = VCF(vcf) vcf_obj.add_info_to_header({ "ID": header_string, "Number": 'A', "Type": "String", "Description": "Repeat expansion status. Alternatives in ['normal', 'pre_mutation', 'full_mutation']" }) print_headers(vcf_obj) for var in vcf_obj: repeat_string = get_repeat_info(var, repeat_information) if repeat_string: var.INFO[header_string] = repeat_string click.echo(str(var).rstrip())
def test_parse_malformaed_repeat_line_wrong_value(): ## GIVEN a some malformed repeat info lines repeats_info_lines = [ "#hgnc_id hgnc_symbol repid ru normal_max pathologic_min disease", "10548 ATXN1 ATXN1 CAG hello 45 SCA1" ] ## WHEN parsing the repeat info ## THEN assert that an exception is raised with pytest.raises(ValueError): repeats_info = parse_repeat_file(repeats_info_lines)
def test_parse_repeat_line(): ## GIVEN a some repeat info lines repeats_info_lines = [ "#hgnc_id hgnc_symbol repid ru normal_max pathologic_min disease", "10548 ATXN1 ATXN1 CAG 35 45 SCA1" ] ## WHEN parsing the repeat info repeats_info = parse_repeat_file(repeats_info_lines) ## THEN assert that the expected repeat info is there assert 'ATXN1' in repeats_info ## THEN assert that the hgnc_id is there assert repeats_info['ATXN1']['hgnc_id'] == 10548 assert repeats_info['ATXN1']['hgnc_symbol'] == 'ATXN1' assert repeats_info['ATXN1']['repid'] == 'ATXN1' assert repeats_info['ATXN1']['ru'] == 'CAG' assert repeats_info['ATXN1']['normal_max'] == 35 assert repeats_info['ATXN1']['pathologic_min'] == 45 assert repeats_info['ATXN1']['disease'] == 'SCA1'
def cli(context, repeats_file): """Table print repeat info""" repeat_information = {} with open(repeats_file, 'r') as file_handle: repeat_information = parse_repeat_file(file_handle, repeats_file_type='json') if not repeat_information: LOG.warning("Could not find any repeat info") context.abort() header = [ "HGNCId", "LocusId", "DisplayRU", "InheritanceMode", "normal_max", "pathologic_min", "Disease", "SourceDisplay", "SourceId" ] table_line = "| {0} | {1} | {2} | {3} | {4} | {5} | {6} | {7} | {8} |" click.echo( table_line.format(header[0], header[1], header[2], header[3], header[4], header[5], header[6], header[7], header[8])) click.echo( table_line.format('-------', '-------', '-------', '-------', '-------', '-------', '-------', '-------', '-------')) for entry in repeat_information: click.echo( table_line.format( repeat_information[entry][header[0]], entry, repeat_information[entry][header[2]], repeat_information[entry][header[3]], repeat_information[entry][header[4]], repeat_information[entry][header[5]], repeat_information[entry][header[6]], repeat_information[entry][header[7]], repeat_information[entry][header[8]], ))
def cli(context, vcf, family_id, repeats_file, loglevel): """Annotate str variants with str status""" coloredlogs.install(level=loglevel) LOG.info("Running stranger version %s", __version__) repeat_information = None repeats_file_type = 'tsv' if repeats_file.endswith('.json'): repeats_file_type = 'json' LOG.info("Parsing repeats file %s", repeats_file) with open(repeats_file, 'r') as file_handle: repeat_information = parse_repeat_file(file_handle, repeats_file_type) if not repeat_information: LOG.warning("Could not find any repeat info") context.abort() header_definitions = [ { 'id': 'STR_STATUS', 'num': 'A', 'type': 'String', 'desc': 'Repeat expansion status. Alternatives in [normal, pre_mutation, full_mutation]' }, { 'id': 'STR_NORMAL_MAX', 'num': '1', 'type': 'Integer', 'desc': 'Max number of repeats allowed to call as normal' }, { 'id': 'STR_PATHOLOGIC_MIN', 'num': '1', 'type': 'Integer', 'desc': 'Min number of repeats required to call as pathologic' }, { 'id': 'SourceDisplay', 'num': '1', 'type': 'String', 'desc': 'Source for variant definition, display' }, { 'id': 'Source', 'num': '1', 'type': 'String', 'desc': 'Source collection for variant definition' }, { 'id': 'SourceId', 'num': '1', 'type': 'String', 'desc': 'Source id for variant definition' }, { 'id': 'SweGenMean', 'num': '1', 'type': 'Float', 'desc': 'Average number of repeat unit copies in population' }, { 'id': 'SweGenStd', 'num': '1', 'type': 'Float', 'desc': 'Standard deviation of number of repeat unit copies in population' }, { 'id': 'DisplayRU', 'num': '1', 'type': 'String', 'desc': 'Display repeat unit familiar to clinician' }, { 'id': 'InheritanceMode', 'num': '1', 'type': 'String', 'desc': 'Main mode of inheritance for disorder' }, { 'id': 'HGNCId', 'num': '1', 'type': 'Integer', 'desc': 'HGNC gene id for associated disease gene' }, { 'id': 'RankScore', 'num': '1', 'type': 'String', 'desc': 'RankScore for variant in this family as family(str):score(int)' }, { 'id': 'Disease', 'num': '1', 'type': 'String', 'desc': 'Associated disorder' }, ] stranger_headers = [] for hdef in header_definitions: header = '##INFO=<ID={0},Number={1},Type={2},Description="{3}">'.format( hdef.get('id'), hdef.get('num'), hdef.get('type'), hdef.get('desc')) stranger_headers.append(header) if vcf.endswith('.gz'): LOG.info("Vcf is zipped") vcf_handle = getreader('utf-8')(gzip.open(vcf), errors='replace') else: vcf_handle = open(vcf, mode='r', encoding='utf-8', errors='replace') LOG.info("Parsing variants from %s", vcf) for line in vcf_handle: line = line.rstrip() if line.startswith('#'): if line.startswith('##'): click.echo(line) continue # Print the new header lines describing stranger annotation for header in stranger_headers: click.echo(header) # Print the vcf header line header_info = line[1:].split('\t') click.echo(line) continue variant_info = dict(zip(header_info, line.split('\t'))) variant_info['alts'] = variant_info['ALT'].split(',') variant_info['info_dict'] = get_info_dict(variant_info['INFO']) repeat_data = get_repeat_info(variant_info, repeat_information) if repeat_data: variant_info['info_dict']['STR_STATUS'] = repeat_data[ 'repeat_strings'] variant_info['info_dict']['STR_NORMAL_MAX'] = str( repeat_data['lower']) variant_info['info_dict']['STR_PATHOLOGIC_MIN'] = str( repeat_data['upper']) variant_info['info_dict']['RankScore'] = ':'.join( [str(family_id), str(repeat_data['rank_score'])]) for annotate_repeat_key in ANNOTATE_REPEAT_KEYS: if repeat_data.get(annotate_repeat_key): variant_info['info_dict'][annotate_repeat_key] = str( repeat_data[annotate_repeat_key]) click.echo(get_variant_line(variant_info, header_info))
def cli(context, vcf, repeats_file, loglevel): """Annotate str variants with str status""" coloredlogs.install(level=loglevel) LOG.info("Running stranger version %s", __version__) repeat_information = None repeats_file_type = 'tsv' if repeats_file.endswith('.json'): repeats_file_type = 'json' LOG.info("Parsing repeats file %s", repeats_file) with open(repeats_file, 'r') as file_handle: repeat_information = parse_repeat_file(file_handle, repeats_file_type) if not repeat_information: LOG.warning("Could not find any repeat info") context.abort() header_definitions = [ { 'id': 'STR_STATUS', 'num': 'A', 'type': 'String', 'desc': 'Repeat expansion status. Alternatives in [normal, pre_mutation, full_mutation]' }, { 'id': 'STR_NORMAL_MAX', 'num': '1', 'type': 'Integer', 'desc': 'Max number of repeats allowed to call as normal' }, { 'id': 'STR_FULLMUT_MIN', 'num': '1', 'type': 'Integer', 'desc': 'Min number of repeats required to call as full mutation' } ] stranger_headers = [] for hdef in header_definitions: header = '##INFO=<ID={0},Number={1},Type={2},Description="{3}">'.format( hdef.get('id'), hdef.get('num'), hdef.get('type'), hdef.get('desc')) stranger_headers.append(header) if vcf.endswith('.gz'): LOG.info("Vcf is zipped") vcf_handle = getreader('utf-8')(gzip.open(vcf), errors='replace') else: vcf_handle = open(vcf, mode='r', encoding='utf-8', errors='replace') LOG.info("Parsing variants from %s", vcf) for line in vcf_handle: line = line.rstrip() if line.startswith('#'): if line.startswith('##'): click.echo(line) continue # Print the new header lines describing stranger annotation for header in stranger_headers: click.echo(header) # Print the vcf header line header_info = line[1:].split('\t') click.echo(line) continue variant_info = dict(zip(header_info, line.split('\t'))) variant_info['alts'] = variant_info['ALT'].split(',') variant_info['info_dict'] = get_info_dict(variant_info['INFO']) repeat_data = get_repeat_info(variant_info, repeat_information) if repeat_data: variant_info['info_dict']['STR_STATUS'] = repeat_data['repeat_strings'] variant_info['info_dict']['STR_NORMAL_MAX'] = str(repeat_data['lower']) variant_info['info_dict']['STR_FULLMUT_MIN'] = str(repeat_data['upper']) click.echo(get_variant_line(variant_info, header_info))