def get_header(vcf_file_path): """Parse the header and return a header object Args: vcf_file_path(str): Path to vcf Returns: head: A HeaderParser object """ logger.info("Parsing header of file {0}".format(vcf_file_path)) head = HeaderParser() handle = get_vcf_handle(infile=vcf_file_path) # Parse the header for line in handle: line = line.rstrip() if line.startswith('#'): if line.startswith('##'): head.parse_meta_data(line) else: head.parse_header_line(line) else: break handle.close() return head
def cli(ctx, variant_file, family_file, family_type, gq_treshold, to_json, outfile, verbose): """Check for pedigree inconsistensies.""" # configure root logger to print to STDERR loglevel = LEVELS.get(min(verbose, 3)) configure_stream(level=loglevel) if not family_file: logger.error("Please provide a family file with -f/--family_file") logger.info("Exiting") sys.exit(1) logger.info("Setting up a family parser") family_parser = FamilyParser(family_file, family_type) logger.debug("Family parser done") # The individuals in the ped file must be present in the variant file: families = family_parser.families logger.info("Families used in analysis: {0}".format( ','.join(list(families.keys())))) ctx.gq_treshold = gq_treshold ctx.to_json = to_json ctx.outfile = outfile ctx.families = families ctx.individuals = family_parser.individuals head = HeaderParser() for line in variant_file: line = line.rstrip() if line.startswith('#'): if line.startswith('##'): head.parse_meta_data(line) else: head.parse_header_line(line) else: break #Add the first variant to the iterator if line: variant_file = itertools.chain([line], variant_file) try: check_individuals(family_parser.individuals, head.individuals) except IOError as e: logger.error(e) logger.info("Individuals in PED file: {0}".format( ', '.join(family_parser.individuals))) logger.info("Individuals in VCF file: {0}".format(', '.join(vcf_individuals))) logger.info("Exiting...") ctx.abort() ctx.variant_file = variant_file ctx.header_line = head.header
def get_header(vcf_lines): """Parse the vcf lines and return a header object""" head = HeaderParser() for line in vcf_lines: if line.startswith('#'): if line.startswith('##'): head.parse_meta_data(line) else: head.parse_header_line(line) return head
def test_parse_vcf_lines(): """ Test how the header parser behaves with simple vcf lines """ header_parser = HeaderParser() header_lines = [ '##fileformat=VCFv4.2', '##FILTER=<ID=LowQual,Description="Low quality">', '##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">', '##INFO=<ID=CNT,Number=A,Type=Integer,Description="Number of times '\ 'this allele was found in external db">', '##contig=<ID=1,length=249250621,assembly=b37>', '##INFO=<ID=DP_HIST,Number=R,Type=String,Description="Histogram for '\ 'DP; Mids: 2.5|7.5|12.5|17.5|22.5|27.5|32.5|37.5|42.5|47.5|52.5|57.5|'\ '62.5|67.5|72.5|77.5|82.5|87.5|92.5|97.5">', '##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for'\ ' the ref and alt alleles in the order listed">', '##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">', '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">', '##FORMAT=<ID=GQ,Number=1,Type=String,Description="GenotypeQuality">' '##reference=file:///human_g1k_v37.fasta', '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tfather\tmother\tproband' ] for line in header_lines: if line.startswith('##'): header_parser.parse_meta_data(line) elif line.startswith('#'): header_parser.parse_header_line(line) assert header_parser.fileformat == "VCFv4.2" assert header_parser.individuals == ['father', 'mother', 'proband'] assert header_parser.vep_columns == [] assert "MQ" in header_parser.extra_info assert header_parser.extra_info["MQ"][ 'Description'] == "RMS Mapping Quality" assert header_parser.extra_info["CNT"]['Number'] == "A" assert header_parser.extra_info["CNT"]['Type'] == "Integer" assert "CNT" in header_parser.extra_info assert "DP_HIST" in header_parser.extra_info assert "LowQual" in header_parser.filter_dict assert "1" in header_parser.contig_dict assert header_parser.header == [ 'CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'father', 'mother', 'proband' ]
def test_parse_vcf_lines(): """ Test how the header parser behaves with simple vcf lines """ header_parser = HeaderParser() header_lines = [ '##fileformat=VCFv4.2', '##FILTER=<ID=LowQual,Description="Low quality">', '##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">', '##INFO=<ID=CNT,Number=A,Type=Integer,Description="Number of times '\ 'this allele was found in external db">', '##contig=<ID=1,length=249250621,assembly=b37>', '##INFO=<ID=DP_HIST,Number=R,Type=String,Description="Histogram for '\ 'DP; Mids: 2.5|7.5|12.5|17.5|22.5|27.5|32.5|37.5|42.5|47.5|52.5|57.5|'\ '62.5|67.5|72.5|77.5|82.5|87.5|92.5|97.5">', '##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for'\ ' the ref and alt alleles in the order listed">', '##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">', '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">', '##FORMAT=<ID=GQ,Number=1,Type=String,Description="GenotypeQuality">' '##reference=file:///human_g1k_v37.fasta', '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tfather\tmother\tproband' ] for line in header_lines: if line.startswith('##'): header_parser.parse_meta_data(line) elif line.startswith('#'): header_parser.parse_header_line(line) assert header_parser.fileformat == "VCFv4.2" assert header_parser.individuals == ['father','mother','proband'] assert header_parser.vep_columns == [] assert "MQ" in header_parser.extra_info assert header_parser.extra_info["MQ"]['Description'] == "RMS Mapping Quality" assert header_parser.extra_info["CNT"]['Number'] == "A" assert header_parser.extra_info["CNT"]['Type'] == "Integer" assert "CNT" in header_parser.extra_info assert "DP_HIST" in header_parser.extra_info assert "LowQual" in header_parser.filter_dict assert "1" in header_parser.contig_dict assert header_parser.header == [ 'CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT', 'father','mother','proband' ]
def _formated_variants(self, raw_variants, case_obj): """Return variant objects Args: raw_variants (Iterable): An iterable with variant lines case_obj (puzzle.nodels.Case): A case object """ vcf_file_path = case_obj.variant_source logger.info("Parsing file {0}".format(vcf_file_path)) head = HeaderParser() handle = get_vcf_handle(infile=vcf_file_path) # Parse the header for line in handle: line = line.rstrip() if line.startswith("#"): if line.startswith("##"): head.parse_meta_data(line) else: head.parse_header_line(line) else: break handle.close() header_line = head.header # Get the individual ids for individuals in vcf file vcf_individuals = set([ind_id for ind_id in head.individuals]) variant_columns = ["CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER"] vep_header = head.vep_columns snpeff_header = head.snpeff_columns index = 0 for variant_line in raw_variants: if not variant_line.startswith("#"): index += 1 # Create a variant dict: variant_dict = get_variant_dict(variant_line=variant_line, header_line=header_line) variant_dict["CHROM"] = variant_dict["CHROM"].lstrip("chrCHR") # Crreate a info dict: info_dict = get_info_dict(info_line=variant_dict["INFO"]) # Check if vep annotation: vep_string = info_dict.get("CSQ") # Check if snpeff annotation: snpeff_string = info_dict.get("ANN") if vep_string: # Get the vep annotations vep_info = get_vep_info(vep_string=vep_string, vep_header=vep_header) elif snpeff_string: # Get the vep annotations snpeff_info = get_snpeff_info(snpeff_string=snpeff_string, snpeff_header=snpeff_header) variant = Variant(**{column: variant_dict.get(column, ".") for column in variant_columns}) logger.debug("Creating a variant object of variant {0}".format(variant.get("variant_id"))) variant["index"] = index logger.debug("Updating index to: {0}".format(index)) variant["start"] = int(variant_dict["POS"]) if self.variant_type == "sv": other_chrom = variant["CHROM"] # If we have a translocation: if ":" in variant_dict["ALT"] and not "<" in variant_dict["ALT"]: other_coordinates = variant_dict["ALT"].strip("ACGTN[]").split(":") other_chrom = other_coordinates[0].lstrip("chrCHR") other_position = other_coordinates[1] variant["stop"] = other_position # Set 'infinity' to length if translocation variant["sv_len"] = float("inf") else: variant["stop"] = int(info_dict.get("END", variant_dict["POS"])) variant["sv_len"] = variant["stop"] - variant["start"] variant["stop_chrom"] = other_chrom else: variant["stop"] = int(variant_dict["POS"]) + (len(variant_dict["REF"]) - len(variant_dict["ALT"])) variant["sv_type"] = info_dict.get("SVTYPE") variant["cytoband_start"] = get_cytoband_coord(chrom=variant["CHROM"], pos=variant["start"]) if variant.get("stop_chrom"): variant["cytoband_stop"] = get_cytoband_coord(chrom=variant["stop_chrom"], pos=variant["stop"]) # It would be easy to update these keys... thousand_g = info_dict.get("1000GAF") if thousand_g: logger.debug("Updating thousand_g to: {0}".format(thousand_g)) variant["thousand_g"] = float(thousand_g) variant.add_frequency("1000GAF", variant.get("thousand_g")) # SV specific tag for number of occurances occurances = info_dict.get("OCC") if occurances: logger.debug("Updating occurances to: {0}".format(occurances)) variant["occurances"] = float(occurances) variant.add_frequency("OCC", occurances) cadd_score = info_dict.get("CADD") if cadd_score: logger.debug("Updating cadd_score to: {0}".format(cadd_score)) variant["cadd_score"] = float(cadd_score) rank_score_entry = info_dict.get("RankScore") if rank_score_entry: for family_annotation in rank_score_entry.split(","): rank_score = family_annotation.split(":")[-1] logger.debug("Updating rank_score to: {0}".format(rank_score)) variant["rank_score"] = float(rank_score) genetic_models_entry = info_dict.get("GeneticModels") if genetic_models_entry: genetic_models = [] for family_annotation in genetic_models_entry.split(","): for genetic_model in family_annotation.split(":")[-1].split("|"): genetic_models.append(genetic_model) logger.debug("Updating rank_score to: {0}".format(rank_score)) variant["genetic_models"] = genetic_models # Add genotype calls: for individual in case_obj.individuals: sample_id = individual.ind_id if sample_id in vcf_individuals: raw_call = dict(zip(variant_dict["FORMAT"].split(":"), variant_dict[sample_id].split(":"))) variant.add_individual( Genotype( sample_id=sample_id, genotype=raw_call.get("GT", "./."), case_id=individual.case_name, phenotype=individual.phenotype, ref_depth=raw_call.get("AD", ",").split(",")[0], alt_depth=raw_call.get("AD", ",").split(",")[1], genotype_quality=raw_call.get("GQ", "."), depth=raw_call.get("DP", "."), supporting_evidence=raw_call.get("SU", "0"), pe_support=raw_call.get("PE", "0"), sr_support=raw_call.get("SR", "0"), ) ) # Add transcript information: gmaf = None if vep_string: for transcript_info in vep_info: transcript = self._get_vep_transcripts(transcript_info) gmaf_raw = transcript_info.get("GMAF") if gmaf_raw: gmaf = float(gmaf_raw.split(":")[-1]) variant.add_transcript(transcript) if gmaf: variant.add_frequency("GMAF", gmaf) if not variant.thousand_g: variant.thousand_g = gmaf elif snpeff_string: for transcript_info in snpeff_info: transcript = self._get_snpeff_transcripts(transcript_info) variant.add_transcript(transcript) variant["most_severe_consequence"] = get_most_severe_consequence(variant["transcripts"]) for gene in self._get_genes(variant): variant.add_gene(gene) self._add_compounds(variant=variant, info_dict=info_dict) yield variant
def get_individuals(vcf=None, case_lines=None, case_type='ped'): """Get the individuals from a vcf file, and/or a ped file. Args: vcf (str): Path to a vcf case_lines(Iterable): Ped like lines case_type(str): Format of ped lines Returns: individuals (generator): generator with Individuals """ individuals = [] if case_lines: # read individuals from ped file family_parser = FamilyParser(case_lines, family_type=case_type) families = family_parser.families logger.info("Found families {0}".format( ','.join(list(families.keys())))) if len(families) != 1: logger.error("Only one family can be used with vcf adapter") raise IOError case_id = list(families.keys())[0] logger.info("Family used in analysis: {0}".format(case_id)) for ind_id in family_parser.individuals: ind = family_parser.individuals[ind_id] logger.info("Found individual {0}".format(ind.individual_id)) individual = Individual( ind_id=ind.individual_id, case_id=case_id, mother=ind.mother, father=ind.father, sex=str(ind.sex), phenotype=str(ind.phenotype), variant_source=vcf, ) individuals.append(individual) elif vcf: # read individuals from vcf file case_id = os.path.basename(vcf) head = HeaderParser() handle = get_vcf_handle(infile=vcf) for line in handle: line = line.rstrip() if line.startswith('#'): if line.startswith('##'): head.parse_meta_data(line) else: head.parse_header_line(line) else: break for index, ind in enumerate(head.individuals): # If we only have a vcf file we can not get metadata about the # individuals individual = Individual( ind_id=ind, case_id=case_id, variant_source=vcf, ) individuals.append(individual) logger.debug("Found individual {0} in {1}".format( ind, vcf)) return individuals
def _formated_variants(self, raw_variants, case_obj): """Return variant objects Args: raw_variants (Iterable): An iterable with variant lines case_obj (puzzle.nodels.Case): A case object """ vcf_file_path = case_obj.variant_source logger.info("Parsing file {0}".format(vcf_file_path)) head = HeaderParser() handle = get_vcf_handle(infile=vcf_file_path) # Parse the header for line in handle: line = line.rstrip() if line.startswith('#'): if line.startswith('##'): head.parse_meta_data(line) else: head.parse_header_line(line) else: break handle.close() header_line = head.header # Get the individual ids for individuals in vcf file vcf_individuals = set([ind_id for ind_id in head.individuals]) variant_columns = ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER'] vep_header = head.vep_columns snpeff_header = head.snpeff_columns index = 0 for variant_line in raw_variants: if not variant_line.startswith('#'): index += 1 #Create a variant dict: variant_dict = get_variant_dict( variant_line = variant_line, header_line = header_line ) variant_dict['CHROM'] = variant_dict['CHROM'].lstrip('chrCHR') #Crreate a info dict: info_dict = get_info_dict( info_line = variant_dict['INFO'] ) #Check if vep annotation: vep_string = info_dict.get('CSQ') #Check if snpeff annotation: snpeff_string = info_dict.get('ANN') if vep_string: #Get the vep annotations vep_info = get_vep_info( vep_string = vep_string, vep_header = vep_header ) elif snpeff_string: #Get the vep annotations snpeff_info = get_snpeff_info( snpeff_string = snpeff_string, snpeff_header = snpeff_header ) variant = Variant( **{column: variant_dict.get(column, '.') for column in variant_columns} ) logger.debug("Creating a variant object of variant {0}".format( variant.get('variant_id'))) variant['index'] = index logger.debug("Updating index to: {0}".format( index)) variant['start'] = int(variant_dict['POS']) if self.variant_type == 'sv': other_chrom = variant['CHROM'] # If we have a translocation: if ':' in variant_dict['ALT']: other_coordinates = variant_dict['ALT'].strip('ACGTN[]').split(':') other_chrom = other_coordinates[0].lstrip('chrCHR') other_position = other_coordinates[1] variant['stop'] = other_position #Set 'infinity' to length if translocation variant['sv_len'] = float('inf') else: variant['stop'] = int(info_dict.get('END', variant_dict['POS'])) variant['sv_len'] = variant['stop'] - variant['start'] variant['stop_chrom'] = other_chrom else: variant['stop'] = int(variant_dict['POS']) + \ (len(variant_dict['REF']) - len(variant_dict['ALT'])) variant['sv_type'] = info_dict.get('SVTYPE') variant['cytoband_start'] = get_cytoband_coord( chrom=variant['CHROM'], pos=variant['start']) if variant.get('stop_chrom'): variant['cytoband_stop'] = get_cytoband_coord( chrom=variant['stop_chrom'], pos=variant['stop']) # It would be easy to update these keys... thousand_g = info_dict.get('1000GAF') if thousand_g: logger.debug("Updating thousand_g to: {0}".format( thousand_g)) variant['thousand_g'] = float(thousand_g) variant.add_frequency('1000GAF', variant.get('thousand_g')) #SV specific tag for number of occurances occurances = info_dict.get('OCC') if occurances: logger.debug("Updating occurances to: {0}".format( occurances)) variant['occurances'] = float(occurances) variant.add_frequency('OCC', occurances) cadd_score = info_dict.get('CADD') if cadd_score: logger.debug("Updating cadd_score to: {0}".format( cadd_score)) variant['cadd_score'] = float(cadd_score) rank_score_entry = info_dict.get('RankScore') if rank_score_entry: for family_annotation in rank_score_entry.split(','): rank_score = family_annotation.split(':')[-1] logger.debug("Updating rank_score to: {0}".format( rank_score)) variant['rank_score'] = float(rank_score) genetic_models_entry = info_dict.get('GeneticModels') if genetic_models_entry: genetic_models = [] for family_annotation in genetic_models_entry.split(','): for genetic_model in family_annotation.split(':')[-1].split('|'): genetic_models.append(genetic_model) logger.debug("Updating rank_score to: {0}".format( rank_score)) variant['genetic_models'] = genetic_models #Add genotype calls: for individual in case_obj.individuals: sample_id = individual.ind_id if sample_id in vcf_individuals: raw_call = dict(zip( variant_dict['FORMAT'].split(':'), variant_dict[sample_id].split(':')) ) variant.add_individual(Genotype( sample_id = sample_id, genotype = raw_call.get('GT', './.'), case_id = individual.case_name, phenotype = individual.phenotype, ref_depth = raw_call.get('AD', ',').split(',')[0], alt_depth = raw_call.get('AD', ',').split(',')[1], genotype_quality = raw_call.get('GQ', '.'), depth = raw_call.get('DP', '.'), supporting_evidence = raw_call.get('SU', '0'), pe_support = raw_call.get('PE', '0'), sr_support = raw_call.get('SR', '0'), )) # Add transcript information: if vep_string: for transcript in self._get_vep_transcripts(variant, vep_info): variant.add_transcript(transcript) elif snpeff_string: for transcript in self._get_snpeff_transcripts(variant, snpeff_info): variant.add_transcript(transcript) variant['most_severe_consequence'] = get_most_severe_consequence( variant['transcripts'] ) for gene in self._get_genes(variant): variant.add_gene(gene) self._add_compounds(variant=variant, info_dict=info_dict) yield variant
def cli(variant_file, thousand_g, exac, treshold, outfile, annotate, keyword, verbose, logfile): """ Filter vcf variants based on their frequency. One can use different sources by addind --keyword multiple times. Variants and frequency sources should be splitted and normalized(with vt). """ loglevel = LEVELS.get(min(verbose,2), "WARNING") init_log(root_logger, logfile, loglevel) logger = logging.getLogger(__name__) #For testing logger = logging.getLogger("filter_variants.cli.root") logger.info("Running filter_variants version {0}".format(__version__)) logger.info("Initializing a Header Parser") head = HeaderParser() for line in variant_file: line = line.rstrip() if line.startswith('#'): if line.startswith('##'): head.parse_meta_data(line) else: head.parse_header_line(line) else: break if line: variant_file = itertools.chain([line], variant_file) if thousand_g: logger.info("Opening 1000G frequency file with tabix open") try: thousand_g_handle = get_tabix_handle(thousand_g) except OSError as e: logger.critical(e.message) logger.info("Exiting") sys.exit(1) logger.debug("1000G frequency file opened") if annotate: head.add_info( "1000GAF", "1", 'Float', "Frequency in the 1000G database." ) if exac: logger.info("Opening ExAC frequency file with tabix open") try: exac_handle = get_tabix_handle(exac) except OSError as e: logger.critical(e.message) logger.info("Exiting") sys.exit(1) logger.debug("ExAC frequency file opened") if annotate: head.add_info( "ExACAF", "1", 'Float', "Frequency in the ExAC database." ) plugins = [] for key in keyword: if key not in head.info_dict: logger.error("{0} is not defined in vcf header.".format(key)) logger.info("Exiting") sys.exit(1) plugins.append(Plugin( name=key, field='INFO', data_type='float', separators=[','], info_key=key, record_rule='max', )) print_headers(head, outfile) for line in variant_file: max_freq = 0 line = line.rstrip() variant_line = line.split('\t') chrom = variant_line[0].strip('chr') position = int(variant_line[1]) ref = variant_line[3] alternative = variant_line[4] logger.debug("Checking variant {0}".format( '_'.join([chrom, str(position), ref, alternative]) )) for plugin in plugins: logger.debug("Getting frequency for {0}".format( plugin.name)) frequency = plugin.get_value(variant_line=line) logger.debug("Found frequency {0}".format( frequency)) if frequency: if float(frequency) > max_freq: logger.debug("Updating max freq") max_freq = float(frequency) if thousand_g: logger.debug("Getting thousand g frequency") frequency = get_frequency( chrom = chrom, pos = position, alt = alternative, tabix_reader = thousand_g_handle ) logger.debug("Found frequency {0}".format( frequency)) if frequency: if annotate: line = add_vcf_info( keyword='1000GAF', variant_line=line, annotation=frequency ) if float(frequency) > max_freq: logger.debug("Updating max freq") max_freq = float(frequency) if exac: logger.debug("Getting ExAC frequency") frequency = get_frequency( chrom = chrom, pos = position, alt = alternative, tabix_reader = exac_handle ) logger.debug("Found frequency {0}".format( frequency)) if frequency: if annotate: line = add_vcf_info( keyword='ExACAF', variant_line=line, annotation=frequency ) if float(frequency) > max_freq: logger.debug("Updating max freq") max_freq = float(frequency) if max_freq < treshold: print_variant(line, outfile) else: logger.debug("Frequency {0} is higher than treshold"\ " {1}. Skip printing variant".format(max_freq, treshold))