def test_variant_printer(): """Test the variant printer""" vcf_file = setup_vcf_file() variant_queue = Manager().Queue() head = HeaderParser() outfile = NamedTemporaryFile(mode='w+t', delete=False, suffix='.vcf') outfile.close() variant_printer = VariantPrinter( task_queue=variant_queue, head=head, mode='chromosome', outfile = outfile.name ) variant_printer.start() batch = OrderedDict() for line in open(vcf_file): line = line.rstrip() if line.startswith('#'): if line.startswith('##'): head.parse_meta_data(line) else: head.parse_header_line(line) else: variant_dict = get_variant_dict(line, head.header) variant_id = get_variant_id(variant_dict) variant_dict['variant_id'] = variant_id variant_dict['info_dict'] = get_info_dict(variant_dict['INFO']) variant_queue.put(variant_dict) variant_queue.put(None) variant_printer.join() variants = [] with open(outfile.name, 'r', 'utf-8-sig') as f: for line in f: variants.append(line.rstrip().split('\t')) assert variants[0][0] == '1' assert variants[0][2] == '11900'
def annotate(context, variant_file, annotate_regions, region_file, cadd_file, thousand_g, exac, spidex, outfile, silent, cadd_raw, cosmic, max_af, temp_dir, genome_build): """ Annotate vcf variants. Annotate variants with a number of different sources. Please use --help for more info. """ regions = annotate_regions logger.info( "Running genmod annotate_variant version {0}".format(__version__)) if not region_file: if genome_build == '37': region_file = ensembl_path_37 elif genome_build == '38': region_file = ensembl_path_38 start_time_analysis = datetime.now() annotation_arguments = {} variants = get_file_handle(variant_file) logger.info("Initializing a Header Parser") head = HeaderParser() line = None for line in variants: line = line.rstrip() if line.startswith('#'): if line.startswith('##'): head.parse_meta_data(line) else: head.parse_header_line(line) else: break #Add the first variant back to the iterator # If the vcf has no variants the last line will be a header if not line.startswith('#'): variants = itertools.chain([line], variants) else: print_headers(head, outfile, silent) sys.exit(0) header_line = head.header annotation_arguments['header_line'] = header_line try: if regions: logger.info("Loading annotations") logger.info("Use annotations file: {0}".format(region_file)) add_regions(head) regions_handle = get_file_handle(region_file) logger.debug("Adding region trees to arguments") annotation_arguments['region_trees'] = build_region_trees( regions_handle, padding=4000) if exac: logger.info("Annotating ExAC frequencies") logger.debug("Using ExAC file: {0}".format(exac)) annotation_arguments['exac'] = get_tabixhandle(exac) add_exac(head) if thousand_g: logger.info("Annotating 1000G frequencies") logger.debug("Using 1000G file: {0}".format(thousand_g)) annotation_arguments['thousand_g'] = get_tabixhandle(thousand_g) add_thousandg(head) if spidex: logger.info("Annotating Spidex z scores") logger.debug("Using Spidex file: {0}".format(spidex)) annotation_arguments['spidex'] = get_tabixhandle(spidex) add_spidex(head) if cadd_file: logger.info("Annotating CADD scores") logger.debug("Using CADD file(s): {0}".format( ', '.join(cadd_file))) annotation_arguments['cadd_files'] = [ get_tabixhandle(cadd) for cadd in cadd_file ] add_cadd(head) if cadd_raw: annotation_arguments['cadd_raw'] = cadd_raw add_cadd_raw(head) if max_af: annotation_arguments['max_af'] = max_af if thousand_g: add_thousandg_max(head) if exac: add_exac_max(head) if cosmic: logger.info("Annotating if variant is in COSMIC") logger.debug("Using COSMIC file: {0}".format(cosmic)) annotation_arguments['cosmic'] = get_tabixhandle(cosmic) add_cosmic(head) except TabixError as err: logger.warning(err) context.abort() print_headers(head, outfile, silent) for variant in variants: print_variant(variant_line=annotate_variant(variant, annotation_arguments), outfile=outfile, silent=silent)
def annotate(context, variant_file, annotate_regions, region_file, cadd_file, thousand_g, exac, spidex, outfile, silent, cadd_raw, cosmic, max_af, temp_dir, genome_build): """ Annotate vcf variants. Annotate variants with a number of different sources. Please use --help for more info. """ regions = annotate_regions logger.info("Running genmod annotate_variant version {0}".format(__version__)) if not region_file: if genome_build == '37': region_file = ensembl_path_37 elif genome_build == '38': region_file = ensembl_path_38 start_time_analysis = datetime.now() annotation_arguments = {} variants = get_file_handle(variant_file) logger.info("Initializing a Header Parser") head = HeaderParser() line = None for line in variants: line = line.rstrip() if line.startswith('#'): if line.startswith('##'): head.parse_meta_data(line) else: head.parse_header_line(line) else: break #Add the first variant back to the iterator # If the vcf has no variants the last line will be a header if not line.startswith('#'): variants = itertools.chain([line], variants) else: print_headers(head, outfile, silent) sys.exit(0) header_line = head.header annotation_arguments['header_line'] = header_line try: if regions: logger.info("Loading annotations") logger.info("Use annotations file: {0}".format(region_file)) add_regions(head) regions_handle = get_file_handle(region_file) logger.debug("Adding region trees to arguments") annotation_arguments['region_trees'] = build_region_trees(regions_handle, padding=4000) if exac: logger.info("Annotating ExAC frequencies") logger.debug("Using ExAC file: {0}".format(exac)) annotation_arguments['exac'] = get_tabixhandle(exac) add_exac(head) if thousand_g: logger.info("Annotating 1000G frequencies") logger.debug("Using 1000G file: {0}".format(thousand_g)) annotation_arguments['thousand_g'] = get_tabixhandle(thousand_g) add_thousandg(head) if spidex: logger.info("Annotating Spidex z scores") logger.debug("Using Spidex file: {0}".format(spidex)) annotation_arguments['spidex'] = get_tabixhandle(spidex) add_spidex(head) if cadd_file: logger.info("Annotating CADD scores") logger.debug("Using CADD file(s): {0}".format(', '.join(cadd_file))) annotation_arguments['cadd_files'] = [get_tabixhandle(cadd) for cadd in cadd_file] add_cadd(head) if cadd_raw: annotation_arguments['cadd_raw'] = cadd_raw add_cadd_raw(head) if max_af: annotation_arguments['max_af'] = max_af if thousand_g: add_thousandg_max(head) if exac: add_exac_max(head) if cosmic: logger.info("Annotating if variant is in COSMIC") logger.debug("Using COSMIC file: {0}".format(cosmic)) annotation_arguments['cosmic'] = get_tabixhandle(cosmic) add_cosmic(head) except TabixError as err: logger.warning(err) context.abort() print_headers(head, outfile, silent) for variant in variants: print_variant( variant_line = annotate_variant(variant, annotation_arguments), outfile = outfile, silent = silent )
def models(variant_file, family_file, family_type, reduced_penetrance, vep, keyword, phased, strict, silent, processes, whole_gene, outfile, temp_dir): """ Annotate genetic models for vcf variants. Checks what patterns of inheritance that are followed in a VCF file. The analysis is family based so each family that are specified in the family file and exists in the variant file will get it's own annotation. """ logger = logging.getLogger(__name__) ######### This is for logging the command line string ######### frame = inspect.currentframe() args, _, _, values = inspect.getargvalues(frame) argument_list = [ i+'='+str(values[i]) for i in values if values[i] and i not in ['frame'] ] ########################################################################### logger.info("Running GENMOD annotate version {0}".format(__version__)) logger.debug("Arguments: {0}".format(', '.join(argument_list))) reduced_penetrance_genes = set() nr_reduced_penetrance_genes = 0 if reduced_penetrance: logger.info("Found file with genes that have reduced penetrance") for line in reduced_penetrance: if not line.startswith('#'): nr_reduced_penetrance_genes += 1 gene_id = line.rstrip().split()[0] logger.debug("Adding gene {0} to reduced penetrance genes".format( gene_id )) reduced_penetrance_genes.add( gene_id ) logger.info("Found {0} genes with reduced penetrance".format( nr_reduced_penetrance_genes)) if not family_file: logger.warning("Please provide a family file with -f/--family_file") logger.info("Exiting") sys.exit(1) logger.info("Setting up a family parser") family_parser = FamilyParser(family_file, family_type) logger.debug("Family parser done") families = {} logger.info("Check if the familys have any affected") for family_id in family_parser.families: found_affected = False family_obj = family_parser.families[family_id] for ind_id in family_obj.individuals: ind_obj = family_obj.individuals[ind_id] if ind_obj.affected: found_affected = True if found_affected: families[family_id] = family_obj else: logger.warning("No affected individuals found for family {0}."\ " Skipping family.".format(family_id)) if not families: logger.warning("Please provide at least one family with affected individuals") sys.exit(0) # The individuals in the ped file must be present in the variant file: logger.info("Families used in analysis: {0}".format( ','.join(list(families.keys())))) logger.info("Individuals included in analysis: {0}".format( ','.join(list(family_parser.individuals.keys())))) head = HeaderParser() for line in variant_file: line = line.rstrip() if line.startswith('#'): if line.startswith('##'): head.parse_meta_data(line) else: head.parse_header_line(line) else: break #Add the first variant to the iterator variant_file = itertools.chain([line], variant_file) if vep: if not "CSQ" in head.info_dict: logger.warning("vep flag is used but there is no CSQ field specified in header") logger.info("Please check VCF file") logger.info("Exiting...") sys.exit(1) else: logger.info("Using VEP annotation") else: if not keyword in head.info_dict: logger.warning("Annotation key {0} could not be found in VCF header".format(keyword)) logger.info("Please check VCF file") logger.info("Exiting...") sys.exit(1) else: logger.info("Using {0} annotation".format(keyword)) if "GeneticModels" in head.info_dict: logger.warning("Genetic models are already annotated according to vcf"\ " header.") logger.info("Exiting...") sys.exit(1) logger.info("Adding genmod version to vcf header") head.add_version_tracking( info_id='genmod', version=__version__, date=datetime.now().strftime("%Y-%m-%d %H:%M"), command_line=' '.join(argument_list) ) logger.debug("Version added") logger.info("Adding genetic models to vcf header") add_metadata( head, 'info', 'GeneticModels', annotation_number='.', entry_type='String', description="':'-separated list of genetic models for this variant." ) logger.debug("Genetic models added") logger.info("Adding model score to vcf header") add_metadata( head, 'info', 'ModelScore', annotation_number='.', entry_type='String', description="PHRED score for genotype models." ) logger.debug("Model score added") logger.info("Adding Compounds to vcf header") add_metadata( head, 'info', 'Compounds', annotation_number='.', entry_type='String', description=("List of compound pairs for this variant." "The list is splitted on ',' family id is separated with compounds" "with ':'. Compounds are separated with '|'.") ) logger.debug("Compounds added") vcf_individuals = head.individuals logger.debug("Individuals found in vcf file: {}".format(', '.join(vcf_individuals))) start_time_analysis = datetime.now() try: check_individuals(family_parser.individuals, vcf_individuals) except IOError as e: logger.error(e) logger.info("Individuals in PED file: {0}".format( ', '.join(family_parser.individuals))) logger.info("Individuals in VCF file: {0}".format(', '.join(vcf_individuals))) logger.info("Exiting...") sys.exit(1) analysis_individuals = list(family_parser.individuals.keys()) logger.info("Individuals used in analysis: {0}".format( ', '.join(analysis_individuals))) ################################################################### ### The task queue is where all jobs(in this case batches that ### ### represents variants in a region) is put. The consumers will ### ### then pick their jobs from this queue. ### ################################################################### logger.debug("Setting up a JoinableQueue for storing variant batches") variant_queue = JoinableQueue(maxsize=1000) logger.debug("Setting up a Queue for storing results from workers") results = Manager().Queue() num_model_checkers = processes #Adapt the number of processes to the machine that run the analysis logger.info('Number of CPU:s {}'.format(cpu_count())) logger.info('Number of model checkers: {}'.format(num_model_checkers)) # These are the workers that do the heavy part of the analysis logger.info('Seting up the workers') model_checkers = [ VariantAnnotator( task_queue=variant_queue, results_queue=results, families=families, individuals=analysis_individuals, phased=phased, strict=strict, whole_gene=whole_gene, vep=vep, reduced_penetrance_genes = reduced_penetrance_genes ) for i in range(num_model_checkers) ] logger.info('Starting the workers') for worker in model_checkers: logger.debug('Starting worker {0}'.format(worker)) worker.start() # This process prints the variants to temporary files logger.info('Seting up the variant printer') if len(model_checkers) == 1: print_headers(head=head, outfile=outfile, silent=silent) variant_printer = VariantPrinter( task_queue=results, head=head, mode='normal', outfile = outfile ) else: # We use a temp file to store the processed variants logger.debug("Build a tempfile for printing the variants") if temp_dir: temp_file = NamedTemporaryFile(delete=False, dir=temp_dir) else: temp_file = NamedTemporaryFile(delete=False) temp_file.close() variant_printer = VariantPrinter( task_queue=results, head=head, mode='chromosome', outfile = temp_file.name ) logger.info('Starting the variant printer process') variant_printer.start() start_time_variant_parsing = datetime.now() # This process parses the original vcf and create batches to put in the variant queue: logger.info('Start parsing the variants') chromosome_list = get_batches( variants = variant_file, batch_queue = variant_queue, header = head, vep = vep, annotation_keyword = keyword ) logger.debug("Put stop signs in the variant queue") for i in range(num_model_checkers): variant_queue.put(None) variant_queue.join() results.put(None) variant_printer.join() if len(model_checkers) > 1: sort_variants(infile=temp_file.name, mode='chromosome') print_headers(head=head, outfile=outfile, silent=silent) with open(temp_file.name, 'r', encoding='utf-8') as f: for line in f: print_variant( variant_line=line, outfile=outfile, mode='modified', silent=silent ) logger.debug("Removing temp file") os.remove(temp_file.name) logger.debug("Temp file removed") logger.info('Time for whole analyis: {0}'.format( str(datetime.now() - start_time_analysis)))
def sort(variant_file, outfile, family_id, silent, position, temp_dir): """ Sort a VCF file based on rank score. """ logger = logging.getLogger(__name__) head = HeaderParser() logger.info("Running GENMOD sort version {0}".format(__version__)) start = datetime.now() # Create a temporary variant file for sorting logger.debug("Creating temporary file for sorting") if temp_dir: temp_file = NamedTemporaryFile(delete=False, dir=temp_dir) else: temp_file = NamedTemporaryFile(delete=False) temp_file.close() # Open the temp file with codecs temp_file_handle = open( temp_file.name, mode='w', encoding='utf-8', errors='replace' ) logger.debug("Temp file created") logger.info("Printing variants to temp file") nr_variants = 0 # Print the variants with rank score in first column for line in variant_file: line = line.rstrip() if line.startswith('#'): if line.startswith('##'): head.parse_meta_data(line) else: head.parse_header_line(line) else: nr_variants += 1 priority = '0' if position: chrom = line.split()[0] priority = get_chromosome_priority(chrom) else: priority = get_rank_score(line) print_variant( variant_line=line, priority=priority, outfile=temp_file_handle ) temp_file_handle.close() logger.info("Variants printed to temp file") logger.info("Nr or variants in VCF file: {0}".format(nr_variants)) sort_mode = 'rank' if position: sort_mode = 'chromosome' logger.info("Sorting variants") sort_variants( infile = temp_file.name, mode=sort_mode ) logger.info("Variants sorted") logger.debug("Printing headers") print_headers( head = head, outfile = outfile, silent=silent ) logger.debug("Headers printed") logger.info("Printing variants") with open(temp_file.name, mode='r', encoding='utf-8', errors='replace') as f: for variant_line in f: print_variant( variant_line = variant_line, outfile = outfile, mode = 'modified', silent=False ) logger.debug("Variants printed") logger.info("Removing temp file") os.remove(temp_file.name) logger.debug("Temp file removed") logger.info("Sorting done, time for sorting: {0}".format(datetime.now()-start))
def compound(context, variant_file, silent, outfile, vep, processes, temp_dir): """ Score compound variants in a vcf file based on their rank score. """ logger.info( 'Running GENMOD score_compounds, version: {0}'.format(__version__)) variant_file = get_file_handle(variant_file) start_time_analysis = datetime.now() logger.info("Initializing a Header Parser") head = HeaderParser() line = None for line in variant_file: line = line.rstrip() if line.startswith('#'): if line.startswith('##'): head.parse_meta_data(line) else: head.parse_header_line(line) else: break logger.info("Headers parsed") if not line.startswith('#'): variant_file = itertools.chain([line], variant_file) else: print_headers(head=head, outfile=outfile, silent=silent) sys.exit(0) header_line = head.header individuals = head.individuals ################################################################### ### The task queue is where all jobs(in this case batches that ### ### represents variants in a region) is put. The consumers will ### ### then pick their jobs from this queue. ### ################################################################### logger.debug("Setting up a JoinableQueue for storing variant batches") variant_queue = JoinableQueue(maxsize=1000) logger.debug("Setting up a Queue for storing results from workers") results = Manager().Queue() num_scorers = processes #Adapt the number of processes to the machine that run the analysis logger.info('Number of CPU:s {}'.format(cpu_count())) logger.info('Number of model checkers: {}'.format(num_scorers)) # These are the workers that do the heavy part of the analysis logger.info('Seting up the workers') compound_scorers = [ CompoundScorer( task_queue=variant_queue, results_queue=results, individuals=individuals, ) for i in range(num_scorers) ] try: logger.info('Starting the workers') for worker in compound_scorers: logger.debug('Starting worker {0}'.format(worker)) worker.start() # This process prints the variants to temporary files logger.info('Seting up the variant printer') # We use a temp file to store the processed variants logger.debug("Build a tempfile for printing the variants") if temp_dir: temp_file = NamedTemporaryFile(delete=False, dir=temp_dir) else: temp_file = NamedTemporaryFile(delete=False) temp_file.close() variant_printer = VariantPrinter(task_queue=results, head=head, mode='chromosome', outfile=temp_file.name) logger.info('Starting the variant printer process') variant_printer.start() start_time_variant_parsing = datetime.now() # This process parses the original vcf and create batches to put in the variant queue: chromosome_list = get_batches(variants=variant_file, batch_queue=variant_queue, header=head, vep=vep, results_queue=results) logger.debug("Put stop signs in the variant queue") for i in range(num_scorers): variant_queue.put(None) variant_queue.join() results.put(None) variant_printer.join() sort_variants(infile=temp_file.name, mode='chromosome') print_headers(head=head, outfile=outfile, silent=silent) with open(temp_file.name, 'r', encoding='utf-8') as f: for line in f: print_variant(variant_line=line, outfile=outfile, mode='modified', silent=silent) except Exception as e: logger.warning(e) for worker in compound_scorers: worker.terminate() variant_printer.terminate() context.abort() finally: logger.info("Removing temp file") os.remove(temp_file.name) logger.debug("Temp file removed") logger.info('Time for whole analyis: {0}'.format( str(datetime.now() - start_time_analysis)))
def score(variant_file, family_id, family_file, family_type, score_config, silent, skip_plugin_check, rank_results, outfile): """ Score variants in a vcf file using a Weighted Sum Model. The specific scores should be defined in a config file, see examples on github. """ logger = logging.getLogger(__name__) logger.info('Running GENMOD score, version: {0}'.format(__version__)) logger.info("Checking family id") if family_file: logger.info("Setting up a family parser") family_parser = FamilyParser(family_file, family_type) logger.debug("Family parser done") family_id = list(family_parser.families.keys())[0] logger.info("Family used in analysis: {0}".format(family_id)) ## Check the score config: if not score_config: logger.warning("Please provide a score config file.") logger.info("Exiting") sys.exit(1) logger.debug("Parsing config file") try: config_parser = ConfigParser(score_config) except ValidateError as e: logger.error(e.message) logger.info("Exiting") sys.exit(1) score_categories = list(config_parser.categories.keys()) logger.debug("Config parsed succesfully") logger.info("Initializing a Header Parser") head = HeaderParser() for line in variant_file: line = line.rstrip() if line.startswith('#'): if line.startswith('##'): head.parse_meta_data(line) else: head.parse_header_line(line) else: break logger.info("Check if all score plugins exist in vcf ...") if not check_plugins(config_parser, head): if not skip_plugin_check: logger.error("All score plugins has to be defined in vcf header") logger.info("Exiting") sys.exit(1) else: logger.info("All plugins are defined in vcf") #Add the first variant to the iterator variant_file = itertools.chain([line], variant_file) header_line = head.header if "RankScore" in head.info_dict: logger.warning("Variants already scored according to VCF header") logger.info("Please check VCF file") logger.info("Exiting...") sys.exit(1) add_metadata( head, 'info', 'RankScore', annotation_number='.', entry_type='String', description="The rank score for this variant in this family. family_id:rank_score." ) if rank_results: add_metadata( head, 'info', 'RankResult', annotation_number='.', entry_type='String', description= '|'.join(score_categories) ) print_headers( head=head, outfile=outfile, silent=silent ) start_scoring = datetime.now() last_twenty = datetime.now() nr_of_variants = 1 for line in variant_file: if not line.startswith('#'): variant = get_variant_dict(line, header_line) variant['info_dict'] = get_info_dict(variant['INFO']) rank_score = 0 # This is for printing results to vcf: category_scores = [] for category in score_categories: category_score = get_category_score(variant, category, config_parser) logger.debug("Adding category score {0} to rank_score".format(category_score)) rank_score += category_score logger.debug("Updating rank score to {0}".format(rank_score)) category_scores.append(str(category_score)) variant = add_vcf_info( keyword = 'RankScore', variant_dict=variant, annotation="{0}:{1}".format(family_id, rank_score) ) if rank_results: variant = add_vcf_info( keyword = 'RankResult', variant_dict=variant, annotation="|".join(category_scores) ) print_variant( variant_dict=variant, header_line=header_line, outfile=outfile, silent=silent ) nr_of_variants += 1 if nr_of_variants % 20000 == 0: logger.info("{0} variants scored.".format(nr_of_variants)) logger.info("Last 20000 took {0} to score.".format(datetime.now()-last_twenty)) last_twenty = datetime.now() logger.info("Variants scored. Number of variants: {0}".format(nr_of_variants)) logger.info("Time to score variants: {0}".format(datetime.now()-start_scoring))
def sort(variant_file, outfile, family_id, silent, position, temp_dir): """ Sort a VCF file based on rank score. """ head = HeaderParser() variant_file = get_file_handle(variant_file) logger.info("Running GENMOD sort version {0}".format(__version__)) start = datetime.now() # Create a temporary variant file for sorting logger.debug("Creating temporary file for sorting") if temp_dir: temp_file = NamedTemporaryFile(delete=False, dir=temp_dir) else: temp_file = NamedTemporaryFile(delete=False) temp_file.close() # Open the temp file with codecs temp_file_handle = open(temp_file.name, mode='w', encoding='utf-8', errors='replace') logger.debug("Temp file created") logger.info("Printing variants to temp file") nr_variants = 0 # Print the variants with rank score in first column for line in variant_file: line = line.rstrip() if line.startswith('#'): if line.startswith('##'): head.parse_meta_data(line) else: head.parse_header_line(line) else: nr_variants += 1 priority = '0' if position: chrom = line.split()[0] priority = get_chromosome_priority(chrom) else: priority = get_rank_score(line) print_variant(variant_line=line, priority=priority, outfile=temp_file_handle) temp_file_handle.close() logger.info("Variants printed to temp file") logger.info("Nr or variants in VCF file: {0}".format(nr_variants)) sort_mode = 'rank' if nr_variants == 0: logger.debug("Printing headers") print_headers(head=head, outfile=outfile, silent=silent) sys.exit(0) if position: sort_mode = 'chromosome' logger.info("Sorting variants") sort_variants(infile=temp_file.name, mode=sort_mode) logger.info("Variants sorted") logger.debug("Printing headers") print_headers(head=head, outfile=outfile, silent=silent) logger.debug("Headers printed") logger.info("Printing variants") with open(temp_file.name, mode='r', encoding='utf-8', errors='replace') as f: for variant_line in f: print_variant(variant_line=variant_line, outfile=outfile, mode='modified', silent=False) logger.debug("Variants printed") logger.info("Removing temp file") os.remove(temp_file.name) logger.debug("Temp file removed") logger.info("Sorting done, time for sorting: {0}".format(datetime.now() - start))
def score(context, variant_file, family_id, family_file, family_type, score_config, silent, skip_plugin_check, rank_results, outfile): """ Score variants in a vcf file using a Weighted Sum Model. The specific scores should be defined in a config file, see examples on github. """ logger.info('Running GENMOD score, version: {0}'.format(__version__)) logger.info("Checking family id") variant_file = get_file_handle(variant_file) if family_file: logger.info("Setting up a family parser") family_parser = FamilyParser(family_file, family_type) logger.debug("Family parser done") family_id = list(family_parser.families.keys())[0] logger.info("Family used in analysis: {0}".format(family_id)) ## Check the score config: if not score_config: logger.warning("Please provide a score config file.") context.abort() logger.debug("Parsing config file") try: config_parser = ConfigParser(score_config) except ValidateError as e: logger.error(e.message) context.abort() score_categories = list(config_parser.categories.keys()) logger.debug("Config parsed succesfully") logger.info("Initializing a Header Parser") head = HeaderParser() for line in variant_file: line = line.rstrip() if line.startswith('#'): if line.startswith('##'): head.parse_meta_data(line) else: head.parse_header_line(line) else: break logger.info("Check if all score plugins exist in vcf ...") if not check_plugins(config_parser, head): if not skip_plugin_check: logger.error("All score plugins has to be defined in vcf header") context.abort() else: logger.info("All plugins are defined in vcf") csq_format = head.vep_columns #Add the first variant to the iterator if not line.startswith('#'): variant_file = itertools.chain([line], variant_file) else: print_headers(head=head, outfile=outfile, silent=silent) sys.exit(0) header_line = head.header if "RankScore" in head.info_dict: logger.warning("Variants already scored according to VCF header") logger.info("Please check VCF file") context.abort() add_metadata( head, 'info', 'RankScore', annotation_number='.', entry_type='String', description= "The rank score for this variant in this family. family_id:rank_score." ) if rank_results: add_metadata(head, 'info', 'RankResult', annotation_number='.', entry_type='String', description='|'.join(score_categories)) print_headers(head=head, outfile=outfile, silent=silent) start_scoring = datetime.now() last_twenty = datetime.now() nr_of_variants = 1 for line in variant_file: if not line.startswith('#'): variant = get_variant_dict(line, header_line) variant['info_dict'] = get_info_dict(variant['INFO']) rank_score = 0 # This is for printing results to vcf: category_scores = [] for category in score_categories: category_score = get_category_score( variant=variant, category=category, config_parser=config_parser, csq_format=csq_format) logger.debug("Adding category score {0} to rank_score".format( category_score)) rank_score += category_score logger.debug("Updating rank score to {0}".format(rank_score)) category_scores.append(str(category_score)) variant = add_vcf_info(keyword='RankScore', variant_dict=variant, annotation="{0}:{1}".format( family_id, rank_score)) if rank_results: variant = add_vcf_info(keyword='RankResult', variant_dict=variant, annotation="|".join(category_scores)) print_variant(variant_dict=variant, header_line=header_line, outfile=outfile, silent=silent) nr_of_variants += 1 if nr_of_variants % 20000 == 0: logger.info("{0} variants scored.".format(nr_of_variants)) logger.info( "Last 20000 took {0} to score.".format(datetime.now() - last_twenty)) last_twenty = datetime.now() logger.info( "Variants scored. Number of variants: {0}".format(nr_of_variants)) logger.info("Time to score variants: {0}".format(datetime.now() - start_scoring))
def models(context, variant_file, family_file, family_type, reduced_penetrance, vep, keyword, phased, strict, silent, processes, outfile, temp_dir, whole_gene): """ Annotate genetic models for vcf variants. Checks what patterns of inheritance that are followed in a VCF file. The analysis is family based so each family that are specified in the family file and exists in the variant file will get it's own annotation. """ ######### This is for logging the command line string ######### frame = inspect.currentframe() args, _, _, values = inspect.getargvalues(frame) argument_list = [ i + '=' + str(values[i]) for i in values if values[i] and i not in ['frame'] ] variant_file = get_file_handle(variant_file) ########################################################################### logger.info( "Running GENMOD annotate models version {0}".format(__version__)) logger.debug("Arguments: {0}".format(', '.join(argument_list))) reduced_penetrance_genes = set() nr_reduced_penetrance_genes = 0 if reduced_penetrance: logger.info("Found file with genes that have reduced penetrance") for line in reduced_penetrance: if not line.startswith('#'): nr_reduced_penetrance_genes += 1 gene_id = line.rstrip().split()[0] logger.debug( "Adding gene {0} to reduced penetrance genes".format( gene_id)) reduced_penetrance_genes.add(gene_id) logger.info("Found {0} genes with reduced penetrance".format( nr_reduced_penetrance_genes)) if not family_file: logger.warning("Please provide a family file with -f/--family_file") context.abort() logger.info("Setting up a family parser") family_parser = FamilyParser(family_file, family_type) logger.debug("Family parser done") families = {} logger.info("Check if the familys have any affected") for family_id in family_parser.families: found_affected = False family_obj = family_parser.families[family_id] for ind_id in family_obj.individuals: ind_obj = family_obj.individuals[ind_id] if ind_obj.affected: found_affected = True if found_affected: families[family_id] = family_obj else: logger.warning("No affected individuals found for family {0}."\ " Skipping family.".format(family_id)) if not families: logger.warning( "Please provide at least one family with affected individuals") context.abort() # The individuals in the ped file must be present in the variant file: logger.info("Families used in analysis: {0}".format(','.join( list(families.keys())))) logger.info("Individuals included in analysis: {0}".format(','.join( list(family_parser.individuals.keys())))) head = HeaderParser() for line in variant_file: line = line.rstrip() if line.startswith('#'): if line.startswith('##'): head.parse_meta_data(line) else: head.parse_header_line(line) else: break #Add the first variant to the iterator if not line.startswith('#'): variant_file = itertools.chain([line], variant_file) else: print_headers(head=head, outfile=outfile, silent=silent) sys.exit(0) if vep: if not "CSQ" in head.info_dict: logger.warning( "vep flag is used but there is no CSQ field specified in header" ) logger.info("Please check VCF file") context.abort() else: logger.info("Using VEP annotation") else: if not keyword in head.info_dict: logger.warning( "Annotation key {0} could not be found in VCF header".format( keyword)) logger.info("Please check VCF file") context.abort() else: logger.info("Using {0} annotation".format(keyword)) if "GeneticModels" in head.info_dict: logger.warning("Genetic models are already annotated according to vcf"\ " header.") context.abort() logger.info("Adding genmod version to vcf header") head.add_version_tracking(info_id='genmod', version=__version__, date=datetime.now().strftime("%Y-%m-%d %H:%M"), command_line=' '.join(argument_list)) logger.debug("Version added") logger.info("Adding genetic models to vcf header") add_metadata( head, 'info', 'GeneticModels', annotation_number='.', entry_type='String', description="':'-separated list of genetic models for this variant.") logger.debug("Genetic models added") logger.info("Adding model score to vcf header") add_metadata(head, 'info', 'ModelScore', annotation_number='.', entry_type='String', description="PHRED score for genotype models.") logger.debug("Model score added") logger.info("Adding Compounds to vcf header") add_metadata( head, 'info', 'Compounds', annotation_number='.', entry_type='String', description=( "List of compound pairs for this variant." "The list is splitted on ',' family id is separated with compounds" "with ':'. Compounds are separated with '|'.")) logger.debug("Compounds added") vcf_individuals = head.individuals logger.debug("Individuals found in vcf file: {}".format( ', '.join(vcf_individuals))) try: check_individuals(family_parser.individuals, vcf_individuals) except IOError as e: logger.error(e) logger.info("Individuals in PED file: {0}".format(', '.join( family_parser.individuals))) logger.info("Individuals in VCF file: {0}".format( ', '.join(vcf_individuals))) context.abort() start_time_analysis = datetime.now() analysis_individuals = list(family_parser.individuals.keys()) logger.info("Individuals used in analysis: {0}".format( ', '.join(analysis_individuals))) ################################################################### ### The task queue is where all jobs(in this case batches that ### ### represents variants in a region) is put. The consumers will ### ### then pick their jobs from this queue. ### ################################################################### logger.debug("Setting up a JoinableQueue for storing variant batches") # One batch consists of all variants from one or several overlapping genes # there can be a significant amount of variants in a batch for whole genome # data... variant_queue = JoinableQueue(maxsize=100) logger.debug("Setting up a Queue for storing results from workers") results = Manager().Queue() num_model_checkers = processes #Adapt the number of processes to the machine that run the analysis logger.info('Number of CPU:s {}'.format(cpu_count())) logger.info('Number of model checkers: {}'.format(num_model_checkers)) # These are the workers that do the heavy part of the analysis logger.info('Seting up the workers') try: model_checkers = [ VariantAnnotator(task_queue=variant_queue, results_queue=results, families=families, individuals=analysis_individuals, phased=phased, strict=strict, vep=vep, reduced_penetrance_genes=reduced_penetrance_genes) for i in range(num_model_checkers) ] logger.info('Starting the workers') for worker in model_checkers: logger.debug('Starting worker {0}'.format(worker)) worker.start() # This process prints the variants to temporary files logger.info('Seting up the variant printer') if len(model_checkers) == 1: print_headers(head=head, outfile=outfile, silent=silent) variant_printer = VariantPrinter(task_queue=results, head=head, mode='normal', outfile=outfile) else: # We use a temp file to store the processed variants logger.debug("Build a tempfile for printing the variants") if temp_dir: temp_file = NamedTemporaryFile(delete=False, dir=temp_dir) else: temp_file = NamedTemporaryFile(delete=False) temp_file.close() variant_printer = VariantPrinter(task_queue=results, head=head, mode='chromosome', outfile=temp_file.name) logger.info('Starting the variant printer process') variant_printer.start() start_time_variant_parsing = datetime.now() # This process parses the original vcf and create batches to put in the variant queue: logger.info('Start parsing the variants') chromosome_list = get_batches(variants=variant_file, batch_queue=variant_queue, header=head, vep=vep, annotation_keyword=keyword) logger.debug("Put stop signs in the variant queue") for i in range(num_model_checkers): variant_queue.put(None) variant_queue.join() results.put(None) variant_printer.join() if len(model_checkers) > 1: sort_variants(infile=temp_file.name, mode='chromosome') print_headers(head=head, outfile=outfile, silent=silent) with open(temp_file.name, 'r', encoding='utf-8') as f: for line in f: print_variant(variant_line=line, outfile=outfile, mode='modified', silent=silent) except Exception as err: logger.warning(err) for worker in model_checkers: worker.terminate() variant_printer.terminate() context.abort() finally: if len(model_checkers) > 1: logger.info("Removing temp file") os.remove(temp_file.name) logger.debug("Temp file removed") logger.info('Time for whole analyis: {0}'.format( str(datetime.now() - start_time_analysis)))
def filter(variant_file, annotation, threshold, discard, greater, silent, outfile): """ Filter vcf variants. Filter variants based on their annotation """ logger.info("Running genmod filter version {0}".format(__version__)) variant_file = get_file_handle(variant_file) start_time_analysis = datetime.now() logger.info("Initializing a Header Parser") head = HeaderParser() for line in variant_file: line = line.rstrip() if line.startswith('#'): if line.startswith('##'): head.parse_meta_data(line) else: head.parse_header_line(line) else: break #Add the first variant to the iterator variant_file = itertools.chain([line], variant_file) header_line = head.header if not annotation in head.info_dict: logger.warning( "Annotation {0} not specified in header".format(annotation)) logger.info("Please check VCF file") logger.info("Exiting...") sys.exit(1) logger.info( "Building a plugin from extract_vcf for {0}".format(annotation)) annotation_plugin = Plugin(name=annotation, field='INFO', info_key=annotation, separators=[','], record_rule='min', data_type='float') logger.debug("Plugin=(field={0},info_key={1},separators={2},record_rule={3}"\ ",data_type={4})".format('INFO', annotation, "','", 'min', 'float')) print_headers(head=head, outfile=outfile, silent=silent) nr_of_variants = 0 nr_of_passed_variants = 0 for variant in variant_file: nr_of_variants += 1 keep_variant = False value = annotation_plugin.get_value(variant_line=variant) logger.debug("Found value {0}".format(value)) if value: if greater: if value > threshold: keep_variant = True else: if value < threshold: keep_variant = True else: if not discard: keep_variant = True if keep_variant: logger.debug("Keeping variant") nr_of_passed_variants += 1 print_variant(variant_line=variant, outfile=outfile, mode='vcf', silent=silent) else: logger.debug("Discarding variant") logger.info("Number of variants in file {0}".format(nr_of_variants)) logger.info( "Number of variants passing filter {0}".format(nr_of_passed_variants)) logger.info( "Number of variants filtered {0}".format(nr_of_variants - nr_of_passed_variants))
def annotate(variant_file, annotate_regions, cadd_file, thousand_g, exac, spidex,annotation_dir, outfile, silent, cadd_raw, cosmic, max_af, processes, temp_dir): """ Annotate vcf variants. Annotate variants with a number of different sources. Please use --help for more info. """ logger.info("Running genmod annotate_variant version {0}".format(__version__)) start_time_analysis = datetime.now() annotator_arguments = {} logger.info("Initializing a Header Parser") head = HeaderParser() line = None for line in variant_file: line = line.rstrip() if line.startswith('#'): if line.startswith('##'): head.parse_meta_data(line) else: head.parse_header_line(line) else: break #Add the first variant to the iterator if line: variant_file = itertools.chain([line], variant_file) header_line = head.header annotator_arguments['header_line'] = header_line if annotate_regions: logger.info("Loading annotations") gene_trees, exon_trees = load_annotations(annotation_dir) annotator_arguments['gene_trees'] = gene_trees annotator_arguments['exon_trees'] = exon_trees add_metadata( head, 'info', 'Annotation', annotation_number='.', entry_type='String', description='Annotates what feature(s) this variant belongs to.' ) add_metadata( head, 'info', 'Exonic', annotation_number='0', entry_type='Flag', description='Indicates if the variant is exonic.' ) if exac: logger.info("Annotating ExAC frequencies") logger.debug("Using ExAC file: {0}".format(exac)) annotator_arguments['exac'] = exac add_metadata( head, 'info', 'ExACAF', annotation_number='1', entry_type='Float', description="Frequency in the ExAC database." ) if thousand_g: logger.info("Annotating 1000G frequencies") logger.debug("Using 1000G file: {0}".format(thousand_g)) annotator_arguments['thousand_g'] = thousand_g add_metadata( head, 'info', '1000GAF', annotation_number='1', entry_type='Float', description="Frequency in the 1000G database." ) if spidex: logger.info("Annotating Spidex z scores") logger.debug("Using Spidex file: {0}".format(spidex)) annotator_arguments['spidex'] = spidex add_metadata( head, 'info', 'SPIDEX', annotation_number='1', entry_type='Float', description="Z score from the spidex database." ) if cadd_file: logger.info("Annotating CADD scores") logger.debug("Using CADD file(s): {0}".format(', '.join(cadd_file))) annotator_arguments['cadd_files'] = cadd_file any_cadd_file = True add_metadata( head, 'info', 'CADD', annotation_number='1', entry_type='Integer', description="The CADD relative score for this alternative." ) if cadd_raw: annotator_arguments['cadd_raw'] = cadd_raw logger.debug("Adding vcf metadata for CADD raw score") add_metadata( head, 'info', 'CADD_raw', annotation_number='1', entry_type='Float', description="The CADD raw score(s) for this alternative(s)." ) if max_af: annotator_arguments['max_af'] = max_af if thousand_g: add_metadata( head, 'info', '1000G_MAX_AF', annotation_number='1', entry_type='Float', description="The max af for thousand genomes populations." ) if exac: add_metadata( head, 'info', 'ExAC_MAX_AF', annotation_number='1', entry_type='Float', description="The max af for ExAC populations." ) if cosmic: logger.info("Annotating if variant is in COSMIC") logger.debug("Using COSMOC file: {0}".format(cosmic)) annotator_arguments['cosmic'] = cosmic add_metadata( head, 'info', 'COSMIC', annotation_number='0', entry_type='Flag', description="If variant is in COSMIC database." ) ################################################################### ### The task queue is where all jobs(in this case batches that ### ### represents variants in a region) is put. The consumers will ### ### then pick their jobs from this queue. ### ################################################################### logger.debug("Setting up a JoinableQueue for storing variant batches") variant_queue = JoinableQueue(maxsize=1000) logger.debug("Setting up a Queue for storing results from workers") results = Manager().Queue() num_annotators = processes #Adapt the number of processes to the machine that run the analysis if cadd_file or spidex: # We need more power when annotating cadd scores: # But if flag is used that overrides if num_annotators == min(4, cpu_count()): num_annotators = min(8, cpu_count()) logger.info('Number of CPU:s {}'.format(cpu_count())) logger.info('Number of model checkers: {}'.format(num_annotators)) # These are the workers that do the heavy part of the analysis logger.info('Setting up the workers') annotators = [ VariantAnnotator( variant_queue, results, **annotator_arguments ) for i in range(num_annotators) ] logger.info('Starting the workers') for worker in annotators: logger.debug('Starting worker {0}'.format(worker)) worker.start() # This process prints the variants to temporary files # If there is only one annotation process we can print the results as soon # as they are done logger.info('Setting up the variant printer') if len(annotators) == 1: print_headers(head, outfile, silent) var_printer = VariantPrinter( task_queue = results, head = head, mode='normal', outfile = outfile ) else: # We use a temp file to store the processed variants logger.debug("Build a tempfile for printing the variants") if temp_dir: temp_file = NamedTemporaryFile(delete=False, dir=temp_dir) else: temp_file = NamedTemporaryFile(delete=False) temp_file.close() var_printer = VariantPrinter( task_queue = results, head = head, mode='chromosome', outfile = temp_file.name ) logger.info('Starting the variant printer process') var_printer.start() start_time_variant_parsing = datetime.now() start_time_twenty = datetime.now() nr_of_lines = 0 # This process parses the original vcf and create batches to put in the variant queue: logger.info('Start parsing the variants') for line in variant_file: line = line.rstrip() if not line.startswith('#'): variant_queue.put(line) nr_of_lines += 1 if nr_of_lines % 20000 == 0: logger.info('{0} variants parsed'.format(nr_of_lines)) logger.info('Last 20000 took {0} to parse'.format( datetime.now()-start_time_twenty)) start_time_twenty = datetime.now() logger.info('Put stop signs in the variant queue') for i in range(num_annotators): variant_queue.put(None) variant_queue.join() results.put(None) var_printer.join() if len(annotators) > 1: logger.info("Start sorting the variants") sort_variants(temp_file.name, mode='chromosome') logger.info("Print the headers") print_headers(head, outfile, silent) with open(temp_file.name, 'r', encoding='utf-8') as f: for line in f: print_variant( variant_line=line, outfile=outfile, mode='modified', silent=silent ) logger.info("Removing temp file") os.remove(temp_file.name) logger.debug("Temp file removed") logger.info('Time for whole analyis: {0}'.format( str(datetime.now() - start_time_analysis)))
def filter(variant_file, annotation, threshold, discard, greater, silent, outfile): """ Filter vcf variants. Filter variants based on their annotation """ logger.info("Running genmod filter version {0}".format(__version__)) start_time_analysis = datetime.now() logger.info("Initializing a Header Parser") head = HeaderParser() for line in variant_file: line = line.rstrip() if line.startswith('#'): if line.startswith('##'): head.parse_meta_data(line) else: head.parse_header_line(line) else: break #Add the first variant to the iterator variant_file = itertools.chain([line], variant_file) header_line = head.header if not annotation in head.info_dict: logger.warning("Annotation {0} not specified in header".format(annotation)) logger.info("Please check VCF file") logger.info("Exiting...") sys.exit(1) logger.info("Building a plugin from extract_vcf for {0}".format(annotation)) annotation_plugin = Plugin( name=annotation, field='INFO', info_key=annotation, separators = [','], record_rule = 'min', data_type = 'float' ) logger.debug("Plugin=(field={0},info_key={1},separators={2},record_rule={3}"\ ",data_type={4})".format('INFO', annotation, "','", 'min', 'float')) print_headers(head=head, outfile=outfile, silent=silent) nr_of_variants = 0 nr_of_passed_variants = 0 for variant in variant_file: nr_of_variants += 1 keep_variant = False value = annotation_plugin.get_value(variant_line=variant) logger.debug("Found value {0}".format(value)) if value: if greater: if value > threshold: keep_variant = True else: if value < threshold: keep_variant = True else: if not discard: keep_variant = True if keep_variant: logger.debug("Keeping variant") nr_of_passed_variants += 1 print_variant( variant_line=variant, outfile=outfile, mode='vcf', silent=silent ) else: logger.debug("Discarding variant") logger.info("Number of variants in file {0}".format(nr_of_variants)) logger.info("Number of variants passing filter {0}".format(nr_of_passed_variants)) logger.info("Number of variants filtered {0}".format( nr_of_variants - nr_of_passed_variants))