def add_cadd(header): """Add cadd annotation to vcf header""" logger.info("Adding 'CADD' to vcf header") add_metadata(header, 'info', 'CADD', annotation_number='1', entry_type='Integer', description="The CADD relative score for this alternative.") return
def add_cadd_raw(header): """Add cadd annotation to vcf header""" logger.info("Adding 'CADD' to vcf header") add_metadata(header, 'info', 'CADD_raw', annotation_number='1', entry_type='Float', description="The CADD raw score(s) for this alternative(s).") return
def add_spidex(header): """Add spidex annotation to vcf header""" logger.info("Adding 'SPIDEX' to vcf header") add_metadata(header, 'info', 'SPIDEX', annotation_number='1', entry_type='Float', description="Z score from the spidex database.") return
def add_thousandg(header): """Add thousand genomes annotation to vcf header""" logger.info("Adding '1000GAF' to vcf header") add_metadata(header, 'info', '1000GAF', annotation_number='1', entry_type='Float', description="Frequency in the 1000G database.") return
def add_thousandg_max(header): """Add thousand genomes max annotation to vcf header""" logger.info("Adding '1000G_MAX_AF' to vcf header") add_metadata(header, 'info', '1000G_MAX_AF', annotation_number='1', entry_type='Float', description="The max af for thousand genomes populations.") return
def add_exac(header): """Add exac annotation to vcf header""" logger.info("Adding 'EXACAF' to vcf header") add_metadata(header, 'info', 'EXACAF', annotation_number='1', entry_type='Float', description="Frequency in the ExAC database.") return
def add_exac_max(header): """Add exac annotation to vcf header""" logger.info("Adding 'EXAC_MAX_AF' to vcf header") add_metadata(header, 'info', 'EXAC_MAX_AF', annotation_number='1', entry_type='Float', description="The max af for ExAC populations.") return
def add_cosmic(header): """Add cosmic annotation to vcf header""" logger.info("Adding 'COSMIC' to vcf header") add_metadata(header, 'info', 'COSMIC', annotation_number='0', entry_type='Flag', description="If variant is in COSMIC database.") return
def add_regions(header): """Add region annotations to header""" logger.info("Adding 'Annotation' to vcf header") add_metadata( header, 'info', 'Annotation', annotation_number='.', entry_type='String', description='Annotates what feature(s) this variant belongs to.') return
def models(variant_file, family_file, family_type, reduced_penetrance, vep, keyword, phased, strict, silent, processes, whole_gene, outfile, temp_dir): """ Annotate genetic models for vcf variants. Checks what patterns of inheritance that are followed in a VCF file. The analysis is family based so each family that are specified in the family file and exists in the variant file will get it's own annotation. """ logger = logging.getLogger(__name__) ######### This is for logging the command line string ######### frame = inspect.currentframe() args, _, _, values = inspect.getargvalues(frame) argument_list = [ i+'='+str(values[i]) for i in values if values[i] and i not in ['frame'] ] ########################################################################### logger.info("Running GENMOD annotate version {0}".format(__version__)) logger.debug("Arguments: {0}".format(', '.join(argument_list))) reduced_penetrance_genes = set() nr_reduced_penetrance_genes = 0 if reduced_penetrance: logger.info("Found file with genes that have reduced penetrance") for line in reduced_penetrance: if not line.startswith('#'): nr_reduced_penetrance_genes += 1 gene_id = line.rstrip().split()[0] logger.debug("Adding gene {0} to reduced penetrance genes".format( gene_id )) reduced_penetrance_genes.add( gene_id ) logger.info("Found {0} genes with reduced penetrance".format( nr_reduced_penetrance_genes)) if not family_file: logger.warning("Please provide a family file with -f/--family_file") logger.info("Exiting") sys.exit(1) logger.info("Setting up a family parser") family_parser = FamilyParser(family_file, family_type) logger.debug("Family parser done") families = {} logger.info("Check if the familys have any affected") for family_id in family_parser.families: found_affected = False family_obj = family_parser.families[family_id] for ind_id in family_obj.individuals: ind_obj = family_obj.individuals[ind_id] if ind_obj.affected: found_affected = True if found_affected: families[family_id] = family_obj else: logger.warning("No affected individuals found for family {0}."\ " Skipping family.".format(family_id)) if not families: logger.warning("Please provide at least one family with affected individuals") sys.exit(0) # The individuals in the ped file must be present in the variant file: logger.info("Families used in analysis: {0}".format( ','.join(list(families.keys())))) logger.info("Individuals included in analysis: {0}".format( ','.join(list(family_parser.individuals.keys())))) head = HeaderParser() for line in variant_file: line = line.rstrip() if line.startswith('#'): if line.startswith('##'): head.parse_meta_data(line) else: head.parse_header_line(line) else: break #Add the first variant to the iterator variant_file = itertools.chain([line], variant_file) if vep: if not "CSQ" in head.info_dict: logger.warning("vep flag is used but there is no CSQ field specified in header") logger.info("Please check VCF file") logger.info("Exiting...") sys.exit(1) else: logger.info("Using VEP annotation") else: if not keyword in head.info_dict: logger.warning("Annotation key {0} could not be found in VCF header".format(keyword)) logger.info("Please check VCF file") logger.info("Exiting...") sys.exit(1) else: logger.info("Using {0} annotation".format(keyword)) if "GeneticModels" in head.info_dict: logger.warning("Genetic models are already annotated according to vcf"\ " header.") logger.info("Exiting...") sys.exit(1) logger.info("Adding genmod version to vcf header") head.add_version_tracking( info_id='genmod', version=__version__, date=datetime.now().strftime("%Y-%m-%d %H:%M"), command_line=' '.join(argument_list) ) logger.debug("Version added") logger.info("Adding genetic models to vcf header") add_metadata( head, 'info', 'GeneticModels', annotation_number='.', entry_type='String', description="':'-separated list of genetic models for this variant." ) logger.debug("Genetic models added") logger.info("Adding model score to vcf header") add_metadata( head, 'info', 'ModelScore', annotation_number='.', entry_type='String', description="PHRED score for genotype models." ) logger.debug("Model score added") logger.info("Adding Compounds to vcf header") add_metadata( head, 'info', 'Compounds', annotation_number='.', entry_type='String', description=("List of compound pairs for this variant." "The list is splitted on ',' family id is separated with compounds" "with ':'. Compounds are separated with '|'.") ) logger.debug("Compounds added") vcf_individuals = head.individuals logger.debug("Individuals found in vcf file: {}".format(', '.join(vcf_individuals))) start_time_analysis = datetime.now() try: check_individuals(family_parser.individuals, vcf_individuals) except IOError as e: logger.error(e) logger.info("Individuals in PED file: {0}".format( ', '.join(family_parser.individuals))) logger.info("Individuals in VCF file: {0}".format(', '.join(vcf_individuals))) logger.info("Exiting...") sys.exit(1) analysis_individuals = list(family_parser.individuals.keys()) logger.info("Individuals used in analysis: {0}".format( ', '.join(analysis_individuals))) ################################################################### ### The task queue is where all jobs(in this case batches that ### ### represents variants in a region) is put. The consumers will ### ### then pick their jobs from this queue. ### ################################################################### logger.debug("Setting up a JoinableQueue for storing variant batches") variant_queue = JoinableQueue(maxsize=1000) logger.debug("Setting up a Queue for storing results from workers") results = Manager().Queue() num_model_checkers = processes #Adapt the number of processes to the machine that run the analysis logger.info('Number of CPU:s {}'.format(cpu_count())) logger.info('Number of model checkers: {}'.format(num_model_checkers)) # These are the workers that do the heavy part of the analysis logger.info('Seting up the workers') model_checkers = [ VariantAnnotator( task_queue=variant_queue, results_queue=results, families=families, individuals=analysis_individuals, phased=phased, strict=strict, whole_gene=whole_gene, vep=vep, reduced_penetrance_genes = reduced_penetrance_genes ) for i in range(num_model_checkers) ] logger.info('Starting the workers') for worker in model_checkers: logger.debug('Starting worker {0}'.format(worker)) worker.start() # This process prints the variants to temporary files logger.info('Seting up the variant printer') if len(model_checkers) == 1: print_headers(head=head, outfile=outfile, silent=silent) variant_printer = VariantPrinter( task_queue=results, head=head, mode='normal', outfile = outfile ) else: # We use a temp file to store the processed variants logger.debug("Build a tempfile for printing the variants") if temp_dir: temp_file = NamedTemporaryFile(delete=False, dir=temp_dir) else: temp_file = NamedTemporaryFile(delete=False) temp_file.close() variant_printer = VariantPrinter( task_queue=results, head=head, mode='chromosome', outfile = temp_file.name ) logger.info('Starting the variant printer process') variant_printer.start() start_time_variant_parsing = datetime.now() # This process parses the original vcf and create batches to put in the variant queue: logger.info('Start parsing the variants') chromosome_list = get_batches( variants = variant_file, batch_queue = variant_queue, header = head, vep = vep, annotation_keyword = keyword ) logger.debug("Put stop signs in the variant queue") for i in range(num_model_checkers): variant_queue.put(None) variant_queue.join() results.put(None) variant_printer.join() if len(model_checkers) > 1: sort_variants(infile=temp_file.name, mode='chromosome') print_headers(head=head, outfile=outfile, silent=silent) with open(temp_file.name, 'r', encoding='utf-8') as f: for line in f: print_variant( variant_line=line, outfile=outfile, mode='modified', silent=silent ) logger.debug("Removing temp file") os.remove(temp_file.name) logger.debug("Temp file removed") logger.info('Time for whole analyis: {0}'.format( str(datetime.now() - start_time_analysis)))
def score(variant_file, family_id, family_file, family_type, score_config, silent, skip_plugin_check, rank_results, outfile): """ Score variants in a vcf file using a Weighted Sum Model. The specific scores should be defined in a config file, see examples on github. """ logger = logging.getLogger(__name__) logger.info('Running GENMOD score, version: {0}'.format(__version__)) logger.info("Checking family id") if family_file: logger.info("Setting up a family parser") family_parser = FamilyParser(family_file, family_type) logger.debug("Family parser done") family_id = list(family_parser.families.keys())[0] logger.info("Family used in analysis: {0}".format(family_id)) ## Check the score config: if not score_config: logger.warning("Please provide a score config file.") logger.info("Exiting") sys.exit(1) logger.debug("Parsing config file") try: config_parser = ConfigParser(score_config) except ValidateError as e: logger.error(e.message) logger.info("Exiting") sys.exit(1) score_categories = list(config_parser.categories.keys()) logger.debug("Config parsed succesfully") logger.info("Initializing a Header Parser") head = HeaderParser() for line in variant_file: line = line.rstrip() if line.startswith('#'): if line.startswith('##'): head.parse_meta_data(line) else: head.parse_header_line(line) else: break logger.info("Check if all score plugins exist in vcf ...") if not check_plugins(config_parser, head): if not skip_plugin_check: logger.error("All score plugins has to be defined in vcf header") logger.info("Exiting") sys.exit(1) else: logger.info("All plugins are defined in vcf") #Add the first variant to the iterator variant_file = itertools.chain([line], variant_file) header_line = head.header if "RankScore" in head.info_dict: logger.warning("Variants already scored according to VCF header") logger.info("Please check VCF file") logger.info("Exiting...") sys.exit(1) add_metadata( head, 'info', 'RankScore', annotation_number='.', entry_type='String', description="The rank score for this variant in this family. family_id:rank_score." ) if rank_results: add_metadata( head, 'info', 'RankResult', annotation_number='.', entry_type='String', description= '|'.join(score_categories) ) print_headers( head=head, outfile=outfile, silent=silent ) start_scoring = datetime.now() last_twenty = datetime.now() nr_of_variants = 1 for line in variant_file: if not line.startswith('#'): variant = get_variant_dict(line, header_line) variant['info_dict'] = get_info_dict(variant['INFO']) rank_score = 0 # This is for printing results to vcf: category_scores = [] for category in score_categories: category_score = get_category_score(variant, category, config_parser) logger.debug("Adding category score {0} to rank_score".format(category_score)) rank_score += category_score logger.debug("Updating rank score to {0}".format(rank_score)) category_scores.append(str(category_score)) variant = add_vcf_info( keyword = 'RankScore', variant_dict=variant, annotation="{0}:{1}".format(family_id, rank_score) ) if rank_results: variant = add_vcf_info( keyword = 'RankResult', variant_dict=variant, annotation="|".join(category_scores) ) print_variant( variant_dict=variant, header_line=header_line, outfile=outfile, silent=silent ) nr_of_variants += 1 if nr_of_variants % 20000 == 0: logger.info("{0} variants scored.".format(nr_of_variants)) logger.info("Last 20000 took {0} to score.".format(datetime.now()-last_twenty)) last_twenty = datetime.now() logger.info("Variants scored. Number of variants: {0}".format(nr_of_variants)) logger.info("Time to score variants: {0}".format(datetime.now()-start_scoring))
def score(context, variant_file, family_id, family_file, family_type, score_config, silent, skip_plugin_check, rank_results, outfile): """ Score variants in a vcf file using a Weighted Sum Model. The specific scores should be defined in a config file, see examples on github. """ logger.info('Running GENMOD score, version: {0}'.format(__version__)) logger.info("Checking family id") variant_file = get_file_handle(variant_file) if family_file: logger.info("Setting up a family parser") family_parser = FamilyParser(family_file, family_type) logger.debug("Family parser done") family_id = list(family_parser.families.keys())[0] logger.info("Family used in analysis: {0}".format(family_id)) ## Check the score config: if not score_config: logger.warning("Please provide a score config file.") context.abort() logger.debug("Parsing config file") try: config_parser = ConfigParser(score_config) except ValidateError as e: logger.error(e.message) context.abort() score_categories = list(config_parser.categories.keys()) logger.debug("Config parsed succesfully") logger.info("Initializing a Header Parser") head = HeaderParser() for line in variant_file: line = line.rstrip() if line.startswith('#'): if line.startswith('##'): head.parse_meta_data(line) else: head.parse_header_line(line) else: break logger.info("Check if all score plugins exist in vcf ...") if not check_plugins(config_parser, head): if not skip_plugin_check: logger.error("All score plugins has to be defined in vcf header") context.abort() else: logger.info("All plugins are defined in vcf") csq_format = head.vep_columns #Add the first variant to the iterator if not line.startswith('#'): variant_file = itertools.chain([line], variant_file) else: print_headers(head=head, outfile=outfile, silent=silent) sys.exit(0) header_line = head.header if "RankScore" in head.info_dict: logger.warning("Variants already scored according to VCF header") logger.info("Please check VCF file") context.abort() add_metadata( head, 'info', 'RankScore', annotation_number='.', entry_type='String', description= "The rank score for this variant in this family. family_id:rank_score." ) if rank_results: add_metadata(head, 'info', 'RankResult', annotation_number='.', entry_type='String', description='|'.join(score_categories)) print_headers(head=head, outfile=outfile, silent=silent) start_scoring = datetime.now() last_twenty = datetime.now() nr_of_variants = 1 for line in variant_file: if not line.startswith('#'): variant = get_variant_dict(line, header_line) variant['info_dict'] = get_info_dict(variant['INFO']) rank_score = 0 # This is for printing results to vcf: category_scores = [] for category in score_categories: category_score = get_category_score( variant=variant, category=category, config_parser=config_parser, csq_format=csq_format) logger.debug("Adding category score {0} to rank_score".format( category_score)) rank_score += category_score logger.debug("Updating rank score to {0}".format(rank_score)) category_scores.append(str(category_score)) variant = add_vcf_info(keyword='RankScore', variant_dict=variant, annotation="{0}:{1}".format( family_id, rank_score)) if rank_results: variant = add_vcf_info(keyword='RankResult', variant_dict=variant, annotation="|".join(category_scores)) print_variant(variant_dict=variant, header_line=header_line, outfile=outfile, silent=silent) nr_of_variants += 1 if nr_of_variants % 20000 == 0: logger.info("{0} variants scored.".format(nr_of_variants)) logger.info( "Last 20000 took {0} to score.".format(datetime.now() - last_twenty)) last_twenty = datetime.now() logger.info( "Variants scored. Number of variants: {0}".format(nr_of_variants)) logger.info("Time to score variants: {0}".format(datetime.now() - start_scoring))
def models(context, variant_file, family_file, family_type, reduced_penetrance, vep, keyword, phased, strict, silent, processes, outfile, temp_dir, whole_gene): """ Annotate genetic models for vcf variants. Checks what patterns of inheritance that are followed in a VCF file. The analysis is family based so each family that are specified in the family file and exists in the variant file will get it's own annotation. """ ######### This is for logging the command line string ######### frame = inspect.currentframe() args, _, _, values = inspect.getargvalues(frame) argument_list = [ i + '=' + str(values[i]) for i in values if values[i] and i not in ['frame'] ] variant_file = get_file_handle(variant_file) ########################################################################### logger.info( "Running GENMOD annotate models version {0}".format(__version__)) logger.debug("Arguments: {0}".format(', '.join(argument_list))) reduced_penetrance_genes = set() nr_reduced_penetrance_genes = 0 if reduced_penetrance: logger.info("Found file with genes that have reduced penetrance") for line in reduced_penetrance: if not line.startswith('#'): nr_reduced_penetrance_genes += 1 gene_id = line.rstrip().split()[0] logger.debug( "Adding gene {0} to reduced penetrance genes".format( gene_id)) reduced_penetrance_genes.add(gene_id) logger.info("Found {0} genes with reduced penetrance".format( nr_reduced_penetrance_genes)) if not family_file: logger.warning("Please provide a family file with -f/--family_file") context.abort() logger.info("Setting up a family parser") family_parser = FamilyParser(family_file, family_type) logger.debug("Family parser done") families = {} logger.info("Check if the familys have any affected") for family_id in family_parser.families: found_affected = False family_obj = family_parser.families[family_id] for ind_id in family_obj.individuals: ind_obj = family_obj.individuals[ind_id] if ind_obj.affected: found_affected = True if found_affected: families[family_id] = family_obj else: logger.warning("No affected individuals found for family {0}."\ " Skipping family.".format(family_id)) if not families: logger.warning( "Please provide at least one family with affected individuals") context.abort() # The individuals in the ped file must be present in the variant file: logger.info("Families used in analysis: {0}".format(','.join( list(families.keys())))) logger.info("Individuals included in analysis: {0}".format(','.join( list(family_parser.individuals.keys())))) head = HeaderParser() for line in variant_file: line = line.rstrip() if line.startswith('#'): if line.startswith('##'): head.parse_meta_data(line) else: head.parse_header_line(line) else: break #Add the first variant to the iterator if not line.startswith('#'): variant_file = itertools.chain([line], variant_file) else: print_headers(head=head, outfile=outfile, silent=silent) sys.exit(0) if vep: if not "CSQ" in head.info_dict: logger.warning( "vep flag is used but there is no CSQ field specified in header" ) logger.info("Please check VCF file") context.abort() else: logger.info("Using VEP annotation") else: if not keyword in head.info_dict: logger.warning( "Annotation key {0} could not be found in VCF header".format( keyword)) logger.info("Please check VCF file") context.abort() else: logger.info("Using {0} annotation".format(keyword)) if "GeneticModels" in head.info_dict: logger.warning("Genetic models are already annotated according to vcf"\ " header.") context.abort() logger.info("Adding genmod version to vcf header") head.add_version_tracking(info_id='genmod', version=__version__, date=datetime.now().strftime("%Y-%m-%d %H:%M"), command_line=' '.join(argument_list)) logger.debug("Version added") logger.info("Adding genetic models to vcf header") add_metadata( head, 'info', 'GeneticModels', annotation_number='.', entry_type='String', description="':'-separated list of genetic models for this variant.") logger.debug("Genetic models added") logger.info("Adding model score to vcf header") add_metadata(head, 'info', 'ModelScore', annotation_number='.', entry_type='String', description="PHRED score for genotype models.") logger.debug("Model score added") logger.info("Adding Compounds to vcf header") add_metadata( head, 'info', 'Compounds', annotation_number='.', entry_type='String', description=( "List of compound pairs for this variant." "The list is splitted on ',' family id is separated with compounds" "with ':'. Compounds are separated with '|'.")) logger.debug("Compounds added") vcf_individuals = head.individuals logger.debug("Individuals found in vcf file: {}".format( ', '.join(vcf_individuals))) try: check_individuals(family_parser.individuals, vcf_individuals) except IOError as e: logger.error(e) logger.info("Individuals in PED file: {0}".format(', '.join( family_parser.individuals))) logger.info("Individuals in VCF file: {0}".format( ', '.join(vcf_individuals))) context.abort() start_time_analysis = datetime.now() analysis_individuals = list(family_parser.individuals.keys()) logger.info("Individuals used in analysis: {0}".format( ', '.join(analysis_individuals))) ################################################################### ### The task queue is where all jobs(in this case batches that ### ### represents variants in a region) is put. The consumers will ### ### then pick their jobs from this queue. ### ################################################################### logger.debug("Setting up a JoinableQueue for storing variant batches") # One batch consists of all variants from one or several overlapping genes # there can be a significant amount of variants in a batch for whole genome # data... variant_queue = JoinableQueue(maxsize=100) logger.debug("Setting up a Queue for storing results from workers") results = Manager().Queue() num_model_checkers = processes #Adapt the number of processes to the machine that run the analysis logger.info('Number of CPU:s {}'.format(cpu_count())) logger.info('Number of model checkers: {}'.format(num_model_checkers)) # These are the workers that do the heavy part of the analysis logger.info('Seting up the workers') try: model_checkers = [ VariantAnnotator(task_queue=variant_queue, results_queue=results, families=families, individuals=analysis_individuals, phased=phased, strict=strict, vep=vep, reduced_penetrance_genes=reduced_penetrance_genes) for i in range(num_model_checkers) ] logger.info('Starting the workers') for worker in model_checkers: logger.debug('Starting worker {0}'.format(worker)) worker.start() # This process prints the variants to temporary files logger.info('Seting up the variant printer') if len(model_checkers) == 1: print_headers(head=head, outfile=outfile, silent=silent) variant_printer = VariantPrinter(task_queue=results, head=head, mode='normal', outfile=outfile) else: # We use a temp file to store the processed variants logger.debug("Build a tempfile for printing the variants") if temp_dir: temp_file = NamedTemporaryFile(delete=False, dir=temp_dir) else: temp_file = NamedTemporaryFile(delete=False) temp_file.close() variant_printer = VariantPrinter(task_queue=results, head=head, mode='chromosome', outfile=temp_file.name) logger.info('Starting the variant printer process') variant_printer.start() start_time_variant_parsing = datetime.now() # This process parses the original vcf and create batches to put in the variant queue: logger.info('Start parsing the variants') chromosome_list = get_batches(variants=variant_file, batch_queue=variant_queue, header=head, vep=vep, annotation_keyword=keyword) logger.debug("Put stop signs in the variant queue") for i in range(num_model_checkers): variant_queue.put(None) variant_queue.join() results.put(None) variant_printer.join() if len(model_checkers) > 1: sort_variants(infile=temp_file.name, mode='chromosome') print_headers(head=head, outfile=outfile, silent=silent) with open(temp_file.name, 'r', encoding='utf-8') as f: for line in f: print_variant(variant_line=line, outfile=outfile, mode='modified', silent=silent) except Exception as err: logger.warning(err) for worker in model_checkers: worker.terminate() variant_printer.terminate() context.abort() finally: if len(model_checkers) > 1: logger.info("Removing temp file") os.remove(temp_file.name) logger.debug("Temp file removed") logger.info('Time for whole analyis: {0}'.format( str(datetime.now() - start_time_analysis)))
def annotate(variant_file, annotate_regions, cadd_file, thousand_g, exac, spidex,annotation_dir, outfile, silent, cadd_raw, cosmic, max_af, processes, temp_dir): """ Annotate vcf variants. Annotate variants with a number of different sources. Please use --help for more info. """ logger.info("Running genmod annotate_variant version {0}".format(__version__)) start_time_analysis = datetime.now() annotator_arguments = {} logger.info("Initializing a Header Parser") head = HeaderParser() line = None for line in variant_file: line = line.rstrip() if line.startswith('#'): if line.startswith('##'): head.parse_meta_data(line) else: head.parse_header_line(line) else: break #Add the first variant to the iterator if line: variant_file = itertools.chain([line], variant_file) header_line = head.header annotator_arguments['header_line'] = header_line if annotate_regions: logger.info("Loading annotations") gene_trees, exon_trees = load_annotations(annotation_dir) annotator_arguments['gene_trees'] = gene_trees annotator_arguments['exon_trees'] = exon_trees add_metadata( head, 'info', 'Annotation', annotation_number='.', entry_type='String', description='Annotates what feature(s) this variant belongs to.' ) add_metadata( head, 'info', 'Exonic', annotation_number='0', entry_type='Flag', description='Indicates if the variant is exonic.' ) if exac: logger.info("Annotating ExAC frequencies") logger.debug("Using ExAC file: {0}".format(exac)) annotator_arguments['exac'] = exac add_metadata( head, 'info', 'ExACAF', annotation_number='1', entry_type='Float', description="Frequency in the ExAC database." ) if thousand_g: logger.info("Annotating 1000G frequencies") logger.debug("Using 1000G file: {0}".format(thousand_g)) annotator_arguments['thousand_g'] = thousand_g add_metadata( head, 'info', '1000GAF', annotation_number='1', entry_type='Float', description="Frequency in the 1000G database." ) if spidex: logger.info("Annotating Spidex z scores") logger.debug("Using Spidex file: {0}".format(spidex)) annotator_arguments['spidex'] = spidex add_metadata( head, 'info', 'SPIDEX', annotation_number='1', entry_type='Float', description="Z score from the spidex database." ) if cadd_file: logger.info("Annotating CADD scores") logger.debug("Using CADD file(s): {0}".format(', '.join(cadd_file))) annotator_arguments['cadd_files'] = cadd_file any_cadd_file = True add_metadata( head, 'info', 'CADD', annotation_number='1', entry_type='Integer', description="The CADD relative score for this alternative." ) if cadd_raw: annotator_arguments['cadd_raw'] = cadd_raw logger.debug("Adding vcf metadata for CADD raw score") add_metadata( head, 'info', 'CADD_raw', annotation_number='1', entry_type='Float', description="The CADD raw score(s) for this alternative(s)." ) if max_af: annotator_arguments['max_af'] = max_af if thousand_g: add_metadata( head, 'info', '1000G_MAX_AF', annotation_number='1', entry_type='Float', description="The max af for thousand genomes populations." ) if exac: add_metadata( head, 'info', 'ExAC_MAX_AF', annotation_number='1', entry_type='Float', description="The max af for ExAC populations." ) if cosmic: logger.info("Annotating if variant is in COSMIC") logger.debug("Using COSMOC file: {0}".format(cosmic)) annotator_arguments['cosmic'] = cosmic add_metadata( head, 'info', 'COSMIC', annotation_number='0', entry_type='Flag', description="If variant is in COSMIC database." ) ################################################################### ### The task queue is where all jobs(in this case batches that ### ### represents variants in a region) is put. The consumers will ### ### then pick their jobs from this queue. ### ################################################################### logger.debug("Setting up a JoinableQueue for storing variant batches") variant_queue = JoinableQueue(maxsize=1000) logger.debug("Setting up a Queue for storing results from workers") results = Manager().Queue() num_annotators = processes #Adapt the number of processes to the machine that run the analysis if cadd_file or spidex: # We need more power when annotating cadd scores: # But if flag is used that overrides if num_annotators == min(4, cpu_count()): num_annotators = min(8, cpu_count()) logger.info('Number of CPU:s {}'.format(cpu_count())) logger.info('Number of model checkers: {}'.format(num_annotators)) # These are the workers that do the heavy part of the analysis logger.info('Setting up the workers') annotators = [ VariantAnnotator( variant_queue, results, **annotator_arguments ) for i in range(num_annotators) ] logger.info('Starting the workers') for worker in annotators: logger.debug('Starting worker {0}'.format(worker)) worker.start() # This process prints the variants to temporary files # If there is only one annotation process we can print the results as soon # as they are done logger.info('Setting up the variant printer') if len(annotators) == 1: print_headers(head, outfile, silent) var_printer = VariantPrinter( task_queue = results, head = head, mode='normal', outfile = outfile ) else: # We use a temp file to store the processed variants logger.debug("Build a tempfile for printing the variants") if temp_dir: temp_file = NamedTemporaryFile(delete=False, dir=temp_dir) else: temp_file = NamedTemporaryFile(delete=False) temp_file.close() var_printer = VariantPrinter( task_queue = results, head = head, mode='chromosome', outfile = temp_file.name ) logger.info('Starting the variant printer process') var_printer.start() start_time_variant_parsing = datetime.now() start_time_twenty = datetime.now() nr_of_lines = 0 # This process parses the original vcf and create batches to put in the variant queue: logger.info('Start parsing the variants') for line in variant_file: line = line.rstrip() if not line.startswith('#'): variant_queue.put(line) nr_of_lines += 1 if nr_of_lines % 20000 == 0: logger.info('{0} variants parsed'.format(nr_of_lines)) logger.info('Last 20000 took {0} to parse'.format( datetime.now()-start_time_twenty)) start_time_twenty = datetime.now() logger.info('Put stop signs in the variant queue') for i in range(num_annotators): variant_queue.put(None) variant_queue.join() results.put(None) var_printer.join() if len(annotators) > 1: logger.info("Start sorting the variants") sort_variants(temp_file.name, mode='chromosome') logger.info("Print the headers") print_headers(head, outfile, silent) with open(temp_file.name, 'r', encoding='utf-8') as f: for line in f: print_variant( variant_line=line, outfile=outfile, mode='modified', silent=silent ) logger.info("Removing temp file") os.remove(temp_file.name) logger.debug("Temp file removed") logger.info('Time for whole analyis: {0}'.format( str(datetime.now() - start_time_analysis)))