def test_get_batches_vep(): """ Test to get a batch """ batch_queue = Queue() variants = [] first_variant = get_variant_line(info="MQ;CSQ=G|ADK") second_variant = get_variant_line(pos="2", info="MQ;CSQ=G|ADK") variants.append(first_variant) variants.append(second_variant) header = HeaderParser() header.parse_header_line("#{0}".format(HEADER)) header.vep_columns = ['Allele', 'SYMBOL'] chromosomes = get_batches(variants=variants, batch_queue=batch_queue, header=header) batch_1 = batch_queue.get() batch_queue.task_done() batch_2 = batch_queue.get() batch_queue.task_done() assert chromosomes == ['1'] assert len(batch_1) == 1 assert len(batch_2) == 1
def test_get_batches_new_chromosome(): """ Test to get a batch """ batch_queue = Queue() variants = [] first_variant = get_variant_line() second_variant = get_variant_line(chrom="2") variants.append(first_variant) variants.append(second_variant) header = HeaderParser() header.parse_header_line("#{0}".format(HEADER)) chromosomes = get_batches(variants=variants, batch_queue=batch_queue, header=header) batch_1 = batch_queue.get() batch_queue.task_done() batch_2 = batch_queue.get() batch_queue.task_done() assert chromosomes == ['1', '2'] assert len(batch_1) == 1 assert len(batch_2) == 1
def test_get_batches_two_regions(): """ Test to get a batch """ batch_queue = Queue() variants = [] first_variant = get_variant_line() second_variant = get_variant_line(pos="2", info="Annotation=DDD;Exonic") variants.append(first_variant) variants.append(second_variant) header = HeaderParser() header.parse_header_line("#{0}".format(HEADER)) chromosomes = get_batches(variants=variants, batch_queue=batch_queue, header=header) batch_1 = batch_queue.get() batch_queue.task_done() batch_2 = batch_queue.get() batch_queue.task_done() assert chromosomes == ['1'] assert len(batch_1) == 1 assert len(batch_2) == 1
def test_variant_printer(): """Test the variant printer""" vcf_file = setup_vcf_file() variant_queue = Manager().Queue() head = HeaderParser() outfile = NamedTemporaryFile(mode='w+t', delete=False, suffix='.vcf') outfile.close() variant_printer = VariantPrinter( task_queue=variant_queue, head=head, mode='chromosome', outfile = outfile.name ) variant_printer.start() batch = OrderedDict() for line in open(vcf_file): line = line.rstrip() if line.startswith('#'): if line.startswith('##'): head.parse_meta_data(line) else: head.parse_header_line(line) else: variant_dict = get_variant_dict(line, head.header) variant_id = get_variant_id(variant_dict) variant_dict['variant_id'] = variant_id variant_dict['info_dict'] = get_info_dict(variant_dict['INFO']) variant_queue.put(variant_dict) variant_queue.put(None) variant_printer.join() variants = [] with open(outfile.name, 'r', 'utf-8-sig') as f: for line in f: variants.append(line.rstrip().split('\t')) assert variants[0][0] == '1' assert variants[0][2] == '11900'
def test_get_batches_one(): """ Test to get a batch """ batch_queue = Queue() variants = [] first_variant = get_variant_line() header = HeaderParser() header.parse_header_line("#{0}".format(HEADER)) variants.append(first_variant) chromosomes = get_batches(variants=variants, batch_queue=batch_queue, header=header) batch = batch_queue.get() assert chromosomes == ['1'] assert len(batch) == 1
def compound(context, variant_file, silent, outfile, vep, processes, temp_dir): """ Score compound variants in a vcf file based on their rank score. """ logger.info( 'Running GENMOD score_compounds, version: {0}'.format(__version__)) variant_file = get_file_handle(variant_file) start_time_analysis = datetime.now() logger.info("Initializing a Header Parser") head = HeaderParser() line = None for line in variant_file: line = line.rstrip() if line.startswith('#'): if line.startswith('##'): head.parse_meta_data(line) else: head.parse_header_line(line) else: break logger.info("Headers parsed") if not line.startswith('#'): variant_file = itertools.chain([line], variant_file) else: print_headers(head=head, outfile=outfile, silent=silent) sys.exit(0) header_line = head.header individuals = head.individuals ################################################################### ### The task queue is where all jobs(in this case batches that ### ### represents variants in a region) is put. The consumers will ### ### then pick their jobs from this queue. ### ################################################################### logger.debug("Setting up a JoinableQueue for storing variant batches") variant_queue = JoinableQueue(maxsize=1000) logger.debug("Setting up a Queue for storing results from workers") results = Manager().Queue() num_scorers = processes #Adapt the number of processes to the machine that run the analysis logger.info('Number of CPU:s {}'.format(cpu_count())) logger.info('Number of model checkers: {}'.format(num_scorers)) # These are the workers that do the heavy part of the analysis logger.info('Seting up the workers') compound_scorers = [ CompoundScorer( task_queue=variant_queue, results_queue=results, individuals=individuals, ) for i in range(num_scorers) ] try: logger.info('Starting the workers') for worker in compound_scorers: logger.debug('Starting worker {0}'.format(worker)) worker.start() # This process prints the variants to temporary files logger.info('Seting up the variant printer') # We use a temp file to store the processed variants logger.debug("Build a tempfile for printing the variants") if temp_dir: temp_file = NamedTemporaryFile(delete=False, dir=temp_dir) else: temp_file = NamedTemporaryFile(delete=False) temp_file.close() variant_printer = VariantPrinter(task_queue=results, head=head, mode='chromosome', outfile=temp_file.name) logger.info('Starting the variant printer process') variant_printer.start() start_time_variant_parsing = datetime.now() # This process parses the original vcf and create batches to put in the variant queue: chromosome_list = get_batches(variants=variant_file, batch_queue=variant_queue, header=head, vep=vep, results_queue=results) logger.debug("Put stop signs in the variant queue") for i in range(num_scorers): variant_queue.put(None) variant_queue.join() results.put(None) variant_printer.join() sort_variants(infile=temp_file.name, mode='chromosome') print_headers(head=head, outfile=outfile, silent=silent) with open(temp_file.name, 'r', encoding='utf-8') as f: for line in f: print_variant(variant_line=line, outfile=outfile, mode='modified', silent=silent) except Exception as e: logger.warning(e) for worker in compound_scorers: worker.terminate() variant_printer.terminate() context.abort() finally: logger.info("Removing temp file") os.remove(temp_file.name) logger.debug("Temp file removed") logger.info('Time for whole analyis: {0}'.format( str(datetime.now() - start_time_analysis)))
def annotate(context, variant_file, annotate_regions, region_file, cadd_file, thousand_g, exac, spidex, outfile, silent, cadd_raw, cosmic, max_af, temp_dir, genome_build): """ Annotate vcf variants. Annotate variants with a number of different sources. Please use --help for more info. """ regions = annotate_regions logger.info( "Running genmod annotate_variant version {0}".format(__version__)) if not region_file: if genome_build == '37': region_file = ensembl_path_37 elif genome_build == '38': region_file = ensembl_path_38 start_time_analysis = datetime.now() annotation_arguments = {} variants = get_file_handle(variant_file) logger.info("Initializing a Header Parser") head = HeaderParser() line = None for line in variants: line = line.rstrip() if line.startswith('#'): if line.startswith('##'): head.parse_meta_data(line) else: head.parse_header_line(line) else: break #Add the first variant back to the iterator # If the vcf has no variants the last line will be a header if not line.startswith('#'): variants = itertools.chain([line], variants) else: print_headers(head, outfile, silent) sys.exit(0) header_line = head.header annotation_arguments['header_line'] = header_line try: if regions: logger.info("Loading annotations") logger.info("Use annotations file: {0}".format(region_file)) add_regions(head) regions_handle = get_file_handle(region_file) logger.debug("Adding region trees to arguments") annotation_arguments['region_trees'] = build_region_trees( regions_handle, padding=4000) if exac: logger.info("Annotating ExAC frequencies") logger.debug("Using ExAC file: {0}".format(exac)) annotation_arguments['exac'] = get_tabixhandle(exac) add_exac(head) if thousand_g: logger.info("Annotating 1000G frequencies") logger.debug("Using 1000G file: {0}".format(thousand_g)) annotation_arguments['thousand_g'] = get_tabixhandle(thousand_g) add_thousandg(head) if spidex: logger.info("Annotating Spidex z scores") logger.debug("Using Spidex file: {0}".format(spidex)) annotation_arguments['spidex'] = get_tabixhandle(spidex) add_spidex(head) if cadd_file: logger.info("Annotating CADD scores") logger.debug("Using CADD file(s): {0}".format( ', '.join(cadd_file))) annotation_arguments['cadd_files'] = [ get_tabixhandle(cadd) for cadd in cadd_file ] add_cadd(head) if cadd_raw: annotation_arguments['cadd_raw'] = cadd_raw add_cadd_raw(head) if max_af: annotation_arguments['max_af'] = max_af if thousand_g: add_thousandg_max(head) if exac: add_exac_max(head) if cosmic: logger.info("Annotating if variant is in COSMIC") logger.debug("Using COSMIC file: {0}".format(cosmic)) annotation_arguments['cosmic'] = get_tabixhandle(cosmic) add_cosmic(head) except TabixError as err: logger.warning(err) context.abort() print_headers(head, outfile, silent) for variant in variants: print_variant(variant_line=annotate_variant(variant, annotation_arguments), outfile=outfile, silent=silent)
def sort(variant_file, outfile, family_id, silent, position, temp_dir): """ Sort a VCF file based on rank score. """ head = HeaderParser() variant_file = get_file_handle(variant_file) logger.info("Running GENMOD sort version {0}".format(__version__)) start = datetime.now() # Create a temporary variant file for sorting logger.debug("Creating temporary file for sorting") if temp_dir: temp_file = NamedTemporaryFile(delete=False, dir=temp_dir) else: temp_file = NamedTemporaryFile(delete=False) temp_file.close() # Open the temp file with codecs temp_file_handle = open(temp_file.name, mode='w', encoding='utf-8', errors='replace') logger.debug("Temp file created") logger.info("Printing variants to temp file") nr_variants = 0 # Print the variants with rank score in first column for line in variant_file: line = line.rstrip() if line.startswith('#'): if line.startswith('##'): head.parse_meta_data(line) else: head.parse_header_line(line) else: nr_variants += 1 priority = '0' if position: chrom = line.split()[0] priority = get_chromosome_priority(chrom) else: priority = get_rank_score(line) print_variant(variant_line=line, priority=priority, outfile=temp_file_handle) temp_file_handle.close() logger.info("Variants printed to temp file") logger.info("Nr or variants in VCF file: {0}".format(nr_variants)) sort_mode = 'rank' if nr_variants == 0: logger.debug("Printing headers") print_headers(head=head, outfile=outfile, silent=silent) sys.exit(0) if position: sort_mode = 'chromosome' logger.info("Sorting variants") sort_variants(infile=temp_file.name, mode=sort_mode) logger.info("Variants sorted") logger.debug("Printing headers") print_headers(head=head, outfile=outfile, silent=silent) logger.debug("Headers printed") logger.info("Printing variants") with open(temp_file.name, mode='r', encoding='utf-8', errors='replace') as f: for variant_line in f: print_variant(variant_line=variant_line, outfile=outfile, mode='modified', silent=False) logger.debug("Variants printed") logger.info("Removing temp file") os.remove(temp_file.name) logger.debug("Temp file removed") logger.info("Sorting done, time for sorting: {0}".format(datetime.now() - start))
def score(context, variant_file, family_id, family_file, family_type, score_config, silent, skip_plugin_check, rank_results, outfile): """ Score variants in a vcf file using a Weighted Sum Model. The specific scores should be defined in a config file, see examples on github. """ logger.info('Running GENMOD score, version: {0}'.format(__version__)) logger.info("Checking family id") variant_file = get_file_handle(variant_file) if family_file: logger.info("Setting up a family parser") family_parser = FamilyParser(family_file, family_type) logger.debug("Family parser done") family_id = list(family_parser.families.keys())[0] logger.info("Family used in analysis: {0}".format(family_id)) ## Check the score config: if not score_config: logger.warning("Please provide a score config file.") context.abort() logger.debug("Parsing config file") try: config_parser = ConfigParser(score_config) except ValidateError as e: logger.error(e.message) context.abort() score_categories = list(config_parser.categories.keys()) logger.debug("Config parsed succesfully") logger.info("Initializing a Header Parser") head = HeaderParser() for line in variant_file: line = line.rstrip() if line.startswith('#'): if line.startswith('##'): head.parse_meta_data(line) else: head.parse_header_line(line) else: break logger.info("Check if all score plugins exist in vcf ...") if not check_plugins(config_parser, head): if not skip_plugin_check: logger.error("All score plugins has to be defined in vcf header") context.abort() else: logger.info("All plugins are defined in vcf") csq_format = head.vep_columns #Add the first variant to the iterator if not line.startswith('#'): variant_file = itertools.chain([line], variant_file) else: print_headers(head=head, outfile=outfile, silent=silent) sys.exit(0) header_line = head.header if "RankScore" in head.info_dict: logger.warning("Variants already scored according to VCF header") logger.info("Please check VCF file") context.abort() add_metadata( head, 'info', 'RankScore', annotation_number='.', entry_type='String', description= "The rank score for this variant in this family. family_id:rank_score." ) if rank_results: add_metadata(head, 'info', 'RankResult', annotation_number='.', entry_type='String', description='|'.join(score_categories)) print_headers(head=head, outfile=outfile, silent=silent) start_scoring = datetime.now() last_twenty = datetime.now() nr_of_variants = 1 for line in variant_file: if not line.startswith('#'): variant = get_variant_dict(line, header_line) variant['info_dict'] = get_info_dict(variant['INFO']) rank_score = 0 # This is for printing results to vcf: category_scores = [] for category in score_categories: category_score = get_category_score( variant=variant, category=category, config_parser=config_parser, csq_format=csq_format) logger.debug("Adding category score {0} to rank_score".format( category_score)) rank_score += category_score logger.debug("Updating rank score to {0}".format(rank_score)) category_scores.append(str(category_score)) variant = add_vcf_info(keyword='RankScore', variant_dict=variant, annotation="{0}:{1}".format( family_id, rank_score)) if rank_results: variant = add_vcf_info(keyword='RankResult', variant_dict=variant, annotation="|".join(category_scores)) print_variant(variant_dict=variant, header_line=header_line, outfile=outfile, silent=silent) nr_of_variants += 1 if nr_of_variants % 20000 == 0: logger.info("{0} variants scored.".format(nr_of_variants)) logger.info( "Last 20000 took {0} to score.".format(datetime.now() - last_twenty)) last_twenty = datetime.now() logger.info( "Variants scored. Number of variants: {0}".format(nr_of_variants)) logger.info("Time to score variants: {0}".format(datetime.now() - start_scoring))
def models(context, variant_file, family_file, family_type, reduced_penetrance, vep, keyword, phased, strict, silent, processes, outfile, temp_dir, whole_gene): """ Annotate genetic models for vcf variants. Checks what patterns of inheritance that are followed in a VCF file. The analysis is family based so each family that are specified in the family file and exists in the variant file will get it's own annotation. """ ######### This is for logging the command line string ######### frame = inspect.currentframe() args, _, _, values = inspect.getargvalues(frame) argument_list = [ i + '=' + str(values[i]) for i in values if values[i] and i not in ['frame'] ] variant_file = get_file_handle(variant_file) ########################################################################### logger.info( "Running GENMOD annotate models version {0}".format(__version__)) logger.debug("Arguments: {0}".format(', '.join(argument_list))) reduced_penetrance_genes = set() nr_reduced_penetrance_genes = 0 if reduced_penetrance: logger.info("Found file with genes that have reduced penetrance") for line in reduced_penetrance: if not line.startswith('#'): nr_reduced_penetrance_genes += 1 gene_id = line.rstrip().split()[0] logger.debug( "Adding gene {0} to reduced penetrance genes".format( gene_id)) reduced_penetrance_genes.add(gene_id) logger.info("Found {0} genes with reduced penetrance".format( nr_reduced_penetrance_genes)) if not family_file: logger.warning("Please provide a family file with -f/--family_file") context.abort() logger.info("Setting up a family parser") family_parser = FamilyParser(family_file, family_type) logger.debug("Family parser done") families = {} logger.info("Check if the familys have any affected") for family_id in family_parser.families: found_affected = False family_obj = family_parser.families[family_id] for ind_id in family_obj.individuals: ind_obj = family_obj.individuals[ind_id] if ind_obj.affected: found_affected = True if found_affected: families[family_id] = family_obj else: logger.warning("No affected individuals found for family {0}."\ " Skipping family.".format(family_id)) if not families: logger.warning( "Please provide at least one family with affected individuals") context.abort() # The individuals in the ped file must be present in the variant file: logger.info("Families used in analysis: {0}".format(','.join( list(families.keys())))) logger.info("Individuals included in analysis: {0}".format(','.join( list(family_parser.individuals.keys())))) head = HeaderParser() for line in variant_file: line = line.rstrip() if line.startswith('#'): if line.startswith('##'): head.parse_meta_data(line) else: head.parse_header_line(line) else: break #Add the first variant to the iterator if not line.startswith('#'): variant_file = itertools.chain([line], variant_file) else: print_headers(head=head, outfile=outfile, silent=silent) sys.exit(0) if vep: if not "CSQ" in head.info_dict: logger.warning( "vep flag is used but there is no CSQ field specified in header" ) logger.info("Please check VCF file") context.abort() else: logger.info("Using VEP annotation") else: if not keyword in head.info_dict: logger.warning( "Annotation key {0} could not be found in VCF header".format( keyword)) logger.info("Please check VCF file") context.abort() else: logger.info("Using {0} annotation".format(keyword)) if "GeneticModels" in head.info_dict: logger.warning("Genetic models are already annotated according to vcf"\ " header.") context.abort() logger.info("Adding genmod version to vcf header") head.add_version_tracking(info_id='genmod', version=__version__, date=datetime.now().strftime("%Y-%m-%d %H:%M"), command_line=' '.join(argument_list)) logger.debug("Version added") logger.info("Adding genetic models to vcf header") add_metadata( head, 'info', 'GeneticModels', annotation_number='.', entry_type='String', description="':'-separated list of genetic models for this variant.") logger.debug("Genetic models added") logger.info("Adding model score to vcf header") add_metadata(head, 'info', 'ModelScore', annotation_number='.', entry_type='String', description="PHRED score for genotype models.") logger.debug("Model score added") logger.info("Adding Compounds to vcf header") add_metadata( head, 'info', 'Compounds', annotation_number='.', entry_type='String', description=( "List of compound pairs for this variant." "The list is splitted on ',' family id is separated with compounds" "with ':'. Compounds are separated with '|'.")) logger.debug("Compounds added") vcf_individuals = head.individuals logger.debug("Individuals found in vcf file: {}".format( ', '.join(vcf_individuals))) try: check_individuals(family_parser.individuals, vcf_individuals) except IOError as e: logger.error(e) logger.info("Individuals in PED file: {0}".format(', '.join( family_parser.individuals))) logger.info("Individuals in VCF file: {0}".format( ', '.join(vcf_individuals))) context.abort() start_time_analysis = datetime.now() analysis_individuals = list(family_parser.individuals.keys()) logger.info("Individuals used in analysis: {0}".format( ', '.join(analysis_individuals))) ################################################################### ### The task queue is where all jobs(in this case batches that ### ### represents variants in a region) is put. The consumers will ### ### then pick their jobs from this queue. ### ################################################################### logger.debug("Setting up a JoinableQueue for storing variant batches") # One batch consists of all variants from one or several overlapping genes # there can be a significant amount of variants in a batch for whole genome # data... variant_queue = JoinableQueue(maxsize=100) logger.debug("Setting up a Queue for storing results from workers") results = Manager().Queue() num_model_checkers = processes #Adapt the number of processes to the machine that run the analysis logger.info('Number of CPU:s {}'.format(cpu_count())) logger.info('Number of model checkers: {}'.format(num_model_checkers)) # These are the workers that do the heavy part of the analysis logger.info('Seting up the workers') try: model_checkers = [ VariantAnnotator(task_queue=variant_queue, results_queue=results, families=families, individuals=analysis_individuals, phased=phased, strict=strict, vep=vep, reduced_penetrance_genes=reduced_penetrance_genes) for i in range(num_model_checkers) ] logger.info('Starting the workers') for worker in model_checkers: logger.debug('Starting worker {0}'.format(worker)) worker.start() # This process prints the variants to temporary files logger.info('Seting up the variant printer') if len(model_checkers) == 1: print_headers(head=head, outfile=outfile, silent=silent) variant_printer = VariantPrinter(task_queue=results, head=head, mode='normal', outfile=outfile) else: # We use a temp file to store the processed variants logger.debug("Build a tempfile for printing the variants") if temp_dir: temp_file = NamedTemporaryFile(delete=False, dir=temp_dir) else: temp_file = NamedTemporaryFile(delete=False) temp_file.close() variant_printer = VariantPrinter(task_queue=results, head=head, mode='chromosome', outfile=temp_file.name) logger.info('Starting the variant printer process') variant_printer.start() start_time_variant_parsing = datetime.now() # This process parses the original vcf and create batches to put in the variant queue: logger.info('Start parsing the variants') chromosome_list = get_batches(variants=variant_file, batch_queue=variant_queue, header=head, vep=vep, annotation_keyword=keyword) logger.debug("Put stop signs in the variant queue") for i in range(num_model_checkers): variant_queue.put(None) variant_queue.join() results.put(None) variant_printer.join() if len(model_checkers) > 1: sort_variants(infile=temp_file.name, mode='chromosome') print_headers(head=head, outfile=outfile, silent=silent) with open(temp_file.name, 'r', encoding='utf-8') as f: for line in f: print_variant(variant_line=line, outfile=outfile, mode='modified', silent=silent) except Exception as err: logger.warning(err) for worker in model_checkers: worker.terminate() variant_printer.terminate() context.abort() finally: if len(model_checkers) > 1: logger.info("Removing temp file") os.remove(temp_file.name) logger.debug("Temp file removed") logger.info('Time for whole analyis: {0}'.format( str(datetime.now() - start_time_analysis)))
def filter(variant_file, annotation, threshold, discard, greater, silent, outfile): """ Filter vcf variants. Filter variants based on their annotation """ logger.info("Running genmod filter version {0}".format(__version__)) variant_file = get_file_handle(variant_file) start_time_analysis = datetime.now() logger.info("Initializing a Header Parser") head = HeaderParser() for line in variant_file: line = line.rstrip() if line.startswith('#'): if line.startswith('##'): head.parse_meta_data(line) else: head.parse_header_line(line) else: break #Add the first variant to the iterator variant_file = itertools.chain([line], variant_file) header_line = head.header if not annotation in head.info_dict: logger.warning( "Annotation {0} not specified in header".format(annotation)) logger.info("Please check VCF file") logger.info("Exiting...") sys.exit(1) logger.info( "Building a plugin from extract_vcf for {0}".format(annotation)) annotation_plugin = Plugin(name=annotation, field='INFO', info_key=annotation, separators=[','], record_rule='min', data_type='float') logger.debug("Plugin=(field={0},info_key={1},separators={2},record_rule={3}"\ ",data_type={4})".format('INFO', annotation, "','", 'min', 'float')) print_headers(head=head, outfile=outfile, silent=silent) nr_of_variants = 0 nr_of_passed_variants = 0 for variant in variant_file: nr_of_variants += 1 keep_variant = False value = annotation_plugin.get_value(variant_line=variant) logger.debug("Found value {0}".format(value)) if value: if greater: if value > threshold: keep_variant = True else: if value < threshold: keep_variant = True else: if not discard: keep_variant = True if keep_variant: logger.debug("Keeping variant") nr_of_passed_variants += 1 print_variant(variant_line=variant, outfile=outfile, mode='vcf', silent=silent) else: logger.debug("Discarding variant") logger.info("Number of variants in file {0}".format(nr_of_variants)) logger.info( "Number of variants passing filter {0}".format(nr_of_passed_variants)) logger.info( "Number of variants filtered {0}".format(nr_of_variants - nr_of_passed_variants))
def summarize(variant_file, family_file, frequency_treshold, frequency_keyword, cadd_treshold, cadd_keyword, gq_treshold, read_depth_treshold): """ Summarize the the variants in a vcf. There will be one result line per individual. - How many variants found\n - How many variants did not satisfy the base call quality treshold. (Default 20)\n - How many variants where not covered in all individuals. (Default depth 10)\n - How many variants followed each model in each family:\n - AR_hom\n - AR_comp\n - AR_hom_dn\n - AR_comp_dn\n - AD\n - AD_dn\n - XD\n - XD_dn\n - XR\n - XR_dn\n - How many rare variants (Default maf < 0.02)\n - How many high scored cadd. (Default cadd = 0)\n - How many rare + high score cadd\n - How many no cadd score\n - How many indels\n - How many indels without cadd score\n """ logger = logging.getLogger(__name__) logger = logging.getLogger("genmod.commands.summarize_variants") head = HeaderParser() nr_of_variants = 0 header = ['sample_id', 'nr_of_variants'] samples = {} logger.debug("Setting up a variant parser") if variant_file == '-': variant_parser = VCFParser(fsock=sys.stdin, check_info=False) else: variant_parser = VCFParser(infile=variant_file, check_info=False) logger.debug("Variant parser setup") head = variant_parser.metadata for sample_id in head.individuals: samples[sample_id] = {} samples[sample_id]["nr_of_variants"] = 0 for variant in variant_parser: for sample_id in samples: samples[sample_id]["nr_of_variants"] += 1 print(variant['genotypes'][sample_id].depth_of_coverage) print(json.dumps(samples))