Exemplo n.º 1
0
def cli(vcf_file, split_variants, outfile, silent, verbose):
    from vcf_parser import VCFParser
    
    if vcf_file == '-':
        variant_parser = VCFParser(
                            fsock = sys.stdin, 
                            split_variants=split_variants
                        )
    else:
        variant_parser = VCFParser(
                            infile = vcf_file, 
                            split_variants=split_variants
                        )
    
    head = variant_parser.metadata
    
    add_metadata(
        head,
        'info',
        'GeneticModels', 
        annotation_number='.', 
        entry_type='String', 
        description="':'-separated list of genetic models for this variant."
        
        )
    # Test if metadata was added properly.
    print_headers(head)
Exemplo n.º 2
0
def test_add_variant_with_genotypes():
    """
    Test to add a variant to a vcf
    """

    parser = VCFParser(fileformat="VCFv4.1")

    parser.metadata.add_info(info_id='MQ',
                             number='1',
                             entry_type='Float',
                             description="RMS Mapping Quality")

    header_line = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'\
        'father\tmother\tproband'

    parser.metadata.parse_header_line(header_line)

    parser.add_variant(chrom='1',
                       pos='11900',
                       rs_id='.',
                       ref='A',
                       alt='T',
                       qual='100',
                       filt='PASS',
                       info="MQ=1",
                       form="GT:GQ",
                       genotypes=["0/1:60", "0/1:60", "1/1:60"])

    variant = parser.variants[0]

    assert variant['POS'] == '11900'
    assert variant['mother'] == '0/1:60'
Exemplo n.º 3
0
def test_add_variant():
    """
    Test to add a variant to a vcf
    """

    parser = VCFParser(fileformat="VCFv4.1")

    parser.metadata.add_info(info_id='MQ',
                             number='1',
                             entry_type='Float',
                             description="RMS Mapping Quality")

    parser.add_variant(chrom='1',
                       pos='11900',
                       rs_id='.',
                       ref='A',
                       alt='T',
                       qual='100',
                       filt='PASS',
                       info="MQ=1")

    variant = parser.variants[0]

    assert variant['CHROM'] == '1'
    assert variant['POS'] == '11900'
Exemplo n.º 4
0
def cli(vcf_file, split_variants, outfile, silent, verbose):
    from vcf_parser import VCFParser

    if vcf_file == '-':
        variant_parser = VCFParser(fsock=sys.stdin,
                                   split_variants=split_variants)
    else:
        variant_parser = VCFParser(infile=vcf_file,
                                   split_variants=split_variants)

    head = variant_parser.metadata

    print_headers(head, outfile, silent)
Exemplo n.º 5
0
def check_families(variant_file):
    """Loop through the vcf file and check which families that are found."""
    families = set([])
    if variant_file == '-':
        variant_parser = VCFParser(fsock=sys.stdin)
    else:
        variant_parser = VCFParser(infile=variant_file)
    for variant in variant_parser:
        genetic_models = variant['info_dict'].get('GeneticModels', None)
        if genetic_models:
            for family_models in genetic_models:
                family = family_models.split(':')[0]
                families.add(family)
    return families
Exemplo n.º 6
0
def test_build_vcf():
    """
    Test how it works to build a vcf by adding metadata and variants to the parser
    """
    parser = VCFParser(fileformat="VCFv4.1")
    variants = []
    assert parser.metadata.fileformat == "VCFv4.1"
Exemplo n.º 7
0
def test_wrong_formatted_vcf():
    """
    Test how vcf_parser behaves if no fileformat is given
    """
    vcf_lines = [
        '##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">\n',
        '##contig=<ID=1,length=249250621,assembly=b37>\n',
        '##reference=file:///humgen/gsa-hpprojects/GATK/bundle'\
        '/current/b37/human_g1k_v37.fasta\n',
        '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'\
        'father\tmother\tproband\n',
        '1\t11900\t.\tA\tT\t100\tPASS\tMQ=1\tGT:GQ\t0/1:60\t0/1:60\t1/1:60\n',
        '1\t879585\t.\tA\tT\t100\tPASS\tMQ=1\tGT:GQ\t0/1:60\t0/0:60\t0/1:60\n',
        '1\t879586\t.\tA\tT\t100\tPASS\tMQ=1\tGT:GQ\t0/0:60\t0/1:60\t0/1:60\n',
        '1\t947378\t.\tA\tT\t100\tPASS\tMQ=1\tGT:GQ\t0/0:60\t0/0:60\t0/1:60\n',
        '1\t973348\t.\tG\tA\t100\tPASS\tMQ=1\tGT:GQ\t0/0:60\t0/0:60\t0/1:60\n',
        '3\t879585\t.\tA\tT\t100\tPASS\tMQ=1\tGT:GQ\t0/1:60\t0/0:60\t0/1:60\n',
        '3\t879586\t.\tA\tT\t100\tPASS\tMQ=1\tGT:GQ\t0/0:60\t0/1:60\t0/1:60\n',
        '3\t947378\t.\tA\tT\t100\tPASS\tMQ=1\tGT:GQ\t0/0:60\t0/0:60\t0/1:60\n',
        '3\t973348\t.\tG\tA\t100\tPASS\tMQ=1\tGT:GQ\t0/0:60\t0/0:60\t0/1:60\n'
        ]
    vcf_file = get_vcf_file(vcf_lines)
    with pytest.raises(SyntaxError):
        for variant in VCFParser(vcf_file):
            print(variant)
Exemplo n.º 8
0
def cli(vcf_file, verbose):
  """
  Test the transcript class.
  """
  from vcf_parser import VCFParser

  vcf_parser = VCFParser(infile=vcf_file, split_variants=True)
  for variant in vcf_parser:

    # Conversion from ensembl to refseq
    # ensembl_to_refseq is a dictionary with ensembl transcript id as keys and
    # a list of refseq ids as values
    ensembl_to_refseq = {}
    for gene_info in variant['info_dict'].get('Ensembl_transcript_to_refseq_transcript', []):
      splitted_gene = gene_info.split(':')
      transcript_info = splitted_gene[1]
      for transcript in transcript_info.split('|'):
        splitted_transcript = transcript.split('>')
        if len(splitted_transcript) > 1:
          ensembl_id = splitted_transcript[0]
          refseq_ids = splitted_transcript[1].split('/')
          ensembl_to_refseq[ensembl_id] = refseq_ids

    for vep_entry in variant['vep_info'].get(variant['ALT'], []):
      transcript = get_transcript(vep_entry, ensembl_to_refseq)
      print(transcript.to_json())
Exemplo n.º 9
0
def test_split_variant():
    """
    Test the vcf_parser
    """
    vcf_lines = [
        '##fileformat=VCFv4.1\n',
        '##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">\n',
        '##contig=<ID=1,length=249250621,assembly=b37>\n',
        '##reference=file:///humgen/gsa-hpprojects/GATK/bundle'\
        '/current/b37/human_g1k_v37.fasta\n',
        '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'\
        'father\tmother\tproband\n',
        '1\t11900\t.\tA\tT,C\t100\tPASS\tMQ=1\tGT:GQ\t0/1:60\t0/2:60\t1/2:60\n',
        ]

    vcf_file = get_vcf_file(vcf_lines)
    variants = []
    for variant in VCFParser(infile=vcf_file, split_variants=True):
        variants.append(variant)

    first_variant = variants[0]
    assert first_variant['POS'] == '11900'
    assert first_variant['ALT'] == 'T'

    second_variant = variants[1]
    assert second_variant['POS'] == '11900'
    assert second_variant['ALT'] == 'C'
Exemplo n.º 10
0
def cli(vcf_file, ped_file, vcf_config_file, scout_config_file, family_type,
        variant_type, institute, verbose):
    """
  Test generate mongo variants.
  """

    from vcf_parser import VCFParser
    from ....models import Case
    from ..config_parser import ConfigParser
    from . import get_case

    setup_configs = {}

    if scout_config_file:
        setup_configs = ConfigParser(scout_config_file)

    if vcf_file:
        setup_configs['load_vcf'] = vcf_file

    if ped_file:
        setup_configs['ped'] = ped_file

    if institute:
        setup_configs['institutes'] = [institute]

    if not setup_configs.get('load_vcf', None):
        print("Please provide a vcf file.(Use flag '-vcf/--vcf_file')",
              file=sys.stderr)
        sys.exit(0)

    # Check that the ped file is provided:
    if not setup_configs.get('ped', None):
        print("Please provide a ped file.(Use flag '-ped/--ped_file')",
              file=sys.stderr)
        sys.exit(0)

    # Check that the config file is provided:
    if not vcf_config_file:
        print(
            "Please provide a config file.(Use flag '-vcf_config/--vcf_config_file')",
            file=sys.stderr)
        sys.exit(0)

    config_object = ConfigParser(vcf_config_file)

    my_case = get_case(setup_configs, family_type)

    vcf_parser = VCFParser(infile=setup_configs['load_vcf'],
                           split_variants=True)

    individuals = vcf_parser.individuals

    variant_count = 0
    for variant in vcf_parser:
        variant_count += 1
        mongo_variant = get_mongo_variant(variant, variant_type, individuals,
                                          my_case, config_object,
                                          variant_count)
        print(mongo_variant.to_json())
Exemplo n.º 11
0
def test_add_filedate():
    """
    Test to add afiledate to the vcf
    """
    parser = VCFParser(fileformat="VCFv4.1")
    variants = []
    parser.metadata.add_meta_line(key='filedate', value='20150607')
    assert 'filedate' in parser.metadata.other_dict
Exemplo n.º 12
0
def test_add_variant():
    """
    Test to add a variant to a vcf
    """

    parser = VCFParser(fileformat="VCFv4.1")
    
    parser.metadata.add_info(
        info_id='MQ', number='1', entry_type='Float', description="RMS Mapping Quality")
    
    parser.add_variant(chrom='1', pos='11900', rs_id='.', ref='A',
                    alt='T', qual='100', filt='PASS', info="MQ=1")
    
    variant = parser.variants[0]
    
    assert variant['CHROM'] == '1'
    assert variant['POS'] == '11900'
Exemplo n.º 13
0
def test_add_contig():
    """
    Test how it works to build a vcf by adding metadata and variants to the parser
    """
    parser = VCFParser(fileformat="VCFv4.1")

    parser.metadata.add_contig(contig_id="1", length="249250621")

    assert '1' in parser.metadata.contig_dict
Exemplo n.º 14
0
 def _variants(self):
     variants = VCFParser(self.vcf_file, check_info=False)
     for index, variant in enumerate(variants):
         variant['id'] = index
         variant['index'] = index + 1
         variant['start'] = int(variant['POS'])
         variant['stop'] = int(variant['POS']) + (len(variant['REF'])
                                                  - len(variant['ALT']))
         yield variant
Exemplo n.º 15
0
def test_add_filter():
    """
    Test how it works to build a vcf by adding metadata and variants to the parser
    """
    parser = VCFParser(fileformat="VCFv4.1")

    parser.metadata.add_filter(filter_id="MY_FILTER",
                               description="The filter description")

    assert 'MY_FILTER' in parser.metadata.filter_dict
Exemplo n.º 16
0
def test_add_alt():
    """
    Test how it works to build a vcf by adding metadata and variants to the parser
    """
    parser = VCFParser(fileformat="VCFv4.1")

    parser.metadata.add_alt(alt_id="MY_ALTERNATIVE",
                            description="The alternative description")

    assert 'MY_ALTERNATIVE' in parser.metadata.alt_dict
Exemplo n.º 17
0
def test_add_format():
    """
    Test how it works to build a vcf by adding metadata and variants to the parser
    """
    parser = VCFParser(fileformat="VCFv4.1")

    parser.metadata.add_format(format_id="DP",
                               number='1',
                               entry_type='Integer',
                               description="The read depth")

    assert 'DP' in parser.metadata.format_dict
Exemplo n.º 18
0
def cli(vcf_file, verbose):
  """
  Test generating genes.
  """

  from vcf_parser import VCFParser

  vcf_parser = VCFParser(infile=vcf_file, split_variants=True)
  for variant in vcf_parser:
    genes = get_genes(variant)
    for gene in genes:
      print(gene.to_json())
Exemplo n.º 19
0
def test_add_info():
    """
    Test how it works to build a vcf by adding metadata and variants to the parser
    """
    parser = VCFParser(fileformat="VCFv4.1")

    parser.metadata.add_info(info_id='MQ',
                             number='1',
                             entry_type='Float',
                             description="RMS Mapping Quality")

    assert 'MQ' in parser.metadata.extra_info
    assert 'MQ' in parser.metadata.info_dict
Exemplo n.º 20
0
def test_add_variant_with_genotypes():
    """
    Test to add a variant to a vcf
    """

    parser = VCFParser(fileformat="VCFv4.1")
    
    parser.metadata.add_info(
        info_id='MQ', number='1', entry_type='Float', description="RMS Mapping Quality")
    
    header_line = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'\
        'father\tmother\tproband'
    
    parser.metadata.parse_header_line(header_line)

    parser.add_variant(chrom='1', pos='11900', rs_id='.', ref='A',
                        alt='T', qual='100', filt='PASS', info="MQ=1",
                        form="GT:GQ", genotypes=["0/1:60", "0/1:60", "1/1:60"])

    variant = parser.variants[0]

    assert variant['POS'] == '11900'
    assert variant['mother'] == '0/1:60'
Exemplo n.º 21
0
def cli(vcf_file, ped_file, scout_config_file, family_type, variant_type,
        institute, verbose):
    """Test the vcf class."""

    from vcf_parser import VCFParser
    from ....models import Case
    from ..config_parser import ConfigParser
    from . import get_case

    setup_configs = {}

    if scout_config_file:
        setup_configs = ConfigParser(scout_config_file)

    if vcf_file:
        setup_configs['load_vcf'] = vcf_file

    if ped_file:
        setup_configs['ped'] = ped_file

    if institute:
        setup_configs['institutes'] = [institute]

    if not setup_configs.get('load_vcf', None):
        print("Please provide a vcf file.(Use flag '-vcf/--vcf_file')",
              file=sys.stderr)
        sys.exit(0)

    # Check that the ped file is provided:
    if not setup_configs.get('ped', None):
        print("Please provide a ped file.(Use flag '-ped/--ped_file')",
              file=sys.stderr)
        sys.exit(0)

    my_case = get_case(setup_configs, family_type)

    vcf_parser = VCFParser(infile=setup_configs['load_vcf'],
                           split_variants=True)
    for variant in vcf_parser:
        compounds = get_compounds(variant, my_case, variant_type)
        for compound in compounds:
            print(compound.to_json())
Exemplo n.º 22
0
def cli(vcf_file, vcf_config_file, verbose):
    """
  Test the get_genotype class."""
    from vcf_parser import VCFParser
    from ..config_parser import ConfigParser

    if not vcf_config_file:
        print('Please provide a vcf config file')
        sys.exit()

    if not vcf_file:
        print('Please provide a vcf file')
        sys.exit()

    configs = ConfigParser(vcf_config_file)

    vcf_parser = VCFParser(infile=vcf_file, split_variants=True)
    individuals = vcf_parser.individuals

    for variant in vcf_parser:
        for individual in individuals:
            genotype_info = get_genotype_information(variant, configs,
                                                     individual)
            print(genotype_info.to_json())
Exemplo n.º 23
0
def annotate(family_file, variant_file, family_type, vep, silent, phased,
             strict, cadd_raw, whole_gene, annotation_dir, cadd_file,
             cadd_1000g, cadd_exac, cadd_esp, cadd_indels, thousand_g, exac,
             outfile, split_variants, processes, dbnfsp, verbose):
    """Annotate variants in a VCF file.\n
        The main function with genmod is to annotate genetic inheritance patterns for variants in families. 
        Use flag --family together with a .ped file to describe which individuals in the vcf you wish to check inheritance for in the analysis.
        Individuals that are not present in the ped file will not be considered in the analysis.\n
        It is also possible to use genmod without a family file. In this case the variants will be annotated with a variety of options seen below.
        Please see docuentation on github.com/moonso/genmod or genmod/examples/readme.md for more information.
    """

    ######### This is for logging the command line string #########
    frame = inspect.currentframe()
    args, _, _, values = inspect.getargvalues(frame)
    argument_list = [
        i + '=' + str(values[i]) for i in values
        if values[i] and i != 'config' and i != 'frame'
    ]

    if verbose:
        print('\nRunning GENMOD annotate version %s \n' % VERSION,
              file=sys.stderr)

    start_time_analysis = datetime.now()

    ######### Setup a variant parser #########

    if variant_file == '-':
        variant_parser = VCFParser(fsock=sys.stdin,
                                   split_variants=split_variants)
    else:
        variant_parser = VCFParser(infile=variant_file,
                                   split_variants=split_variants)

    # These are the individuals in from the vcf file
    individuals = variant_parser.individuals

    head = variant_parser.metadata

    # Update version logging
    add_metadata(head,
                 'version',
                 'genmod',
                 version=VERSION,
                 command_line_string=' '.join(argument_list))

    ######### Parse the ped file (if there is one) #########

    families = {}

    if family_file:
        family_parser = FamilyParser(family_file, family_type)
        # The individuals in the ped file must be present in the variant file:
        families = family_parser.families

        for individual in family_parser.individuals:
            if individual not in individuals:
                warning(
                    'All individuals in ped file must be in vcf file! Aborting...'
                )
                warning('Individuals in PED file: %s' %
                        ' '.join(list(family_parser.individuals.keys())))
                warning('Individuals in VCF file: %s' % ' '.join(individuals))
                print('Exiting...', file=sys.stderr)
                sys.exit()

        add_metadata(
            head,
            'info',
            'GeneticModels',
            annotation_number='.',
            entry_type='String',
            description="':'-separated list of genetic models for this variant."
        )
        add_metadata(head,
                     'info',
                     'ModelScore',
                     annotation_number='1',
                     entry_type='Integer',
                     description="PHRED score for genotype models.")
        add_metadata(
            head,
            'info',
            'Compounds',
            annotation_number='.',
            entry_type='String',
            description=
            ("List of compound pairs for this variant."
             "The list is splitted on ',' family id is separated with compounds"
             "with ':'. Compounds are separated with '|'."))

    if verbose:
        if family_file:
            print('Starting analysis of families: %s' %
                  ','.join(list(families.keys())),
                  file=sys.stderr)
            print('Individuals included in analysis: %s\n' %
                  ','.join(list(family_parser.individuals.keys())),
                  file=sys.stderr)
    ######### Read to the annotation data structures #########

    gene_trees = {}
    exon_trees = {}

    # If the variants are already annotated we do not need to redo the annotation
    if not vep:

        gene_trees, exon_trees = load_annotations(annotation_dir, verbose)

        add_metadata(
            head,
            'info',
            'Annotation',
            annotation_number='.',
            entry_type='String',
            description='Annotates what feature(s) this variant belongs to.')
    else:
        if verbose:
            print('Using VEP annotation', file=sys.stderr)

    ######### Check which other annotations files that should be used in the analysis #########

    cadd_annotation = False

    if cadd_file:
        if verbose:
            print('Cadd file! %s' % cadd_file, file=sys.stderr)
        cadd_annotation = True
    if cadd_1000g:
        if verbose:
            print('Cadd 1000G file! %s' % cadd_1000g, file=sys.stderr)
        cadd_annotation = True
    if cadd_esp:
        if verbose:
            print('Cadd ESP6500 file! %s' % cadd_esp, file=sys.stderr)
        cadd_annotation = True
    if cadd_indels:
        if verbose:
            print('Cadd InDel file! %s' % cadd_indels, file=sys.stderr)
        cadd_annotation = True
    if cadd_exac:
        if verbose:
            print('Cadd ExAC file! %s' % cadd_exac, file=sys.stderr)
        cadd_annotation = True

    if cadd_annotation:
        add_metadata(
            head,
            'info',
            'CADD',
            annotation_number='A',
            entry_type='Float',
            description="The CADD relative score for this alternative.")
        if cadd_raw:
            add_metadata(
                head,
                'info',
                'CADD_raw',
                annotation_number='A',
                entry_type='Float',
                description="The CADD raw score(s) for this alternative(s).")

    if thousand_g:
        if verbose:
            print('1000G frequency file! %s' % thousand_g, file=sys.stderr)
        add_metadata(head,
                     'info',
                     '1000G_freq',
                     annotation_number='A',
                     entry_type='Float',
                     description="Frequency in the 1000G database.")

    if exac:
        if verbose:
            print('ExAC frequency file! %s' % exac, file=sys.stderr)
        add_metadata(head,
                     'info',
                     'ExAC_freq',
                     annotation_number='A',
                     entry_type='Float',
                     description="Frequency in the ExAC database.")

    if dbnfsp:
        if verbose:
            print('dbNFSP file! %s' % dbnfsp, file=sys.stderr)

    ###################################################################
    ### The task queue is where all jobs(in this case batches that  ###
    ### represents variants in a region) is put. The consumers will ###
    ### then pick their jobs from this queue.                       ###
    ###################################################################

    variant_queue = JoinableQueue(maxsize=1000)
    # The consumers will put their results in the results queue
    results = Manager().Queue()

    num_model_checkers = processes
    #Adapt the number of processes to the machine that run the analysis
    if cadd_annotation:
        # We need more power when annotating cadd scores:
        # But if flag is used that overrides
        if num_model_checkers == min(4, cpu_count()):
            num_model_checkers = min(8, cpu_count())

    if verbose:
        print('Number of CPU:s %s' % cpu_count(), file=sys.stderr)
        print('Number of model checkers: %s' % num_model_checkers,
              file=sys.stderr)

    # We use a temp file to store the processed variants
    temp_file = NamedTemporaryFile(delete=False)
    temp_file.close()
    # Open the temp file with codecs
    temporary_variant_file = open(temp_file.name,
                                  mode='w',
                                  encoding='utf-8',
                                  errors='replace')

    # These are the workers that do the heavy part of the analysis
    model_checkers = [
        VariantConsumer(variant_queue, results, families, phased, vep,
                        cadd_raw, cadd_file, cadd_1000g, cadd_exac, cadd_esp,
                        cadd_indels, thousand_g, exac, dbnfsp, strict, verbose)
        for i in range(num_model_checkers)
    ]

    for w in model_checkers:
        w.start()

    # This process prints the variants to temporary files
    var_printer = VariantPrinter(results,
                                 temporary_variant_file,
                                 head,
                                 mode='chromosome',
                                 verbosity=verbose)
    var_printer.start()

    start_time_variant_parsing = datetime.now()

    if verbose:
        print('Start parsing the variants ... \n', file=sys.stderr)

    # This process parses the original vcf and create batches to put in the variant queue:

    chromosome_list = get_batches(variant_parser, variant_queue, individuals,
                                  gene_trees, exon_trees, phased, vep,
                                  whole_gene, verbose)

    # Put stop signs in the variant queue
    for i in range(num_model_checkers):
        variant_queue.put(None)

    variant_queue.join()
    results.put(None)
    var_printer.join()

    temporary_variant_file.close()

    if verbose:
        print('Cromosomes found in variant file: %s \n' %
              ','.join(chromosome_list),
              file=sys.stderr)
        print('Models checked!\n', file=sys.stderr)

    sort_variants(temp_file.name, mode='chromosome', verbose=verbose)

    print_headers(head, outfile, silent)

    print_variants(temp_file.name, outfile, mode='modified', silent=silent)

    # Remove all temp files:
    os.remove(temp_file.name)

    if verbose:
        print('Time for whole analyis: %s' %
              str(datetime.now() - start_time_analysis),
              file=sys.stderr)
Exemplo n.º 24
0
def analyze(variant_file, family_type, frequency_treshold, frequency_keyword,
            cadd_treshold, cadd_keyword, coverage, gq_treshold, outdir, silent,
            exclude_problematic, verbose):
    """Analyze the annotated variants in a VCF file. 
        
        If there are multiple families in the ped one analysis per family will
        be done. The variants are analyzed in five different categories based 
        on what inheritance patterns that are followed.
        The differen analysies are: 
        
                AR compound\n
                AR homozygote\n
                Dominant\n
                X linked\n
                Dominant dn\n
        
        Which variants to be considered are specified in the command line. 
        Defaults are (based on a rare disease assumption):
        
            MAF < 0.02\n
            CADD score > 12\n
            Coverage in all individuals > 7\n
            Call quality > 20\n
        
        The highest scoring variants of each category is printed to screen.
        The full list of each category is printed to new vcf files in a 
        directory specified by the user. Default current dir.
        File names are the same like the input vcf with the name of the 
        analysis appended.
    
    """

    start_time_analysis = datetime.now()

    # configs = ConfigObj(config_file)
    # prefered_models = make_models([])

    inheritance_keyword = 'GeneticModels'
    families = check_families(variant_file)
    file_name = os.path.splitext(os.path.split(variant_file)[-1])[0]

    # if config_file:
    #     frequency_treshold = float(configs.get('frequency', {}).get('rare', frequency_treshold))
    #     freq_keyword = configs.get('frequency', {}).get('keyword', freq_keyword)
    #     inheritance_patterns = [pattern for pattern in configs.get('inheritance', {}).get('patterns',[])]
    #     inheritance_keyword = configs.get('inheritance', {}).get('keyword',inheritance_keyword)
    #     prefered_models = make_models(inheritance_patterns)

    if variant_file == '-':
        variant_parser = VCFParser(fsock=sys.stdin)
    else:
        variant_parser = VCFParser(infile=variant_file)

    for family_id in families:
        print('Analysis for family: %s' % family_id)

        head = variant_parser.metadata

        dominant_dict = {}
        homozygote_dict = {}
        compound_dict = {}
        x_linked_dict = {}
        dominant_dn_dict = {}

        get_interesting_variants(variant_parser, family_id, dominant_dict,
                                 homozygote_dict, compound_dict, x_linked_dict,
                                 dominant_dn_dict, frequency_treshold,
                                 frequency_keyword, cadd_treshold,
                                 cadd_keyword, gq_treshold, coverage,
                                 exclude_problematic)

        remove_inacurate_compounds(compound_dict, family_id)

        if len(dominant_dict) > 0:
            dominant_file = os.path.join(outdir,
                                         file_name + '_dominant_analysis.vcf')

            print_headers(head, dominant_file)

            print_results(dominant_dict,
                          dominant_file,
                          family_id,
                          variant_parser.header,
                          cadd_keyword,
                          frequency_keyword,
                          mode='dominant')

        if len(homozygote_dict) > 0:
            homozygote_file = os.path.join(
                outdir, file_name + '_homozygote_analysis.vcf')
            print_headers(head, homozygote_file)

            print_results(homozygote_dict,
                          homozygote_file,
                          family_id,
                          variant_parser.header,
                          cadd_keyword,
                          frequency_keyword,
                          mode='homozygote')

        if len(compound_dict) > 0:
            compound_file = os.path.join(outdir,
                                         file_name + '_compound_analysis.vcf')
            print_headers(head, compound_file)

            print_results(compound_dict,
                          compound_file,
                          family_id,
                          variant_parser.header,
                          cadd_keyword,
                          frequency_keyword,
                          mode='compound')

        if len(x_linked_dict) > 0:
            xlinked_file = os.path.join(outdir,
                                        file_name + '_x_linked_analysis.vcf')
            print_headers(head, xlinked_file)

            print_results(x_linked_dict,
                          xlinked_file,
                          family_id,
                          variant_parser.header,
                          cadd_keyword,
                          frequency_keyword,
                          mode='xlinked')

        if len(dominant_dn_dict) > 0:
            dominant_dn_file = os.path.join(
                outdir, file_name + '_ad_denovo_analysis.vcf')
            print_headers(head, dominant_dn_file)

            print_results(dominant_dn_dict,
                          dominant_dn_file,
                          family_id,
                          variant_parser.header,
                          cadd_keyword,
                          frequency_keyword,
                          mode='denovo')

        print('')

        print('Number of interesting Dominant variants: %s' %
              len(dominant_dict))
        print('Number of interesting Homozygote variants: %s' %
              len(homozygote_dict))
        print('Number of interesting Compound variants: %s' %
              len(compound_dict))
        print('Number of interesting X-linked variants: %s' %
              len(x_linked_dict))
        print('Number of interesting Autosomal Dominant de novo variants: %s' %
              len(dominant_dn_dict))

        # pp(compound_dict)

        print('Time for analysis: %s' %
              str(datetime.now() - start_time_analysis))
Exemplo n.º 25
0
def cli(variant_file, vep, split, outfile, verbose, silent, check_info,
        allele_symbol, logfile, loglevel):
    """
    Tool for parsing vcf files.
    
    Prints the vcf file to output. 
    If --split/-s is used all multiallelic calls will be splitted and printed 
    as single variant calls.
    For more information, please see github.com/moonso/vcf_parser.
    """
    from vcf_parser import logger, init_log

    if not loglevel:
        if verbose:
            loglevel = 'INFO'

    init_log(logger, logfile, loglevel)
    nr_of_variants = 0
    start = datetime.now()

    # with open(variant_file, 'r', encoding="utf-8") as f:
    #     for line in f:
    #         if not line.startswith('#'):
    #             nr_of_variants += 1

    if variant_file == '-':
        logger.info("Start parsing variants from stdin")
        my_parser = VCFParser(fsock=sys.stdin,
                              split_variants=split,
                              check_info=check_info,
                              allele_symbol=allele_symbol)
    else:
        logger.info(
            "Start parsing variants from file {0}".format(variant_file))
        my_parser = VCFParser(infile=variant_file,
                              split_variants=split,
                              check_info=check_info,
                              allele_symbol=allele_symbol)

    if outfile:
        f = open(outfile, 'w', encoding='utf-8')
        logger.info("Printing vcf to file {0}".format(outfile))

    if not silent:
        logger.info("Printing vcf to stdout")
    else:
        logger.info("Skip printing since silent is active")

    for line in my_parser.metadata.print_header():
        if outfile:
            f.write(line + '\n')
        else:
            if not silent:
                print(line)
    try:
        for variant in my_parser:
            variant_line = '\t'.join(
                [variant[head] for head in my_parser.header])
            if outfile:
                f.write(variant_line + '\n')
            else:
                if not silent:
                    print(variant_line)
            nr_of_variants += 1
    except SyntaxError as e:
        print(e)

    logger.info('Number of variants: {0}'.format(nr_of_variants))
    logger.info('Time to parse file: {0}'.format(str(datetime.now() - start)))
Exemplo n.º 26
0
def summarize(variant_file, frequency_treshold, frequency_keyword,
              cadd_treshold, cadd_keyword, gq_treshold, read_depth_treshold):
    """
    Analyze the the variants in a vcf, the following will be printed:
    
    - How many variants found\n
    - How many variants did not satisfy the base call 
        quality treshold. (Default 20)\n
    - How many variants where not covered in all individuals. 
        (Default depth 10)\n
    - How many variants followed each model in each family:\n
            - AR_hom\n
            - AR_comp\n
            - AR_hom_dn\n
            - AR_comp_dn\n
            - AD\n
            - AD_dn\n
            - XD\n
            - XD_dn\n
            - XR\n
            - XR_dn\n
        - How many rare variants (Default maf < 0.02)\n
        - How many high scored cadd. (Default cadd = 0)\n
        - How many rare + high score cadd\n
        - How many no cadd score\n
        - How many indels\n
        - How many indels without cadd score\n
    
    """

    vcf_file_name = os.path.splitext(os.path.split(variant_file)[-1])[0]

    print("Searching for family members in file...")
    families = check_families(variant_file)
    print('Found families: %s' % ','.join(families))
    inheritance_keyword = 'GeneticModels'

    inheritance_models = [
        'AR_hom', 'AR_hom_dn', 'AR_comp', 'AR_comp_dn', 'AD', 'AD_dn', 'XD',
        'XD_dn', 'XR', 'XR_dn'
    ]

    family_dict = {}
    for family_id in families:
        family_dict[family_id] = {}
        for inheritance_model in inheritance_models:
            family_dict[family_id][inheritance_model] = 0

    number_of_variants = 0
    interesting_variants = 0
    rare_variants = 0
    high_cadd_scores = 0
    no_cadd_score = 0
    high_cadd_and_rare = 0
    high_gq = 0
    covered_in_all = 0
    indels = 0
    indel_no_cadd = 0
    true_de_novos = 0
    low_genotype = 0
    low_coverage = 0
    low_genotype_and_low_coverage = 0

    analysis_start = datetime.now()

    if variant_file == '-':
        variant_parser = VCFParser(fsock=sys.stdin)
    else:
        variant_parser = VCFParser(infile=variant_file)

    for variant in variant_parser:

        maf = min([
            float(frequency)
            for frequency in variant['info_dict'].get(frequency_keyword, '0')
        ])
        cadd_score = max([
            float(cscore)
            for cscore in variant['info_dict'].get(cadd_keyword, '0')
        ])
        reference = variant['REF']
        alternative = variant['ALT']

        number_of_variants += 1
        genotypes = variant.get('genotypes', {})

        correct_genotype = True
        adequate_depth = True
        high_cadd = True
        rare = True

        for individual in genotypes:
            if genotypes[individual].genotype_quality < gq_treshold:
                correct_genotype = False

            #If any individual has depth below "depth" we do not consider the variant
            if genotypes[individual].quality_depth < read_depth_treshold:
                adequate_depth = False

        if not correct_genotype:
            low_genotype += 1
            if not adequate_depth:
                low_genotype_and_low_coverage += 1
        if not adequate_depth:
            low_coverage += 1
        # We are most interested in the variants that meet the criterias of read depth and genotype quality:
        if correct_genotype and adequate_depth:
            interesting_variants += 1
            # Check the cadd score:

            if cadd_score >= cadd_treshold:
                high_cadd_scores += 1
            else:
                high_cadd = False
            if cadd_score == 0:
                no_cadd_score += 1

            # Check the frequency of the variants:
            if maf <= frequency_treshold:
                rare_variants += 1
                if high_cadd:
                    high_cadd_and_rare += 1
            else:
                rare = False

            # Check if indel:

            if len(reference) > 1 or len(alternative) > 1:
                indels += 1
                if cadd_score == 0:
                    indel_no_cadd += 1

        for family_id in families:
            models_found = get_inheritance_models(variant, family_id,
                                                  inheritance_keyword)
            if models_found:
                for model in models_found:
                    family_dict[family_id][model] += 1

    # pp(inheritance_dict)
    print("\n\nSUMMARY OF VARIANTS FOUND IN %s.vcf" % vcf_file_name)
    print(
        "===========================================================================================\n"
    )
    print('Number of variants in file: %s' % number_of_variants)
    print('Number of variants with low genotype quality (gq<%s): %s' %
          (gq_treshold, low_genotype))
    print('Number of variants with low coverage (cov<%s): %s' %
          (read_depth_treshold, low_coverage))
    print(
        'Number of variants with low coverage AND low genotype quality: %s \n\n'
        % low_genotype_and_low_coverage)

    print(
        "The following statistics are for the variants that meet the criterias for genotype quality and read depth.\n"
        "This means that the variants are covered in all individuals.\n"
        "-----------------------------------------------------------------------------------------\n"
    )

    print(
        "Number of variants to be considered in the analysis(according to the statement above): %s \n"
        % (interesting_variants))

    for family_id in families:
        print('Models followed for family %s \n' % family_id)
        for model in inheritance_models:
            print("%s = %s" % (model, family_dict[family_id][model]))

    print('\nNumber of rare (maf<%s): %s. Frequency of all: %.2f' %
          (frequency_treshold, rare_variants,
           rare_variants / interesting_variants))
    print(
        'Number of high cadd scores (cadd >= %s): %s. Frequency of all: %.2f' %
        (cadd_treshold, high_cadd_scores,
         high_cadd_scores / interesting_variants))
    print('Number of high cadd scores and rare: %s. Frequency of all: %.2f' %
          (high_cadd_and_rare, high_cadd_and_rare / interesting_variants))
    print('Number of no cadd scores: %s. Frequency of all: %.2f \n' %
          (no_cadd_score, no_cadd_score / interesting_variants))
    print('Number of indels: %s. Frequency of all: %.2f' %
          (indels, indels / number_of_variants))
    print('Number of indels and no cadd score: %s. Frequency of all: %.2f \n' %
          (indel_no_cadd, indel_no_cadd / number_of_variants))
    print('Time for analysis: %s' % str(datetime.now() - analysis_start))
Exemplo n.º 27
0
def summarize(variant_file, family_file, frequency_treshold, frequency_keyword,
              cadd_treshold, cadd_keyword, gq_treshold, read_depth_treshold):
    """
    Summarize the the variants in a vcf.
    
    There will be one result line per individual.
    
    - How many variants found\n
    - How many variants did not satisfy the base call 
        quality treshold. (Default 20)\n
    - How many variants where not covered in all individuals. 
        (Default depth 10)\n
    - How many variants followed each model in each family:\n
            - AR_hom\n
            - AR_comp\n
            - AR_hom_dn\n
            - AR_comp_dn\n
            - AD\n
            - AD_dn\n
            - XD\n
            - XD_dn\n
            - XR\n
            - XR_dn\n
        - How many rare variants (Default maf < 0.02)\n
        - How many high scored cadd. (Default cadd = 0)\n
        - How many rare + high score cadd\n
        - How many no cadd score\n
        - How many indels\n
        - How many indels without cadd score\n
    
    """
    logger = logging.getLogger(__name__)
    logger = logging.getLogger("genmod.commands.summarize_variants")

    head = HeaderParser()

    nr_of_variants = 0

    header = ['sample_id', 'nr_of_variants']

    samples = {}

    logger.debug("Setting up a variant parser")
    if variant_file == '-':
        variant_parser = VCFParser(fsock=sys.stdin, check_info=False)
    else:
        variant_parser = VCFParser(infile=variant_file, check_info=False)
    logger.debug("Variant parser setup")

    head = variant_parser.metadata

    for sample_id in head.individuals:
        samples[sample_id] = {}
        samples[sample_id]["nr_of_variants"] = 0

    for variant in variant_parser:
        for sample_id in samples:
            samples[sample_id]["nr_of_variants"] += 1
            print(variant['genotypes'][sample_id].depth_of_coverage)

    print(json.dumps(samples))
Exemplo n.º 28
0
def load_mongo_db(scout_configs,
                  vcf_configs=None,
                  family_type='cmms',
                  mongo_db='variantDatabase',
                  variant_type='clinical',
                  username=None,
                  password=None,
                  port=27017,
                  host='localhost',
                  rank_score_threshold=0,
                  variant_number_threshold=5000):
    """Populate a moongo database with information from ped and variant files."""
    # get root path of the Flask app
    # project_root = '/'.join(app.root_path.split('/')[0:-1])

    logger = logging.getLogger(__name__)
    # For testing only
    if __name__ == '__main__':
        logger = logging.getLogger("scout.ext.backend.load_mongo")

    ####### Check if the vcf file is on the proper format #######
    vcf_file = scout_configs['load_vcf']
    logger.info(
        "Found a vcf for loading variants into scout: {0}".format(vcf_file))

    logger.info("Connecting to {0}".format(mongo_db))
    connect(mongo_db,
            host=host,
            port=port,
            username=username,
            password=password)

    variant_database = get_db()

    ped_file = scout_configs['ped']
    logger.info("Found a ped file: {0}".format(ped_file))

    ######## Parse the config file to check for keys ########
    logger.info("Parsing config file")
    config_object = ConfigParser(vcf_configs)

    ######## Get the cases and add them to the mongo db: ########

    logger.info("Get the case from ped file")
    case = get_case(scout_configs, family_type)

    logger.info('Case found in {0}: {1}'.format(ped_file, case.display_name))

    ######## Add the institute to the mongo db: ########

    for institute_name in case['collaborators']:
        if institute_name:
            institute = get_institute(institute_name)
            logger.info("Institute found: {0}".format(institute))
            try:
                Institute.objects.get(internal_id=institute.internal_id)
                logger.info(
                    "Institute {0} already in database".format(institute))
            except DoesNotExist:
                institute.save()
                logger.info(
                    "Adding new institute {0} to database".format(institute))

    logger.info("Updating case in database")

    update_case(case, variant_type, logger)

    ######## Get the variants and add them to the mongo db: ########

    logger.info("Setting up a variant parser")
    variant_parser = VCFParser(infile=vcf_file,
                               split_variants=True,
                               skip_info_check=True)
    nr_of_variants = 0

    logger.info("Deleting old variants for case {0}".format(case.case_id))
    Variant.objects(case_id=case.case_id, variant_type=variant_type).delete()
    logger.debug("Variants deleted")

    start_inserting_variants = datetime.now()

    # Get the individuals to see which we should include in the analysis
    ped_individuals = {
        individual.individual_id: individual.display_name
        for individual in case.individuals
    }

    # Check which individuals that exists in the vcf file.
    # Save the individuals in a dictionary with individual ids as keys
    # and display names as values
    individuals = {}
    # loop over keys (internal ids)
    logger.info("Checking which individuals in ped file exists in vcf")
    for individual_id, display_name in iteritems(ped_individuals):
        logger.debug("Checking individual {0}".format(individual_id))
        if individual_id in variant_parser.individuals:
            logger.debug("Individual {0} found".format(individual_id))
            individuals[individual_id] = display_name
        else:
            logger.warning("Individual {0} is present in ped file but"\
                          " not in vcf".format(individual_id))

    logger.info('Start parsing variants')

    ########## If a rank score threshold is used check if it is below that threshold ##########
    for variant in variant_parser:
        logger.debug("Parsing variant {0}".format(variant['variant_id']))
        if not float(variant['rank_scores'][
                case.display_name]) > rank_score_threshold:
            logger.info("Lower rank score threshold reaced after {0}"\
                        " variants".format(nr_of_variants))
            break

        if nr_of_variants > variant_number_threshold:
            logger.info("Variant number threshold reached. ({0})".format(
                variant_number_threshold))
            break

        nr_of_variants += 1
        mongo_variant = get_mongo_variant(variant, variant_type, individuals,
                                          case, config_object, nr_of_variants)

        mongo_variant.save()

        if nr_of_variants % 1000 == 0:
            logger.info('{0} variants parsed'.format(nr_of_variants))
Exemplo n.º 29
0
def score(family_file, variant_file, family_type, annotation_dir, vep,
          plugin_file, processes, silent, outfile, verbose):
    """
    Score variants in a vcf file using Weighted Sum Model.
    The specific scores should be defined in a config file, see examples in 
    genmod/configs
    """

    frame = inspect.currentframe()
    args, _, _, values = inspect.getargvalues(frame)
    argument_list = [
        i + '=' + str(values[i]) for i in values
        if values[i] and i != 'args' and i != 'frame' and i != 'parser'
    ]

    start_time_analysis = datetime.now()

    if verbose:
        log.info('Running GENMOD score, version: %s \n' % VERSION)

    ## Start by parsing the pedigree file:
    prefered_models = []
    family_id = None

    if family_file:
        prefered_models, family_id = get_genetic_models(
            family_file, family_type)
    else:
        log.critical("Please provide a family file")
        sys.exit()

    if verbose:
        log.info('Prefered model found in family file: %s \n' %
                 prefered_models)

    if not plugin_file:
        log.critical("Please provide a plugin file")
        sys.exit()

    ######### Read to the annotation data structures #########

    gene_trees = {}
    exon_trees = {}

    # If the variants are already annotated we do not need to redo the annotation
    if not vep:
        gene_trees, exon_trees = load_annotations(annotation_dir, verbose)
    else:
        if verbose:
            log.info('Using VEP annotation')

    ## Check the variants:

    if variant_file == '-':
        variant_parser = VCFParser(fsock=sys.stdin, skip_info_check=True)
    else:
        variant_parser = VCFParser(infile=variant_file, skip_info_check=True)

    head = variant_parser.metadata

    add_metadata(head,
                 'version',
                 'genmod_score',
                 version=VERSION,
                 command_line_string=' '.join(argument_list))

    add_metadata(
        head,
        'info',
        'IndividualRankScore',
        annotation_number='.',
        entry_type='String',
        description="Individual rank score for the variant in this family. "\
        "This score is NOT corrected for compounds"
    )

    add_metadata(
        head,
        'info',
        'RankScore',
        annotation_number='.',
        entry_type='String',
        description="Combined rank score for the variant in this family. "\
        "This score is corrected for compounds"
    )

    alt_dict, score_dict, value_dict, operation_dict = check_plugin(
        plugin_file, variant_parser, verbose)

    ####################################################################
    ### The variant queue is where all jobs(in this case batches that###
    ### represents variants in a region) is put. The consumers will  ###
    ### then pick their jobs from this queue.                        ###
    ####################################################################

    variant_queue = JoinableQueue(maxsize=1000)
    # The consumers will put their results in the results queue
    results = Manager().Queue()

    num_model_scorers = processes

    if verbose:
        log.info('Number of CPU:s %s' % cpu_count(), file=sys.stderr)
        log.info('Number of model scorers: %s' % num_model_scorers,
                 file=sys.stderr)

    temp_file = NamedTemporaryFile(delete=False)
    temp_file.close()

    # We open a variant file to print the variants before sorting:
    temporary_variant_file = open(temp_file.name,
                                  mode='w',
                                  encoding='utf-8',
                                  errors='replace')

    model_scorers = [
        VariantScorer(variant_queue, results, variant_parser.header,
                      prefered_models, family_id, alt_dict, score_dict,
                      value_dict, operation_dict, verbose)
        for i in range(num_model_scorers)
    ]

    for proc in model_scorers:
        proc.start()

    # This process prints the variants to temporary files
    var_printer = VariantPrinter(results,
                                 temporary_variant_file,
                                 head,
                                 mode='score',
                                 verbosity=verbose)

    var_printer.start()

    start_time_variant_parsing = datetime.now()

    if verbose:
        log.info('Start parsing the variants ... \n')

    # get_batches put the variants in the queue and returns all chromosomes
    # found among the variants
    chromosome_list = get_batches(variant_parser,
                                  variant_queue,
                                  individuals=[],
                                  gene_trees=gene_trees,
                                  exon_trees=exon_trees,
                                  phased=False,
                                  vep=vep,
                                  whole_genes=True,
                                  verbosity=verbose)

    # Put stop signs in the variant queue
    for i in range(num_model_scorers):
        variant_queue.put(None)

    variant_queue.join()

    results.put(None)
    var_printer.join()

    temporary_variant_file.close()

    if verbose:
        log.info('Cromosomes found in variant file: %s \n' %
                 ','.join(chromosome_list))
        log.info('Variants scored!\n')

    sort_variants(infile=temp_file.name, mode='rank', verbose=verbose)

    print_headers(head, outfile, silent)

    print_variants(temp_file.name, outfile, mode='modified', silent=silent)

    os.remove(temp_file.name)

    if verbose:
        log.info('Time for whole analyis: %s' %
                 str(datetime.now() - start_time_analysis))