Exemplo n.º 1
0
def load_annotations(annotation_dir, verbose=False):
    """
    Load the annotations found in the indata path.
    These are pickled interval trees that are returned as dictionaries.
    """
    
    gene_trees = {}
    exon_trees = {}
    
    if verbose:
        print('Reading annotations...\n', file=sys.stderr)
    
    gene_db = os.path.join(annotation_dir, 'genes.db')
    exon_db = os.path.join(annotation_dir, 'exons.db')
    
    try:
        with open(gene_db, 'rb') as f:
            gene_trees = pickle.load(f)
        with open(exon_db, 'rb') as g:
            exon_trees = pickle.load(g)
    except IOError as e:
        if verbose:
            warning('No annotations found.')
            warning('You need to build annotations! See documentation.')
            # It is possible to continue the analysis without annotation files
        pass
    
    if verbose:
        print('Annotations used found in: %s, %s\n' % (gene_db, exon_db), file=sys.stderr)
     
    return gene_trees, exon_trees
Exemplo n.º 2
0
def load_annotations(annotation_dir, verbose=False):
    """
    Load the annotations found in the indata path.
    These are pickled interval trees that are returned as dictionaries.
    """

    gene_trees = {}
    exon_trees = {}

    if verbose:
        print('Reading annotations...\n', file=sys.stderr)

    gene_db = os.path.join(annotation_dir, 'genes.db')
    exon_db = os.path.join(annotation_dir, 'exons.db')

    try:
        with open(gene_db, 'rb') as f:
            gene_trees = pickle.load(f)
        with open(exon_db, 'rb') as g:
            exon_trees = pickle.load(g)
    except IOError as e:
        if verbose:
            warning('No annotations found.')
            warning('You need to build annotations! See documentation.')
            # It is possible to continue the analysis without annotation files
        pass

    if verbose:
        print('Annotations used found in: %s, %s\n' % (gene_db, exon_db),
              file=sys.stderr)

    return gene_trees, exon_trees
Exemplo n.º 3
0
def annotate_variant(variant, gene_trees, exon_trees, vep, whole_genes,
                     verbosity):
    """
    Annotate variants with what regions the belong.
    Adds 'annotation' = set(set, of, genes) and 
    'compound_candidate' = Boolean to variant dictionary.
    Variants are compound candidates is the are exonic
    and in the same gene. 
    If 'while_gene' is used intronic variants are also
    compound candidates.
    
    Input: variant_dictionary
    
    Returns: variant_dictionary with annotation added    
    """

    variant['comp_candidate'] = False
    variant['annotation'] = set()

    # Internally we never use 'chr' in the chromosome names:
    chrom = variant['CHROM']

    if chrom.startswith('chr'):
        chrom = chrom[3:]

    alternatives = variant['ALT'].split(',')
    # When checking what features that are overlapped we use the longest alternative
    longest_alt = max([len(alternative) for alternative in alternatives])
    variant_position = int(variant['POS'])
    variant_interval = [variant_position, (variant_position + longest_alt - 1)]

    #If annotated with vep we do not need to check interval trees
    if vep:
        variant['annotation'] = check_vep_annotation(variant)
        if len(variant['annotation']) > 0:
            variant['comp_candidate'] = True
    else:
        try:
            variant['annotation'] = set(
                gene_trees[chrom].find_range(variant_interval))
        except KeyError:
            if verbosity:
                warning(''.join(
                    ['Chromosome ', chrom, ' is not in annotation file!']))

        if whole_genes:
            # If compounds are to be checked in whole genes (including introns):
            if len(variant['annotation']) > 0:
                variant['comp_candidate'] = True
        else:
            #Check if exonic:
            try:
                if len(exon_trees[chrom].find_range(variant_interval)):
                    variant['comp_candidate'] = True
            except KeyError:
                if verbosity:
                    warning(''.join(
                        ['Chromosome ', chrom, ' is not in annotation file!']))
        return
Exemplo n.º 4
0
    def run(self):
        """Starts the printing"""
        # Print the results to a temporary file:
        number_of_finished = 0
        proc_name = self.name
        if self.verbosity:
            print(('%s: starting!' % proc_name), file=sys.stderr)
        while True:

            next_result = self.task_queue.get()

            if self.verbosity:
                if self.task_queue.full():
                    warning('Printing queue full')

            if next_result is None:
                if self.verbosity:
                    print('All variants printed!', file=sys.stderr)
                self.temp_file.close()
                break

            else:

                for variant_id in next_result:
                    variant = next_result[variant_id]

                    if self.mode == 'score':
                        try:
                            priority = variant['Individual_rank_score']
                        except KeyError:
                            priority = '0'
                    elif self.mode == 'chromosome':
                        chrom = variant['CHROM']
                        if chrom.startswith('chr'):
                            chrom = chrom[3:]
                        try:
                            priority = int(chrom)
                        except ValueError:
                            if chrom == 'X':
                                priority = 23
                            elif chrom == 'Y':
                                priority = 24
                            elif chrom == 'MT':
                                priority = 25
                            else:
                                priority = 26
                    else:
                        raise SyntaxError("""Need to specify priority mode for 
                                            printing the variants""")

                    print_line = [str(priority)] + [
                        variant.get(entry, '-') for entry in self.header
                    ]

                    self.temp_file.write('\t'.join(print_line) + '\n')

        return
Exemplo n.º 5
0
def annotate_variant(variant, gene_trees, exon_trees, vep, whole_genes, verbosity):
    """
    Annotate variants with what regions the belong.
    Adds 'annotation' = set(set, of, genes) and 
    'compound_candidate' = Boolean to variant dictionary.
    Variants are compound candidates is the are exonic
    and in the same gene. 
    If 'while_gene' is used intronic variants are also
    compound candidates.
    
    Input: variant_dictionary
    
    Returns: variant_dictionary with annotation added    
    """
        
    variant['comp_candidate'] = False
    variant['annotation'] = set()
    
    # Internally we never use 'chr' in the chromosome names:        
    chrom = variant['CHROM']
    
    if chrom.startswith('chr'):
        chrom = chrom[3:]
    
    alternatives = variant['ALT'].split(',')
    # When checking what features that are overlapped we use the longest alternative
    longest_alt = max([len(alternative) for alternative in alternatives])
    variant_position = int(variant['POS'])
    variant_interval = [variant_position, (variant_position + longest_alt-1)]
    
    #If annotated with vep we do not need to check interval trees
    if vep:
        variant['annotation'] = check_vep_annotation(variant)
        if len(variant['annotation']) > 0:
            variant['comp_candidate'] = True
    else:
        try:
            variant['annotation'] = set(gene_trees[chrom].find_range(variant_interval))
        except KeyError:
            if verbosity:
                warning(''.join(['Chromosome ', chrom, ' is not in annotation file!']))
        
        if whole_genes:
        # If compounds are to be checked in whole genes (including introns):
            if len(variant['annotation']) > 0:
                variant['comp_candidate'] = True
        else:
        #Check if exonic:
            try:
                if len(exon_trees[chrom].find_range(variant_interval)):
                    variant['comp_candidate'] = True
            except KeyError:
                if verbosity:
                    warning(''.join(['Chromosome ', chrom, ' is not in annotation file!']))
        return
Exemplo n.º 6
0
def get_batches(variant_parser,
                batch_queue,
                individuals,
                gene_trees={},
                exon_trees={},
                phased=False,
                vep=False,
                whole_genes=False,
                verbosity=False):
    """
    Create batches and put them into the queue.
    Annotate the variants with regions, either from the annotation built by
    genmod or check the VEP terms. The variants in one feature will be a 
    batch(default feature is a gene), if intergenic the batch sixe is 
    maximun 10000 variants long. After one batch is filled it is sent 
    to the variant queue.
    """
    beginning = True
    # A batch is a dictionary with variants
    batch = {}
    new_chrom = None
    current_chrom = None
    current_features = []
    haploblock_id = 1
    # Haploblocks is a dictionary with list of lists like {ind_id:[[start, stop, id],[start, stop,id],...], ...}
    haploblocks = {ind_id: [] for ind_id in individuals}
    nr_of_batches = 0
    chromosomes = []
    # Parse the vcf file:
    if verbosity:
        start_parsing_time = datetime.now()
        start_chrom_time = start_parsing_time
        start_twenty_time = start_parsing_time
        if batch_queue.full():
            warning('Queue full!!')

    nr_of_variants = 0
    for variant in variant_parser:

        variant_id = variant['variant_id']
        nr_of_variants += 1
        new_chrom = variant['CHROM']
        if new_chrom.startswith('chr'):
            new_chrom = new_chrom[3:]

        # Annotate which features the variant belongs to:
        annotate_variant(variant, gene_trees, exon_trees, vep, whole_genes,
                         verbosity)

        new_features = variant['annotation']

        if verbosity:
            if nr_of_variants % 20000 == 0:
                log.info('%s variants parsed!' % nr_of_variants)
                log.info('Last 20.000 took %s to parse.\n' %
                         str(datetime.now() - start_twenty_time))
                start_twenty_time = datetime.now()

        # If we look at the first variant, setup boundary conditions:
        if beginning:
            current_features = new_features
            # Add the variant to each of its features in a batch
            batch[variant_id] = variant
            current_chrom = new_chrom
            batch['haploblocks'] = {}
            if phased:
                # We collect the starts of the haploblocks
                haploblock_starts = {
                    ind_id: int(variant['POS'])
                    for ind_id in individuals
                }
            beginning = False
        else:
            # If we should put the batch in the queue:
            send = True

            if phased:
                for ind_id in individuals:
                    #A new haploblock is indicated by '/' if the data is phased
                    if '/' in variant.get(ind_id, './.'):
                        #If call is not passed we consider it to be on same haploblock(GATK recommendations)
                        if variant.get('FILTER', '.') == 'PASS':
                            haploblocks[ind_id].append([
                                haploblock_starts[ind_id],
                                int(variant['POS']) - 1,
                                str(haploblock_id)
                            ])
                            haploblock_id += 1
                            haploblock_starts[ind_id] = int(variant['POS'])

        # Check if we are in a space between features:
            if len(new_features) == 0:
                if len(current_features) == 0:
                    # If the intergeneic region is bigger than 10000 we send it as a batch
                    if len(batch) < 10000:
                        send = False
        #If not check if we are in a region with overlapping features
            elif new_features.intersection(current_features):
                send = False

        # If we are at a new chromosome we finish the current batch:
            if new_chrom != current_chrom:
                chromosomes.append(current_chrom)
                # New chromosome means new batch
                send = True

                if verbosity:
                    log.info('Chromosome %s parsed!' % current_chrom)
                    log.info('Time to parse chromosome %s' %
                             str(datetime.now() - start_chrom_time))
                    start_chrom_time = datetime.now()

                current_chrom = new_chrom

            if send:
                if phased:
                    # Create an interval tree for each individual with the phaing intervals
                    for ind_id in individuals:
                        #Check if we have just finished an interval
                        if haploblock_starts[ind_id] != int(variant['POS']):
                            haploblocks[ind_id].append([
                                haploblock_starts[ind_id],
                                int(variant['POS']),
                                str(haploblock_id)
                            ])
                            haploblock_id += 1
                        # Create interval trees of the haploblocks
                        batch['haploblocks'][
                            ind_id] = interval_tree.IntervalTree(
                                haploblocks[ind_id],
                                haploblocks[ind_id][0][0] - 1,
                                haploblocks[ind_id][-1][1] + 1)
                    haploblocks = {ind_id: [] for ind_id in individuals}

                # Put the job in the queue
                batch_queue.put(batch)
                nr_of_batches += 1
                #Reset the variables
                current_features = new_features
                batch = {}
                batch[variant_id] = variant
                batch['haploblocks'] = {}
            else:
                current_features = current_features.union(new_features)
                batch[variant_id] = variant

    chromosomes.append(current_chrom)
    nr_of_batches += 1

    if verbosity:
        log.info('Chromosome %s parsed!' % current_chrom)
        log.info('Time to parse chromosome %s \n' %
                 str(datetime.now() - start_chrom_time))
        log.info('Variants parsed!')
        log.info('Time to parse variants:%s' %
                 str(datetime.now() - start_parsing_time))
        log.info('Number of variants in variant file:%s\n' % nr_of_variants)
        log.info('Number of batches created:%s\n' % nr_of_batches)

    if phased:
        # Create an interval tree for each individual with the phasing intervals
        for ind_id in individuals:
            #check if we have just finished an interval
            if haploblock_starts[ind_id] != int(variant['POS']):
                haploblocks[ind_id].append([
                    haploblock_starts[ind_id],
                    int(variant['POS']),
                    str(haploblock_id)
                ])
                haploblock_id += 1
            try:
                batch['haploblocks'][ind_id] = interval_tree.IntervalTree(
                    haploblocks[ind_id], haploblocks[ind_id][0][0] - 1,
                    haploblocks[ind_id][-1][1] + 1)
            except IndexError:
                pass

    batch_queue.put(batch)

    return chromosomes
Exemplo n.º 7
0
def get_batches(variant_parser, batch_queue, individuals, gene_trees={}, 
                exon_trees={}, phased=False, vep=False, whole_genes=False, 
                verbosity=False):
    """
    Create batches and put them into the queue.
    Annotate the variants with regions, either from the annotation built by
    genmod or check the VEP terms. The variants in one feature will be a 
    batch(default feature is a gene), if intergenic the batch sixe is 
    maximun 10000 variants long. After one batch is filled it is sent 
    to the variant queue.
    """
    beginning = True
    # A batch is a dictionary with variants
    batch = {}
    new_chrom = None
    current_chrom = None
    current_features = []
    haploblock_id = 1
    # Haploblocks is a dictionary with list of lists like {ind_id:[[start, stop, id],[start, stop,id],...], ...}
    haploblocks = {ind_id:[] for ind_id in individuals}
    nr_of_batches = 0
    chromosomes = []
    # Parse the vcf file:
    if verbosity:
        start_parsing_time = datetime.now()
        start_chrom_time = start_parsing_time
        start_twenty_time = start_parsing_time
        if batch_queue.full():
            warning('Queue full!!')
    
    nr_of_variants = 0
    for variant in variant_parser:
        
        variant_id = variant['variant_id']
        nr_of_variants += 1
        new_chrom = variant['CHROM']
        if new_chrom.startswith('chr'):
            new_chrom = new_chrom[3:]
        
        # Annotate which features the variant belongs to:
        annotate_variant(
                            variant, 
                            gene_trees, 
                            exon_trees, 
                            vep, 
                            whole_genes, 
                            verbosity
                        )
        
        new_features = variant['annotation']
        
        if verbosity:
            if nr_of_variants % 20000 == 0:
                log.info('%s variants parsed!' % nr_of_variants)
                log.info('Last 20.000 took %s to parse.\n' % 
                         str(datetime.now() - start_twenty_time))
                start_twenty_time = datetime.now()
        
        # If we look at the first variant, setup boundary conditions:
        if beginning:
            current_features = new_features
            # Add the variant to each of its features in a batch
            batch[variant_id] = variant
            current_chrom = new_chrom
            batch['haploblocks'] = {}
            if phased:
                # We collect the starts of the haploblocks
                haploblock_starts = {ind_id:int(variant['POS']) for ind_id in individuals}
            beginning = False
        else:
            # If we should put the batch in the queue:
            send = True
            
            if phased:
                for ind_id in individuals:
                    #A new haploblock is indicated by '/' if the data is phased
                    if '/' in variant.get(ind_id, './.'):
                    #If call is not passed we consider it to be on same haploblock(GATK recommendations)
                        if variant.get('FILTER', '.') == 'PASS':
                            haploblocks[ind_id].append(
                                                [   
                                                    haploblock_starts[ind_id], 
                                                    int(variant['POS']) - 1,
                                                    str(haploblock_id)
                                                ]
                                            )
                            haploblock_id += 1
                            haploblock_starts[ind_id] = int(variant['POS'])
            
        # Check if we are in a space between features:
            if len(new_features) == 0:
                if len(current_features) == 0:
                    # If the intergeneic region is bigger than 10000 we send it as a batch
                    if len(batch) < 10000:
                        send = False
        #If not check if we are in a region with overlapping features
            elif new_features.intersection(current_features):
                send = False
            
        # If we are at a new chromosome we finish the current batch:
            if new_chrom != current_chrom:
                chromosomes.append(current_chrom)
                # New chromosome means new batch
                send = True
                
                if verbosity:
                    log.info('Chromosome %s parsed!' % current_chrom)
                    log.info('Time to parse chromosome %s' 
                                % str(datetime.now()-start_chrom_time))
                    start_chrom_time = datetime.now()
            
                current_chrom = new_chrom
            
            if send:
                if phased:
                # Create an interval tree for each individual with the phaing intervals 
                    for ind_id in individuals:
                        #Check if we have just finished an interval
                        if haploblock_starts[ind_id] != int(variant['POS']):                                        
                            haploblocks[ind_id].append(
                                                [
                                                    haploblock_starts[ind_id], 
                                                    int(variant['POS']), 
                                                    str(haploblock_id)
                                                ]
                                            )
                            haploblock_id += 1
                        # Create interval trees of the haploblocks
                        batch['haploblocks'][ind_id] = interval_tree.IntervalTree(
                                                                haploblocks[ind_id], 
                                                                haploblocks[ind_id][0][0]-1, 
                                                                haploblocks[ind_id][-1][1]+1
                                                        )
                    haploblocks = {ind_id:[] for ind_id in individuals}
                
                # Put the job in the queue
                batch_queue.put(batch)
                nr_of_batches += 1
                #Reset the variables
                current_features = new_features
                batch = {}
                batch[variant_id] = variant
                batch['haploblocks'] = {}
            else:
                current_features = current_features.union(new_features)
                batch[variant_id] = variant
    
    chromosomes.append(current_chrom)
    nr_of_batches += 1
    
    if verbosity:
        log.info('Chromosome %s parsed!' % current_chrom)
        log.info('Time to parse chromosome %s \n' % str(datetime.now()-start_chrom_time))
        log.info('Variants parsed!')
        log.info('Time to parse variants:%s' % str(datetime.now() - start_parsing_time))
        log.info('Number of variants in variant file:%s\n' % nr_of_variants)
        log.info('Number of batches created:%s\n' % nr_of_batches)
    
    if phased:
    # Create an interval tree for each individual with the phasing intervals
        for ind_id in individuals:
            #check if we have just finished an interval
            if haploblock_starts[ind_id] != int(variant['POS']):
                haploblocks[ind_id].append(
                                    [
                                        haploblock_starts[ind_id], 
                                        int(variant['POS']), 
                                        str(haploblock_id)
                                    ]
                                )
                haploblock_id += 1
            try:
                batch['haploblocks'][ind_id] = interval_tree.IntervalTree(
                                                        haploblocks[ind_id], 
                                                        haploblocks[ind_id][0][0]-1, 
                                                        haploblocks[ind_id][-1][1]+1
                                                )
            except IndexError:
                pass
    
    batch_queue.put(batch)
    
    return chromosomes