Пример #1
0
    def initial_abundances(eqs, lengths, population):

        # Divides counts equally between alleles in the same compatibility class
        def divide_equally(alleles, count):
            n_alleles = len(alleles)
            for allele in alleles:
                counts[allele] += count / n_alleles

        # Divides counts proportionally to allele frequency
        def divide_prior(alleles, count, allele_prob):
            total_prob = sum(allele_prob.values())
            for allele in alleles:
                counts[allele] += count * (allele_prob[allele] / total_prob)

        counts = defaultdict(float)
        undivided_counts = defaultdict(float)
        for alleles, count in eqs:

            allele_prior = defaultdict(float)
            for idx in alleles:
                undivided_counts[idx] += count
                allele = process_allele(allele_idx[idx][0], 2)
                if population and allele in prior:
                    allele_prior[idx] = prior[allele][population]

            if population and allele_prior:
                divide_prior(alleles, count, allele_prior)
                continue

            divide_equally(alleles, count)

        return counts_to_abundances(counts), undivided_counts
Пример #2
0
def convert_allele(allele, resolution):
    '''Checks nomenclature of input allele and returns converted allele.'''
    i = len(allele.split(':'))

    # Input: P-group allele
    if allele[-1] == 'P':
        if resolution == 'g-group':
            sys.exit('[convert] Error: p-group cannot be converted ' +
                     'to g-group.')

        # Output: 1-field allele unless forced
        elif type(resolution) == int:
            if resolution > 1 and not args.force:
                sys.exit('[convert] Error: p-group cannot be ' +
                         'converted to %.0f fields.' % resolution)
            allele = process_allele(allele[:-1], resolution)

    # Input: G-group allele
    elif allele[-1] == 'G':

        # Output: 1-field allele unless forced
        if type(resolution) == int:
            if resolution > 1 and not args.force:
                sys.exit('[convert] Error: g-group cannot be converted' +
                         'to %.0f fields.' % resolution)
            allele = process_allele(allele[:-1], resolution)

        # Output: P-group allele
        elif resolution == 'p-group':
            if allele[:-1] in p_group[i]:
                allele = p_group[i][allele[:-1]]

            elif process_allele(allele[:-1], i - 1) in p_group[i]:
                allele = p_group[i][process_allele(allele[:-1], i - 1)]

    # Input: ungrouped allele
    # Output: G-group allele
    elif resolution == 'g-group':
        if allele in g_group[i]:
            allele = g_group[i][allele]
        elif allele[-1] != 'N':
            allele = process_allele(allele, 3)

    # Input: ungrouped allele
    # Output: P-group allele
    elif resolution == 'p-group':
        if allele in p_group[i]:
            allele = p_group[i][allele]

    # Input: ungrouped allele
    # Output: reduced resolution, ungrouped allele
    elif type(resolution) == int:
        allele = process_allele(allele, resolution)

    return allele
Пример #3
0
def filter_eqs(complete_genotypes, allele_idx, eq_idx, partial_alleles):
    '''Filters compatibility classes if they contain partial alleles or
       at least one predicted complete allele.
    '''

    all_predicted = {
        allele
        for alleles in complete_genotypes.values() for allele in alleles
    }

    wanted_indices = {
        index
        for index, alleles in allele_idx.items()
        if alleles and (set(alleles) & (partial_alleles | set(all_predicted)))
    }

    filtered_eqs = dict()
    for group, eq_list in eq_idx.items():
        filtered_eqs[group] = dict()
        for gene in complete_genotypes:
            if gene not in eq_list:
                continue

            filtered = []
            for indices, count in eq_list[gene]:
                indices = set(indices) & wanted_indices

                if not indices:
                    continue

                filtered.append([indices, count])
            filtered_eqs[group][gene] = filtered

    allele_eq = {group: defaultdict(set) for group in filtered_eqs.keys()}
    for group, eq_list in filtered_eqs.items():
        for gene in eq_list:
            for i, (indices, count) in enumerate(eq_list[gene]):
                for idx in indices:
                    for allele in allele_idx[idx]:
                        allele = process_allele(allele, 3)
                        allele_eq[group][allele].add(i)

    return filtered_eqs, allele_eq
Пример #4
0
def genotype_gene(gene, gene_count, eqs, lengths, allele_idx, population,
                  prior, tolerance, max_iterations, drop_iterations,
                  drop_threshold, zygosity_threshold):
    '''Calls transcript quantification and genotype prediction.'''

    if gene not in {'A', 'B', 'C', 'DRB1', 'DQB1', 'DQA1'}:
        population = None

    em_results = expectation_maximization(eqs, lengths, allele_idx, population,
                                          prior, tolerance, max_iterations,
                                          drop_iterations, drop_threshold)

    em_results = [[idx, allele_idx[idx], a] for idx, a in em_results.items()]

    log.info('\n[genotype] Top alleles by abundance:')
    log.info('\t\t{: <20}    {: >9}'.format('allele', 'abundance'))

    for _, alleles, abundance in sorted(em_results,
                                        key=lambda x: x[2],
                                        reverse=True):

        log.info('\t\t{: <20}    {: >8.2f}%'.format(
            process_allele(alleles[0], 3), abundance * 100))

    genotype, pair_count = predict_genotype(eqs, allele_idx, allele_eq,
                                            em_results, gene_count, population,
                                            prior, zygosity_threshold)

    log.info(
        '\n[genotype] Most likely genotype explaining {:.0f} reads:'.format(
            pair_count))

    for allele in genotype:
        log.info(f'\t\t{allele}')

    return em_results, genotype
Пример #5
0
def process_hla_dat():
    '''Processes IMGTHLA database, returning HLA sequences, exon locations, 
       lists of complete and partial alleles and possible exon combinations.
    '''

    sequences = dict()
    utrs = defaultdict(dict)
    exons = defaultdict(dict)
    gene_exons = defaultdict(set)

    sequence = partial = utr = exon = False

    gene_set = set()
    complete_alleles = set()
    complete_2fields = set()
    partial_alleles = set()

    with open(hla_dat, 'r') as file:
        lines = file.read().splitlines()

    for line in lines:
        # Denotes end of sequence, add allele to database
        if line.startswith('//'):
            if sequence and allele in exons:
                sequences[allele] = seq
                gene_exons[gene].add(number)
                gene_set.add(gene)
                
                if not partial:
                    complete_alleles.add(allele)
                    complete_2fields.add(process_allele(allele,2))
                    
                else:
                    partial_alleles.add(allele)
            partial = False

        # Denotes partial alleles
        elif line.startswith('FT') and 'partial' in line:
            partial = True
            
        # Allele name and gene
        elif line.startswith('FT') and re.search('allele\="HLA-', line):   
            allele = re.split('HLA-', re.sub('["\n]','',line))[1]
            gene = get_gene(allele)

            exon = sequence = False
            seq = ''

        # Exon coordinates
        elif line.startswith('FT') and re.search('exon',line):
            info = re.split('\s+', line)
            start = int(info[2].split('..')[0]) - 1
            stop = int(info[2].split('..')[1])
            exon_coord = [start, stop]
            exon = True

        # Exon number on following line
        elif exon:
            number = re.split('"', line)[1]
            exons[allele][number] = exon_coord
            exon = False

        # UTRs
        elif line.startswith('FT') and (re.search('\sUTR\s',line)):
            info = re.split('\s+', line)
            start = int(info[2].split('..')[0]) - 1
            stop = int(info[2].split('..')[1])
            utr_coord = [start, stop]

            if allele not in exons:
                utrs[allele]['utr5'] = utr_coord
            else:
                utrs[allele]['utr3'] = utr_coord

                
        # Start of sequence
        elif line.startswith('SQ'):
            sequence = True

        elif sequence and line.startswith(' '):
            seq += ''.join(line.split()[:-1]).upper()
            
    # select only 2-field partial alleles
    partial_alleles = {allele for allele in partial_alleles 
                        if process_allele(allele,2) not in complete_2fields}
                   
    # get most common final exon length to truncate stop-loss alleles
    final_exon_length = defaultdict(list)
    for allele in complete_alleles:
        gene = get_gene(allele)
        exon = sorted(gene_exons[gene])[-1]
        
        if exon not in exons[allele]:
            continue
            
        start, stop = exons[allele][exon]
        final_exon_length[gene].append(stop-start)
        
    for gene, lengths in final_exon_length.items():
        exon = sorted(gene_exons[gene])[-1]
        length = get_mode(lengths)
        final_exon_length[gene] = [exon,length]
            
    return (complete_alleles, partial_alleles, gene_set, sequences, utrs, 
           exons, final_exon_length)
Пример #6
0
def type_partial(eqs, gene, partial_exons, complete_genotype, partial_alleles,
                 population, prior, tolerance, max_iterations, drop_iterations,
                 drop_threshold, zygosity_threshold):
    '''Types partial alleles.'''

    # Return count of a single allele
    def get_single_count(a):
        return sum([eqs[group][gene][idx][1] for idx in allele_eq[group][a]])

    # Return count of a pair of alleles
    def get_pair_count(a1, a2):
        indices = allele_eq[group][a1] | allele_eq[group][a2]
        return sum([eqs[group][gene][idx][1] for idx in indices])

    # Return nonshared count of a pair of alleles
    def get_nonshared_count(a1, a2):
        a1_indices = allele_eq[group][a1] - allele_eq[group][a2]
        a2_indices = allele_eq[group][a2] - allele_eq[group][a1]
        a1_count = sum([eqs[group][gene][idx][1] for idx in a1_indices])
        a2_count = sum([eqs[group][gene][idx][1] for idx in a2_indices])
        return a1_count, a2_count

    if gene not in {'A', 'B', 'C', 'DRB1', 'DQB1', 'DQA1'}:
        population = None

    # Set binding region by class
    if gene.startswith('D'):
        binding_region = "['2']"
    else:
        binding_region = "['2', '3']"

    if gene not in eqs[binding_region]:
        log.info(f'[genotype] No reads aligned to HLA-{gene} binding region')
        return complete_genotype

    # Get group of possible partial alleles by performing transcript
    # quantification on the binding region exons
    results = expectation_maximization(eqs[binding_region][gene], lengths,
                                       allele_idx, population, prior,
                                       tolerance, max_iterations,
                                       drop_iterations, drop_threshold)
    exon_groups = defaultdict(set)

    # Map partial alleles to their possible exon combinations
    for idx in results:
        alleles = {process_allele(allele, 3) for allele in allele_idx[idx]}
        for allele in (alleles - set(complete_genotype)) & partial_alleles:
            for group in eqs.keys():
                if group[1:-1] in str(sorted(partial_exons[allele].keys())):
                    exon_groups[group].add(allele)

    # Compare pairs of partial alleles and predicted alleles
    overall = []
    for group in sorted(exon_groups.keys(),
                        key=lambda x: len(x),
                        reverse=False):

        # Skip just exon 2 for class I
        if not gene.startswith('D') and group == "['2']":
            continue

        # Only look at partial alleles that have a different sequence for
        # this combination of exons than the complete alleles
        a1, a2 = complete_genotype
        possible_alleles = {
            allele
            for allele in exon_groups[group]
            if allele_eq[group][allele] != allele_eq[group][a1]
            and allele_eq[group][allele] != allele_eq[group][a2]
        }

        if not possible_alleles:
            continue

        explained_reads = dict()

        # Filter partial alleles that have only a few more reads
        # than the complete alleles
        min_count = min(get_single_count(a1), get_single_count(a2))
        possible_alleles &= {
            allele
            for allele in exon_groups[group]
            if get_single_count(allele) > 10 + min_count
        }

        total_count = sum([count for _, count in eqs[group][gene]])

        # Get percent explained reads by complete genotype
        pair_count = get_pair_count(a1, a2)
        explained_percent = round(pair_count / total_count, 8)
        explained_reads[(a1, a2)] = explained_percent

        # Only consider pairs with partial alleles if they explain
        # a greater percentage of reads
        for a1, a2 in combinations(
                set(complete_genotype) | possible_alleles, 2):
            pair_count = get_pair_count(a1, a2)
            if pair_count / total_count > explained_percent:
                explained_reads[(a1, a2)] = round(pair_count / total_count, 8)

        if not explained_reads:
            continue
        top_perc = max(explained_reads.values())
        explained_reads = {
            key: value
            for key, value in explained_reads.items() if value == top_perc
        }

        # If the top percentage of explained reads is shared by more than
        # one pair, use priors to break the ties
        if population and len(explained_reads) > 1:
            pair_prior = dict()
            for a1, a2 in explained_reads.keys():
                if (process_allele(a1, 2) not in prior
                        or process_allele(a2, 2) not in prior):
                    continue

                pair_prior[(a1,a2)] =  prior[process_allele(a1,2)][population] \
                                     * prior[process_allele(a2,2)][population]

            if pair_prior:
                a1, a2 = sorted(pair_prior.items(),
                                key=lambda x: x[1],
                                reverse=True)[0][0]
            else:
                a1, a2 = sorted(explained_reads.items(),
                                key=lambda x: x[1],
                                reverse=True)[0][0]
        else:
            a1, a2 = sorted(explained_reads.items(),
                            key=lambda x: x[1],
                            reverse=True)[0][0]
        group = re.sub('[\'\[\]]', '', group)
        log.info('\t\texons {: <22}\t{: <28}\t{:.2f}%'.format(
            group, ', '.join([a1, a2]), top_perc * 100))

        overall.append([(a1, a2), top_perc])

    if overall:
        return sorted(overall,
                      key=lambda x: (x[1], x[0][0], x[0][1]),
                      reverse=True)[0][0]

    return complete_genotype
Пример #7
0
def predict_genotype(eqs, allele_idx, allele_eq, em_results, gene_count,
                     population, prior, zygosity_threshold):
    '''Predicts most likely genotype using scoring based on proportion of 
       explained reads, tie-breaking with allele priors.
    '''

    # Returns number of reads explained by an allele
    def get_count(a):
        observed_eqs = allele_eq[a]
        return sum([eqs[idx][1] for idx in observed_eqs])

    # Returns number of reads explained by a pair of alleles
    def get_pair_count(a1, a2):
        if type(a1) == tuple:
            a1_eqs = set.union(*[allele_eq[idx] for idx in a1])
        else:
            a1_eqs = allele_eq[a1]
        if type(a2) == tuple:
            a2_eqs = set.union(*[allele_eq[idx] for idx in a2])
        else:
            a2_eqs = allele_eq[a2]

        observed_eqs = a1_eqs | a2_eqs

        return sum([eqs[idx][1] for idx in observed_eqs])

    # Returns non-shared counts for a pair of alleles
    def get_nonshared_count(a1, a2):
        if type(a1) == tuple:
            a1_eqs = set.union(*[allele_eq[idx] for idx in a1])
        else:
            a1_eqs = allele_eq[a1]
        if type(a2) == tuple:
            a2_eqs = set.union(*[allele_eq[idx] for idx in a2])
        else:
            a2_eqs = allele_eq[a2]

        a1_nonshared_eqs = a1_eqs - a2_eqs
        a2_nonshared_eqs = a2_eqs - a1_eqs

        a1_count = sum([eqs[idx][1] for idx in a1_nonshared_eqs])
        a2_count = sum([eqs[idx][1] for idx in a2_nonshared_eqs])

        return a1_count, a2_count

    explained_reads = dict()
    if len(em_results) > 1:
        grouped_indices = defaultdict(set)
        for idx, alleles, abundances in em_results:
            allele = process_allele(alleles[0], 2)
            grouped_indices[allele].add(idx)

        grouped_indices = [tuple(v) for v in grouped_indices.values()]

        if len(grouped_indices) > 1:
            for a1, a2 in combinations(grouped_indices, 2):
                pair_count = get_pair_count(a1, a2)
                explained_reads[(a1, a2)] = pair_count / gene_count
        else:
            a1, a2 = sorted(list(grouped_indices)[0])[:2]
            pair_count = get_pair_count(a1, a2)
            explained_reads[((a1, ), (a2, ))] = pair_count / gene_count

        # Print information
        log.info('\n[genotype] Pairs by % explained reads:')
        log.info('\t\t{: <28}    {: >7}\t'.format('allele pair', 'explained'))
        for (a1, a2), count in sorted(explained_reads.items(),
                                      key=lambda x: x[1],
                                      reverse=True):
            alleles = ', '.join([
                process_allele(allele_idx[a1[0]][0], 3),
                process_allele(allele_idx[a2[0]][0], 3)
            ])
            log.info('\t\t{: <28}    {: >9.2f}%\t'.format(
                alleles, count * 100))

        max_count = max(explained_reads.values())
        top_by_reads = {
            pair: count
            for pair, count in explained_reads.items() if count == max_count
        }

        # If more than one pair has the same number of explained reads
        # use allele frequency priors to break the tie
        if len(top_by_reads) > 1 and population:
            pair_prior = dict()
            for a1, a2 in top_by_reads.keys():
                allele1 = process_allele(allele_idx[a1[0]][0], 2)
                allele2 = process_allele(allele_idx[a2[0]][0], 2)
                if allele1 not in prior or allele2 not in prior:
                    continue

                pair_prior[(a1,a2)] =   prior[allele1][population] \
                                      * prior[allele2][population]

            max_prior = max(pair_prior.values())
            pair_prior = {
                pair: prior
                for pair, prior in pair_prior.items() if prior >= max_prior
            }

            a1, a2 = sorted(pair_prior.keys(), key=lambda x: (x[0], x[1]))[0]

        else:
            a1, a2 = sorted(top_by_reads.items(),
                            key=lambda x: x[1],
                            reverse=True)[0][0]

        pair_count = get_pair_count(a1, a2)
        a1_count, a2_count = get_nonshared_count(a1, a2)

        a1 = process_allele(allele_idx[sorted(a1)[0]][0], 3)
        a2 = process_allele(allele_idx[sorted(a2)[0]][0], 3)
        # Zygosity check based on nonshared counts
        log.info(f'\n[genotype] Checking zygosity')
        if a1_count == a2_count == 0:
            log.info('[genotype] Unable to distinguish ' +
                     'between minor and major alleles')
            genotype = [a1, a2]
        elif a1_count == 0:
            log.info('[genotype] Likely heterozygous: minor allele has no ' +
                     'nonshared reads')
            genotype = [a2]
        elif a2_count == 0:
            log.info('[genotype] Likely heterozygous: minor allele has no ' +
                     'nonshared reads')
            genotype = [a1]
        elif min(a1_count / a2_count,
                 a2_count / a1_count) < zygosity_threshold:
            log.info(f'[genotype] Likely homozygous: minor/major ' +
                     'nonshared count {:.2f}'.format(
                         min(a1_count / a2_count, a2_count / a1_count)))
            if a1_count > a2_count:
                genotype = [a1]
            else:
                genotype = [a2]
        else:
            log.info(f'[genotype] Likely heterozygous: minor/major ' +
                     'nonshared count {:.2f}'.format(
                         min(a1_count / a2_count, a2_count / a1_count)))
            genotype = [a1, a2]

    else:
        a1, alleles, _ = em_results[0]
        pair_count = get_count(a1)
        a1_count = pair_count
        a2_count = None
        genotype = [
            process_allele(alleles[0], 3),
        ]

    return genotype, pair_count
Пример #8
0
def expectation_maximization(eqs, lengths, allele_idx, population, prior,
                             tolerance, max_iterations, drop_iterations,
                             drop_threshold):
    '''Quantifies allele transcript abundance. Based on the methods
       used in HISAT-genotype (http://dx.doi.org/10.1101/266197).
    '''

    # Divides raw counts between alleles for the first iteration
    # of transcript quantification
    def initial_abundances(eqs, lengths, population):

        # Divides counts equally between alleles in the same compatibility class
        def divide_equally(alleles, count):
            n_alleles = len(alleles)
            for allele in alleles:
                counts[allele] += count / n_alleles

        # Divides counts proportionally to allele frequency
        def divide_prior(alleles, count, allele_prob):
            total_prob = sum(allele_prob.values())
            for allele in alleles:
                counts[allele] += count * (allele_prob[allele] / total_prob)

        counts = defaultdict(float)
        undivided_counts = defaultdict(float)
        for alleles, count in eqs:

            allele_prior = defaultdict(float)
            for idx in alleles:
                undivided_counts[idx] += count
                allele = process_allele(allele_idx[idx][0], 2)
                if population and allele in prior:
                    allele_prior[idx] = prior[allele][population]

            if population and allele_prior:
                divide_prior(alleles, count, allele_prior)
                continue

            divide_equally(alleles, count)

        return counts_to_abundances(counts), undivided_counts

    # Normalizes counts by allele length and convert to abundances
    def counts_to_abundances(counts):
        abundances = defaultdict(float)

        for allele, count in counts.items():
            length = lengths[allele]
            abundances[allele] = count / length

        total_abundance = sum(abundances.values())

        for allele, abundance in abundances.items():
            abundances[allele] = abundance / total_abundance

        return abundances

    # Redistribute counts between alleles in the same compatibility
    # class based on their overall abundance
    def update_abundances(eqs, abundances):
        counts = defaultdict(float)

        for alleles, count in eqs:
            alleles = [allele for allele in alleles if allele in abundances]
            total_abundance = sum([abundances[allele] for allele in alleles])

            if total_abundance == 0:
                continue

            for allele in alleles:
                counts[allele] += count * (abundances[allele] /
                                           total_abundance)

        return counts_to_abundances(counts)

    # Drop low support alleles after a specified number of iterations if their
    # abundance is less than a specified proportion of the greatest abundance
    def drop_alleles(eqs, abundances, drop_iterations, drop_threshold,
                     iterations, converged):
        if iterations == 1:
            abundances = {
                allele: abundance
                for allele, abundance in abundances.items() if abundance > 0.0
            }

        elif iterations >= drop_iterations or converged:
            threshold = drop_threshold * max(abundances.values())
            abundances = {
                allele: abundance
                for allele, abundance in abundances.items()
                if abundance >= threshold
            }
        return abundances, eqs

    # Compute square root of sum of squares
    def SRSS(theta):
        square_sum = 0.0
        for i in theta:
            square_sum += i**2
        return math.sqrt(square_sum)

    # Check if sum difference between two iterations is below tolerance
    def check_convergence(theta0, theta_prime):
        diff = [theta_prime[allele] - theta0[allele] for allele in theta0]
        residual_error = SRSS(diff)
        return residual_error < tolerance

    converged = False
    iterations = 1

    theta0, undivided_counts = initial_abundances(eqs, lengths, population)

    log.info('[genotype] Top 10 alleles by undivided read count:')
    log.info('\t\t{: <20}    {: >10}\t'.format('allele', 'read count'))

    for idx, count in sorted(undivided_counts.items(),
                             key=lambda x: x[1],
                             reverse=True)[:10]:

        log.info('\t\t{: <20}    {: >10.0f}\t'.format(
            process_allele(allele_idx[idx][0], 3), count))

    log.info(f'\n[genotype] Quantifying allele transcript abundance')

    # SQUAREM - accelerated EM
    # R. Varadhan & C. Roland (doi: 10.1 1 1 1/j. 1467-9469.2007.00585.X)
    # Used by HISAT-genotype, originaly used by Sailfish
    while iterations < max_iterations and not converged:
        # Get next two steps
        theta1 = update_abundances(eqs, theta0)
        theta2 = update_abundances(eqs, theta1)
        theta_prime = defaultdict(float)

        r = dict()
        v = dict()
        sum_r = 0.0
        sum_v = 0.0

        # Compute r and v
        for allele in theta1:
            r[allele] = theta1[allele] - theta0[allele]
            v[allele] = (theta2[allele] - theta1[allele]) - r[allele]

        srss_r = SRSS(r.values())
        srss_v = SRSS(v.values())

        if srss_v != 0:
            # Compute step length
            alpha = -(srss_r / srss_v)
            for allele in r:
                value =   theta0[allele] \
                        - 2*alpha*r[allele] \
                        + (alpha**2)*v[allele]

                theta_prime[allele] = value

            step_min = min(theta_prime.values())
            step_max = max(theta_prime.values())

            # Adjust step rather than kicking out alleles with a negative result
            if step_min < 0:
                theta_prime = {
                    allele: (value - step_min) / (step_max - step_min)
                    for allele, value in theta_prime.items()
                }

                total = sum(theta_prime.values())

                theta_prime = {
                    allele: value / total
                    for allele, value in theta_prime.items()
                }

            # Update abundances with given the new proportions
            theta_prime = update_abundances(eqs, theta_prime)

        else:
            theta_prime = theta1

        converged = check_convergence(theta0, theta_prime)

        theta0, eqs = drop_alleles(eqs, theta_prime, drop_iterations,
                                   drop_threshold, iterations, converged)
        iterations += 1

    log.info(f'[genotype] EM converged after {iterations} iterations')

    return theta0