Exemplo n.º 1
0
 def addplot(oindexlist, ofracslist, n_seqs, fname, title):
     hist = Hist(30, 0., 1.)
     for ofracs in ofracslist:
         hist.fill(ofracs)
     fig, ax = self.plotting.mpl_init()
     hist.mpl_plot(ax, remove_empty_bins=True)
     ax.text(0.65,
             0.8 * ax.get_ylim()[1],
             'size: %d' % n_seqs,
             fontsize=20,
             fontweight='bold')
     ax.text(0.65,
             0.7 * ax.get_ylim()[1],
             'h: %.2f' %
             utils.fay_wu_h(line=None,
                            restrict_to_region=restrict_to_region,
                            occurence_indices=oindexlist,
                            n_seqs=n_seqs),
             fontsize=20,
             fontweight='bold')
     regionstr = restrict_to_region + ' ' if restrict_to_region is not None else ''
     self.plotting.mpl_finish(
         ax,
         plotdir,
         fname,
         title=title,
         xlabel=regionstr + 'mutation frequency',
         ylabel=regionstr + 'density of mutations',
         xticks=[0, 1],
         log=''
     )  # xticks=[min(occurence_fractions), max(occurence_fractions)],
     self.addfname(fnames, fname)
def print_stuff(line):
    cluster_index = sorted_clusters.index(cluster)
    naive_cdr3, matureiseq0_cdr3 = utils.subset_sequences(
        line, iseq=0, restrict_to_region='cdr3'
    )  # line['naive_seq'][(line['codon_positions']['v']):((line['codon_positions']['j'])+3)] #get nt sequence of CDR3 from first base of cysteine through last base of tryptophan
    # mature_cdr3_seqs = []  # trying to translate the consensus cdr3 so I can search these with my seed seqs
    #     for iseq in range(len(line['unique_ids'])):
    #         naive_cdr3_seq, mature_cdr3_seq = utils.subset_sequences(line, iseq=iseq, restrict_to_region='cdr3')
    #         mature_cdr3_seqs.append(mature_cdr3_seq)
    # translated_cdr3 = Seq().... not done
    cdr3_aa = '%-30s' % Seq(naive_cdr3).translate()
    if any('-ig' in s for s in line['unique_ids']):
        cdr3_aa = utils.color('red', cdr3_aa, width=30)
    print '%4s     %s %s %s %5d %5d %5d %7.3f   %8.4f     %2d   %s %4.2f' % (
        cluster_index,
        utils.color_gene(line['v_gene'], width=15),
        utils.color_gene(line['d_gene'], width=15),
        utils.color_gene(line['j_gene'], width=10),
        len(line['unique_ids']),
        numpy.mean(line['n_mutations']),
        numpy.median(line['n_mutations']),
        numpy.mean(line['mut_freqs']),
        float(len(cluster)) / n_total,
        (line['cdr3_length'] / 3),
        cdr3_aa,
        utils.fay_wu_h(line, debug=False),
    )
Exemplo n.º 3
0
def print_stuff(line):
    cluster_index = sorted_clusters.index(cluster)
    naive_cdr3, matureiseq0_cdr3 = utils.subset_sequences(line, iseq=0, restrict_to_region='cdr3') # returns the CDR3 nt sequence for naive, and the first mutated sequence (iseq0); CDR3 = first base of cysteine through last base of tryptophan

    # mature_cdr3_seqs = []  # trying to translate the consensus cdr3 so I can search these with my seed seqs
    # for iseq in range(len(line['unique_ids'])):
    #     naive_cdr3_seq, mature_cdr3_seq = utils.subset_sequences(line, iseq=iseq, restrict_to_region='cdr3')
    #     mature_cdr3_seqs.append(mature_cdr3_seq)
    # mature_cdr3_seqs
    # translated_cdr3 = mature_cdr3_seqs.translate()

    cdr3_aa = '%-30s' % Seq(naive_cdr3).translate()
    # If a cluster contains one of our seed seqs, color this CDR3 red
    if any('-ig' in s for s in line['unique_ids']):
        cdr3_aa = utils.color('red', cdr3_aa, width=30)
    if args.cdr3 in cdr3_aa: # Only print clusters with naive CDR3 that matches our specified --cdr3 argument
        print 'index    genes                                        size    n muts    SHM     rep frac     CDR3                                FayWuH'
        print '                                                            mean  med                        len  seq'
        print '%4s     %s %s %s %5d %5d %5d %7.3f   %8.4f     %2d   %s %4.2f' % (
                cluster_index,
                utils.color_gene(line['v_gene'], width=15),
                utils.color_gene(line['d_gene'], width=15),
                utils.color_gene(line['j_gene'], width=10),
                len(line['unique_ids']),
                numpy.mean(line['n_mutations']),
                numpy.median(line['n_mutations']),
                numpy.mean(line['mut_freqs']),
                float(len(cluster)) / n_total,
                (line['cdr3_length']/3),
                cdr3_aa,
                utils.fay_wu_h(line, debug=False),
                )
        # print 'number of mutations per sequence in cluster', sorted(line['n_mutations'])
        print len(line['naive_seq']), 'length of naive seq'
        # utils.print_reco_event(utils.synthesize_single_seq_line(line, iseq=0))  # print ascii-art representation of the rearrangement event
        print 'unique_ids: ', getkey(line['unique_ids'])
        print
        print utils.print_reco_event(line)
def print_stuff(line):
    intscore = 0  # create a clonal family scoring system
    cluster_index = sorted_clusters.index(cluster)
    shm_index = shm_clusters.index(cluster)
    naive_cdr3, matureiseq0_cdr3 = utils.subset_sequences(
        line, iseq=0, restrict_to_region='cdr3'
    )  # line['naive_seq'][(line['codon_positions']['v']):((line['codon_positions']['j'])+3)] #get nt sequence of CDR3 from first base of cysteine through last base of tryptophan
    # mature_cdr3_seqs = []  # trying to translate the consensus cdr3 so I can search these with my seed seqs
    #     for iseq in range(len(line['unique_ids'])):
    #         naive_cdr3_seq, mature_cdr3_seq = utils.subset_sequences(line, iseq=iseq, restrict_to_region='cdr3')
    #         mature_cdr3_seqs.append(mature_cdr3_seq)
    # translated_cdr3 = Seq().... not done
    cdr3_aa = '%-30s' % Seq(naive_cdr3).translate()
    if any('-ig' in s for s in line['unique_ids']):
        cdr3_aa = utils.color('red', cdr3_aa, width=30)

    # score clusters based on cluster size
    if cluster_index < 25:
        intscore = intscore + 4
    elif cluster_index >= 25 and cluster_index <= 50:
        intscore = intscore + 3
    elif cluster_index >= 50 and cluster_index <= 75:
        intscore = intscore + 2
    elif cluster_index >= 75 and cluster_index <= 100:
        intscore = intscore + 1

    # score clusters based on SHM
    if shm_index < 25:
        intscore = intscore + 4
    elif shm_index >= 25 and shm_index <= 50:
        intscore = intscore + 3
    elif shm_index >= 50 and shm_index <= 75:
        intscore = intscore + 2
    elif shm_index >= 75 and shm_index <= 100:
        intscore = intscore + 1

    # score clusters based on SFS
    if utils.fay_wu_h(line, debug=False) <= -20:
        intscore = intscore + 4
    elif utils.fay_wu_h(line, debug=False) <= -10:
        intscore = intscore + 3
    elif utils.fay_wu_h(line, debug=False) <= 0:
        intscore = intscore + 2
    elif utils.fay_wu_h(line, debug=False) <= 10:
        intscore = intscore + 1

    # score by bnAb gene usage
    if (line['v_gene']).split('*')[0] in (
            cd4bs_genes or glycan_genes or bridging_genes or mper_genes
    ):  # beware this does not include CDR3 length of bnAb VH genes
        intscore = intscore + 4

    print '%4s %4s     %s %s %s %5d %5d %5d %7.3f   %8.4f     %2d   %s %4.2f' % (
        intscore,
        cluster_index,
        utils.color_gene(line['v_gene'], width=15),
        utils.color_gene(line['d_gene'], width=15),
        utils.color_gene(line['j_gene'], width=10),
        len(line['unique_ids']),
        numpy.mean(line['n_mutations']),
        numpy.median(line['n_mutations']),
        numpy.mean(line['mut_freqs']),
        float(len(cluster)) / n_total,
        (line['cdr3_length'] / 3),
        cdr3_aa,
        utils.fay_wu_h(line, debug=False),
    )
sorted_clusters = sorted(annotations,
                         key=lambda q: len(annotations[q]['unique_ids']),
                         reverse=True)

#### sorted_clusters = [c for c in sorted_clusters if utils.is_functional(annotations[c])] # checks if the cluster contains ANY non-functional sequences

# total size of repertoire (number sequences)
n_total = sum([len(cluster) for cluster in sorted_clusters])

# add more criteria
biggest_clusters = sorted_clusters[:100]  # 100 biggest clusters
shm_clusters = sorted(biggest_clusters,
                      key=lambda q: numpy.mean(annotations[q]['mut_freqs']),
                      reverse=True)  # rank by SHM
sfs_clusters = sorted(biggest_clusters,
                      key=lambda q: utils.fay_wu_h(annotations[q], debug=False)
                      )  # rank by SFS score

cluster_sfses = {}
for cluster in biggest_clusters:
    cluster_sfses[cluster] = utils.fay_wu_h(annotations[cluster], debug=False)
print numpy.mean(cluster_sfses.values())
print numpy.std(cluster_sfses.values())
print numpy.percentile(cluster_sfses.values(), 5)
print numpy.percentile(cluster_sfses.values(), 10)
print numpy.percentile(cluster_sfses.values(), 50)
print numpy.percentile(cluster_sfses.values(), 80)
print numpy.percentile(cluster_sfses.values(), 90)

# create function that gives me the score - this function calls a subfunction for each metric (i.e. percentile).  The superfunction can then weight the metrics
# give 0 points to anyone not in top 30 percentile
# sort by size
sorted_clusters = sorted(annotations,
                         key=lambda q: len(annotations[q]['unique_ids']),
                         reverse=True)
# sorted_clusters = [c for c in sorted_clusters if utils.is_functional(annotations[c])] # checks if the cluster contains ANY non-functional sequences
n_total = sum([len(cluster) for cluster in sorted_clusters])

# add more criteria
biggest_clusters = sorted_clusters[:100]  # 100 biggest clusters
shm_clusters = sorted(biggest_clusters,
                      key=lambda q: numpy.mean(annotations[q]['mut_freqs']),
                      reverse=True)
sfs_clusters = sorted(
    biggest_clusters,
    key=lambda q: utils.fay_wu_h(annotations[q], debug=False))

# cluster size: print x biggest clusters
print '\x1b[1;32;40m' + '  printing the largest clusters' + '\x1b[0m'
for cluster in sorted_clusters[:5]:
    # if sorted_clusters.index(cluster) < 50:
    #     print_stuff(annotations[cluster])
    print_stuff(annotations[cluster])

# high mean %SHM: print most mutated clusters from 100 biggest clusters
mutclust = int(args.nclust)
print '\x1b[1;32;40m' + '  printing the most mutated clusters (within 100 biggest)' + '\x1b[0m'
for cluster in shm_clusters[:mutclust]:
    # if sorted_clusters.index(cluster) < 50:
    #     print_stuff(annotations[cluster])
    print_stuff(annotations[cluster])