def sample_clonesize_stat(sample, samdir, freqs=[], numtop=50, args=None): # calculate: number of clones/ counts that lie within each freq # range. Note: freqs must be sorted, or the func will sort it # <numtop>: number of top clones whose freqs will be report stat = CloneSizeStat(sorted(freqs)) stat.set_sample_info(sample) if args: numtop = args[0] clones = libsample.sample_all_clones(samdir) sorted_clones = sorted(clones, reverse=True, key=lambda c: c.freq) for index, clone in enumerate(sorted_clones): if index < numtop: stat.topfreqs.append(clone.freq) for i, minfreq in enumerate(stat.freqs): maxfreq = float('inf') if i + 1 < len(stat.freqs): maxfreq = stat.freqs[i + 1] if minfreq <= clone.freq and clone.freq < maxfreq: stat.numclones[i] += 1 stat.counts[i] += clone.count # convert to frequencies: stat.numclones = [libcommon.get_pc(c, stat.numclone) for c in stat.numclones] stat.counts = [libcommon.get_pc(c, stat.size) for c in stat.counts] # get cumulative stats: stat.numclones_cumul = libcommon.get_cumulative(stat.numclones) stat.counts_cumul = libcommon.get_cumulative(stat.counts) stat.topfreqs_cumul = libcommon.get_cumulative(stat.topfreqs, True) return stat
def sample_lendist_stat(sample, samdir, args=None): # lendist with counts and with number of clones len2clones = {} len2reads = {} clones = libsample.sample_all_clones(samdir) totalclone = 0 for clone in clones: if clone.aa: l = len(clone.aa) if clone.vdel is not None: totalclone += 1 if l not in len2clones: len2clones[l] = 1 else: len2clones[l] += 1 if l not in len2reads: len2reads[l] = clone.freq else: len2reads[l] += clone.freq # convert the number of clones into % total clones for l, numclone in len2clones.iteritems(): len2clones[l] = float(numclone) / totalclone stat = LenDistStat() stat.set_sample_info(sample) stat.set_stats(len2clones, len2reads) return stat
def sample_geneusage_stat(sample, samdir, args=None): # gene usage of a specific number # initialize usage types = ['v', 'd', 'j', 'vj', 'dj'] type2gene2clones = {} type2gene2reads = {} for t in types: type2gene2clones[t] = {} type2gene2reads[t] = {} # get usage clones = libsample.sample_all_clones(samdir) for clone in clones: # update each genetype usage genetypes = ['v', 'd', 'j'] for type in genetypes: gene2clones = type2gene2clones[type] gene2reads = type2gene2reads[type] gene = clone[type] numclone = 1.0 freq = clone.freq if gene not in gene2clones: gene2clones[gene] = numclone gene2reads[gene] = freq else: gene2clones[gene] += numclone gene2reads[gene] += freq # update gene combination usage for type in ['vj', 'dj']: gene2clones = type2gene2clones[type] gene2reads = type2gene2reads[type] g0 = clone[type[0]] g1 = clone[type[1]] numclone = 1.0 freq = clone.freq combi = "|".join([g0, g1]) if combi not in gene2clones: gene2clones[combi] = numclone gene2reads[combi] = freq else: gene2clones[combi] += numclone gene2reads[combi] += freq # convert the number of clones into % total clones: for type, gene2clones in type2gene2clones.iteritems(): for gene, numclone in gene2clones.iteritems(): gene2clones[gene] = float(numclone) / sample.numclone # get the stat obj stat = GeneUsageStat() stat.set_sample_info(sample) stat.set_stats(type2gene2clones, type2gene2reads) return stat