def create_maf_distribution( seqs, distrib_fhand=None, plot_fhand=None, summary_fhand=None, groups=None, group_kind=None ): "It creates the distribution of the maf (not takes in account ref allele)" title = "maf" if groups and group_kind: title = "maf (%s: %s)" % (group_kind, ",".join(groups)) mafs = CachedArray("f") for seq in seqs: for snv in seq.get_features("snv"): maf = calculate_maf_frequency(snv, groups=groups, group_kind=group_kind) if maf: mafs.append(maf) if list(mafs): create_distribution( mafs, labels={"title": title}, distrib_fhand=distrib_fhand, bins=None, plot_fhand=plot_fhand, range_=None, summary_fhand=summary_fhand, calculate_freqs=False, remove_outliers=False, )
def create_het_distribution( seqs, distrib_fhand=None, plot_fhand=None, summary_fhand=None, group_kind=None, groups=None, ploidy=2 ): """It creates the distribution of the heterozygosity (not takes in account ref allele)""" title = "heterozygosity" if groups and group_kind: title = "heterozygosity (%s: %s)" % (group_kind, ",".join(groups)) hets = CachedArray("f") for seq in seqs: for snv in seq.get_features("snv"): if not group_kind and "heterozygosity" in snv.qualifiers: het = snv.qualifiers["heterozygosity"] else: het = calculate_heterozygosity(snv, ploidy, group_kind=group_kind, groups=groups) if het is not None: hets.append(het) if list(hets): create_distribution( hets, labels={"title": title}, distrib_fhand=distrib_fhand, bins=None, plot_fhand=plot_fhand, range_=None, summary_fhand=summary_fhand, calculate_freqs=False, remove_outliers=False, )
def create_pic_distribution( seqs, distrib_fhand=None, plot_fhand=None, summary_fhand=None, read_groups=None, group_kind=None, groups=None ): "It creates the distribution of the pic (not takes in account ref allele)" title = "pic" if groups and group_kind: title = "pic (%s: %s)" % (group_kind, ",".join(groups)) pics = CachedArray("f") for seq in seqs: for snv in seq.get_features("snv"): if not group_kind and "pic" in snv.qualifiers: pic = snv.qualifiers["pic"] else: pic = calculate_pic(snv, group_kind=group_kind, groups=groups) if pic is not None: pics.append(pic) if list(pics): create_distribution( pics, labels={"title": title}, distrib_fhand=distrib_fhand, bins=None, plot_fhand=plot_fhand, range_=None, summary_fhand=summary_fhand, calculate_freqs=False, remove_outliers=False, )
def test_basic_distribution(self): 'It tests the distribution' summary_fhand = StringIO() distrib_fhand = StringIO() numbers = CachedArray(typecode='I') numbers.extend([1, 2, 3, 4, 5, 6, 7, 8, 9, 101, 2, 3, 4, 5, 6, 7, 8, 9]) create_distribution(numbers, distrib_fhand=distrib_fhand, summary_fhand=summary_fhand) result = '''Statistics for histogram ------------------------- minimum: 1 maximum: 101 average: 10.5556 variance: 486.9136 sum: 190 items: 18''' assert result in summary_fhand.getvalue()
def main(): '''The main section''' # Get parameters infhand, outfhand, do_incompat, low_memory = set_parameters() bins = 20 range_ = (95, 100) # Parse blast results blasts = BlastParser(infhand) # The values for the distribution score_keys = ['similarity'] if do_incompat: score_keys.append('d_incompatibility') scores = alignment_results_scores(blasts, score_keys) # The distribution if do_incompat: #distrib, x_edges, y_edges = numpy.histogram2d(scores[0], scores[1], # bins=bins) distrib = numpy.histogram2d(scores[0], scores[1], bins=bins)[0] else: result = create_distribution(scores, range=range_, bins=bins, low_memory=low_memory) distrib = result['distrib'][0] bin_edges = result['bin_edges'][0] # The drawing if do_incompat: #fig = pylab.figure() pylab.figure() #axes = Axes3D(fig) #axes.plot_surface(x_edges[:-1], y_edges[:-1], distrib) #axes = pylab.subplot(111) pylab.subplot(111) image = pylab.imshow(distrib) image.set_interpolation('bilinear') pylab.show() else: draw_scatter(x_axe=bin_edges[:-1], y_axe=distrib, fhand=outfhand) return
def bam_distribs(bam_fhand, kind, basename=None, range_=None, grouping=None, sample_size=None, summary_fhand=None, labels=None, plot_file_format='svg'): '''It makes the bam coverage distribution. It can make the distribution taking into account any of the readgroup items: platform, sample and library ''' value_calculator = {'coverage':_get_bam_coverage, 'mapq':_get_bam_mapping_quality, 'edit_distance': _get_bam_edit_distance} coverage_labels = {'title': "Coverage for %s %s", 'xlabel': 'Coverage', 'ylabel': 'Num. of positions', 'sum':None, 'items':'total sequence length' } mapping_labels = {'title': "Mapping qualities for %s %s", 'xlabel': "mapping quality", 'ylabel': 'Num. of reads', 'sum':None, 'items':'number reads in the sam file' } edit_distance_labels = {'title': "Edit distances for %s %s", 'xlabel': "edit distance", 'ylabel': 'Num. of reads', 'sum':None, 'items':'number reads in the sam file' } plot_labels = {'coverage': coverage_labels, 'mapq':mapping_labels, 'edit_distance': edit_distance_labels} if sample_size is not None: sampled_bam_fhand = NamedTemporaryFile(suffix='.bam') sample_bam(bam_fhand, sampled_bam_fhand, sample_size) sample_fpath = sampled_bam_fhand.name else: sample_fpath = bam_fhand.name create_bam_index(bam_fpath=sample_fpath) bam = pysam.Samfile(sample_fpath, 'rb') rgs = get_read_group_info(bam) if grouping is None: platforms = set([rg['PL'] for rg in rgs.values()]) if len(platforms) > 1: grouping = 'PL' else: grouping = 'SM' item_values = value_calculator[kind](bam, rgs, grouping) results = {} for group_name, values in item_values.items(): if basename is None: distrib_fhand = None plot_fhand = None else: distrib_fhand = open('%s.%s_%s.dat' % (basename, kind, group_name), 'w') plot_fhand = open('%s.%s_%s.%s' % (basename, kind, group_name, plot_file_format), 'w') if grouping == 'PL': grouping = 'platform' elif grouping == 'SM': grouping = 'sample' labels = copy.deepcopy(plot_labels[kind]) labels['title'] = labels['title'] % (grouping, group_name) remove_outliers = True if kind == 'coverage' else False distrib = create_distribution(values, labels=labels, distrib_fhand=distrib_fhand, plot_fhand=plot_fhand, range_=range_, summary_fhand=summary_fhand, remove_outliers=remove_outliers) results[(grouping, group_name)] = distrib return results