def print_b6_file_stats(self): if self.matrix == []: self.load_b6_matrix() TABULAR = lambda x, y: sys.stdout.write('%s %s: %s\n' % (x, '.' * (20 - len(x)), y)) INFO = lambda x: '%-10.2f %-10.2f %-10.2f %-10.2f'\ % (numpy.mean(self.matrix[x]), numpy.std(self.matrix[x]), numpy.min(self.matrix[x]), numpy.max(self.matrix[x])) print TABULAR('Total Hits', pretty_print(len(self.matrix[IDENTITY]))) print print ' mean std min max' print TABULAR('Identity', INFO(IDENTITY)) TABULAR('Alignment Length', INFO(ALIGNMENT_LENGTH)) TABULAR('Mismatches', INFO(MISMATCHES)) TABULAR('Gaps', INFO(GAPS)) TABULAR('Query Start', INFO(Q_START)) TABULAR('Query End', INFO(Q_END)) TABULAR('Target Start', INFO(S_START)) TABULAR('Target End', INFO(S_END)) TABULAR('E-Value', INFO(E_VALUE)) TABULAR('Bit Score', INFO(BIT_SCORE)) print
def get_oligo_reps_dict(html_dict, html_output_directory): oligos, rep_dir = html_dict['oligos'], html_dict[ 'output_directory_for_reps'] oligo_reps_dict = {} oligo_reps_dict['imgs'] = {} oligo_reps_dict['fancy_seqs'] = {} oligo_reps_dict['clear_seqs'] = {} oligo_reps_dict['frequency'] = {} oligo_reps_dict['component_references'] = {} oligo_reps_dict['blast_results'] = {} for i in range(0, len(oligos)): oligo = oligos[i] alignment_base_path = os.path.join(rep_dir, '%.5d_' % i + oligo) diversity_image_path = alignment_base_path + '_unique.png' diversity_image_dest = os.path.join( html_output_directory, os.path.basename(diversity_image_path)) shutil.copy2(diversity_image_path, diversity_image_dest) oligo_reps_dict['imgs'][oligo] = os.path.basename(diversity_image_dest) unique_sequences_path = alignment_base_path + '_unique' uniques = u.SequenceSource(unique_sequences_path) oligo_reps_dict['fancy_seqs'][oligo] = [] oligo_reps_dict['clear_seqs'][oligo] = [] oligo_reps_dict['frequency'][oligo] = [] while uniques.next() and uniques.pos <= 20: oligo_reps_dict['clear_seqs'][oligo].append(uniques.seq) oligo_reps_dict['fancy_seqs'][oligo].append( get_decorated_sequence(uniques.seq, html_dict['entropy_components'])) oligo_reps_dict['frequency'][oligo].append( pretty_print(uniques.id.split('|')[1].split(':')[1])) entropy_file_path = alignment_base_path + '_unique_entropy' entropy_values_per_column = [0] * html_dict['alignment_length'] for column, entropy in [ x.strip().split('\t') for x in open(entropy_file_path) ]: entropy_values_per_column[int(column)] = float(entropy) color_per_column = cPickle.load( open(alignment_base_path + '_unique_color_per_column.cPickle')) oligo_reps_dict['component_references'][oligo] = ''.join([ '<span style="background-color: %s;"><a onmouseover="popup(\'\column: %d<br />entropy: %.4f\', 100)" href="">|</a></span>' % (color_per_column[i], i, entropy_values_per_column[i]) for i in range(0, html_dict['alignment_length']) ]) blast_results_dict = alignment_base_path + '_unique_BLAST.cPickle' if os.path.exists(blast_results_dict): html_dict['blast_results_found'] = True oligo_reps_dict['blast_results'][oligo] = cPickle.load( open(blast_results_dict)) else: oligo_reps_dict['blast_results'][oligo] = None return oligo_reps_dict
def print_b6_file_stats(self): if self.matrix == []: self.load_b6_matrix() TABULAR = lambda x, y: sys.stdout.write('%s %s: %s\n' % (x, '.' * (20 - len(x)), y)) INFO = lambda x: '%-10.2f %-10.2f %-10.2f %-10.2f'\ % (numpy.mean(self.matrix[x]), numpy.std(self.matrix[x]), numpy.min(self.matrix[x]), numpy.max(self.matrix[x])) print() TABULAR('Total Hits', pretty_print(len(self.matrix[IDENTITY]))) print() print(' mean std min max') print() TABULAR('Identity', INFO(IDENTITY)) TABULAR('Alignment Length', INFO(ALIGNMENT_LENGTH)) TABULAR('Mismatches', INFO(MISMATCHES)) TABULAR('Gaps', INFO(GAPS)) TABULAR('Query Start', INFO(Q_START)) TABULAR('Query End', INFO(Q_END)) TABULAR('Target Start', INFO(S_START)) TABULAR('Target End', INFO(S_END)) TABULAR('E-Value', INFO(E_VALUE)) TABULAR('Bit Score', INFO(BIT_SCORE)) print()
def get_oligo_reps_dict(html_dict, html_output_directory): oligos, rep_dir = html_dict['oligos'], html_dict['output_directory_for_reps'] oligo_reps_dict = {} oligo_reps_dict['imgs'] = {} oligo_reps_dict['fancy_seqs'] = {} oligo_reps_dict['clear_seqs'] = {} oligo_reps_dict['frequency'] = {} oligo_reps_dict['component_references'] = {} oligo_reps_dict['blast_results'] = {} for i in range(0, len(oligos)): oligo = oligos[i] alignment_base_path = os.path.join(rep_dir, '%.5d_' % i + oligo) diversity_image_path = alignment_base_path + '_unique.png' diversity_image_dest = os.path.join(html_output_directory, os.path.basename(diversity_image_path)) shutil.copy2(diversity_image_path, diversity_image_dest) oligo_reps_dict['imgs'][oligo] = os.path.basename(diversity_image_dest) unique_sequences_path = alignment_base_path + '_unique' uniques = u.SequenceSource(unique_sequences_path) oligo_reps_dict['fancy_seqs'][oligo] = [] oligo_reps_dict['clear_seqs'][oligo] = [] oligo_reps_dict['frequency'][oligo] = [] while uniques.next() and uniques.pos <= 20: oligo_reps_dict['clear_seqs'][oligo].append(uniques.seq) oligo_reps_dict['fancy_seqs'][oligo].append(get_decorated_sequence(uniques.seq, html_dict['entropy_components'])) oligo_reps_dict['frequency'][oligo].append(pretty_print(uniques.id.split('|')[1].split(':')[1])) entropy_file_path = alignment_base_path + '_unique_entropy' entropy_values_per_column = [0] * html_dict['alignment_length'] for column, entropy in [x.strip().split('\t') for x in open(entropy_file_path)]: entropy_values_per_column[int(column)] = float(entropy) color_per_column = cPickle.load(open(alignment_base_path + '_unique_color_per_column.cPickle')) oligo_reps_dict['component_references'][oligo] = ''.join(['<span style="background-color: %s;"><a onmouseover="popup(\'\column: %d<br />entropy: %.4f\', 100)" href="">|</a></span>' % (color_per_column[i], i, entropy_values_per_column[i]) for i in range(0, html_dict['alignment_length'])]) blast_results_dict = alignment_base_path + '_unique_BLAST.cPickle' if os.path.exists(blast_results_dict): html_dict['blast_results_found'] = True oligo_reps_dict['blast_results'][oligo] = cPickle.load(open(blast_results_dict)) else: oligo_reps_dict['blast_results'][oligo] = None return oligo_reps_dict
def load_b6_matrix(self): for i in range(0, 12): self.matrix.append([]) F = lambda x, i: self.conversion[i](x) while self.next(raw = True): if self.pos % 10000 == 0 or self.pos == 1: sys.stderr.write('\r[b6_matrix] Reading: %s' % (pretty_print(self.pos))) sys.stderr.flush() b6_columns = self.entry_line.split(('\t')) for i in range(0, 12): self.matrix[i].append(F(b6_columns[i], i)) sys.stderr.write('\n') return True
def load_b6_matrix(self): for i in range(0, 12): self.matrix.append([]) F = lambda x, i: self.conversion[i](x) while self.next(raw=True): if self.pos % 10000 == 0 or self.pos == 1: sys.stderr.write('\r[b6_matrix] Reading: %s' % (pretty_print(self.pos))) sys.stderr.flush() b6_columns = self.entry_line.split(('\t')) for i in range(0, 12): self.matrix[i].append(F(b6_columns[i], i)) sys.stderr.write('\n') return True
def length_distribution(fasta, output=None, title=None): fasta = u.SequenceSource(fasta) sequence_lengths = [] fasta.reset() while fasta.next(): if fasta.pos % 1000 == 0 or fasta.pos == 1: sys.stderr.write('\r[fastalib] Reading: %s' % (fasta.pos)) sys.stderr.flush() sequence_lengths.append(len(fasta.seq.replace('-', ''))) fasta.reset() sys.stderr.write('\n') max_seq_len = max(sequence_lengths) + (int(max(sequence_lengths) / 100.0) or 10) seq_len_distribution = [0] * (max_seq_len + 1) for l in sequence_lengths: seq_len_distribution[l] += 1 fig = plt.figure(figsize=(12, 8)) plt.rcParams.update({'axes.linewidth': 0.9}) plt.rc('grid', color='0.50', linestyle='-', linewidth=0.1) gs = gridspec.GridSpec(20, 1) ############################################################################################################# ax1 = plt.subplot(gs[1:3]) plt.subplots_adjust(left=0.05, bottom=0.03, top=0.95, right=0.98) plt.grid(False) plt.yticks([]) plt.xticks([]) total_seqs = len(sequence_lengths) plt.text(0.02, 0.5, 'total: %s / mean: %.2f / std: %.2f / min: %s / max: %s'\ % (pretty_print(total_seqs), numpy.mean(sequence_lengths), numpy.std(sequence_lengths),\ min(sequence_lengths),\ max(sequence_lengths)),\ va = 'center', alpha = 0.8, size = 12) ############################################################################################################# ax1 = plt.subplot(gs[4:11]) plt.grid(True) plt.subplots_adjust(left=0.05, bottom=0.01, top=0.95, right=0.98) plt.plot(seq_len_distribution, color='black', alpha=0.3) plt.fill_between(range(0, max_seq_len + 1), seq_len_distribution, y2=0, color='black', alpha=0.30) plt.ylabel('number of sequences') xtickstep = (max_seq_len / 50) or 1 ytickstep = max(seq_len_distribution) / 20 or 1 plt.xticks(range(xtickstep, max_seq_len + 1, xtickstep), rotation=90, size='xx-small') plt.yticks(range(0, max(seq_len_distribution) + 1, ytickstep), [y for y in range(0, max(seq_len_distribution) + 1, ytickstep)], size='xx-small') plt.xlim(xmin=0, xmax=max_seq_len) plt.ylim(ymin=0, ymax=max(seq_len_distribution) + (max(seq_len_distribution) / 20.0)) plt.figtext(0.5, 0.96, '%s' % (title or fasta.fasta_file_path), weight='black', size='xx-large', ha='center') ############################################################################################################# ax2 = plt.subplot(gs[12:19]) plt.subplots_adjust(left=0.05, bottom=0.01, top=0.95, right=0.98) plt.grid(True) length_abundance = {} for l in sequence_lengths: if length_abundance.has_key(l): length_abundance[l] += 1 else: length_abundance[l] = 1 percentages = [] total_percentage = 0 for i in range(0, max_seq_len): if length_abundance.has_key(i): total_percentage += length_abundance[i] * 100.0 / total_seqs percentages.append(total_percentage) else: percentages.append(total_percentage) xtickstep = (max_seq_len / 50) or 1 plt.xticks(range(xtickstep, max_seq_len + 1, xtickstep), rotation=90, size='xx-small') plt.yticks(range(0, 101, 5), ['%d%%' % y for y in range(0, 101, 5)], size='xx-small') plt.ylabel('percent of reads') plt.xlim(xmin=0, xmax=max_seq_len) plt.ylim(ymin=0, ymax=100) plt.plot(percentages) plt.fill_between(range(0, max_seq_len + 1), percentages + [100], y2=0, color='blue', alpha=0.30) ############################################################################################################# if output == None: output = fasta.fasta_file_path try: plt.savefig(output + '.pdf') except: plt.savefig(output + '.png') try: plt.show() except: pass fasta.close() return
def sumvals(arg, clean=None): if clean: return sum(arg.values()) return pretty_print(sum(arg.values()))
def entropy_analysis(alignment_path, output_file = None, verbose = True, uniqued = False, freq_from_defline = None, weighted = False, qual_stats_dict = None, amino_acid_sequences = False): if freq_from_defline == None: freq_from_defline = lambda x: int([t.split(':')[1] for t in x.split('|') if t.startswith('freq')][0]) lines = [] previous_alignment_length = None progress = Progress() progress.verbose = verbose alignment = u.SequenceSource(alignment_path) progress.new('Processing the Alignment') # processing the alignment file.. while alignment.next(): # check the alignment lengths along the way: if previous_alignment_length: if previous_alignment_length != len(alignment.seq): raise EntropyError, "Not all reads have the same length." # print out process info if alignment.pos % 10000 == 0: progress.update('Reads processed: %s' % (pretty_print(alignment.pos))) # fill 'lines' variable if not uniqued: lines.append(alignment.seq) else: try: frequency = freq_from_defline(alignment.id) except IndexError: raise EntropyError, "Reads declared as unique, but they do not have proper deflines. See help for --uniqued." for i in range(0, frequency): lines.append(alignment.seq) previous_alignment_length = len(alignment.seq) progress.end() if verbose: run.info('Number of reads', pretty_print(alignment.pos)) alignment.close() # entropy analysis progress.new('Entropy Analysis') entropy_tpls = [] for position in range(0, len(lines[0])): progress.update(P(int(position + 1), len(lines[0]))) if len(set([x[position] for x in lines])) == 1: entropy_tpls.append((position, 0.0),) else: column = "".join([x[position] for x in lines]) if weighted: if not qual_stats_dict: raise EntropyError, "Weighted entropy is selected, but no qual stats are provided" e = entropy(column, l_qual = qual_stats_dict[position], amino_acid_sequences = amino_acid_sequences) else: e = entropy(column, amino_acid_sequences = amino_acid_sequences) if e < 0.00001: entropy_tpls.append((position, 0.0),) else: entropy_tpls.append((position, e),) sorted_entropy_tpls = sorted(entropy_tpls, key=operator.itemgetter(1), reverse=True) progress.end() if verbose: entropy_components_larger_than_0 = [e[1] for e in entropy_tpls if e[1] > 0] if entropy_components_larger_than_0: run.info('Entropy analysis', 'Done (total of %d components greater than 0, mean: %.2f, max: %.2f, min: %.2f).' \ % (len(entropy_components_larger_than_0), numpy.mean(entropy_components_larger_than_0), numpy.max(entropy_components_larger_than_0), numpy.min(entropy_components_larger_than_0))) else: run.info('Entropy analysis', 'None of the nucleotide positions posessed any entropy!') if output_file: entropy_output = open(output_file, 'w') for _component, _entropy in sorted_entropy_tpls: entropy_output.write('%d\t%.4f\n' % (_component, _entropy)) if verbose: run.info('Entropy analysis output file path', output_file) entropy_output.close() return [x[1] for x in entropy_tpls]
def length_distribution(fasta, output = None, title = None): fasta = u.SequenceSource(fasta) sequence_lengths = [] fasta.reset() while fasta.next(): if fasta.pos % 1000 == 0 or fasta.pos == 1: sys.stderr.write('\r[fastalib] Reading: %s' % (fasta.pos)) sys.stderr.flush() sequence_lengths.append(len(fasta.seq.replace('-', ''))) fasta.reset() sys.stderr.write('\n') max_seq_len = max(sequence_lengths) + (int(max(sequence_lengths) / 100.0) or 10) seq_len_distribution = [0] * (max_seq_len + 1) for l in sequence_lengths: seq_len_distribution[l] += 1 fig = plt.figure(figsize = (12, 8)) plt.rcParams.update({'axes.linewidth' : 0.9}) plt.rc('grid', color='0.50', linestyle='-', linewidth=0.1) gs = gridspec.GridSpec(20, 1) ############################################################################################################# ax1 = plt.subplot(gs[1:3]) plt.subplots_adjust(left=0.05, bottom = 0.03, top = 0.95, right = 0.98) plt.grid(False) plt.yticks([]) plt.xticks([]) total_seqs = len(sequence_lengths) plt.text(0.02, 0.5, 'total: %s / mean: %.2f / std: %.2f / min: %s / max: %s'\ % (pretty_print(total_seqs), numpy.mean(sequence_lengths), numpy.std(sequence_lengths),\ min(sequence_lengths),\ max(sequence_lengths)),\ va = 'center', alpha = 0.8, size = 12) ############################################################################################################# ax1 = plt.subplot(gs[4:11]) plt.grid(True) plt.subplots_adjust(left=0.05, bottom = 0.01, top = 0.95, right = 0.98) plt.plot(seq_len_distribution, color = 'black', alpha = 0.3) plt.fill_between(range(0, max_seq_len + 1), seq_len_distribution, y2 = 0, color = 'black', alpha = 0.30) plt.ylabel('number of sequences') xtickstep = (max_seq_len / 50) or 1 ytickstep = max(seq_len_distribution) / 20 or 1 plt.xticks(range(xtickstep, max_seq_len + 1, xtickstep), rotation=90, size='xx-small') plt.yticks(range(0, max(seq_len_distribution) + 1, ytickstep), [y for y in range(0, max(seq_len_distribution) + 1, ytickstep)], size='xx-small') plt.xlim(xmin = 0, xmax = max_seq_len) plt.ylim(ymin = 0, ymax = max(seq_len_distribution) + (max(seq_len_distribution) / 20.0)) plt.figtext(0.5, 0.96, '%s' % (title or fasta.fasta_file_path), weight = 'black', size = 'xx-large', ha = 'center') ############################################################################################################# ax2 = plt.subplot(gs[12:19]) plt.subplots_adjust(left=0.05, bottom = 0.01, top = 0.95, right = 0.98) plt.grid(True) length_abundance = {} for l in sequence_lengths: if length_abundance.has_key(l): length_abundance[l] += 1 else: length_abundance[l] = 1 percentages = [] total_percentage = 0 for i in range(0, max_seq_len): if length_abundance.has_key(i): total_percentage += length_abundance[i] * 100.0 / total_seqs percentages.append(total_percentage) else: percentages.append(total_percentage) xtickstep = (max_seq_len / 50) or 1 plt.xticks(range(xtickstep, max_seq_len + 1, xtickstep), rotation=90, size='xx-small') plt.yticks(range(0, 101, 5), ['%d%%' % y for y in range(0, 101, 5)], size='xx-small') plt.ylabel('percent of reads') plt.xlim(xmin = 0, xmax = max_seq_len) plt.ylim(ymin = 0, ymax = 100) plt.plot(percentages) plt.fill_between(range(0, max_seq_len + 1), percentages + [100], y2 = 0, color = 'blue', alpha = 0.30) ############################################################################################################# if output == None: output = fasta.fasta_file_path try: plt.savefig(output + '.pdf') except: plt.savefig(output + '.png') try: plt.show() except: pass fasta.close() return
def visualize_b6_output(self, title_hint, Q_LENGTH=101): if self.matrix == []: self.load_b6_matrix() import matplotlib.pyplot as plt import matplotlib.gridspec as gridspec def _setp(b, c='red'): plt.setp(b['medians'], color=c) plt.setp(b['whiskers'], color='black', alpha=0.6) plt.setp(b['boxes'], color='black', alpha=0.8) plt.setp(b['caps'], color='black', alpha=0.6) plt.setp(b['fliers'], color='#EEEEEE', alpha=0.01) fig = plt.figure(figsize=(24, 12)) plt.rcParams.update({'axes.linewidth': 0.9}) plt.rc('grid', color='0.50', linestyle='-', linewidth=0.1) gs = gridspec.GridSpec(2, 19) # # UPPER PANEL, Q_START AND Q_END # ax1 = plt.subplot(gs[0:15]) plt.grid(True) plt.subplots_adjust(left=0.03, bottom=0.05, top=0.92, right=0.97) plt.title('Alignment Start / End Positions for "%s" (Number of Hits: %s)'\ % (os.path.basename(self.b6_source) if not title_hint else title_hint, pretty_print(len(self.matrix[0])))) p1 = [0] * max(self.matrix[Q_END]) p2 = [0] * max(self.matrix[Q_END]) for i in self.matrix[Q_START]: p1[i - 1] += 1 for i in self.matrix[Q_END]: p2[i - 1] += 1 p1 = [x * 100.0 / sum(p1) for x in p1] p2 = [x * 100.0 / sum(p2) for x in p2] for i in range(0, len(p1)): plt.bar([i], [100], color='green', alpha=(p1[i] / max(p1)) * 0.8, width=1, edgecolor='green') for i in range(0, len(p2)): plt.bar([i], [100], color='purple', alpha=(p2[i] / max(p2)) * 0.8, width=1, linewidth=0) ax1.plot(p1, c='black', linewidth=3) ax1.plot(p1, c='green', label='Alignment Start Position') ax1.plot(p2, c='black', linewidth=3) ax1.plot(p2, c='red', label='Alignment End Position') plt.fill_between(list(range(0, len(p1))), p1, y2=0, color='black', alpha=0.5) plt.fill_between(list(range(0, len(p2))), p2, y2=0, color='black', alpha=0.5) plt.ylabel('Percent of Hits') plt.xlabel('Position') plt.xticks(list(range(0, Q_LENGTH, Q_LENGTH / 100)), list(range(1, Q_LENGTH + 1, Q_LENGTH / 100)), rotation=90, size='xx-small') plt.yticks([t for t in range(0, 101, 10)], ['%s%%' % t for t in range(0, 101, 10)], size='xx-small') plt.ylim(ymin=0, ymax=100) plt.xlim(xmin=0, xmax=Q_LENGTH - 1) plt.legend() #UPPER PANEL RIGHT SIDE ax1b = plt.subplot(gs[16:19]) plt.title('Percent Identity Breakdown') plt.grid(True) percent_brake_down = [] for p in range(90, 101): percent_brake_down.append( len([True for x in self.matrix[IDENTITY] if x >= p]) * 100.0 / len(self.matrix[IDENTITY])) percent_differences = [] for i in range(0, len(percent_brake_down)): if i < len(percent_brake_down) - 1: percent_differences.append(percent_brake_down[i] - percent_brake_down[i + 1]) else: percent_differences.append(percent_brake_down[i]) percent_differences.sort(reverse=True) ax1b.bar([t + .05 for t in range(0, 11)], percent_differences, width=.9, color='orange') plt.xlim(xmax=11) plt.ylim(ymax=100, ymin=0) plt.xticks([t + .5 for t in range(0, 11)], ['%s%%' % t for t in range(100, 89, -1)], rotation=90, size='xx-small') plt.yticks([t for t in range(0, 101, 10)], ['%s%%' % t for t in range(0, 101, 10)], size='xx-small') plt.xlabel('Percent Identity Level') plt.ylabel('Percent of Hits') # BOX 1 ax2 = plt.subplot(gs[19:22]) plt.grid(True) plt.title('Query Alignment Start / End Positions') plt.ylabel('Position in Query') b2 = ax2.boxplot([self.matrix[Q_START], self.matrix[Q_END]], positions=[0.5, 1.5], sym=',', widths=0.7) _setp(b2) plt.xticks([0.5, 1.5], ['Start', 'End']) # BOX 2 ax3 = plt.subplot(gs[23:26]) plt.grid(True) plt.title('Target Alignment Start / End Positions') plt.ylabel('Position in Target') b3 = ax3.boxplot([self.matrix[S_START], self.matrix[S_END]], positions=[0.5, 1.5], sym=',', widths=0.7) _setp(b3) plt.xticks([0.5, 1.5], ['Start', 'End']) # BOX 3 ax4 = plt.subplot(gs[27:29]) plt.grid(True) plt.title('Percent Identity to Target') plt.ylabel('Percent') b4 = ax4.boxplot(self.matrix[IDENTITY], positions=[0.5], sym=',', widths=0.7) _setp(b4, 'purple') plt.xticks([0.5], []) plt.ylim(ymax=101, ymin=0) # BOX 4 ax5 = plt.subplot(gs[30:32]) plt.grid(True) plt.title('Alignment Length') plt.ylabel('Nucleotide') b5 = ax5.boxplot(self.matrix[ALIGNMENT_LENGTH], positions=[0.5], sym=',', widths=0.7) _setp(b5, 'orange') plt.xticks([0.5], []) # BOX 5 ax6 = plt.subplot(gs[33:35]) plt.grid(True) plt.title('Mismatches and Gaps') plt.ylabel('Number') b6 = ax6.boxplot([self.matrix[MISMATCHES], self.matrix[GAPS]], positions=[0.5, 1.5], sym=',', widths=0.7) _setp(b6, 'brown') plt.xticks([0.5, 1.5], ['Mismatches', 'Gaps']) # BOX 6 ax7 = plt.subplot(gs[36:38]) plt.grid(True) plt.title('Bit Score') b7 = ax7.boxplot(self.matrix[BIT_SCORE], positions=[0.5], sym=',', widths=0.7) _setp(b7, 'green') plt.xticks([0.5], []) try: plt.savefig(self.b6_source + '.tiff') except: plt.savefig(self.b6_source + '.png') try: plt.show() except: pass return
def visualize_b6_output(self, title_hint, Q_LENGTH = 101): if self.matrix == []: self.load_b6_matrix() import matplotlib.pyplot as plt import matplotlib.gridspec as gridspec def _setp(b, c = 'red'): plt.setp(b['medians'], color=c) plt.setp(b['whiskers'], color='black', alpha=0.6) plt.setp(b['boxes'], color='black', alpha=0.8) plt.setp(b['caps'], color='black', alpha=0.6) plt.setp(b['fliers'], color='#EEEEEE', alpha=0.01) fig = plt.figure(figsize = (24, 12)) plt.rcParams.update({'axes.linewidth' : 0.9}) plt.rc('grid', color='0.50', linestyle='-', linewidth=0.1) gs = gridspec.GridSpec(2, 19) # # UPPER PANEL, Q_START AND Q_END # ax1 = plt.subplot(gs[0:15]) plt.grid(True) plt.subplots_adjust(left=0.03, bottom = 0.05, top = 0.92, right = 0.97) plt.title('Alignment Start / End Positions for "%s" (Number of Hits: %s)'\ % (os.path.basename(self.b6_source) if not title_hint else title_hint, pretty_print(len(self.matrix[0])))) p1 = [0] * max(self.matrix[Q_END]) p2 = [0] * max(self.matrix[Q_END]) for i in self.matrix[Q_START]: p1[i - 1] += 1 for i in self.matrix[Q_END]: p2[i - 1] += 1 p1 = [x * 100.0 / sum(p1) for x in p1] p2 = [x * 100.0 / sum(p2) for x in p2] for i in range(0, len(p1)): plt.bar([i], [100], color='green', alpha = (p1[i] / max(p1)) * 0.8, width = 1, edgecolor='green') for i in range(0, len(p2)): plt.bar([i], [100], color='purple', alpha = (p2[i] / max(p2)) * 0.8, width = 1, linewidth = 0) ax1.plot(p1, c = 'black', linewidth = 3) ax1.plot(p1, c = 'green', label = 'Alignment Start Position') ax1.plot(p2, c = 'black', linewidth = 3) ax1.plot(p2, c = 'red', label = 'Alignment End Position') plt.fill_between(range(0, len(p1)), p1, y2 = 0, color = 'black', alpha = 0.5) plt.fill_between(range(0, len(p2)), p2, y2 = 0, color = 'black', alpha = 0.5) plt.ylabel('Percent of Hits') plt.xlabel('Position') plt.xticks(range(0, Q_LENGTH, Q_LENGTH / 100), range(1, Q_LENGTH + 1, Q_LENGTH / 100), rotation=90, size='xx-small') plt.yticks([t for t in range(0, 101, 10)], ['%s%%' % t for t in range(0, 101, 10)], size='xx-small') plt.ylim(ymin = 0, ymax = 100) plt.xlim(xmin = 0, xmax = Q_LENGTH - 1) plt.legend() #UPPER PANEL RIGHT SIDE ax1b = plt.subplot(gs[16:19]) plt.title('Percent Identity Breakdown') plt.grid(True) percent_brake_down = [] for p in range(90, 101): percent_brake_down.append(len([True for x in self.matrix[IDENTITY] if x >= p]) * 100.0 / len(self.matrix[IDENTITY])) percent_differences = [] for i in range(0, len(percent_brake_down)): if i < len(percent_brake_down) - 1: percent_differences.append(percent_brake_down[i] - percent_brake_down[i + 1]) else: percent_differences.append(percent_brake_down[i]) percent_differences.sort(reverse = True) ax1b.bar([t + .05 for t in range(0, 11)], percent_differences, width = .9, color = 'orange') plt.xlim(xmax = 11) plt.ylim(ymax = 100, ymin = 0) plt.xticks([t + .5 for t in range(0, 11)], ['%s%%' % t for t in range(100, 89, -1)], rotation=90, size='xx-small') plt.yticks([t for t in range(0, 101, 10)], ['%s%%' % t for t in range(0, 101, 10)], size='xx-small') plt.xlabel('Percent Identity Level') plt.ylabel('Percent of Hits') # BOX 1 ax2 = plt.subplot(gs[19:22]) plt.grid(True) plt.title('Query Alignment Start / End Positions') plt.ylabel('Position in Query') b2 = ax2.boxplot([self.matrix[Q_START], self.matrix[Q_END]], positions=[0.5, 1.5], sym=',', widths=0.7) _setp(b2) plt.xticks([0.5, 1.5], ['Start', 'End']) # BOX 2 ax3 = plt.subplot(gs[23:26]) plt.grid(True) plt.title('Target Alignment Start / End Positions') plt.ylabel('Position in Target') b3 = ax3.boxplot([self.matrix[S_START], self.matrix[S_END]], positions=[0.5, 1.5], sym=',', widths=0.7) _setp(b3) plt.xticks([0.5, 1.5], ['Start', 'End']) # BOX 3 ax4 = plt.subplot(gs[27:29]) plt.grid(True) plt.title('Percent Identity to Target') plt.ylabel('Percent') b4 = ax4.boxplot(self.matrix[IDENTITY], positions=[0.5], sym=',', widths=0.7) _setp(b4, 'purple') plt.xticks([0.5], []) plt.ylim(ymax = 101, ymin = 0) # BOX 4 ax5 = plt.subplot(gs[30:32]) plt.grid(True) plt.title('Alignment Length') plt.ylabel('Nucleotide') b5 = ax5.boxplot(self.matrix[ALIGNMENT_LENGTH], positions=[0.5], sym=',', widths=0.7) _setp(b5, 'orange') plt.xticks([0.5], []) # BOX 5 ax6 = plt.subplot(gs[33:35]) plt.grid(True) plt.title('Mismatches and Gaps') plt.ylabel('Number') b6 = ax6.boxplot([self.matrix[MISMATCHES], self.matrix[GAPS]], positions=[0.5, 1.5], sym=',', widths=0.7) _setp(b6, 'brown') plt.xticks([0.5, 1.5], ['Mismatches', 'Gaps']) # BOX 6 ax7 = plt.subplot(gs[36:38]) plt.grid(True) plt.title('Bit Score') b7 = ax7.boxplot(self.matrix[BIT_SCORE], positions=[0.5], sym=',', widths=0.7) _setp(b7, 'green') plt.xticks([0.5], []) try: plt.savefig(self.b6_source + '.tiff') except: plt.savefig(self.b6_source + '.png') try: plt.show() except: pass return
def generate_html_output(run_info_dict, html_output_directory=None, entropy_figure=None): if not html_output_directory: html_output_directory = os.path.join(run_info_dict['output_directory'], 'HTML-OUTPUT') if not os.path.exists(html_output_directory): os.makedirs(html_output_directory) html_dict = copy.deepcopy(run_info_dict) shutil.copy2(os.path.join(absolute, 'static/style.css'), os.path.join(html_output_directory, 'style.css')) shutil.copy2(os.path.join(absolute, 'static/header_1.png'), os.path.join(html_output_directory, 'header.png')) shutil.copy2(os.path.join(absolute, 'static/missing_image.png'), os.path.join(html_output_directory, 'missing.png')) shutil.copy2(os.path.join(absolute, 'static/colorbar.png'), os.path.join(html_output_directory, 'colorbar.png')) shutil.copy2(os.path.join(absolute, 'scripts/jquery-1.7.1.js'), os.path.join(html_output_directory, 'jquery-1.7.1.js')) shutil.copy2(os.path.join(absolute, 'scripts/popup.js'), os.path.join(html_output_directory, 'popup.js')) shutil.copy2(os.path.join(absolute, 'scripts/g.pie.js'), os.path.join(html_output_directory, 'g.pie.js')) shutil.copy2(os.path.join(absolute, 'scripts/g.raphael.js'), os.path.join(html_output_directory, 'g.raphael.js')) shutil.copy2(os.path.join(absolute, 'scripts/raphael.js'), os.path.join(html_output_directory, 'raphael.js')) shutil.copy2(os.path.join(absolute, 'scripts/morris.js'), os.path.join(html_output_directory, 'morris.js')) def copy_as(source, dest_name, essential=True): dest = os.path.join(html_output_directory, dest_name) if essential: shutil.copy2(source, dest) else: # it is ok if you fail to copy files that are not # essential.. try: shutil.copy2(source, dest) except: sys.stderr.write( '\n\n[HTML] Warning: Source file not found\n\tSource: "%s"\n\tDest: "%s\n\n"' % (source, dest)) return os.path.basename(dest) # embarrassingly ad-hoc: if entropy_figure: if entropy_figure.endswith('.pdf') or entropy_figure.endswith('.png'): entropy_figure = entropy_figure[:-4] CP = lambda e, o: copy_as(os.path.join(e + ('.%s' % ext)), o, essential=True if ext == 'png' else False) for ext in ['png', 'pdf']: output_file = 'entropy.%s' % ext if entropy_figure: html_dict['entropy_figure_%s' % ext] = CP(entropy_figure, output_file) else: try: html_dict['entropy_figure_%s' % ext] = CP( run_info_dict['entropy'], output_file) except: html_dict['entropy_figure_%s' % ext] = CP( run_info_dict['entropy'][:-4], output_file) if run_info_dict['gexf_network_file_path']: html_dict['gexf_network_file_path'] = copy_as( run_info_dict['gexf_network_file_path'], 'network.gexf') if run_info_dict['sample_mapping']: html_dict['sample_mapping'] = copy_as(run_info_dict['sample_mapping'], 'sample_mapping.txt') else: html_dict['sample_mapping'] = None html_dict['matrix_count_file_path'] = copy_as( run_info_dict['matrix_count_file_path'], 'matrix_counts.txt') html_dict['matrix_percent_file_path'] = copy_as( run_info_dict['matrix_percent_file_path'], 'matrix_percents.txt') html_dict['read_distribution_table_path'] = copy_as( run_info_dict['read_distribution_table_path'], 'read_distribution.txt') html_dict['environment_file_path'] = copy_as( run_info_dict['environment_file_path'], 'environment.txt') html_dict['oligos_fasta_file_path'] = copy_as( run_info_dict['oligos_fasta_file_path'], 'oligos.fa.txt') html_dict['oligos_nexus_file_path'] = copy_as( run_info_dict['oligos_nexus_file_path'], 'oligos.nex.txt') def get_figures_dict(html_dict_prefix): html_dict_key = '%s_file_path' % html_dict_prefix if html_dict.has_key(html_dict_key): figures_dict = cPickle.load(open(html_dict[html_dict_key])) for _map in figures_dict: for _func in figures_dict[_map]: for _op in figures_dict[_map][_func]: if os.path.exists(figures_dict[_map][_func][_op] + '.pdf') and os.path.exists( figures_dict[_map][_func][_op] + '.png'): prefix = copy_as( figures_dict[_map][_func][_op] + '.pdf', '%s.pdf' % '-'.join([_map, _func, _op])) prefix = copy_as( figures_dict[_map][_func][_op] + '.png', '%s.png' % '-'.join([_map, _func, _op])) figures_dict[_map][_func][_op] = '.'.join( prefix.split('.')[:-1]) else: figures_dict[_map][_func][_op] = None return figures_dict else: return None html_dict['figures_dict'] = get_figures_dict('figures_dict') html_dict['exclusive_figures_dict'] = get_figures_dict( 'exclusive_figures_dict') if html_dict['generate_sets']: html_dict['across_samples_MN_file_path'] = copy_as( run_info_dict['across_samples_MN_file_path'], 'across_samples_max_normalized.txt') html_dict['across_samples_SN_file_path'] = copy_as( run_info_dict['across_samples_SN_file_path'], 'across_samples_sum_normalized.txt') html_dict['oligo_sets_stackbar_figure'] = copy_as( run_info_dict['stack_bar_with_agglomerated_oligos_file_path'], 'stackbar_with_oligo_sets.png') html_dict['oligos_across_samples_figure'] = copy_as( run_info_dict['oligos_across_samples_file_path'], 'oligos_across_samples.png') html_dict['oligotype_sets_figure'] = copy_as( run_info_dict['oligotype_sets_across_samples_figure_path'], 'oligotype_sets.png') html_dict['matrix_count_oligo_sets_file_path'] = copy_as( run_info_dict['matrix_count_oligo_sets_file_path'], 'matrix_counts_oligo_sets.txt') html_dict['matrix_percent_oligo_sets_file_path'] = copy_as( run_info_dict['matrix_percent_oligo_sets_file_path'], 'matrix_percents_oligo_sets.txt') html_dict['oligotype_sets_file'] = copy_as( run_info_dict['oligotype_sets_file_path'], 'oligotype_sets.txt') html_dict['oligotype_sets'] = [ l.strip().split('\t')[1].split(',') for l in open(run_info_dict['oligotype_sets_file_path']) ] if html_dict.has_key('representative_seqs_fasta_file_path'): html_dict['representative_seqs_fasta_file_path'] = copy_as( run_info_dict['representative_seqs_fasta_file_path'], 'oligo-representatives.fa.txt') else: html_dict['representative_seqs_fasta_file_path'] = None if run_info_dict.has_key('blast_ref_db') and os.path.exists( run_info_dict['blast_ref_db']): html_dict['blast_ref_db_path'] = copy_as(run_info_dict['blast_ref_db'], 'reference_db.fa') html_dict['entropy_components'] = [ int(x) for x in html_dict['bases_of_interest_locs'].split(',') ] html_dict['samples_dict'] = get_samples_dict_from_environment_file( run_info_dict['environment_file_path']) html_dict['samples'] = sorted(html_dict['samples_dict'].keys()) html_dict['blast_results_found'] = False # get alignment length html_dict['alignment_length'] = get_alignment_length( run_info_dict['alignment']) # include pretty names html_dict['pretty_names'] = pretty_names # get purity score colors dict html_dict['score_color_dict'] = {} gradient = get_list_of_colors(26, colormap='RdYlGn') for oligo in run_info_dict['final_purity_score_dict']: html_dict['score_color_dict'][oligo] = gradient[int( run_info_dict['final_purity_score_dict'][oligo] * 25)] # get total purity score color dict html_dict['total_score_color'] = gradient[int( float(run_info_dict['total_purity_score_dict']) * 25)] # get colors dict html_dict['color_dict'] = get_colors_dict( run_info_dict['colors_file_path']) # get abundant oligos list html_dict['oligos'] = get_oligos_list( run_info_dict['oligos_fasta_file_path']) # get oligo frequencies html_dict['frequency'] = {} for oligo in html_dict['oligos']: html_dict['frequency'][oligo] = pretty_print( sum([ d[oligo] for d in html_dict['samples_dict'].values() if d.has_key(oligo) ])) # get purity score html_dict['purity_score'] = run_info_dict['final_purity_score_dict'] # get total purity score html_dict['total_purity_score'] = run_info_dict['total_purity_score_dict'] # get unique sequence dict (which will contain the most frequent unique sequence for given oligotype) if html_dict.has_key('output_directory_for_reps'): html_dict['rep_oligo_seqs_clean_dict'], html_dict[ 'rep_oligo_seqs_fancy_dict'] = get_unique_sequences_dict(html_dict) html_dict['oligo_reps_dict'] = get_oligo_reps_dict( html_dict, html_output_directory) html_dict['component_reference'] = ''.join([ '<a onmouseover="popup(\'\#%d\', 50)" href="">|</a>' % i for i in range(0, html_dict['alignment_length']) ]) # get javascript code for sample pie-charts html_dict['pie_charts_js'] = render_to_string('pie_charts_js.tmpl', html_dict) # FIXME: code below is very inefficient and causes a huge # memory issue. fix it by not using deepcopy. # generate individual oligotype pages if html_dict.has_key('output_directory_for_reps'): for i in range(0, len(html_dict['oligos'])): oligo = html_dict['oligos'][i] tmp_dict = copy.deepcopy(html_dict) tmp_dict['oligo'] = oligo tmp_dict['distribution'] = get_oligo_distribution_dict( oligo, html_dict) oligo_page = os.path.join(html_output_directory, 'oligo_%s.html' % oligo) tmp_dict['index'] = i + 1 tmp_dict['total'] = len(html_dict['oligos']) tmp_dict['prev'] = None tmp_dict['next'] = None if i > 0: tmp_dict['prev'] = 'oligo_%s.html' % html_dict['oligos'][i - 1] if i < (len(html_dict['oligos']) - 1): tmp_dict['next'] = 'oligo_%s.html' % html_dict['oligos'][i + 1] rendered = render_to_string('single_oligo.tmpl', tmp_dict) open(oligo_page, 'w').write(rendered.encode("utf-8")) # generate index index_page = os.path.join(html_output_directory, 'index.html') rendered = render_to_string('index_for_oligo.tmpl', html_dict) open(index_page, 'w').write(rendered.encode("utf-8")) return index_page
def pretify(arg): return pretty_print(arg)
def topology(topology_dict_path, output_file=None, title=None): G, nodes_dict = topology_graph(topology_dict_path) number_of_edges = G.number_of_edges() number_of_nodes = G.number_of_nodes() print("Loaded %d edges and %d nodes." % (number_of_edges, number_of_nodes)) plt.figure(figsize=(24, 16)) # use graphviz to find radial layout # twopi, gvcolor, wc, ccomps, tred, sccmap, fdp, circo, neato, acyclic, nop, gvpr, dot pos = nx.graphviz_layout(G, prog="fdp") # node size is proportional to number of reads went into it sizes = dict.fromkeys(G.nodes(), 0.0) for (u, v, d) in G.edges(data=True): sizes[u] = d['size'] max_size = max(sizes.values()) k = 10000.0 / max_size for node in sizes: sizes[node] = sizes[node] * k if sizes[node] * k > 500 else 500 shapes = dict.fromkeys(G.nodes(), 0.0) for (u, v, d) in G.edges(data=True): shapes[u] = 'o' if d['size'] > 1 else '' # edge width, not in use at this moment edgewidth = [] for (u, v, d) in G.edges(data=True): edgewidth.append(2) #len(G.get_edge_data(u,v))) parent_nodes_network = nx.draw_networkx_nodes( G, pos, nodelist=parent_nodes, node_shape='o', node_size=[sizes[i] for i in parent_nodes], node_color='#EFEFEF') final_nodes_network = nx.draw_networkx_nodes( G, pos, nodelist=final_nodes, node_shape='o', node_size=[sizes[i] for i in final_nodes], node_color='#FAFFFA') parent_nodes_network.set_edgecolor('#888888') final_nodes_network.set_edgecolor('#88BB00') nx.draw_networkx_edges(G, pos, alpha=0.4, node_size=10, width=1, edge_color='#808080') nx.draw_networkx_labels( G, pos, font_size=8, font_weight='bold', labels=dict([(u, '%s\n(%s)' % (d['label'], pretty_print(d['size']))) for u, v, d in G.edges(data=True)])) # adjust the plot limits xmax = 1.02 * max(x for x, y in pos.values()) ymax = 1.02 * max(y for x, y in pos.values()) plt.xlim(0, xmax) plt.ylim(0, ymax) plt.xticks([]) plt.yticks([]) plt.subplots_adjust(hspace=0, wspace=0, right=0.995, left=0.005, top=0.995, bottom=0.005) plt.text(0.03, 0.97, title or "Topology", fontsize='xx-large', fontname="Arial", fontweight="bold", transform=plt.gca().transAxes) ax = plt.gca() plt.setp(ax, frame_on=False) #plt.axis('off') if nodes_dict['root'].has_key('freq_curve_img_path'): AX = plt.gca() f = plt.gcf() for node in nodes_dict.keys(): (x, y) = pos[node] xt, yt = AX.transData.transform((x, y)) # figure coordinates xf, yf = f.transFigure.inverted().transform( (xt, yt)) # axes coordinates print xf, yf if node == 'root': imsize = 0.04 else: imsize = 0.025 img = mpimg.imread(nodes_dict[node]['freq_curve_img_path']) a = plt.axes( [xf - imsize / 2.0, yf - imsize / 2.0, imsize, imsize]) a.imshow(img) a.axis('off') if output_file: plt.savefig(output_file) else: plt.show()
def show_progress(self, end=False): sys.stderr.write('\r[b6lib] Reading: %s' % (pretty_print(self.pos))) sys.stderr.flush() if end: sys.stderr.write('\n')
def show_progress(self, end = False): sys.stderr.write('\r[b6lib] Reading: %s' % (pretty_print(self.pos))) sys.stderr.flush() if end: sys.stderr.write('\n')
def generate_html_output(run_info_dict, html_output_directory = None, entropy_figure = None): if not html_output_directory: html_output_directory = os.path.join(run_info_dict['output_directory'], 'HTML-OUTPUT') if not os.path.exists(html_output_directory): os.makedirs(html_output_directory) html_dict = copy.deepcopy(run_info_dict) shutil.copy2(os.path.join(absolute, 'static/style.css'), os.path.join(html_output_directory, 'style.css')) shutil.copy2(os.path.join(absolute, 'static/header_1.png'), os.path.join(html_output_directory, 'header.png')) shutil.copy2(os.path.join(absolute, 'static/missing_image.png'), os.path.join(html_output_directory, 'missing.png')) shutil.copy2(os.path.join(absolute, 'scripts/jquery-1.7.1.js'), os.path.join(html_output_directory, 'jquery-1.7.1.js')) shutil.copy2(os.path.join(absolute, 'scripts/popup.js'), os.path.join(html_output_directory, 'popup.js')) shutil.copy2(os.path.join(absolute, 'scripts/g.pie.js'), os.path.join(html_output_directory, 'g.pie.js')) shutil.copy2(os.path.join(absolute, 'scripts/g.raphael.js'), os.path.join(html_output_directory, 'g.raphael.js')) shutil.copy2(os.path.join(absolute, 'scripts/raphael.js'), os.path.join(html_output_directory, 'raphael.js')) shutil.copy2(os.path.join(absolute, 'scripts/morris.js'), os.path.join(html_output_directory, 'morris.js')) def copy_as(source, dest_name, essential = True): dest = os.path.join(html_output_directory, dest_name) if essential: shutil.copy2(source, dest) else: # it is ok if you fail to copy files that are not # essential.. try: shutil.copy2(source, dest) except: sys.stderr.write('\n\n[HTML] Warning: Source file not found\n\tSource: "%s"\n\tDest: "%s\n\n"' % (source, dest)) return os.path.basename(dest) # embarrassingly ad-hoc: if entropy_figure: if entropy_figure.endswith('.pdf') or entropy_figure.endswith('.png'): entropy_figure = entropy_figure[:-4] CP = lambda e, o: copy_as(os.path.join(e + ('.%s' % ext)), o, essential = True if ext == 'png' else False) for ext in ['png', 'pdf']: output_file = 'entropy.%s' % ext if entropy_figure: html_dict['entropy_figure_%s' % ext] = CP(entropy_figure, output_file) else: try: html_dict['entropy_figure_%s' % ext] = CP(run_info_dict['entropy'], output_file) except: html_dict['entropy_figure_%s' % ext] = CP(run_info_dict['entropy'][:-4], output_file) if run_info_dict['gexf_network_file_path']: html_dict['gexf_network_file_path'] = copy_as(run_info_dict['gexf_network_file_path'], 'network.gexf') if run_info_dict['sample_mapping']: html_dict['sample_mapping'] = copy_as(run_info_dict['sample_mapping'], 'sample_mapping.txt') else: html_dict['sample_mapping'] = None html_dict['matrix_count_file_path'] = copy_as(run_info_dict['matrix_count_file_path'], 'matrix_counts.txt') html_dict['matrix_percent_file_path'] = copy_as(run_info_dict['matrix_percent_file_path'], 'matrix_percents.txt') html_dict['read_distribution_table_path'] = copy_as(run_info_dict['read_distribution_table_path'], 'read_distribution.txt') html_dict['environment_file_path'] = copy_as(run_info_dict['environment_file_path'], 'environment.txt') html_dict['oligos_fasta_file_path'] = copy_as(run_info_dict['oligos_fasta_file_path'], 'oligos.fa.txt') html_dict['oligos_nexus_file_path'] = copy_as(run_info_dict['oligos_nexus_file_path'], 'oligos.nex.txt') def get_figures_dict(html_dict_prefix): html_dict_key = '%s_file_path' % html_dict_prefix if html_dict.has_key(html_dict_key): figures_dict = cPickle.load(open(html_dict[html_dict_key])) for _map in figures_dict: for _func in figures_dict[_map]: for _op in figures_dict[_map][_func]: if os.path.exists(figures_dict[_map][_func][_op] + '.pdf') and os.path.exists(figures_dict[_map][_func][_op] + '.png'): prefix = copy_as(figures_dict[_map][_func][_op] + '.pdf', '%s.pdf' % '-'.join([_map, _func, _op])) prefix = copy_as(figures_dict[_map][_func][_op] + '.png', '%s.png' % '-'.join([_map, _func, _op])) figures_dict[_map][_func][_op] = '.'.join(prefix.split('.')[:-1]) else: figures_dict[_map][_func][_op] = None return figures_dict else: return None html_dict['figures_dict'] = get_figures_dict('figures_dict') html_dict['exclusive_figures_dict'] = get_figures_dict('exclusive_figures_dict') if html_dict['generate_sets']: html_dict['across_samples_MN_file_path'] = copy_as(run_info_dict['across_samples_MN_file_path'], 'across_samples_max_normalized.txt') html_dict['across_samples_SN_file_path'] = copy_as(run_info_dict['across_samples_SN_file_path'], 'across_samples_sum_normalized.txt') html_dict['oligo_sets_stackbar_figure'] = copy_as(run_info_dict['stack_bar_with_agglomerated_oligos_file_path'], 'stackbar_with_oligo_sets.png') html_dict['oligos_across_samples_figure'] = copy_as(run_info_dict['oligos_across_samples_file_path'], 'oligos_across_samples.png') html_dict['oligotype_sets_figure'] = copy_as(run_info_dict['oligotype_sets_across_samples_figure_path'], 'oligotype_sets.png') html_dict['matrix_count_oligo_sets_file_path'] = copy_as(run_info_dict['matrix_count_oligo_sets_file_path'], 'matrix_counts_oligo_sets.txt') html_dict['matrix_percent_oligo_sets_file_path'] = copy_as(run_info_dict['matrix_percent_oligo_sets_file_path'], 'matrix_percents_oligo_sets.txt') html_dict['oligotype_sets_file'] = copy_as(run_info_dict['oligotype_sets_file_path'], 'oligotype_sets.txt') html_dict['oligotype_sets'] = [l.strip().split('\t')[1].split(',') for l in open(run_info_dict['oligotype_sets_file_path'])] if html_dict.has_key('representative_seqs_fasta_file_path'): html_dict['representative_seqs_fasta_file_path'] = copy_as(run_info_dict['representative_seqs_fasta_file_path'], 'oligo-representatives.fa.txt') else: html_dict['representative_seqs_fasta_file_path'] = None if run_info_dict.has_key('blast_ref_db') and os.path.exists(run_info_dict['blast_ref_db']): html_dict['blast_ref_db_path'] = copy_as(run_info_dict['blast_ref_db'], 'reference_db.fa') html_dict['entropy_components'] = [int(x) for x in html_dict['bases_of_interest_locs'].split(',')] html_dict['samples_dict'] = get_samples_dict_from_environment_file(run_info_dict['environment_file_path']) html_dict['samples'] = sorted(html_dict['samples_dict'].keys()) html_dict['blast_results_found'] = False # get alignment length html_dict['alignment_length'] = get_alignment_length(run_info_dict['alignment']) # include pretty names html_dict['pretty_names'] = pretty_names # get colors dict html_dict['color_dict'] = get_colors_dict(run_info_dict['colors_file_path']) # get abundant oligos list html_dict['oligos'] = get_oligos_list(run_info_dict['oligos_fasta_file_path']) # get oligo frequencies html_dict['frequency'] = {} for oligo in html_dict['oligos']: html_dict['frequency'][oligo] = pretty_print(sum([d[oligo] for d in html_dict['samples_dict'].values() if d.has_key(oligo)])) # get unique sequence dict (which will contain the most frequent unique sequence for given oligotype) if html_dict.has_key('output_directory_for_reps'): html_dict['rep_oligo_seqs_clean_dict'], html_dict['rep_oligo_seqs_fancy_dict'] = get_unique_sequences_dict(html_dict) html_dict['oligo_reps_dict'] = get_oligo_reps_dict(html_dict, html_output_directory) html_dict['component_reference'] = ''.join(['<a onmouseover="popup(\'\#%d\', 50)" href="">|</a>' % i for i in range(0, html_dict['alignment_length'])]) # get javascript code for sample pie-charts html_dict['pie_charts_js'] = render_to_string('pie_charts_js.tmpl', html_dict) # FIXME: code below is very inefficient and causes a huge # memory issue. fix it by not using deepcopy. # generate individual oligotype pages if html_dict.has_key('output_directory_for_reps'): for i in range(0, len(html_dict['oligos'])): oligo = html_dict['oligos'][i] tmp_dict = copy.deepcopy(html_dict) tmp_dict['oligo'] = oligo tmp_dict['distribution'] = get_oligo_distribution_dict(oligo, html_dict) oligo_page = os.path.join(html_output_directory, 'oligo_%s.html' % oligo) tmp_dict['index'] = i + 1 tmp_dict['total'] = len(html_dict['oligos']) tmp_dict['prev'] = None tmp_dict['next'] = None if i > 0: tmp_dict['prev'] = 'oligo_%s.html' % html_dict['oligos'][i - 1] if i < (len(html_dict['oligos']) - 1): tmp_dict['next'] = 'oligo_%s.html' % html_dict['oligos'][i + 1] rendered = render_to_string('single_oligo.tmpl', tmp_dict) open(oligo_page, 'w').write(rendered.encode("utf-8")) # generate index index_page = os.path.join(html_output_directory, 'index.html') rendered = render_to_string('index_for_oligo.tmpl', html_dict) open(index_page, 'w').write(rendered.encode("utf-8")) return index_page
def entropy_analysis(alignment_path, output_file=None, verbose=True, uniqued=False, freq_from_defline=None, weighted=False, qual_stats_dict=None, amino_acid_sequences=False): if freq_from_defline == None: freq_from_defline = lambda x: int( [t.split(':')[1] for t in x.split('|') if t.startswith('freq')][0]) lines = [] previous_alignment_length = None progress = Progress() progress.verbose = verbose alignment = u.SequenceSource(alignment_path) progress.new('Processing the Alignment') # processing the alignment file.. while alignment.next(): # check the alignment lengths along the way: if previous_alignment_length: if previous_alignment_length != len(alignment.seq): raise EntropyError, "Not all reads have the same length." # print out process info if alignment.pos % 10000 == 0: progress.update('Reads processed: %s' % (pretty_print(alignment.pos))) # fill 'lines' variable if not uniqued: lines.append(alignment.seq) else: try: frequency = freq_from_defline(alignment.id) except IndexError: raise EntropyError, "Reads declared as unique, but they do not have proper deflines. See help for --uniqued." for i in range(0, frequency): lines.append(alignment.seq) previous_alignment_length = len(alignment.seq) progress.end() if verbose: run.info('Number of reads', pretty_print(alignment.pos)) alignment.close() # entropy analysis progress.new('Entropy Analysis') entropy_tpls = [] for position in range(0, len(lines[0])): progress.update(P(int(position + 1), len(lines[0]))) if len(set([x[position] for x in lines])) == 1: entropy_tpls.append((position, 0.0), ) else: column = "".join([x[position] for x in lines]) if weighted: if not qual_stats_dict: raise EntropyError, "Weighted entropy is selected, but no qual stats are provided" e = entropy(column, l_qual=qual_stats_dict[position], amino_acid_sequences=amino_acid_sequences) else: e = entropy(column, amino_acid_sequences=amino_acid_sequences) if e < 0.00001: entropy_tpls.append((position, 0.0), ) else: entropy_tpls.append((position, e), ) sorted_entropy_tpls = sorted(entropy_tpls, key=operator.itemgetter(1), reverse=True) progress.end() if verbose: entropy_components_larger_than_0 = [ e[1] for e in entropy_tpls if e[1] > 0 ] if entropy_components_larger_than_0: run.info('Entropy analysis', 'Done (total of %d components greater than 0, mean: %.2f, max: %.2f, min: %.2f).' \ % (len(entropy_components_larger_than_0), numpy.mean(entropy_components_larger_than_0), numpy.max(entropy_components_larger_than_0), numpy.min(entropy_components_larger_than_0))) else: run.info('Entropy analysis', 'None of the nucleotide positions posessed any entropy!') if output_file: entropy_output = open(output_file, 'w') for _component, _entropy in sorted_entropy_tpls: entropy_output.write('%d\t%.4f\n' % (_component, _entropy)) if verbose: run.info('Entropy analysis output file path', output_file) entropy_output.close() return [x[1] for x in entropy_tpls]
def topology(topology_dict_path, output_file = None, title = None): G, nodes_dict = topology_graph(topology_dict_path) number_of_edges = G.number_of_edges() number_of_nodes = G.number_of_nodes() print("Loaded %d edges and %d nodes." % (number_of_edges, number_of_nodes)) plt.figure(figsize=(24, 16)) # use graphviz to find radial layout # twopi, gvcolor, wc, ccomps, tred, sccmap, fdp, circo, neato, acyclic, nop, gvpr, dot pos=nx.graphviz_layout(G, prog="fdp") # node size is proportional to number of reads went into it sizes = dict.fromkeys(G.nodes(), 0.0) for (u, v, d) in G.edges(data=True): sizes[u] = d['size'] max_size = max(sizes.values()) k = 10000.0 / max_size for node in sizes: sizes[node] = sizes[node] * k if sizes[node] * k > 500 else 500 shapes = dict.fromkeys(G.nodes(), 0.0) for (u, v, d) in G.edges(data=True): shapes[u] = 'o' if d['size'] > 1 else '' # edge width, not in use at this moment edgewidth = [] for (u, v, d) in G.edges(data = True): edgewidth.append(2) #len(G.get_edge_data(u,v))) parent_nodes_network = nx.draw_networkx_nodes(G, pos, nodelist = parent_nodes, node_shape = 'o', node_size = [sizes[i] for i in parent_nodes], node_color = '#EFEFEF') final_nodes_network = nx.draw_networkx_nodes(G, pos, nodelist = final_nodes, node_shape = 'o', node_size = [sizes[i] for i in final_nodes], node_color = '#FAFFFA') parent_nodes_network.set_edgecolor('#888888') final_nodes_network.set_edgecolor('#88BB00') nx.draw_networkx_edges(G, pos, alpha=0.4, node_size=10, width = 1, edge_color='#808080') nx.draw_networkx_labels(G, pos, font_size=8, font_weight = 'bold', labels = dict([(u, '%s\n(%s)' % (d['label'], pretty_print(d['size']))) for u, v, d in G.edges(data=True)])) # adjust the plot limits xmax = 1.02 * max(x for x, y in pos.values()) ymax = 1.02 * max(y for x, y in pos.values()) plt.xlim(0, xmax) plt.ylim(0, ymax) plt.xticks([]) plt.yticks([]) plt.subplots_adjust(hspace = 0, wspace = 0, right = 0.995, left = 0.005, top = 0.995, bottom = 0.005) plt.text(0.03, 0.97, title or "Topology", fontsize='xx-large', fontname="Arial", fontweight="bold", transform=plt.gca().transAxes) ax=plt.gca() plt.setp(ax, frame_on=False) #plt.axis('off') if nodes_dict['root'].has_key('freq_curve_img_path'): AX=plt.gca() f=plt.gcf() for node in nodes_dict.keys(): (x, y) = pos[node] xt,yt = AX.transData.transform((x, y)) # figure coordinates xf, yf = f.transFigure.inverted().transform((xt, yt)) # axes coordinates print xf, yf if node == 'root': imsize = 0.04 else: imsize = 0.025 img = mpimg.imread(nodes_dict[node]['freq_curve_img_path']) a = plt.axes([xf - imsize / 2.0, yf - imsize / 2.0, imsize, imsize ]) a.imshow(img) a.axis('off') if output_file: plt.savefig(output_file) else: plt.show()