def main(): outfolder, genome_fasta, normalization_file_name = sys.argv[1:4] experimental_file_names = sys.argv[4:] control_dict = mod_utils.unPickle(normalization_file_name) rescaled_control_dict = normalize_dict_to_max(control_dict) norm_name = '.'.join(os.path.basename(normalization_file_name).split('.')[:-1]) experimental_dict_names = ['.'.join(os.path.basename(file_name).split('.')[:-1]) for file_name in experimental_file_names] experimental_dicts = [mod_utils.unPickle(file_name) for file_name in experimental_file_names] rescaled_experimental_dicts = [normalize_dict_to_max(exp_dict) for exp_dict in experimental_dicts] print experimental_dict_names, norm_name normed_mutation_rate_histogram(rescaled_experimental_dicts, experimental_dict_names, os.path.join(outfolder, '%s_rescaled_mutation_rate_histogram' % norm_name), title='mutation rate, rescaled to max', xlim = (0, 0.1), min = 0, max =1, step = 0.001) comparisons = [] rescaled_comparisons = [] write_wig(control_dict, norm_name, os.path.join(outfolder, norm_name)) for i in range(len(experimental_dict_names)): write_wig(rescaled_experimental_dicts[i], experimental_dict_names[i], os.path.join(outfolder, experimental_dict_names[i])) comparison_log2_ratios = compare_to_control(experimental_dicts[i], control_dict) rescaled_comparison_log2_ratios = compare_to_control(rescaled_experimental_dicts[i], rescaled_control_dict) comparisons.append(comparison_log2_ratios) rescaled_comparisons.append(rescaled_comparison_log2_ratios) mod_utils.makePickle(comparison_log2_ratios, os.path.join(outfolder, experimental_dict_names[i]+'_comparison_log2.pkl')) #mod_utils.makePickle(rescaled_comparison_log2_ratios, os.path.join(outfolder, experimental_dict_names[i]+'_rescaled_comparison_log2.pkl')) write_wig(comparison_log2_ratios, experimental_dict_names[i]+'_comparison_log2', os.path.join(outfolder, experimental_dict_names[i]+'_comparison_log2')) #write_wig(rescaled_comparison_log2_ratios, experimental_dict_names[i]+'_rescaled)comparison_log2', os.path.join(outfolder, experimental_dict_names[i]+'_rescaled_comparison_log2')) #try: # plot_weighted_nts_pie(background_subtracted, genome_fasta, '%s backround-subtracted fractions' % experimental_dict_names[i], os.path.join(outfolder, experimental_dict_names[i]+'_sub_pie')) #except: # pass #print comparisons #print rescaled_comparisons normed_mutation_rate_histogram(comparisons, experimental_dict_names, os.path.join(outfolder, '%s_comparison_histogram' % norm_name), title='log2 experiment/control', xlim = (-10, 10), min = -100, max =100, step = 0.1)
def main(): outprefix, bundle1, bundle2, bundle3, bundle4, bundle5, datafile_name = sys.argv[1:8] bundles = [bundle1, bundle2, bundle3, bundle4, bundle5] reactivities = mod_utils.unPickle(datafile_name) for i in range(1,6): infile = open(bundles[i-1]) outfile = open(outprefix+'_bundle'+str(i)+'.pdb' ,'w') for line in infile: if line.startswith('ATOM'): chain = line[21] resi = int(line[22:28].strip()) if i in rRNA_assignments and chain in rRNA_assignments[i] and resi in reactivities['+'][rRNA_assignments[i][chain]]: new_line = '%s%6.3f%s' % (line[:60], reactivities['+'][rRNA_assignments[i][chain]][resi], line[66:]) assert len(line) == len(new_line) else: new_line = '%s%6.4f%s' % (line[:60], 0.0, line[66:]) assert len(line) == len(new_line) elif line.startswith("ANISOU"): new_line = '' #remove the anisotropic b factors, I don't need them else: new_line = line outfile.write(new_line) infile.close() outfile.close()
def main(): read_5p_ends_file, genome_fasta, outprefix = sys.argv[1:4] tp_tn_annotations = sys.argv[4:]#true positive and true negative annotations genome_dict = mod_utils.convertFastaToDict(genome_fasta) read_5p_ends = mod_utils.unPickle(read_5p_ends_file) normed_density_array = winsorize_norm_chromosome_data(read_5p_ends, 'S.c.18S_rRNA', '+', genome_dict, 'ACTG') real_tp_tn_data = [] for filename in tp_tn_annotations: real_tp, real_tn = get_tp_tn(filename) real_tp_tn_data.append((os.path.basename(filename), real_tp, real_tn)) roc_curves = {} for entry in real_tp_tn_data: roc_curves[entry[0]] = [[],[]]#x and y value arrays for each stepsize = 0.0001 for cutoff in numpy.arange(0,1.+5*stepsize, stepsize): called_p = call_positives(normed_density_array, 'S.c.18S_rRNA', '+', genome_dict, 'AC', cutoff) for entry in real_tp_tn_data: #print called_p.intersection(entry[1]) num_tp_called = len(called_p.intersection(entry[1]))#how many true positives called at this cutoff num_fp_called = len(called_p.intersection(entry[2]))#how many fp positives called at this cutoff roc_curves[entry[0]][0].append(100.*num_fp_called/float(len(entry[2])))#FP rate on x axis roc_curves[entry[0]][1].append(100.*num_tp_called/float(len(entry[1])))#TP rate on y axis plot_ROC_curves(roc_curves, outprefix)
def main(): tp_tn_annotations, genome_fasta, outprefix = sys.argv[1:4] density_files = sys.argv[4:] sample_names = [os.path.basename(filename).split("_back_")[0] for filename in density_files] mutation_densities = [mod_utils.unPickle(pickled_density) for pickled_density in density_files] genome_dict = mod_utils.convertFastaToDict(genome_fasta) normed_density_arrays = [ winsorize_norm_chromosome_data(mutation_density, "S.c.25S__rRNA", genome_dict, "AC") for mutation_density in mutation_densities ] real_tp, real_tn = get_tp_tn(tp_tn_annotations) roc_curves = {} for sample_name in sample_names: roc_curves[sample_name] = [[], []] # x and y value arrays for each stepsize = 0.0001 for cutoff in numpy.arange(0, 1.0 + 5 * stepsize, stepsize): for i in range(len(sample_names)): # the fasta file should be the EXACT one used for the pipeline, and the chromosome name below should match # the one in the FASTA file exactly called_p = call_positives(normed_density_arrays[i], "S.c.25S__rRNA", genome_dict, "AC", cutoff) num_tp_called = len(called_p.intersection(real_tp)) # how many true positives called at this cutoff num_fp_called = len(called_p.intersection(real_tn)) # how many fp positives called at this cutoff roc_curves[sample_names[i]][1].append(100.0 * num_tp_called / float(len(real_tp))) # TP rate on y axis roc_curves[sample_names[i]][0].append(100.0 * num_fp_called / float(len(real_tn))) # FP rate on x axis plot_ROC_curves(roc_curves, "S.c.25S__rRNA", outprefix)
def main(): all_counts_file, all_depths_file, min_mutations, output_prefix = sys.argv[1:5] min_mutations = int(min_mutations) all_counts = mod_utils.unPickle(all_counts_file) all_depths = mod_utils.unPickle(all_depths_file) comparisons = (pair.split(',') for pair in sys.argv[5:]) for comparison in comparisons: subtracted_rates, subtraction_errors = subtraction_norm(all_counts, all_depths, min_mutations, comparison) divided_rates, division_errors = division_norm(all_counts, all_depths, min_mutations, comparison) mod_utils.makePickle(subtracted_rates, '%s_%s_%s_sub_norm.pkl' % (output_prefix, comparison[0], comparison[1])) mod_utils.makePickle(subtraction_errors, '%s_%s_%s_sub_err.pkl' % (output_prefix, comparison[0], comparison[1])) mod_utils.makePickle(divided_rates, '%s_%s_%s_div_norm.pkl' % (output_prefix, comparison[0], comparison[1])) mod_utils.makePickle(division_errors, '%s_%s_%s_div_err.pkl' % (output_prefix, comparison[0], comparison[1])) for rRNA in subtracted_rates: write_out_counts(subtracted_rates, subtraction_errors, divided_rates, division_errors, rRNA, '%s_%s_%s_%s.txt' % (output_prefix, comparison[0], comparison[1], rRNA))
def generate_single_mutation_rates_dict(chromosome, start, stop, folder, file_names, strip_suffix): combined_mutation_rates = {} for file_name in file_names: dataset_label = file_name.rstrip(strip_suffix) mutation_rates = mod_utils.unPickle(os.path.join(folder, file_name)) mutation_array = [float(mutation_rates[chromosome][position]) if position in mutation_rates[chromosome] else 0.0 for position in range(start, stop+1)] combined_mutation_rates[dataset_label] = mutation_array return combined_mutation_rates
def main(): outfolder, genome_fasta, normalization_file_name = sys.argv[1:4] experimental_file_names = sys.argv[4:] mod_utils.make_dir(outfolder) normalization_dict = mod_utils.unPickle(normalization_file_name) norm_name = '.'.join(os.path.basename(normalization_file_name).split('.')[:-2]) experimental_dict_names = ['.'.join(os.path.basename(file_name).split('.')[:-2]) for file_name in experimental_file_names] experimental_dicts = [mod_utils.unPickle(file_name) for file_name in experimental_file_names] normed_mutation_rate_histogram(experimental_dicts, experimental_dict_names, os.path.join(outfolder, 'mutation_rate_histogram'), title='nonzero positions') background_subtracted_sets = [] write_wig(normalization_dict, norm_name, os.path.join(outfolder, norm_name)) for i in range(len(experimental_dict_names)): write_wig(experimental_dicts[i], experimental_dict_names[i], os.path.join(outfolder, experimental_dict_names[i])) background_subtracted = subtract_background(experimental_dicts[i], normalization_dict) background_subtracted_sets.append(background_subtracted) mod_utils.makePickle(background_subtracted, os.path.join(outfolder, experimental_dict_names[i]+'_subtracted.pkl')) write_wig(background_subtracted, experimental_dict_names[i]+'_subtracted', os.path.join(outfolder, experimental_dict_names[i]+'_subtracted')) try: plot_weighted_nts_pie(background_subtracted, genome_fasta, '%s backround-subtracted fractions' % experimental_dict_names[i], os.path.join(outfolder, experimental_dict_names[i]+'_sub_pie')) except: pass normed_mutation_rate_histogram(background_subtracted_sets, experimental_dict_names, os.path.join(outfolder, 'back_subtracted_mutation_rate_histogram'), title = 'nonzero positions, background subtracted')
def main(): tp_tn_annotations, genome_fasta, outprefix = sys.argv[1:4] density_files = sys.argv[4:] sample_names = [os.path.basename(filename) for filename in density_files] mutation_densities = [mod_utils.unPickle(pickled_density) for pickled_density in density_files] genome_dict = mod_utils.convertFastaToDict(genome_fasta) normed_density_arrays = [winsorize_norm_chromosome_data(mutation_density, 'S.c.18S_rRNA', '+', genome_dict, 'ACTG') for mutation_density in mutation_densities] real_tp, real_tn = get_tp_tn(tp_tn_annotations) roc_curves = {} for sample_name in sample_names: roc_curves[sample_name] = [[],[]]#x and y value arrays for each stepsize = 0.0001 for cutoff in numpy.arange(0,1.+5*stepsize, stepsize): for i in range(len(sample_names)): called_p = call_positives(normed_density_arrays[i], 'S.c.18S_rRNA', '+', genome_dict, 'AC', cutoff) num_tp_called = len(called_p.intersection(real_tp))#how many true positives called at this cutoff num_fp_called = len(called_p.intersection(real_tn))#how many fp positives called at this cutoff roc_curves[sample_names[i]][1].append(100.*num_tp_called/float(len(real_tp)))#TP rate on y axis roc_curves[sample_names[i]][0].append(100.*num_fp_called/float(len(real_tn)))#FP rate on x axis plot_ROC_curves(roc_curves, outprefix)
def count_reads(lib_settings): """ """ # Create empty dicts for storing counts data srt_dict = createStrandDict(strands) # Counts for 5' end of read our standard data format cov_dict = createStrandDict(strands) # Counts of times covered by a read mut_dict = createStrandDict(strands) # Counts of mismatches at a position read_mutations = defaultdict(int) #counts different types of mutations relative to read genome_mutations = defaultdict(int) #counts different types of mutations relative to genome mutations_by_read_position = defaultdict(dict) read_position_coverage = defaultdict(float) mutations_by_genome_position = defaultdict(dict) genome_position_coverage = defaultdict(float) mutated_nts = defaultdict(float) read_insertion_sizes = [] genomic_deletion_sizes = [] with gzip.open(lib_settings.get_mapped_reads_sam_gz(), 'r') as f: for line in f: # Iterate through SAM file lines if not line.startswith('@'): # Parse line into relevant strings fields = line.strip().split('\t') ID = fields[0] #the first field in the mapped file corresponds to a unique id number for that read- these should correspond to the names in the raw_seqs dictionary flag = int(fields[1]) ''' The flag field provides a lot of info about the read, it is the decimal representation of a bit string, each digit of which is true or false Bit 0 = The read was part of a pair during sequencing Bit 1 = The read is mapped in a pair Bit 2 = The query sequence is unmapped Bit 3 = The mate is unmapped Bit 4 = Strand of query (0=forward 1=reverse) So, to see if a flag represents a read on the - strand, we evaluate (16 & 'flag'), where & is the bitwise and operator, which will be non-zero (True) only if this read is on the - strand ''' if (4&flag):#if this is an unmapped read, don't bother continue if (16&flag): strand = '-' else: strand = '+' chrom = fields[2] MAPQ = int(fields[4]) if int(MAPQ) >= lib_settings.get_property('min_mapping_quality'): cigarString = fields[5] seq = fields[9] mappingLength = len(seq) qScores = fields[10] # Some lines seem to lack some strings this throws of indexing of NM:i, MD:Z, and NH:i strings NHstr = checkTag('NH:i:',fields) NMstr = checkTag('NM:i:',fields) MDstr = checkTag('MD:Z:',fields) assert 'NM:i' in NMstr assert 'MD:Z' in MDstr assert 'NH:i' in NHstr multiplicity = float(NHstr.split(':')[2]) fields = line.strip().split('\t') counts = float(1.0/multiplicity) # Weight of read MDzString = MDstr # Add subdicts for chromosome if needed if chrom not in srt_dict[strand]: srt_dict[strand][chrom] = defaultdict(float) if chrom not in cov_dict[strand]: cov_dict[strand][chrom] = defaultdict(float) if chrom not in mut_dict[strand]: mut_dict[strand][chrom] = defaultdict(float) # Parse cigar string, get genome mapping span, and relative genomic positions covered by read rel_genomic_event_positions, rel_genome_coverage, mutations_rel_genome, mutations_rel_read, readMappingSpan, genomeMappingSpan = parse_MDz_and_cigar(cigarString, MDzString, mappingLength, seq) for pos in range(len(mutations_rel_genome)): genome_position_coverage[pos] += counts event = mutations_rel_genome[pos] if not event == 'M': #count if it's not a match assert event[0] != 'I' if event[0] == 'D': genomic_deletion_sizes.append(event[1]) event = event[0] if event not in mutations_by_genome_position[pos]: mutations_by_genome_position[pos][event] = 0 mutations_by_genome_position[pos][event] += counts genome_mutations[event] += counts if event[0] in 'ATCG': mutated_nts[event[0]] += counts for pos in range(len(mutations_rel_read)): read_position_coverage[pos] += counts event = mutations_rel_read[pos] if not event == 'M': #count if it's not a match assert event[0] != 'D' if event[0] == 'I': read_insertion_sizes.append(event[1]) event = event[0] if event not in mutations_by_read_position[pos]: mutations_by_read_position[pos][event] = 0 mutations_by_read_position[pos][event] += counts read_mutations[event] += counts # Set start position of read if strand== '+': start=int(fields[3]) else: #When a read maps to the minus strand, bowtie returns the reverse complement, and indicates # where this reverse mapped on the + strand. Thus the original 5' end of the read actually # was x nt downstream on the + strand start=int(fields[3])+genomeMappingSpan-1 # translate relative positions to absolute positions genome_cov = readGenomicCoverage(rel_genome_coverage, strand, start) # get genome coverage srt_dict[strand][chrom][start] += counts #just add the number of counts to that start position for pos in genome_cov: # Increment positions for coverage dict cov_dict[strand][chrom][pos] += counts # If mismatches need to parse, get the absolute genomic pos, and increment counters genMismatches = readGenomicCoverage(rel_genomic_event_positions, strand, start) for event_position in genMismatches: mut_dict[strand][chrom][event_position] += counts mod_utils.makePickle(srt_dict, lib_settings.get_read_5p_counts()) mod_utils.makePickle(cov_dict, lib_settings.get_positional_coverage()) mod_utils.makePickle(mut_dict, lib_settings.get_mutation_counts()) mod_utils.makePickle(genome_mutations, lib_settings.get_counting_prefix() + '.genome_mutations.pkl') mod_utils.makePickle(mutations_by_genome_position, lib_settings.get_counting_prefix() + '.genome_position_mutations.pkl') mod_utils.makePickle(genome_position_coverage, lib_settings.get_counting_prefix() + '.genome_position_coverage.pkl') mod_utils.makePickle(mutated_nts, lib_settings.get_counting_prefix() + '.nt_mutations.pkl') mod_utils.makePickle(read_mutations, lib_settings.get_counting_prefix() + '.read_mutations.pkl') mod_utils.makePickle(mutations_by_read_position, lib_settings.get_counting_prefix() + '.read_position_mutations.pkl') mod_utils.makePickle(read_position_coverage, lib_settings.get_counting_prefix() + '.read_position_coverage.pkl') mod_utils.makePickle(genomic_deletion_sizes, lib_settings.get_counting_prefix() + '.deletion_sizes.pkl') mod_utils.makePickle(read_insertion_sizes, lib_settings.get_counting_prefix() + '.insertion_sizes.pkl') normalized_mutations = normalized_mutation_rates(mod_utils.unPickle(lib_settings.get_mutation_counts()), mod_utils.unPickle(lib_settings.get_positional_coverage())) mod_utils.makePickle(normalized_mutations, lib_settings.get_normalized_mutation_counts()) plot_mutated_nts_pie(mod_utils.unPickle(lib_settings.get_counting_prefix() + '.nt_mutations.pkl'), 'mutated rRNA nts in ' + lib_settings.sample_name, lib_settings.get_counting_prefix()+'.mutated_nts' ) plot_full_mutation_stats(mod_utils.unPickle(lib_settings.get_counting_prefix() + '.read_mutations.pkl'), mod_utils.unPickle(lib_settings.get_counting_prefix() + '.insertion_sizes.pkl'), mod_utils.unPickle(lib_settings.get_counting_prefix() + '.read_position_mutations.pkl'), mod_utils.unPickle(lib_settings.get_counting_prefix() + '.read_position_coverage.pkl'), 'mutations wrt reads', "insertion size", lib_settings.get_counting_prefix()+'.read_mutations') plot_full_mutation_stats(mod_utils.unPickle(lib_settings.get_counting_prefix() + '.genome_mutations.pkl'), mod_utils.unPickle(lib_settings.get_counting_prefix() + '.deletion_sizes.pkl'), mod_utils.unPickle(lib_settings.get_counting_prefix() + '.genome_position_mutations.pkl'), mod_utils.unPickle(lib_settings.get_counting_prefix() + '.genome_position_coverage.pkl'), 'mutations wrt genome', "deletion size", lib_settings.get_counting_prefix()+'.genome_mutations') pie_read_5p_ends(mod_utils.unPickle(lib_settings.get_read_5p_counts()), mod_utils.convertFastaToDict(lib_settings.experiment_settings.get_rRNA_fasta()), lib_settings.get_counting_prefix()) normed_mutation_rate_histogram(mod_utils.unPickle(lib_settings.get_normalized_mutation_counts()), lib_settings.sample_name, lib_settings.get_counting_prefix())