def main():
    read_5p_ends_file, genome_fasta, outprefix = sys.argv[1:4]
    tp_tn_annotations = sys.argv[4:]#true positive and true negative annotations
    genome_dict = mod_utils.convertFastaToDict(genome_fasta)
    read_5p_ends = mod_utils.unPickle(read_5p_ends_file)
    normed_density_array = winsorize_norm_chromosome_data(read_5p_ends, 'S.c.18S_rRNA', '+', genome_dict, 'ACTG')
    real_tp_tn_data = []
    for filename in tp_tn_annotations:
        real_tp, real_tn = get_tp_tn(filename)
        real_tp_tn_data.append((os.path.basename(filename), real_tp, real_tn))

    roc_curves = {}
    for entry in real_tp_tn_data:
        roc_curves[entry[0]] = [[],[]]#x and y value arrays for each

    stepsize = 0.0001
    for cutoff in numpy.arange(0,1.+5*stepsize, stepsize):
        called_p = call_positives(normed_density_array, 'S.c.18S_rRNA', '+', genome_dict, 'AC', cutoff)
        for entry in real_tp_tn_data:
            #print called_p.intersection(entry[1])

            num_tp_called = len(called_p.intersection(entry[1]))#how many true positives called at this cutoff
            num_fp_called = len(called_p.intersection(entry[2]))#how many fp positives called at this cutoff
            roc_curves[entry[0]][0].append(100.*num_fp_called/float(len(entry[2])))#FP rate on x axis
            roc_curves[entry[0]][1].append(100.*num_tp_called/float(len(entry[1])))#TP rate on y axis

    plot_ROC_curves(roc_curves, outprefix)
def main():
    tp_tn_annotations, genome_fasta, outprefix = sys.argv[1:4]
    density_files = sys.argv[4:]
    sample_names = [os.path.basename(filename).split("_back_")[0] for filename in density_files]
    mutation_densities = [mod_utils.unPickle(pickled_density) for pickled_density in density_files]

    genome_dict = mod_utils.convertFastaToDict(genome_fasta)
    normed_density_arrays = [
        winsorize_norm_chromosome_data(mutation_density, "S.c.25S__rRNA", genome_dict, "AC")
        for mutation_density in mutation_densities
    real_tp, real_tn = get_tp_tn(tp_tn_annotations)
    roc_curves = {}
    for sample_name in sample_names:
        roc_curves[sample_name] = [[], []]  # x and y value arrays for each

    stepsize = 0.0001
    for cutoff in numpy.arange(0, 1.0 + 5 * stepsize, stepsize):
        for i in range(len(sample_names)):
            # the fasta file should be the EXACT one used for the pipeline, and the chromosome name below should match
            # the one in the FASTA file exactly
            called_p = call_positives(normed_density_arrays[i], "S.c.25S__rRNA", genome_dict, "AC", cutoff)
            num_tp_called = len(called_p.intersection(real_tp))  # how many true positives called at this cutoff
            num_fp_called = len(called_p.intersection(real_tn))  # how many fp positives called at this cutoff
            roc_curves[sample_names[i]][1].append(100.0 * num_tp_called / float(len(real_tp)))  # TP rate on y axis
            roc_curves[sample_names[i]][0].append(100.0 * num_fp_called / float(len(real_tn)))  # FP rate on x axis

    plot_ROC_curves(roc_curves, "S.c.25S__rRNA", outprefix)
def plot_weighted_nts_pie(background_subtracted, fasta_genome, title, out_prefix):
    genome = mod_utils.convertFastaToDict(fasta_genome)
    fig = plt.figure(figsize=(8,8))
    plot = fig.add_subplot(111)#a pie chart of mutated nts weighted by background-subtracted counts
    labels = "ATCG"
    nt_counts = defaultdict(float)
    for strand in background_subtracted:
        for chromosome in background_subtracted[strand]:
            for position in background_subtracted[strand][chromosome]:
                nt = genome[chromosome][position-1]
                nt_counts[nt] += background_subtracted[strand][chromosome][position]
    sizes = numpy.array([nt_counts[nt] for nt in labels])
    total = float(sum(sizes))
    sizes = sizes/total
    merged_labels = ['%s %.3f' % (labels[i], sizes[i]) for i in range(len(sizes))]
    plot.pie(sizes, labels = merged_labels, colors = mod_utils.rainbow)

    plt.savefig(out_prefix + '.pdf', transparent='True', format='pdf')
Exemplo n.º 5
 def __init__(self, settings_file):
     self.settings_file = settings_file
     self.rRNA_seqs = mod_utils.convertFastaToDict(self.get_rRNA_fasta())
Exemplo n.º 6
 def __init__(self, settings_file):
     self.settings_file = settings_file
     self.rRNA_seqs = mod_utils.convertFastaToDict(self.get_rRNA_fasta())
def count_reads(lib_settings):

    # Create empty dicts for storing counts data
    srt_dict = createStrandDict(strands) # Counts for 5' end of read our standard data format
    cov_dict = createStrandDict(strands) # Counts of times covered by a read
    mut_dict = createStrandDict(strands) # Counts of mismatches at a position
    read_mutations = defaultdict(int) #counts different types of mutations relative to read
    genome_mutations = defaultdict(int) #counts different types of mutations relative to genome
    mutations_by_read_position = defaultdict(dict)
    read_position_coverage = defaultdict(float)
    mutations_by_genome_position = defaultdict(dict)
    genome_position_coverage = defaultdict(float)
    mutated_nts = defaultdict(float)
    read_insertion_sizes = []
    genomic_deletion_sizes = []

    with, 'r') as f:
        for line in f: # Iterate through SAM file lines
            if not line.startswith('@'):
                # Parse line into relevant strings

                fields = line.strip().split('\t')
                ID = fields[0] #the first field in the mapped file corresponds to a unique id number for that read- these should correspond to the names in the raw_seqs dictionary
                flag = int(fields[1])
                The flag field provides a lot of info about the read, it is the decimal representation of a bit string, each digit of which is true or false

                Bit 0 = The read was part of a pair during sequencing
                Bit 1 = The read is mapped in a pair
                Bit 2 = The query sequence is unmapped
                Bit 3 = The mate is unmapped
                Bit 4 = Strand of query (0=forward 1=reverse)
                So, to see if a flag represents a read on the  - strand, we evaluate (16 & 'flag'), where & is the bitwise and operator,
                which will be non-zero (True) only if this read is on the - strand
                if (4&flag):#if this is an unmapped read, don't bother

                if (16&flag):
                    strand = '-'
                    strand = '+'
                chrom = fields[2]
                MAPQ = int(fields[4])
                if int(MAPQ) >= lib_settings.get_property('min_mapping_quality'):
                    cigarString = fields[5]
                    seq = fields[9]
                    mappingLength = len(seq)
                    qScores = fields[10]
                    # Some lines seem to lack some strings this throws of indexing of NM:i, MD:Z, and NH:i strings
                    NHstr = checkTag('NH:i:',fields)
                    NMstr = checkTag('NM:i:',fields)
                    MDstr = checkTag('MD:Z:',fields)
                    assert 'NM:i' in NMstr
                    assert 'MD:Z' in MDstr
                    assert 'NH:i' in NHstr
                    multiplicity = float(NHstr.split(':')[2])

                    fields = line.strip().split('\t')
                    counts = float(1.0/multiplicity) # Weight of read
                    MDzString = MDstr

                    # Add subdicts for chromosome if needed
                    if chrom not in srt_dict[strand]:
                        srt_dict[strand][chrom] = defaultdict(float)
                    if chrom not in cov_dict[strand]:
                        cov_dict[strand][chrom] = defaultdict(float)
                    if chrom not in mut_dict[strand]:
                        mut_dict[strand][chrom] = defaultdict(float)

                    # Parse cigar string, get genome mapping span, and relative genomic positions covered by read
                    rel_genomic_event_positions, rel_genome_coverage, mutations_rel_genome, mutations_rel_read, readMappingSpan, genomeMappingSpan = parse_MDz_and_cigar(cigarString, MDzString, mappingLength, seq)

                    for pos in range(len(mutations_rel_genome)):
                        genome_position_coverage[pos] += counts
                        event = mutations_rel_genome[pos]
                        if not event == 'M': #count if it's not a match
                            assert event[0] != 'I'
                            if event[0] == 'D':
                                event = event[0]
                            if event not in mutations_by_genome_position[pos]:
                                 mutations_by_genome_position[pos][event] = 0
                            mutations_by_genome_position[pos][event] += counts
                            genome_mutations[event] += counts
                            if event[0] in 'ATCG':
                                mutated_nts[event[0]] += counts

                    for pos in range(len(mutations_rel_read)):
                        read_position_coverage[pos] += counts
                        event = mutations_rel_read[pos]
                        if not event == 'M': #count if it's not a match
                            assert event[0] != 'D'
                            if event[0] == 'I':
                                event = event[0]
                            if event not in mutations_by_read_position[pos]:
                                 mutations_by_read_position[pos][event] = 0
                            mutations_by_read_position[pos][event] += counts
                            read_mutations[event] += counts

                    # Set start position of read
                    if strand== '+':
                        #When a read maps to the minus strand, bowtie returns the reverse complement, and indicates
                        # where this reverse mapped on the + strand. Thus the original 5' end of the read actually
                        # was x nt downstream on the + strand

                    # translate relative positions to absolute positions
                    genome_cov = readGenomicCoverage(rel_genome_coverage, strand, start) # get genome coverage

                    srt_dict[strand][chrom][start] += counts #just add the number of counts to that start position
                    for pos in genome_cov: # Increment positions for coverage dict
                        cov_dict[strand][chrom][pos] += counts

                    # If mismatches need to parse, get the absolute genomic pos, and increment counters
                    genMismatches = readGenomicCoverage(rel_genomic_event_positions, strand, start)
                    for event_position in genMismatches:
                        mut_dict[strand][chrom][event_position] += counts

    mod_utils.makePickle(srt_dict, lib_settings.get_read_5p_counts())
    mod_utils.makePickle(cov_dict, lib_settings.get_positional_coverage())
    mod_utils.makePickle(mut_dict, lib_settings.get_mutation_counts())

    mod_utils.makePickle(genome_mutations, lib_settings.get_counting_prefix() + '.genome_mutations.pkl')
    mod_utils.makePickle(mutations_by_genome_position, lib_settings.get_counting_prefix() + '.genome_position_mutations.pkl')
    mod_utils.makePickle(genome_position_coverage, lib_settings.get_counting_prefix() + '.genome_position_coverage.pkl')

    mod_utils.makePickle(mutated_nts, lib_settings.get_counting_prefix() + '.nt_mutations.pkl')

    mod_utils.makePickle(read_mutations, lib_settings.get_counting_prefix() + '.read_mutations.pkl')
    mod_utils.makePickle(mutations_by_read_position, lib_settings.get_counting_prefix() + '.read_position_mutations.pkl')
    mod_utils.makePickle(read_position_coverage, lib_settings.get_counting_prefix() + '.read_position_coverage.pkl')

    mod_utils.makePickle(genomic_deletion_sizes, lib_settings.get_counting_prefix() + '.deletion_sizes.pkl')

    mod_utils.makePickle(read_insertion_sizes, lib_settings.get_counting_prefix() + '.insertion_sizes.pkl')

    normalized_mutations = normalized_mutation_rates(mod_utils.unPickle(lib_settings.get_mutation_counts()), mod_utils.unPickle(lib_settings.get_positional_coverage()))
    mod_utils.makePickle(normalized_mutations, lib_settings.get_normalized_mutation_counts())

    plot_mutated_nts_pie(mod_utils.unPickle(lib_settings.get_counting_prefix() + '.nt_mutations.pkl'), 'mutated rRNA nts in ' + lib_settings.sample_name, lib_settings.get_counting_prefix()+'.mutated_nts' )
    plot_full_mutation_stats(mod_utils.unPickle(lib_settings.get_counting_prefix() + '.read_mutations.pkl'), mod_utils.unPickle(lib_settings.get_counting_prefix() + '.insertion_sizes.pkl'),
                             mod_utils.unPickle(lib_settings.get_counting_prefix() + '.read_position_mutations.pkl'), mod_utils.unPickle(lib_settings.get_counting_prefix() + '.read_position_coverage.pkl'), 'mutations wrt reads', "insertion size",
    plot_full_mutation_stats(mod_utils.unPickle(lib_settings.get_counting_prefix() + '.genome_mutations.pkl'), mod_utils.unPickle(lib_settings.get_counting_prefix() + '.deletion_sizes.pkl'), mod_utils.unPickle(lib_settings.get_counting_prefix() + '.genome_position_mutations.pkl'),
                             mod_utils.unPickle(lib_settings.get_counting_prefix() + '.genome_position_coverage.pkl'), 'mutations wrt genome', "deletion size",
    pie_read_5p_ends(mod_utils.unPickle(lib_settings.get_read_5p_counts()), mod_utils.convertFastaToDict(lib_settings.experiment_settings.get_rRNA_fasta()), lib_settings.get_counting_prefix())
    normed_mutation_rate_histogram(mod_utils.unPickle(lib_settings.get_normalized_mutation_counts()), lib_settings.sample_name, lib_settings.get_counting_prefix())