Пример #1
0
    def filter_synthetic_sequences(self, reads):
        if self.synthetic_fasta:
            synthetic_sequences = [read.seq for read in fasta.reads(self.synthetic_fasta)]
        else:
            synthetic_sequences = []

        synthetic_lengths = np.zeros(self.max_read_length + 1)
        for read in reads:
            if contaminants.is_synthetic(read, synthetic_sequences):
                synthetic_lengths[len(read.seq)] += 1
            else:
                yield read

        self.write_file('lengths', {'synthetic': synthetic_lengths})
def produce_sw_alignments(reads, genome_dirs, extra_targets):
    targets = []
    for genome_dir in genome_dirs:
        fasta_fns = genomes.get_all_fasta_file_names(genome_dir)
        for fasta_fn in fasta_fns:
            targets.extend(list(fasta.reads(fasta_fn)))
    targets.extend(extra_targets)

    for read in reads:
        alignments = get_local_alignments(read, targets) + get_edge_alignments(read, targets)
        # bowtie2 only retains up to the first space in a qname, so do the same
        # here to allow qnames to be compared
        sanitized_name = up_to_first_space(read.name)
        yield sanitized_name, alignments
Пример #3
0
def get_oligo_hit_lengths(bam_fn, oligos_fasta_fn, oligos_sam_fn, max_read_length):
    oligo_mappings = load_oligo_mappings(oligos_sam_fn)
    bam_file = pysam.Samfile(bam_fn, "rb")

    oligo_names = [read.name for read in fasta.reads(oligos_fasta_fn)]
    lengths = np.zeros((len(oligo_names), max_read_length + 1), int)

    for oligo_number, oligo_name in enumerate(oligo_names):
        for rname, start, end in oligo_mappings[oligo_name]:
            reads = bam_file.fetch(rname, start, end)
            for aligned_read in reads:
                if not aligned_read.is_secondary:
                    # Can't use qlen here because the bam files omit
                    # the seq and qual of secondary mappings
                    lengths[oligo_number][aligned_read.inferred_length] += 1

    return lengths
Пример #4
0
def get_oligo_hit_lengths(bam_fn, oligos_fasta_fn, oligos_sam_fn,
                          max_read_length):
    oligo_mappings = load_oligo_mappings(oligos_sam_fn)
    bam_file = pysam.Samfile(bam_fn, 'rb')

    oligo_names = [read.name for read in fasta.reads(oligos_fasta_fn)]
    lengths = np.zeros((len(oligo_names), max_read_length + 1), int)

    for oligo_number, oligo_name in enumerate(oligo_names):
        for rname, start, end in oligo_mappings[oligo_name]:
            reads = bam_file.fetch(rname, start, end)
            for aligned_read in reads:
                if not aligned_read.is_secondary:
                    # Can't use qlen here because the bam files omit
                    # the seq and qual of secondary mappings
                    lengths[oligo_number][aligned_read.inferred_length] += 1

    return lengths
def produce_sw_alignments(reads, genome_dirs, extra_targets, max_to_report=5):
    targets = set()

    for genome_dir in genome_dirs:
        fasta_fns = genomes.get_all_fasta_file_names(genome_dir)
        for fasta_fn in fasta_fns:
            targets.update(fasta.reads(fasta_fn))

    targets.update(extra_targets)

    for read in reads:
        alignments = get_local_alignments(read, targets) + get_edge_alignments(
            read, targets)
        # bowtie2 only retains up to the first space in a qname, so do the same
        # here to allow qnames to be compared
        alignments = sorted(alignments, key=lambda a: a['score'], reverse=True)
        alignments = alignments[:max_to_report]

        sanitized_name = up_to_first_space(read.name)
        yield sanitized_name, alignments
Пример #6
0
def test_new_synth():
    import trim
    from Sequencing import fasta
    sfn = '/home/jah/projects/ribosomes/data/stephanie_markers/stephanie_markers.fa'
    synthetics = [read.seq for read in fasta.reads(sfn)]

    reads = fastq.reads(
        '/home/jah/projects/ribosomes/experiments/belgium_2014_08_07/WT_1_FP/data/WT_1_FP.140731.MiSeq.FCA.lane1.R1.fastq'
    )
    for read in reads:
        trim_at = trim.trim_by_local_alignment(read.seq)
        trimmed_seq = read.seq[:trim_at]
        trimmed_read = fasta.Read(read.name, trimmed_seq)
        old = is_synthetic(trimmed_read, synthetics)
        new = is_synthetic_new(trimmed_read, synthetics)
        if old and not new and trimmed_seq != '':
            print 'old is', old
            print 'new is', new
            print trimmed_seq
            raw_input()
Пример #7
0
def test_new_synth():
    import trim
    from Sequencing import fasta

    sfn = "/home/jah/projects/ribosomes/data/stephanie_markers/stephanie_markers.fa"
    synthetics = [read.seq for read in fasta.reads(sfn)]

    reads = fastq.reads(
        "/home/jah/projects/ribosomes/experiments/belgium_2014_08_07/WT_1_FP/data/WT_1_FP.140731.MiSeq.FCA.lane1.R1.fastq"
    )
    for read in reads:
        trim_at = trim.trim_by_local_alignment(read.seq)
        trimmed_seq = read.seq[:trim_at]
        trimmed_read = fasta.Read(read.name, trimmed_seq)
        old = is_synthetic(trimmed_read, synthetics)
        new = is_synthetic_new(trimmed_read, synthetics)
        if old and not new and trimmed_seq != "":
            print "old is", old
            print "new is", new
            print trimmed_seq
            raw_input()
Пример #8
0
def plot_oligo_hit_lengths(oligos_fasta_fn, lengths, fig_fn):
    oligo_names = [read.name for read in fasta.reads(oligos_fasta_fn)]
    if len(oligo_names) == 0:
        # If no oligos have been defined, there is no picture to make.
        return None

    fig, ax = plt.subplots(figsize=(18, 12))
    for oligo_name, oligo_lengths, color in zip(oligo_names, lengths, colors):
        denominator = np.maximum(oligo_lengths.sum(), 1)
        normalized_lengths = np.true_divide(oligo_lengths, denominator)
        ax.plot(normalized_lengths, 'o-', color=color, label=oligo_name)

    ax.legend(loc='upper right', framealpha=0.5)

    ax.set_xlim(0, lengths.shape[1] - 1)

    ax.set_xlabel('Length of original RNA fragment')
    ax.set_ylabel('Number of fragments')
    ax.set_title('Distribution of fragment lengths overlapping each oligo')

    fig.savefig(fig_fn)
    plt.close(fig)
Пример #9
0
def plot_oligo_hit_lengths(oligos_fasta_fn, lengths, fig_fn):
    oligo_names = [read.name for read in fasta.reads(oligos_fasta_fn)]
    if len(oligo_names) == 0:
        # If no oligos have been defined, there is no picture to make.
        return None

    fig, ax = plt.subplots(figsize=(18, 12))
    for oligo_name, oligo_lengths, color in zip(oligo_names, lengths, colors):
        denominator = np.maximum(oligo_lengths.sum(), 1)
        normalized_lengths = np.true_divide(oligo_lengths, denominator)
        ax.plot(normalized_lengths, "o-", color=color, label=oligo_name)

    ax.legend(loc="upper right", framealpha=0.5)

    ax.set_xlim(0, lengths.shape[1] - 1)

    ax.set_xlabel("Length of original RNA fragment")
    ax.set_ylabel("Number of fragments")
    ax.set_title("Distribution of fragment lengths overlapping each oligo")

    fig.savefig(fig_fn)
    plt.close(fig)
Пример #10
0
    def visualize_unmapped(self):
        bowtie2_targets = [(self.file_names['genome'], self.file_names['bowtie2_index_prefix'], 'C,20,0'),
                          ]
        sw_genome_dirs = ['/home/jah/genomes/truseq',
                          '/home/jah/projects/crac/data/organisms/saccharomyces_cerevisiae/EF4/contaminant/fasta/',
                         ]
        extra_targets = [fasta.Read('smRNA_linker', trim.smRNA_linker)]
        if self.synthetic_fasta:
            extra_targets.extend(list(fasta.reads(self.synthetic_fasta)))
        
        def get_reads():
            for i, (seq, count) in enumerate(self.read_file('common_unmapped')['non_long_polyA'].most_common()):
                read = fastq.Read('{0}_{1}'.format(i, count),
                                  seq,
                                  fastq.encode_sanger([40]*len(seq)),
                                 )
                yield read

        visualize_structure.visualize_unpaired_alignments(get_reads,
                                                          sw_genome_dirs,
                                                          extra_targets,
                                                          bowtie2_targets,
                                                          self.file_names['unmapped_structures'],
                                                         )
Пример #11
0
def align_reads(
    target_fasta_fn,
    reads,
    bam_fn,
    min_path_length=15,
    error_fn='/dev/null',
    alignment_type='overlap',
):
    ''' Aligns reads to targets in target_fasta_fn by Smith-Waterman, storing
    alignments in bam_fn and yielding unaligned reads.
    '''
    targets = {r.name: r.seq for r in fasta.reads(target_fasta_fn)}

    target_names = sorted(targets)
    target_lengths = [len(targets[n]) for n in target_names]
    alignment_sorter = sam.AlignmentSorter(
        target_names,
        target_lengths,
        bam_fn,
    )
    statistics = Counter()

    with alignment_sorter:
        for original_read in reads:
            statistics['input'] += 1

            alignments = []

            rc_read = fastq.Read(
                original_read.name,
                utilities.reverse_complement(original_read.seq),
                original_read.qual[::-1],
            )

            for read, is_reverse in ([original_read, False], [rc_read, True]):
                qual = fastq.decode_sanger(read.qual)
                for target_name, target_seq in targets.iteritems():
                    alignment = generate_alignments(read.seq, target_seq,
                                                    alignment_type)[0]
                    path = alignment['path']
                    if len(path) >= min_path_length and alignment['score'] / (
                            2. * len(path)) > 0.8:
                        aligned_segment = pysam.AlignedSegment()
                        aligned_segment.seq = read.seq
                        aligned_segment.query_qualities = qual
                        aligned_segment.is_reverse = is_reverse

                        char_pairs = make_char_pairs(path, read.seq,
                                                     target_seq)

                        cigar = sam.aligned_pairs_to_cigar(char_pairs)
                        clip_from_start = first_query_index(path)
                        if clip_from_start > 0:
                            cigar = [(sam.BAM_CSOFT_CLIP, clip_from_start)
                                     ] + cigar
                        clip_from_end = len(
                            read.seq) - 1 - last_query_index(path)
                        if clip_from_end > 0:
                            cigar = cigar + [
                                (sam.BAM_CSOFT_CLIP, clip_from_end)
                            ]
                        aligned_segment.cigar = cigar

                        read_aligned, ref_aligned = zip(*char_pairs)
                        md = sam.alignment_to_MD_string(
                            ref_aligned, read_aligned)
                        aligned_segment.set_tag('MD', md)

                        aligned_segment.set_tag('AS', alignment['score'])
                        aligned_segment.tid = alignment_sorter.get_tid(
                            target_name)
                        aligned_segment.query_name = read.name
                        aligned_segment.next_reference_id = -1
                        aligned_segment.reference_start = first_target_index(
                            path)

                        alignments.append(aligned_segment)

            if alignments:
                statistics['aligned'] += 1

                sorted_alignments = sorted(alignments,
                                           key=lambda m: m.get_tag('AS'),
                                           reverse=True)
                grouped = utilities.group_by(sorted_alignments,
                                             key=lambda m: m.get_tag('AS'))
                _, highest_group = grouped.next()
                primary_already_assigned = False
                for alignment in highest_group:
                    if len(highest_group) == 1:
                        alignment.mapping_quality = 2
                    else:
                        alignment.mapping_quality = 1

                    if not primary_already_assigned:
                        primary_already_assigned = True
                    else:
                        alignment.is_secondary = True

                    alignment_sorter.write(alignment)
            else:
                statistics['unaligned'] += 1

                yield read

        with open(error_fn, 'w') as error_fh:
            for key in ['input', 'aligned', 'unaligned']:
                error_fh.write('{0}: {1:,}\n'.format(key, statistics[key]))