Exemplo n.º 1
0
            n_unmapped += 1

            # Take only the first part of read1, to make sure quality is high
            seq = reads[reads[1].is_read1].seq[:200]
            seqb = Seq(seq, IUPAC.ambiguous_dna)

            # Save to file, to test local blast
            reads_unmapped.append(reads[reads[1].is_read1])
            if len(reads_unmapped) >= 100:
                break

            continue

            # BLAST it
            blast_xml = NCBIWWW.qblast("blastn", "nr", seqb)
            blast_record = NCBIXML.read(blast_xml)
            ali = blast_record.alignments
            if len(ali):
                ali = ali[0]
                print ali.title
            else:
                print 'No matches found'

        seqs_unmapped = reads_to_seqrecord(reads_unmapped)

    from Bio import SeqIO
    SeqIO.write(seqs_unmapped, '/ebio/ag-neher/home/fzanini/tmp/seqs_for_blast.fastq', 'fastq')
    SeqIO.write(seqs_unmapped, '/ebio/ag-neher/home/fzanini/tmp/seqs_for_blast.fasta', 'fasta')

Exemplo n.º 2
0
def fish_distant_reads(bamfilename, ref,
                       min_mismatches=20, max_mismatches=30,
                       VERBOSE=0, maxseqs=-1):
    '''Fish distant reads from the trash'''
    import numpy as np

    from hivwholeseq.utils.mapping import pair_generator, reads_to_seqrecord
    from hivwholeseq.sequencing.filter_mapped_reads import check_overhanging_reads, \
            get_distance_from_consensus

    distances = []
    seqs = []
    edges = []
    with pysam.Samfile(bamfilename, 'rb') as bamfile:
        for irp, reads in enumerate(pair_generator(bamfile)):
            if VERBOSE >= 2:
                if not ((irp + 1) % 10000):
                    print irp + 1

            (read1, read2) = reads
            i_fwd = reads[0].is_reverse

            # Check a few things to make sure we are looking at paired reads
            if read1.qname != read2.qname:
                raise ValueError('Read pair '+str(irp)+': reads have different names!')

            # Ignore unmapped reads
            if read1.is_unmapped or read2.is_unmapped:
                continue
            
            # Ignore not properly paired reads (this includes mates sitting on
            # different fragments)
            if (not read1.is_proper_pair) or (not read2.is_proper_pair):
                continue

            # Check for overhangs beyond the edge
            skip = check_overhanging_reads(reads, len(ref))
            if skip:
                continue

            # Fish out our reads
            dc = get_distance_from_consensus(ref, reads, VERBOSE=VERBOSE)
            if (min_mismatches <= dc.sum() <= max_mismatches):
                if VERBOSE >= 3:
                    print 'Gotcha!', reads[0].qname
                seqs.append(reads[0])
                seqs.append(reads[1])
                distances.append(dc)
                edge = [(read.pos, read.pos + sum(bl for bt, bl in read.cigar if bt in (0, 2)))
                        for read in reads]
                edges.append(edge)

                if len(seqs) // 2 == maxseqs:
                    if VERBOSE >= 2:
                        print 'Max seqs reached:', maxseqs
                    break

        seqs = list(pair_generator(reads_to_seqrecord(seqs)))

    distances = np.array(distances, int)
    return (distances, edges, seqs)
Exemplo n.º 3
0
            # Take only the first part of read1, to make sure quality is high
            seq = reads[reads[1].is_read1].seq[:200]
            seqb = Seq(seq, IUPAC.ambiguous_dna)

            # Save to file, to test local blast
            reads_unmapped.append(reads[reads[1].is_read1])
            if len(reads_unmapped) >= 100:
                break

            continue

            # BLAST it
            blast_xml = NCBIWWW.qblast("blastn", "nr", seqb)
            blast_record = NCBIXML.read(blast_xml)
            ali = blast_record.alignments
            if len(ali):
                ali = ali[0]
                print ali.title
            else:
                print 'No matches found'

        seqs_unmapped = reads_to_seqrecord(reads_unmapped)

    from Bio import SeqIO
    SeqIO.write(seqs_unmapped,
                '/ebio/ag-neher/home/fzanini/tmp/seqs_for_blast.fastq',
                'fastq')
    SeqIO.write(seqs_unmapped,
                '/ebio/ag-neher/home/fzanini/tmp/seqs_for_blast.fasta',
                'fasta')