n_unmapped += 1 # Take only the first part of read1, to make sure quality is high seq = reads[reads[1].is_read1].seq[:200] seqb = Seq(seq, IUPAC.ambiguous_dna) # Save to file, to test local blast reads_unmapped.append(reads[reads[1].is_read1]) if len(reads_unmapped) >= 100: break continue # BLAST it blast_xml = NCBIWWW.qblast("blastn", "nr", seqb) blast_record = NCBIXML.read(blast_xml) ali = blast_record.alignments if len(ali): ali = ali[0] print ali.title else: print 'No matches found' seqs_unmapped = reads_to_seqrecord(reads_unmapped) from Bio import SeqIO SeqIO.write(seqs_unmapped, '/ebio/ag-neher/home/fzanini/tmp/seqs_for_blast.fastq', 'fastq') SeqIO.write(seqs_unmapped, '/ebio/ag-neher/home/fzanini/tmp/seqs_for_blast.fasta', 'fasta')
def fish_distant_reads(bamfilename, ref, min_mismatches=20, max_mismatches=30, VERBOSE=0, maxseqs=-1): '''Fish distant reads from the trash''' import numpy as np from hivwholeseq.utils.mapping import pair_generator, reads_to_seqrecord from hivwholeseq.sequencing.filter_mapped_reads import check_overhanging_reads, \ get_distance_from_consensus distances = [] seqs = [] edges = [] with pysam.Samfile(bamfilename, 'rb') as bamfile: for irp, reads in enumerate(pair_generator(bamfile)): if VERBOSE >= 2: if not ((irp + 1) % 10000): print irp + 1 (read1, read2) = reads i_fwd = reads[0].is_reverse # Check a few things to make sure we are looking at paired reads if read1.qname != read2.qname: raise ValueError('Read pair '+str(irp)+': reads have different names!') # Ignore unmapped reads if read1.is_unmapped or read2.is_unmapped: continue # Ignore not properly paired reads (this includes mates sitting on # different fragments) if (not read1.is_proper_pair) or (not read2.is_proper_pair): continue # Check for overhangs beyond the edge skip = check_overhanging_reads(reads, len(ref)) if skip: continue # Fish out our reads dc = get_distance_from_consensus(ref, reads, VERBOSE=VERBOSE) if (min_mismatches <= dc.sum() <= max_mismatches): if VERBOSE >= 3: print 'Gotcha!', reads[0].qname seqs.append(reads[0]) seqs.append(reads[1]) distances.append(dc) edge = [(read.pos, read.pos + sum(bl for bt, bl in read.cigar if bt in (0, 2))) for read in reads] edges.append(edge) if len(seqs) // 2 == maxseqs: if VERBOSE >= 2: print 'Max seqs reached:', maxseqs break seqs = list(pair_generator(reads_to_seqrecord(seqs))) distances = np.array(distances, int) return (distances, edges, seqs)
# Take only the first part of read1, to make sure quality is high seq = reads[reads[1].is_read1].seq[:200] seqb = Seq(seq, IUPAC.ambiguous_dna) # Save to file, to test local blast reads_unmapped.append(reads[reads[1].is_read1]) if len(reads_unmapped) >= 100: break continue # BLAST it blast_xml = NCBIWWW.qblast("blastn", "nr", seqb) blast_record = NCBIXML.read(blast_xml) ali = blast_record.alignments if len(ali): ali = ali[0] print ali.title else: print 'No matches found' seqs_unmapped = reads_to_seqrecord(reads_unmapped) from Bio import SeqIO SeqIO.write(seqs_unmapped, '/ebio/ag-neher/home/fzanini/tmp/seqs_for_blast.fastq', 'fastq') SeqIO.write(seqs_unmapped, '/ebio/ag-neher/home/fzanini/tmp/seqs_for_blast.fasta', 'fasta')