def normalize_fasta(fastaFile, refFile, outFile): f = FastaReader(fastaFile) recs = [] with open(outFile, "w") as of: for r in f: r_id = "%s" % hex(zlib.adler32(r.name + r.sequence) & 0xffffffff) print >>of, ">"+r_id seq = r.sequence.upper() print >>of, seq output = subprocess.check_output("blasr -bestn 1 -m 1 %s %s" % ( outFile, refFile ), shell=True) direction = {} output = output.strip().split("\n") for l in output: l = l.strip().split() rId = l[0].split("/")[0] if l[2] != l[3]: direction[rId] = "-" else: direction[rId] = "+" f = FastaReader(outFile) outData = [] for r in f: r_id = "%s" % r.name outData.append(">"+r_id) seq = r.sequence.upper() if direction != None: if direction.get(r_id, "+") != "+": seq = "".join([rmap[c] for c in seq[::-1]]) outData.append(seq) with open(outFile,"w") as of: print >>of, "\n".join(outData)
def test_runner(self): """Test CombineRunner.""" ipq_opts = IceQuiverHQLQOptions(qv_trim_5=100, qv_trim_3=30) d = op.join(SIV_DATA_DIR, "test_tool_contract_chunks") split_dirs = [op.join(d, b, "cluster_out") for b in ("0to1kb_part0", "1to2kb_part0", "2to3kb_part0", "3to4kb_part0", "4to5kb_part0")] print split_dirs out_combined_dir = op.join(OUT_DIR, "test_CombineUtils", "combined_dir") rmpath(out_combined_dir) mkdir(out_combined_dir) obj = CombineRunner(combined_dir=out_combined_dir, sample_name="mysample", split_dirs=split_dirs, ipq_opts=ipq_opts) obj.run() expected_out_fns = (obj.all_hq_fa, obj.all_hq_fq, obj.all_lq_fa, obj.all_lq_fq, obj.all_consensus_isoforms_fa, obj.all_cluster_report_fn, obj.all_cluster_summary_fn) self.assertTrue(all([op.exists(f) for f in expected_out_fns])) expected_hq_isoforms = ['i1_HQ_mysample|c0/f2p16/1826', 'i2_HQ_mysample|c2/f9p14/2470', 'i2_HQ_mysample|c5/f7p19/2472', 'i2_HQ_mysample|c10/f8p16/2457', 'i2_HQ_mysample|c98/f2p10/2081', 'i2_HQ_mysample|c108/f23p28/2471'] self.assertEqual([r.name.split(' ')[0] for r in FastaReader(obj.all_hq_fa)], expected_hq_isoforms) self.assertEqual([r.name.split(' ')[0] for r in FastqReader(obj.all_hq_fq)], expected_hq_isoforms) expected_lq_isoforms_num = 73 self.assertEqual(len([r for r in FastaReader(obj.all_lq_fa)]), expected_lq_isoforms_num) expected_consensus_isoforms_num = 79 self.assertEqual(len([r for r in FastaReader(obj.all_consensus_isoforms_fa)]), expected_consensus_isoforms_num)
def writeSummary(self, fa, summary_fn, hq_fa=None, lq_fa=None): """Extract number of consensus isoforms predicted, and total number of bases in all consensuus isoforms from fa and write the two attributes to summary_fn. if hq_fa (polished high-quality isoforms) is not None, report the number of polished hq clusters if lq_fa (polished high-quality isoforms) is not None, report the number of polished hq clusters """ try: with FastaReader(fa) as reader: for r in reader: self.summary.numConsensusIsoforms += 1 self.summary.numTotalBases += len(r.sequence) if hq_fa is not None: self.summary.num_polished_hq_isoforms = 0 with FastaReader(hq_fa) as reader: for r in reader: self.summary.num_polished_hq_isoforms += 1 if lq_fa is not None: self.summary.num_polished_lq_isoforms = 0 with FastaReader(lq_fa) as reader: for r in reader: self.summary.num_polished_lq_isoforms += 1 self.summary.write(summary_fn) except ZeroDivisionError: errMsg = "No consensus isoforms predicted." self.add_log(errMsg, level=logging.ERROR) raise ClusterException(errMsg)
def run(self): """Subset reads based on read annotation and subset rules.""" infoMsg = "Extracting reads from {f} based on ".format(f=self.inFN) infoMsg += "rules(FullLength={fl}, nonChimeric={nc}).".format( fl="true" if self.rules.FL != 0 else "false", nc="true" if self.rules.nonChimeric != 0 else "false") logging.info(infoMsg) if not self.printReadLengthOnly: with FastaReader(self.inFN) as reader, \ FastaWriter(self.outFN) as writer: for r in reader: #print >> sys.stderr, r.name, self.ignore_polyA annotation = ReadAnnotation.fromString( r.name, self.ignore_polyA) if self.satisfy(annotation, self.rules): writer.writeRecord(r.name, r.sequence) else: # print read length only, dont print read names and sequences with FastaReader(self.inFN) as reader, \ open(self.outFN, 'w') as writer: for r in reader: annotation = ReadAnnotation.fromString( r.name, self.ignore_polyA) if self.satisfy(annotation, self.rules): writer.write("{rl}\n".format(rl=len(r.sequence)))
def write_summary(self, summary_fn, isoforms_fa, hq_fa=None, lq_fa=None): """Extract number of consensus isoforms predicted, and total number of bases in all consensuus isoforms from isoforms_fa and write the two attributes to summary_fn. if hq_fa (polished high-quality isoforms) is not None, report the number of polished hq clusters if lq_fa (polished high-quality isoforms) is not None, report the number of polished hq clusters """ self.add_log("Writing a summary to {f}".format(f=summary_fn), level=logging.INFO) try: summary = ClusterSummary() with FastaReader(isoforms_fa) as reader: for r in reader: summary.numConsensusIsoforms += 1 summary.numTotalBases += len(r.sequence) if hq_fa is not None: summary.num_polished_hq_isoforms = 0 with FastaReader(hq_fa) as reader: for r in reader: summary.num_polished_hq_isoforms += 1 if lq_fa is not None: summary.num_polished_lq_isoforms = 0 with FastaReader(lq_fa) as reader: for r in reader: summary.num_polished_lq_isoforms += 1 summary.write(summary_fn) except ZeroDivisionError: errMsg = "No consensus isoforms predicted." self.add_log(errMsg, level=logging.ERROR) raise RuntimeError(errMsg)
def __init__(self, isoseq_output_fn, reference_transcripts_fn, output_analysis_fn, min_true_positive, max_false_positive, min_seq_similarity, max_fuzzy_junction): self.isoseq_output_fn = isoseq_output_fn self.reference_transcripts_fn = reference_transcripts_fn self.output_analysis_fn = output_analysis_fn if isoseq_output_fn.endswith(".fasta") or isoseq_output_fn.endswith( ".fa"): self.isoforms = [r for r in FastaReader(isoseq_output_fn)] self.isoseq_output_fa = self.isoseq_output_fn elif isoseq_output_fn.endswith(".fastq") or isoseq_output_fn.endswith( ".fq"): self.isoforms = [r for r in FastqReader(isoseq_output_fn)] self.isoseq_output_fa = self.output_analysis_fn + ".isoseq.fa" with FastaWriter(self.isoseq_output_fa) as writer: for r in self.isoforms: writer.writeRecord(r.name, r.sequence) self.reference_transcripts = [ r for r in FastaReader(reference_transcripts_fn) ] self.min_true_positive = min_true_positive self.max_false_positive = max_false_positive self.min_seq_similarity = min_seq_similarity if min_seq_similarity <= 1 \ else min_seq_similarity / 100.0 self.max_fuzzy_junction = max_fuzzy_junction self.alns = self.filter_alns( self.map_isoforms_to_reference_transcripts())
def gconFunc(tp): # called bcause multiprocess rootDir, barcode = tp bcdir = "/".join((rootDir, barcode)) ## call gcon logging.info("In gconFunc for: %s" % barcode) cmd = "gcon.py r --min_cov 3 %s/subreads.fasta %s/seed_read.fasta -d %s" % \ (bcdir, bcdir, bcdir) subprocess.call(cmd, shell=True) ## check to see if the file is empty r = FastaReader("%s/g_consensus.fa" % bcdir) if not list(r)[0].sequence: return None ## check to see if we are going to run quiver if not runner.args.noQuiver: # setup the blasr / sam / quiver stuff. logging.info("Setup regions file, now running blasr through quiver.") cmd = ('blasr %s %s/g_consensus.fa -nproc 1 -sam -regionTable %s/region.fofn -out ' + \ '%s/aligned_reads.sam') % (runner.args.inputFofn, bcdir, bcdir, bcdir) logging.debug(cmd) subprocess.call(cmd, shell=True) cmd = 'samtoh5 %s/aligned_reads.sam %s/g_consensus.fa %s/aligned_reads.cmp.h5' % \ (bcdir, bcdir, bcdir) logging.debug(cmd) subprocess.call(cmd, shell=True) cmd = ('loadPulses %s %s/aligned_reads.cmp.h5 -byread -metrics ' + \ 'QualityValue,InsertionQV,MergeQV,DeletionQV,DeletionTag,SubstitutionTag,' + \ 'SubstitutionQV') % (runner.args.inputFofn, bcdir) logging.debug(cmd) subprocess.call(cmd, shell=True) cmd = 'cmph5tools.py sort --inPlace %s/aligned_reads.cmp.h5' % bcdir logging.debug(cmd) subprocess.call(cmd, shell=True) cmd = ('quiver -vv --algorithm quiver -p P4-C2.AllQVsMergingByChannelModel ' \ '%s/aligned_reads.cmp.h5 --outputFilename %s/q_consensus.fasta ' + \ '--referenceFilename %s/g_consensus.fa') % (bcdir, bcdir, bcdir) logging.debug(cmd) subprocess.call(cmd, shell=True) cFilename = 'q_consensus.fasta' else: cFilename = 'g_consensus.fa' ## append results to output file. bcCons = "%s/%s/%s" % (rootDir, barcode, cFilename) if os.path.exists(bcCons): return FastaRecord(barcode, list(FastaReader(bcCons))[0].sequence) else: return None
def __init__(self, file_name): self.file_name = file_name self._is_fasta = False self.ext = op.splitext(file_name)[1].upper() if self.ext in [".FA", ".FASTA"]: self._dataset = FastaReader(file_name) self._is_fasta = True elif self.ext == ".BAM": self._dataset = openDataFile(file_name) else: # either contigset.xml or consensusreadset.xml assert self.ext == ".XML" self._dataset = openDataSet(file_name) if isinstance(self._dataset, ContigSet): self._is_fasta = True
def create_chimeras(input_file, output=None, reference_file=None, alignment_file=None): """ Pick the top N Amplicon Analysis consensus seqs from a Fasta by Nreads """ # Check the input files, and align the input file if needed if reference_file and alignment_file is None: alignment_file = align_best_reference(input_file, reference_file) elif reference_file is None and alignment_file is None: msg = "extract_alleles requires either an Alignment or a Reference!" log.error(msg) raise IOError(msg) # Set the output file if not specified if output is None: basename = '.'.join(input_file.split('.')[:-1]) output = '%s.chimeras.fasta' % basename # Parse the alignment data and extract the target sequences alignments = list(BlasrReader(alignment_file)) groups = _group_by_locus(alignments) groups = _filter_groups(groups) sequences = list(FastaReader(input_file)) chimeras = list(_create_chimeras(groups, sequences)) write_fasta(chimeras, output) return output
def makeBarcodeH5FromBasH5(basH5): """The workhorse function for creating a barcode H5 file from a base H5 file.""" labeler = BarcodeScorer(basH5, FastaReader(runner.args.barcodeFile), runner.args.adapterSidePad, runner.args.insertSidePad, scoreMode=runner.args.scoreMode, maxHits=runner.args.maxAdapters, scoreFirst=runner.args.scoreFirst, startTimeCutoff=runner.args.startTimeCutoff) if runner.args.nZmws < 0: zmws = basH5.sequencingZmws else: zmws = basH5.sequencingZmws[0:runner.args.nZmws] logging.debug("Labeling %d ZMWs from: %s" % (len(zmws), basH5.filename)) labeledZmws = labeler.labelZmws(zmws) logging.debug("Labeled %d ZMWs" % len(labeledZmws)) outBase = re.sub(BAS_PLS_REGEX, BARCODE_EXT, os.path.basename(basH5.filename)) outFile = '/'.join((runner.args.outDir, outBase)) logging.debug("Writing to: %s" % outFile) writeBarcodeH5(labeledZmws, labeler, outFile, runner.args.saveExtendedInfo) return outFile
def fasta_to_plot_group(fasta_file, output_dir): lengths = [] with FastaReader(fasta_file) as f: for record in f: lengths.append(len(record.sequence)) from pbreports.plot.helper import get_fig_axes #pylint: disable=import-error from pbcommand.models.report import PlotGroup, Plot fig, ax = get_fig_axes() if len(lengths) == 1: v = lengths[0] hrange = (v - 1, v + 1) ax.hist(lengths, range=hrange) else: ax.hist(lengths) ax.set_title("Sequence Length Histogram") ax.set_xlabel("Sequence Length") name = "sequence_length_hist.png" png_path = os.path.join(output_dir, name) fig.savefig(png_path) plots = [Plot("sequence_lengths", name)] pg = PlotGroup("reference_hist", "Sequence Lengths", plots=plots) return pg
def _extract_sequences(project, contigs): sequence_file = os.path.join(project, 'results', 'AmpliconAssembly', 'Final_Sequences.fasta') for record in FastaReader(sequence_file): name = record.name.split()[0] if name in contigs: yield record
def run_after(self, rtc, output_dir): self.assertTrue(op.exists(rtc.task.output_files[0])) out_dir = op.join(OUT_DIR, "test_gather_polished_isoforms_in_each_bin") cluster_out_dirs = [ op.join(out_dir, bin_name, "cluster_out") for bin_name in BIN_NAMES ] out_hq_fns = [ op.join(d, fn) for d in cluster_out_dirs for fn in HQ_ISOFORMS_FNS ] print "out_hq_fns %s" % out_hq_fns self.assertTrue(all([op.exists(f) for f in out_hq_fns])) out_lq_fns = [ op.join(d, fn) for d in cluster_out_dirs for fn in LQ_ISOFORMS_FNS ] print "out_lq_fns %s" % out_lq_fns self.assertTrue(all([op.exists(f) for f in out_lq_fns])) print "out_lq_fa %s is not empty" % out_lq_fns[0] n = len([r for r in FastaReader(out_lq_fns[0])]) self.assertTrue(n > 0) out_logs = [ IceFiles(prog_name="", root_dir=d).submitted_quiver_jobs_log for d in cluster_out_dirs ] print "out_logs %s" % out_logs self.assertTrue(all([op.exists(f) for f in out_logs]))
def main(): parser = argparse.ArgumentParser() parser.add_argument("-a", "--assembly", help="assembled contigs") #parser.add_argument("-m","--mapping", help="mapping of read to contigs in bam format") #parser.add_argument("-d","--dir",help="output directory for results",default='out') args = parser.parse_args() RF = 'AAGCTT' f = FastaReader(args.assembly) for record in f: id, seq = record.id, str(record.sequence) pos = [m.start(0) for m in re.finditer(RF, seq)] length = len(seq) left_count = 0 rigt_count = 0 for each in pos: if each < length / 2: left_count += 1 else: rigt_count += 1 print id, left_count, rigt_count
def run_fasta_filter(fasta_in, fasta_out, min_seq_length): with FastaWriter(fasta_out) as w: with FastaReader(fasta_in) as r: for record in r: if len(record.sequence) > min_seq_length: w.writeRecord(record) return 0
def testSplit(self): """Test FastaSplitter.split().""" fs = FastaSplitter(self.input_fasta, 2, self.out_dir, "testFastaSplitter_split_") fs.split() splittedReads = [] for of in fs.out_fns: self.assertTrue(op.exists(of)) with FastaReader(of) as reader: splittedReads.extend([(r.name, r.sequence) for r in reader]) fs.rmOutFNs() reads = [] with FastaReader(self.input_fasta) as reader: reads.extend([(r.name, r.sequence) for r in reader]) self.assertTrue(len(reads) == 22) self.assertTrue(splittedReads == reads)
def isValidFasta(filename): if not isValidFile(filename) or not isFastaFile(filename): return False try: list(FastaReader(filename)) except: return False return True
def get_the_only_fasta_record(fa): """Input fasta file should contain exactly one FastaRecord, return the fastas record.""" rs = [r for r in FastaReader(fa)] if len(rs) != 1: errMsg = "Cluster fasta file {fa} must contain only one read.".\ format(fa=fa) raise ValueError(errMsg) return rs[0]
def test_readFasta(self): f = FastaReader(data.getFasta()) entries = list(f) assert 48 == len(entries) assert "ref000001|EGFR_Exon_2" == entries[0].header assert ("TTTCTTCCAGTTTGCCAAGGCACGAGTAACAAGCTCACGCAGTTGGGCACTTT" "TGAAGATCATTTTCTCAGCCTCCAGAGGATGTTCAATAACTGTGAGGTGGTCC" "TTGGGAATTTGGAAATTACCTATGTGCAGAGGAATTATGATCTTTCCTTCTTA" "AAGGTTGGTGACTTTGATTTTCCT") == entries[0].sequence
def _findCliques(self, alignGraph, readsFa): """ Find all mutually exclusive cliques within the graph, with decreased size. alignGraph - a graph, each node represent a read and each edge represents an alignment between two end points. Return a dictionary of clique indices and nodes. key = index of a clique value = nodes within a clique Cliques are ordered by their size descendingly: index up, size down Reads which are not included in any cliques will be added as cliques of size 1. """ uc = {} # To keep cliques found used = [] # nodes within any cliques ind = 0 # index of clique to discover deg = alignGraph.degree().items() # Sort tuples of (node, degree) by degree, descendingly deg.sort(key=lambda x: x[1], reverse=True) for d in deg: node = d[0] # node which has the largest degree in alignGraph if node not in alignGraph: continue # just get the immediate neighbors since we're looking for perfect # cliques subGraph = alignGraph.subgraph([node] + alignGraph.neighbors(node)) subNodes = subGraph.nodes() # Convert from networkx.Graph to a sparse matrix S, H = pClique.convert_graph_connectivity_to_sparse( subGraph, subNodes) # index of the 'node' in the sub-graph seed_i = subNodes.index(node) # Grasp a clique from subGraph, and return indices of clique nodes # setting gamma=0.8 means to find quasi-0.8-cliques! tQ = pClique.grasp(S, H, gamma=0.8, maxitr=5, given_starting_node=seed_i) if len(tQ) > 0: c = [subNodes[i] for i in tQ] # nodes in the clique uc[ind] = c # Add the clique to uc ind += 1 used += c # Add clique nodes to used # Remove clique nodes from alignGraph and continue alignGraph.remove_nodes_from(c) with FastaReader(readsFa) as reader: for r in reader: rid = r.name.split()[0] if rid not in used: uc[ind] = [rid] ind += 1 return uc
def get_fasta_stats(fasta, genome_size): """Calculate basic fasta stats""" lengths = [len(record.sequence) for record in FastaReader(fasta)] lengths.sort(reverse=True) asm_contigs = len(lengths) asm_total_bp = sum(lengths) def get_nstat(lens, stat, genome_size=None): """Calculate all N* stats""" lens.sort(reverse=True) if genome_size is not None: total = genome_size else: total = sum(lens) limit = total * stat for num in lens: total -= num if total <= limit: return num asm_n50 = get_nstat(lengths, 0.50) asm_n90 = get_nstat(lengths, 0.10) asm_n95 = get_nstat(lengths, 0.05) asm_min = lengths[-1] asm_max = lengths[0] asm_mean = asm_total_bp / asm_contigs asm_median = int((lengths[int(math.floor(asm_contigs * .5))] + lengths[int(math.floor(asm_contigs * .5))]) / 2) asm_esize = sum([x * x for x in lengths]) / asm_total_bp fasta_stats = { 'asm_contigs': asm_contigs, 'asm_total_bp': asm_total_bp, 'asm_esize': asm_esize, 'asm_min': asm_min, 'asm_max': asm_max, 'asm_mean': asm_mean, 'asm_median': asm_median, 'asm_n50': asm_n50, 'asm_n90': asm_n90, 'asm_n95': asm_n95 } if genome_size is not None: asm_ng50 = get_nstat(lengths, 0.50, genome_size) asm_ng90 = get_nstat(lengths, 0.10, genome_size) asm_ng95 = get_nstat(lengths, 0.05, genome_size) fasta_stats.update({ 'asm_ng50': asm_ng50, 'asm_ng90': asm_ng90, 'asm_ng95': asm_ng95 }) return fasta_stats
def fasta_movie_counts( fasta ): counts = {'all':0} for record in FastaReader( fasta ): movie = record.name.split('_')[0] counts['all'] += 1 try: counts[movie] += 1 except: counts[movie] = 1 return counts
def _createEntryFromFile(self): self._id = os.path.splitext(os.path.basename(self._path))[0] self._info = ReferenceInfo(self) self._info._file = self._path self._contigs = [] for seq in FastaReader(self._path): contig = ReferenceContig(self) contig._name = seq.getTag() contig._id = contig._name self._contigs.append(contig)
def main(): id2seq = {} parser = argparse.ArgumentParser() parser.add_argument("-b", "--breakpoint", help="file containing breakpoints") parser.add_argument("-a", "--assembly", help="fasta file containing contigs") parser.add_argument("-o", "--outfile", help="new assembly file") parser.add_argument("-l", "--lenfile", help="length of contigs") args = parser.parse_args() lenfile = open(args.lenfile, 'w') lenmap = {} f = FastaReader(args.assembly) for record in f: id = record.id id2seq[id] = record.sequence[0:-10] new_seq = {} f = open(args.breakpoint, 'r') lines = f.readlines() for line in lines: attrs = line.split() if len(attrs) == 1: curr_contig = attrs[0] seq = id2seq[curr_contig] else: start = long(attrs[0]) end = long(attrs[1]) new_id = curr_contig + '_' + attrs[0] + '_' + attrs[1] new_seq[new_id] = seq[start:end] lenmap[new_id] = end - start + 1 rec_list = [] writer = FastaWriter(args.scaffold) for key in new_seq: writer.writeRecord(key, new_seq[key]) for key in lenmap: lenfile.write(key + "\t" + str(lenmap[key]) + '\n')
def fasta_to_report(fasta_file, output_json): nrecords = 0 with FastaReader(fasta_file) as r: for _ in r: nrecords += 1 attr = Attribute("num_records", nrecords, "Number of Records") plot_groups = try_fasta_to_plot_group(fasta_file, output_json) return Report("fasta_report", attributes=[attr], plotgroups=plot_groups)
def get_fasta_readlengths(fasta_file): """ Get a sorted list of contig lengths :return: (tuple) """ lens = [] with FastaReader(fasta_file) as f: for record in f: lens.append(len(record.sequence)) lens.sort() return lens
def test_dosLineEndingsFasta(self): fr = FastaReader(data.getDosFormattedFasta()) frEntries = list(fr) ft = IndexedFastaReader(data.getDosFormattedFasta()) ftEntries = list(ft) assert_equal(len(frEntries), len(ftEntries)) for (frE, ftE) in zip(frEntries, ftEntries): assert_equal(frE.header, ftE.header) assert_equal(frE.sequence, ftE.sequence[:])
def test_readFasta(self): f = FastaReader(data.getFasta()) entries = list(f) assert_equal(48, len(entries)) assert_equal("ref000001|EGFR_Exon_2", entries[0].name) assert_equal( "TTTCTTCCAGTTTGCCAAGGCACGAGTAACAAGCTCACGCAGTTGGGCACTTT" "TGAAGATCATTTTCTCAGCCTCCAGAGGATGTTCAATAACTGTGAGGTGGTCC" "TTGGGAATTTGGAAATTACCTATGTGCAGAGGAATTATGATCTTTCCTTCTTA" "AAGGTTGGTGACTTTGATTTTCCT", entries[0].sequence) assert_equal("e3912e9ceacd6538ede8c1b2adda7423", entries[0].md5)
def __len__(self): if not self._is_fasta: return len(self._dataset) else: if self.ext in [".FA", ".FASTA"]: return len([r for r in FastaReader(self.file_name)]) else: # contigset n = 0 for rr in self._dataset.resourceReaders(): n += len([r for r in rr]) return n
def get_subset_reads(fasta_fn, cluster_dict, cluster_index, out_file_name): f = FastaReader(fasta_fn) with open(out_file_name, "w") as out_f: for r in f: read_id = r.name read_seq = r.sequence.upper() if read_id in cluster_dict[cluster_index]: print >> out_f, ">" + r.name print >> out_f, r.sequence
class CCSInput(object): """ Wrapper class for handling multiple formats specifying CCS sequences. The old convention was to use .fasta, but we would like to be able to pass the classifier a ConsensusReadSet (i.e. .bam files) instead for use within pbsmrtpipe. """ def __init__(self, file_name): self.file_name = file_name self._is_fasta = False self.ext = op.splitext(file_name)[1].upper() if self.ext in [".FA", ".FASTA"]: self._dataset = FastaReader(file_name) self._is_fasta = True elif self.ext == ".BAM": self._dataset = openDataFile(file_name) else: # either contigset.xml or consensusreadset.xml assert self.ext == ".XML" self._dataset = openDataSet(file_name) if isinstance(self._dataset, ContigSet): self._is_fasta = True def __iter__(self): for rec in self._dataset: if not self._is_fasta: rec = CCSBamSequence(rec.peer) yield rec def close(self): """Close all datasets.""" self._dataset.close() def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): self.close() def __len__(self): if not self._is_fasta: return len(self._dataset) else: if self.ext in [".FA", ".FASTA"]: return len([r for r in FastaReader(self.file_name)]) else: # contigset n = 0 for rr in self._dataset.resourceReaders(): n += len([r for r in rr]) return n def __delitem__(self, dummy_name): raise NotImplementedError("%s.%s" % (self.__class__.__name__, "__delitem__")) def __setitem__(self, dummy_index, dummy_name): raise NotImplementedError("%s.%s" % (self.__class__.__name__, "__setitem__")) def __getitem__(self, key): raise NotImplementedError("%s.%s" % (self.__class__.__name__, "__getitem__"))
#! /usr/bin/env python import sys from pbcore.io import FastaReader f = FastaReader(sys.argv[1]) for seq in f: chr = seq list = chr.sequence.split('N') max = 0 max_seq = "" for sec in list: if len(sec) > max: max = len(sec) max_seq = sec print len(max_seq) wf = open("human_chr14.fa","w") wf.write(max_seq) f.close() wf.close()