class CCSInput(object): """ Wrapper class for handling multiple formats specifying CCS sequences. The old convention was to use .fasta, but we would like to be able to pass the classifier a ConsensusReadSet (i.e. .bam files) instead for use within pbsmrtpipe. """ def __init__(self, file_name): self.file_name = file_name self._is_fasta = False self.ext = op.splitext(file_name)[1].upper() if self.ext in [".FA", ".FASTA"]: self._dataset = FastaReader(file_name) self._is_fasta = True elif self.ext == ".BAM": self._dataset = openDataFile(file_name) else: # either contigset.xml or consensusreadset.xml assert self.ext == ".XML" self._dataset = openDataSet(file_name) if isinstance(self._dataset, ContigSet): self._is_fasta = True def __iter__(self): for rec in self._dataset: if not self._is_fasta: rec = CCSBamSequence(rec.peer) yield rec def close(self): """Close all datasets.""" self._dataset.close() def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): self.close() def __len__(self): if not self._is_fasta: return len(self._dataset) else: if self.ext in [".FA", ".FASTA"]: return len([r for r in FastaReader(self.file_name)]) else: # contigset n = 0 for rr in self._dataset.resourceReaders(): n += len([r for r in rr]) return n def __delitem__(self, dummy_name): raise NotImplementedError("%s.%s" % (self.__class__.__name__, "__delitem__")) def __setitem__(self, dummy_index, dummy_name): raise NotImplementedError("%s.%s" % (self.__class__.__name__, "__setitem__")) def __getitem__(self, key): raise NotImplementedError("%s.%s" % (self.__class__.__name__, "__getitem__"))
#! /usr/bin/env python import sys from pbcore.io import FastaReader f = FastaReader(sys.argv[1]) for seq in f: chr = seq list = chr.sequence.split('N') max = 0 max_seq = "" for sec in list: if len(sec) > max: max = len(sec) max_seq = sec print len(max_seq) wf = open("human_chr14.fa","w") wf.write(max_seq) f.close() wf.close()
#! /usr/bin/env python import sys from pbcore.io import FastaReader f = FastaReader(sys.argv[1]) for seq in f: chr = seq list = chr.sequence.split('N') max = 0 max_seq = "" for sec in list: if len(sec) > max: max = len(sec) max_seq = sec print len(max_seq) wf = open("human_chr14.fa", "w") wf.write(max_seq) f.close() wf.close()