def __init__(self, fileName, rowClass=None, typeMap=None, defaultColType=None, columns=None, columnNameMapper=None, ignoreExtraCols=False, isRdb=False, inFh=None, allowEmpty=False, dialect=csv.excel_tab): """Open TSV file and read header into object. Removes leading # from UCSC header. fileName - name of file, opened unless inFh is specified rowClass - class or class factory function to use for a row. Must take TsvReader and list of string values of columns. typeMap - if specified, it maps column names to the type objects to use to convert the column. Unspecified columns will not be converted. Key is the column name, value can be either a type or a tuple of (parseFunc, formatFunc). If a type is use, str() is used to convert to a printable value. defaultColType - if specified, type of unspecified columns columns - if specified, the column names to use. The header should not be in the file. columnNameMapper - function to map column names to the internal name. ignoreExtraCols - should extra columns be ignored? isRdb - file is an RDB file, ignore second row (type map still needed). inFh - If not None, this is used as the open file, rather than opening it. Closed when the end of file is reached. allowEmpty - an empty input results in an EOF rather than an error. Should specify this if reading from a database query. dialect - a csv dialect object or name. """ self.columns = [] self.colMap = {} self.fileName = fileName self.lineNum = 0 self.rowClass = rowClass if rowClass == None: self.rowClass = TsvRow self.columnNameMapper = columnNameMapper self.isRdb = isRdb self.colTypes = None self.ignoreExtraCols = ignoreExtraCols if inFh != None: self.inFh = inFh else: self.inFh = fileOps.opengz(fileName, "r") try: self.reader = csv.reader(self.inFh, dialect=dialect) if columns: self.__setupColumns(columns) else: self.__readHeader(allowEmpty) self.__initColTypes(typeMap, defaultColType) except Exception, e: self.close() raise
def __init__(self, tabFile, rowClass=None, hashAreComments=False, skipBlankLines=False): self.inFh = fileOps.opengz(tabFile) self.csvRdr = csv.reader(self.inFh, dialect=csv.excel_tab) self.rowClass = rowClass self.hashAreComments = hashAreComments self.skipBlankLines = skipBlankLines self.lineNum = 0
def __init__(self, gxfFile=None, gxfFh=None): self.gxfFile = gxfFile if gxfFile is not None else "<unknown>" self.openedFile = (gxfFh is None) self.fh = fileOps.opengz(gxfFile) if gxfFh is None else gxfFh self.line = None self.lineNum = 0 # collect offsets if not compressed self.randomAccess = (getattr(self.fh, "tell", None) is not None) self.lineFileOffset = 0 self.lineFileLength = 0 # includes \n for reading self.metas = []
def __init__(self, asmReport): self.metaData = dict() # main headers at started self.seqs = [] self.bySequenceName = dict() self.byGenBankAccn = dict() self.byRefSeqAccn = dict() self.byUcscStyleName = dict() with fileOps.opengz(asmReport) as fh: self._parseMetaData(fh) self._skipToSeqTable(fh) self._parseRecords(fh)
def parse(self, gff3File, gff3Fh=None): """ Parse the gff3 files and return a Gff3Set object in format. If gff3File ends with .gz or .bz2, it will decompressed. If gff3Fh is specified, then parse the already opened stream, with gff3File used for error message. """ self.fileName = gff3File self.lineNumber = 0 fh = fileOps.opengz(gff3File, 'r') if gff3Fh is None else gff3Fh try: gff3Set = Gff3Set(self.fileName) for line in fh: self.lineNumber += 1 self._parseLine(gff3Set, line[0:-1]) finally: self.fileName = self.lineNumber = None if gff3Fh is None: fh.close() gff3Set.finish() return gff3Set
def write_fasta(path_or_handle, name, seq, chunk_size=100): """Writes out fasta file. if path ends in gz, will be gzipped. """ if isinstance(path_or_handle, str): fh = opengz(path_or_handle, 'w') else: fh = path_or_handle valid_chars = {x for x in string.ascii_letters + "-"} try: assert any([isinstance(seq, unicode), isinstance(seq, str)]) except AssertionError: raise RuntimeError("Sequence is not unicode or string") try: assert all(x in valid_chars for x in seq) except AssertionError: bad_chars = {x for x in seq if x not in valid_chars} raise RuntimeError("Invalid FASTA character(s) see in fasta sequence: {}".format(bad_chars)) fh.write(">%s\n" % name) for i in xrange(0, len(seq), chunk_size): fh.write("%s\n" % seq[i:i+chunk_size]) if isinstance(path_or_handle, str): fh.close()
def read_fasta(path_or_handle, validate='DNA'): """iteratively yields a sequence for each '>' it encounters, ignores '#' lines if validate is true, will ensure that each row contains valid DNA fasta characters """ assert validate in ['DNA', 'protein', None], "Valid options for validate are DNA, protein or None" if isinstance(path_or_handle, str): fh = opengz(path_or_handle) else: fh = path_or_handle line = fh.readline() chars_to_remove = "\n " if validate is 'DNA': valid_chars = set('ACGTUYSWKMBDHVNacgtuyswkmbdhvn.-*') elif validate is 'protein': valid_chars = set('ABCDEFGHIKLMPQSRTVWXYZUabcdefghiklmpqsrtvwxyzuNn.-*') else: valid_chars = set() while line != '': if line[0] == '>': name = line[1:-1] line = fh.readline() seq = array.array('c') while line != '' and line[0] != '>': line = line.translate(None, chars_to_remove) if len(line) > 0 and line[0] != '#': seq.extend(line) line = fh.readline() if validate is not None: try: assert all(x in valid_chars for x in seq) except AssertionError: bad_chars = {x for x in seq if x not in valid_chars} raise RuntimeError("Invalid FASTA character(s) see in fasta sequence: {}".format(bad_chars)) yield name, seq.tostring() else: line = fh.readline() if isinstance(path_or_handle, str): fh.close()
def __init__(self, fileName): self.fh = fileOps.opengz(fileName)
def __init__(self, fileName): self.fh = None # required for __del__ if open fails self.fh = fileOps.opengz(fileName)
def __init__(self, agpFile): self.agpFile = agpFile self.agpVersion = None self.recs = [] with fileOps.opengz(agpFile) as fh: self._parseLines(fh)