Пример #1
0
    def __init__(self,
                 fileName,
                 rowClass=None,
                 typeMap=None,
                 defaultColType=None,
                 columns=None,
                 columnNameMapper=None,
                 ignoreExtraCols=False,
                 isRdb=False,
                 inFh=None,
                 allowEmpty=False,
                 dialect=csv.excel_tab):
        """Open TSV file and read header into object.  Removes leading # from
        UCSC header.

        fileName - name of file, opened unless inFh is specified
        rowClass - class or class factory function to use for a row. Must take
            TsvReader and list of string values of columns.
        typeMap - if specified, it maps column names to the type objects to
            use to convert the column.  Unspecified columns will not be
            converted. Key is the column name, value can be either a type
            or a tuple of (parseFunc, formatFunc).  If a type is use,
            str() is used to convert to a printable value.
        defaultColType - if specified, type of unspecified columns
        columns - if specified, the column names to use.  The header
            should not be in the file.
        columnNameMapper - function to map column names to the internal name.
        ignoreExtraCols - should extra columns be ignored?
        isRdb - file is an RDB file, ignore second row (type map still needed).
        inFh - If not None, this is used as the open file, rather than
          opening it.  Closed when the end of file is reached.
        allowEmpty - an empty input results in an EOF rather than an error.
          Should specify this if reading from a database query.
        dialect - a csv dialect object or name.
        """
        self.columns = []
        self.colMap = {}
        self.fileName = fileName
        self.lineNum = 0
        self.rowClass = rowClass
        if rowClass == None:
            self.rowClass = TsvRow
        self.columnNameMapper = columnNameMapper
        self.isRdb = isRdb
        self.colTypes = None
        self.ignoreExtraCols = ignoreExtraCols
        if inFh != None:
            self.inFh = inFh
        else:
            self.inFh = fileOps.opengz(fileName, "r")
        try:
            self.reader = csv.reader(self.inFh, dialect=dialect)
            if columns:
                self.__setupColumns(columns)
            else:
                self.__readHeader(allowEmpty)
            self.__initColTypes(typeMap, defaultColType)
        except Exception, e:
            self.close()
            raise
Пример #2
0
 def __init__(self, tabFile, rowClass=None, hashAreComments=False, skipBlankLines=False):
     self.inFh = fileOps.opengz(tabFile)
     self.csvRdr = csv.reader(self.inFh, dialect=csv.excel_tab)
     self.rowClass = rowClass
     self.hashAreComments = hashAreComments
     self.skipBlankLines = skipBlankLines
     self.lineNum = 0
Пример #3
0
 def __init__(self,
              tabFile,
              rowClass=None,
              hashAreComments=False,
              skipBlankLines=False):
     self.inFh = fileOps.opengz(tabFile)
     self.csvRdr = csv.reader(self.inFh, dialect=csv.excel_tab)
     self.rowClass = rowClass
     self.hashAreComments = hashAreComments
     self.skipBlankLines = skipBlankLines
     self.lineNum = 0
Пример #4
0
 def __init__(self, gxfFile=None, gxfFh=None):
     self.gxfFile = gxfFile if gxfFile is not None else "<unknown>"
     self.openedFile = (gxfFh is None)
     self.fh = fileOps.opengz(gxfFile) if gxfFh is None else gxfFh
     self.line = None
     self.lineNum = 0
     # collect offsets if not compressed
     self.randomAccess = (getattr(self.fh, "tell", None) is not None)
     self.lineFileOffset = 0
     self.lineFileLength = 0  # includes \n for reading
     self.metas = []
Пример #5
0
 def __init__(self, asmReport):
     self.metaData = dict()  # main headers at started
     self.seqs = []
     self.bySequenceName = dict()
     self.byGenBankAccn = dict()
     self.byRefSeqAccn = dict()
     self.byUcscStyleName = dict()
     with fileOps.opengz(asmReport) as fh:
         self._parseMetaData(fh)
         self._skipToSeqTable(fh)
         self._parseRecords(fh)
Пример #6
0
    def __init__(self, fileName, rowClass=None, typeMap=None, defaultColType=None, columns=None, columnNameMapper=None,
                 ignoreExtraCols=False, isRdb=False, inFh=None, allowEmpty=False, dialect=csv.excel_tab):
        """Open TSV file and read header into object.  Removes leading # from
        UCSC header.

        fileName - name of file, opened unless inFh is specified
        rowClass - class or class factory function to use for a row. Must take
            TsvReader and list of string values of columns.
        typeMap - if specified, it maps column names to the type objects to
            use to convert the column.  Unspecified columns will not be
            converted. Key is the column name, value can be either a type
            or a tuple of (parseFunc, formatFunc).  If a type is use,
            str() is used to convert to a printable value.
        defaultColType - if specified, type of unspecified columns
        columns - if specified, the column names to use.  The header
            should not be in the file.
        columnNameMapper - function to map column names to the internal name.
        ignoreExtraCols - should extra columns be ignored?
        isRdb - file is an RDB file, ignore second row (type map still needed).
        inFh - If not None, this is used as the open file, rather than
          opening it.  Closed when the end of file is reached.
        allowEmpty - an empty input results in an EOF rather than an error.
          Should specify this if reading from a database query.
        dialect - a csv dialect object or name.
        """
        self.columns = []
        self.colMap = {}
        self.fileName = fileName
        self.lineNum = 0
        self.rowClass = rowClass
        if rowClass == None:
            self.rowClass = TsvRow
        self.columnNameMapper = columnNameMapper
        self.isRdb = isRdb
        self.colTypes = None
        self.ignoreExtraCols = ignoreExtraCols
        if inFh != None:
            self.inFh = inFh
        else:
            self.inFh = fileOps.opengz(fileName, "r")
        try:
            self.reader = csv.reader(self.inFh, dialect=dialect)
            if columns:
                self.__setupColumns(columns)
            else:
                self.__readHeader(allowEmpty)
            self.__initColTypes(typeMap, defaultColType)
        except Exception, e:
            self.close()
            raise
Пример #7
0
 def parse(self, gff3File, gff3Fh=None):
     """
     Parse the gff3 files and return a Gff3Set object in format.  If
     gff3File ends with .gz or .bz2, it will decompressed.  If gff3Fh is
     specified, then parse the already opened stream, with gff3File used
     for error message.
     """
     self.fileName = gff3File
     self.lineNumber = 0
     fh = fileOps.opengz(gff3File, 'r') if gff3Fh is None else gff3Fh
     try:
         gff3Set = Gff3Set(self.fileName)
         for line in fh:
             self.lineNumber += 1
             self._parseLine(gff3Set, line[0:-1])
     finally:
         self.fileName = self.lineNumber = None
         if gff3Fh is None:
             fh.close()
     gff3Set.finish()
     return gff3Set
Пример #8
0
def write_fasta(path_or_handle, name, seq, chunk_size=100):
    """Writes out fasta file. if path ends in gz, will be gzipped.
    """
    if isinstance(path_or_handle, str):
        fh = opengz(path_or_handle, 'w')
    else:
        fh = path_or_handle
    valid_chars = {x for x in string.ascii_letters + "-"}
    try:
        assert any([isinstance(seq, unicode), isinstance(seq, str)])
    except AssertionError:
        raise RuntimeError("Sequence is not unicode or string")
    try:
        assert all(x in valid_chars for x in seq)
    except AssertionError:
        bad_chars = {x for x in seq if x not in valid_chars}
        raise RuntimeError("Invalid FASTA character(s) see in fasta sequence: {}".format(bad_chars))
    fh.write(">%s\n" % name)
    for i in xrange(0, len(seq), chunk_size):
        fh.write("%s\n" % seq[i:i+chunk_size])
    if isinstance(path_or_handle, str):
        fh.close()
Пример #9
0
def read_fasta(path_or_handle, validate='DNA'):
    """iteratively yields a sequence for each '>' it encounters, ignores '#' lines
    if validate is true, will ensure that each row contains valid DNA fasta characters
    """
    assert validate in ['DNA', 'protein', None], "Valid options for validate are DNA, protein or None"
    if isinstance(path_or_handle, str):
        fh = opengz(path_or_handle)
    else:
        fh = path_or_handle
    line = fh.readline()
    chars_to_remove = "\n "
    if validate is 'DNA':
        valid_chars = set('ACGTUYSWKMBDHVNacgtuyswkmbdhvn.-*')
    elif validate is 'protein':
        valid_chars = set('ABCDEFGHIKLMPQSRTVWXYZUabcdefghiklmpqsrtvwxyzuNn.-*')
    else:
        valid_chars = set()
    while line != '':
        if line[0] == '>':
            name = line[1:-1]
            line = fh.readline()
            seq = array.array('c')
            while line != '' and line[0] != '>':
                line = line.translate(None, chars_to_remove)
                if len(line) > 0 and line[0] != '#':
                    seq.extend(line)
                line = fh.readline()
            if validate is not None:
                try:
                    assert all(x in valid_chars for x in seq)
                except AssertionError:
                    bad_chars = {x for x in seq if x not in valid_chars}
                    raise RuntimeError("Invalid FASTA character(s) see in fasta sequence: {}".format(bad_chars))
            yield name, seq.tostring()
        else:
            line = fh.readline()
    if isinstance(path_or_handle, str):
        fh.close()
Пример #10
0
 def __init__(self, fileName):
     self.fh = fileOps.opengz(fileName)
Пример #11
0
 def __init__(self, fileName):
     self.fh = None  # required for __del__ if open fails
     self.fh = fileOps.opengz(fileName)
Пример #12
0
 def __init__(self, agpFile):
     self.agpFile = agpFile
     self.agpVersion = None
     self.recs = []
     with fileOps.opengz(agpFile) as fh:
         self._parseLines(fh)
Пример #13
0
 def __init__(self, fileName):
     self.fh = fileOps.opengz(fileName)
Пример #14
0
 def __init__(self, fileName):
     self.fh = None  # required for __del__ if open fails
     self.fh = fileOps.opengz(fileName)