def __init__(self, fnames, keepExons=False, labels=None, verbose=False): """ Driver function to actually parse files. The steps are as follows: 1) skip to the first non-comment line 2) Infer the type from that 3) Call a type-specific processing function accordingly * These call the underlying C code for storage * These handle chromsome name conversions (python-level) Required inputs are as follows: fnames: A list of (possibly compressed with gzip or bzip2) GTF or BED files. Optional input is: keepExons: For BED12 files, exons are ignored by default. labels: Override the feature labels supplied in the file(s). Note that this might instead be replaced later in the .features attribute. verbose: Whether to print warnings (default: False) """ self.fname = [] self.filename = "" self.chroms = [] self.features = [] self.tree = tree.initTree() self.keepExons = keepExons self.verbose = verbose if not isinstance(fnames, list): fnames = [fnames] # Load the files for labelIdx, fname in enumerate(fnames): self.filename = fname fp = openPossiblyCompressed(fname) line, labelColumn = self.firstNonComment(fp) if line is None: # This will only ever happen if a file is empty or just has a header/comment continue line = line.strip() ftype = self.inferType(fp, line, labelColumn) if ftype != 'GTF' and labels is not None: assert(len(labels) > labelIdx) bname = labels[labelIdx] else: bname = basename(fname) if ftype == 'GTF': self.parseGTF(fp, line) elif ftype == 'BED3': self.parseBED(fp, line, 3, feature=bname, labelColumn=labelColumn) elif ftype == 'BED6': self.parseBED(fp, line, 6, feature=bname, labelColumn=labelColumn) else: self.parseBED(fp, line, 12, feature=bname, labelColumn=labelColumn) fp.close() # Sanity check if self.tree.countEntries() == 0: raise RuntimeError("None of the input BED/GTF files had valid regions") if len(self.features) == 0: raise RuntimeError("There were no valid feature labels!") # vine -> tree self.tree.finish()
def __init__(self, fnames, exonID="exon", transcriptID="transcript", keepExons=False, labels=[], transcript_id_designator="transcript_id", defaultGroup=None, verbose=False): """ Driver function to actually parse files. The steps are as follows: 1) skip to the first non-comment line 2) Infer the type from that 3) Call a type-specific processing function accordingly * These call the underlying C code for storage * These handle chromsome name conversions (python-level) * These handle labels (python-level, with a C-level numeric attribute) 4) Sanity checking (do the number of labels make sense?) Required inputs are as follows: fnames: A list of (possibly compressed with gzip or bzip2) GTF or BED files. Optional input is: exonID: For GTF files, the feature column (column 3) label for exons, or whatever else should be stored as exons. The default is 'exon', though one could use 'CDS' instead. transcriptID: As above, but for transcripts. The default is 'transcript_id'. keepExons: For BED12 and GTF files, exons are ignored by default. labels: A list of group labels. transcript_id_designator: For gtf files, this is the key used in a searching for the transcript ID. If one sets transcriptID to 'gene', then transcript_id_designator would need to be changed to 'gene_id' or 'gene_name' to extract the gene ID/name from the attributes. defaultGroup: The default group name. If None, the file name is used. verbose: Whether to produce warning messages (default: False) """ self.fname = [] self.filename = "" self.chroms = [] self.exons = [] self.labels = [] self.transcriptIDduplicated = [] self.tree = tree.initTree() self.labelIdx = 0 self.transcript_id_designator = transcript_id_designator self.exonID = exonID self.transcriptID = transcriptID self.keepExons = keepExons self.defaultGroup = defaultGroup self.verbose = verbose if labels != []: self.already_input_labels = True if not isinstance(fnames, list): fnames = [fnames] # Load the files for fname in fnames: self.filename = fname fp = openPossiblyCompressed(fname) line, labelColumn = self.firstNonComment(fp) if line is None: # This will only ever happen if a file is empty or just has a header/comment continue line = line.strip() ftype = self.inferType(fp, line, labelColumn) if ftype == 'GTF': self.parseGTF(fp, line) elif ftype == 'BED3': self.parseBED(fp, line, 3, labelColumn) elif ftype == 'BED6': self.parseBED(fp, line, 6, labelColumn) else: self.parseBED(fp, line, 12, labelColumn) fp.close() # Sanity check if self.tree.countEntries() == 0: raise RuntimeError("None of the input BED/GTF files had valid regions") # Replace labels if len(labels) > 0: if len(labels) != len(self.labels): raise RuntimeError("The number of labels found ({0}) does not match the number input ({1})!".format(self.labels, labels)) else: self.labels = labels # vine -> tree self.tree.finish()
def __init__(self, fnames, exonID="exon", transcriptID="transcript", keepExons=False, labels=[], transcript_id_designator="transcript_id", defaultGroup=None, verbose=False): """ Driver function to actually parse files. The steps are as follows: 1) skip to the first non-comment line 2) Infer the type from that 3) Call a type-specific processing function accordingly * These call the underlying C code for storage * These handle chromsome name conversions (python-level) * These handle labels (python-level, with a C-level numeric attribute) 4) Sanity checking (do the number of labels make sense?) Required inputs are as follows: fnames: A list of (possibly compressed with gzip or bzip2) GTF or BED files. Optional input is: exonID: For GTF files, the feature column (column 3) label for exons, or whatever else should be stored as exons. The default is 'exon', though one could use 'CDS' instead. transcriptID: As above, but for transcripts. The default is 'transcript_id'. keepExons: For BED12 and GTF files, exons are ignored by default. labels: A list of group labels. transcript_id_designator: For gtf files, this is the key used in a searching for the transcript ID. If one sets transcriptID to 'gene', then transcript_id_designator would need to be changed to 'gene_id' or 'gene_name' to extract the gene ID/name from the attributes. defaultGroup: The default group name. If None, the file name is used. verbose: Whether to produce warning messages (default: False) """ self.fname = [] self.filename = "" self.chroms = [] self.exons = [] self.labels = [] self.transcriptIDduplicated = [] self.tree = tree.initTree() self.labelIdx = 0 self.transcript_id_designator = transcript_id_designator self.exonID = exonID self.transcriptID = transcriptID self.keepExons = keepExons self.defaultGroup = defaultGroup self.verbose = verbose if labels != []: self.already_input_labels = True if not isinstance(fnames, list): fnames = [fnames] # Load the files for fname in fnames: self.filename = fname fp = openPossiblyCompressed(fname) line, labelColumn = self.firstNonComment(fp) if line is None: # This will only ever happen if a file is empty or just has a header/comment continue line = line.strip() ftype = self.inferType(fp, line, labelColumn) if ftype == 'GTF': self.parseGTF(fp, line) elif ftype == 'BED3': self.parseBED(fp, line, 3, labelColumn) elif ftype == 'BED6': self.parseBED(fp, line, 6, labelColumn) else: self.parseBED(fp, line, 12, labelColumn) fp.close() # Sanity check if self.tree.countEntries() == 0: raise RuntimeError( "None of the input BED/GTF files had valid regions") # Replace labels if len(labels) > 0: if len(labels) != len(self.labels): raise RuntimeError( "The number of labels found ({0}) does not match the number input ({1})!" .format(self.labels, labels)) else: self.labels = labels # vine -> tree self.tree.finish()