def read_cogs(filename): """ Read COG functions. """ cog_re = re.compile(r'\[(\w+)\]\s+(COG\d+)\s+(.*)') cogs = [] with open(filename, 'r') as f: for line in f: m = cog_re.match(line) if m: cog = OpenStruct() cog.id = m.group(2) cog.name = m.group(3) cog.parents = m.group(1) cog.namespace = 'cog' cogs.append(cog) return cogs
def read_tigrfams_by_role(filename): """ This is no longer used! Reads the hierarchical structure of TIGRFams organized into categories called roles. Returns a nested list structure of roles and sub-roles that hold tigrfams. Downloaded file from here: http://cmr.jcvi.org/tigr-scripts/CMR/shared/EvidenceList.cgi?ev_type=TIGRFAM&order_type=role Note the TIGRFams flat file is more complete than the TIGRFams by role file. """ # we'll be making and returning a nested list of tigr roles holding tigrfams tigrfams_by_role = [] with open(filename, 'r') as f: category = None subcategory = None for line in f: # skip blank lines if len(line.strip())==0: continue if line.startswith(" "): fields = line.lstrip(' ').rstrip("\n").split("\t") # skip column headers if fields[0] == 'Accession': continue tigrfam = OpenStruct() tigrfam.id = fields[0] tigrfam.name = fields[1] tigrfam.description = fields[2] subcategory['tigrfams'].append(tigrfam) elif line.startswith(" "): name = line.strip() subcategory = {'name':name, 'tigrfams':[]} category['roles'].append(subcategory) else: name = line.strip() category = {'name':name, 'roles':[]} tigrfams_by_role.append(category) return tigrfams_by_role
def read_cog_categories(filename): """ Read COG functional categories (see http://www.ncbi.nlm.nih.gov/COG/grace/fiew.cgi) """ cog_categories = [] parent = None with open(filename, 'r') as f: for line in f: c = OpenStruct() if re.match("[A-Z]\t.*", line): fields = line.rstrip("\n").split("\t") c.id = fields[0] c.name = fields[3] c.parents = (parent,) c.namespace = "cog subcategory" else: c.name = line.rstrip("\n") c.namespace = "cog category" parent = c.name cog_categories.append(c) return cog_categories
def read_tigrfams(filename): """ Read the flat listing of TIGRFams. Note the TIGRFams flat file is more complete than the TIGRFams by role file. The flat file is a superset of the by-role file. """ tigrfams = [] with open(filename, 'r') as f: #skip header line = f.next() for line in f: fields = line.rstrip("\n").split("\t") tigrfam = OpenStruct() tigrfam.id = fields[0] tigrfam.name = fields[1] tigrfam.description = fields[2] tigrfams.append(tigrfam) return tigrfams