def read_tigrfams_by_role(filename): """ This is no longer used! Reads the hierarchical structure of TIGRFams organized into categories called roles. Returns a nested list structure of roles and sub-roles that hold tigrfams. Downloaded file from here: http://cmr.jcvi.org/tigr-scripts/CMR/shared/EvidenceList.cgi?ev_type=TIGRFAM&order_type=role Note the TIGRFams flat file is more complete than the TIGRFams by role file. """ # we'll be making and returning a nested list of tigr roles holding tigrfams tigrfams_by_role = [] with open(filename, 'r') as f: category = None subcategory = None for line in f: # skip blank lines if len(line.strip())==0: continue if line.startswith(" "): fields = line.lstrip(' ').rstrip("\n").split("\t") # skip column headers if fields[0] == 'Accession': continue tigrfam = OpenStruct() tigrfam.id = fields[0] tigrfam.name = fields[1] tigrfam.description = fields[2] subcategory['tigrfams'].append(tigrfam) elif line.startswith(" "): name = line.strip() subcategory = {'name':name, 'tigrfams':[]} category['roles'].append(subcategory) else: name = line.strip() category = {'name':name, 'roles':[]} tigrfams_by_role.append(category) return tigrfams_by_role
def read_tigrfams(filename): """ Read the flat listing of TIGRFams. Note the TIGRFams flat file is more complete than the TIGRFams by role file. The flat file is a superset of the by-role file. """ tigrfams = [] with open(filename, 'r') as f: #skip header line = f.next() for line in f: fields = line.rstrip("\n").split("\t") tigrfam = OpenStruct() tigrfam.id = fields[0] tigrfam.name = fields[1] tigrfam.description = fields[2] tigrfams.append(tigrfam) return tigrfams
def read_genes(filename, chromosome=None, chromosome_map=None, rna=False): genes = [] with open(filename, 'r') as f: try: # first two lines hold title and column headers: title = f.next() # figure out chromosome from title if chromosome is None: for key in chromosome_map: if title.find(key) > -1: chromosome = chromosome_map[key] break if chromosome is None: raise Exception( "Can't figure out chromosome for: %s\ntitle=%s", filename, title) # parse out column headers columns = {} i = 0 for column in f.next().strip().split("\t"): columns[column] = i i += 1 except Exception as e: print "Error reading file: " + filename print str(type(e)) + ": " + str(e) return None try: # read line into objects for line in f: # strip leading and trailing whitespace line = line.strip() # skip blank lines if (len(line) == 0): continue fields = line.split("\t") gene = OpenStruct() gene.name = fields[columns['Locus_tag']] # locus tag if (fields[columns['Locus']] != '-'): gene.common_name = fields[columns['Locus']] # locus if 'Gi' in columns: gene.gi = int(fields[columns['Gi']]) gene.geneid = int(fields[columns['GeneID']]) gene.strand = fields[columns['Strand']] # '+' or '-' gene.start = int(fields[columns['Start']]) gene.end = int(fields[columns['End']]) if (fields[columns['Product Name']] != '-'): gene.description = fields[columns['Product Name']] # locus gene.chromosome = chromosome if rna: gene.type = guess_rna_gene_type(gene.description) else: gene.type = 'CDS' genes.append(gene) except Exception as e: print "Error reading line: " + line print str(type(e)) + ": " + str(e) return genes