def test_reading_from_guide_table(self): """Read table""" root = Newick.Clade() root.name = "cellular organisms" #print(root.depths()) inf = open("./test-phyloutil/test1/Pseudozyma-antarctica-1.txt", 'r') table = util.readTable(inf, header=True) #print(table) tree = phyloutil.treeFromClassificationTable(table) inf.close() phyloutil.printTree(tree, sys.stdout)
def test_reading_from_class_table(self): """Read table""" root = Newick.Clade() root.name = "cellular organisms" inf = open("./test-phyloutil/test1/Pseudozyma-antarctica-1.txt", 'r') table = util.readTable(inf, header=True) #print(table) tree = phyloutil.treeFromClassificationTable(table) inf.close() #phyloutil.printTree(root, sys.stdout) termlist = list(tree.get_terminals()) self.assertTrue(termlist[0].name=='Moesziomyces antarcticus T-34')
def test_run(self): """readTable header""" fname = "tmp_lightdataframe.txt" inf = open(fname, "w") inf.write("one\ttwo\tthree\n") inf.write("a\tb\t3\n") inf.write("a\tb\t33\n") inf.close() with open(fname, "r") as inf: ldf = util.readTable(inf, header=True) h = ldf.header self.assertTrue(h[1] == "two") os.remove(fname)
def test_run(self): """readTable header""" fname = "tmp_lightdataframe.txt" inf = open(fname, 'w') inf.write("one\ttwo\tthree\n") inf.write("a\tb\t3\n") inf.write("a\tb\t33\n") inf.close() with open(fname, 'r') as inf: ldf = util.readTable(inf, header=True) h = ldf.header self.assertTrue(h[1] == 'two') os.remove(fname)
def test_run(self): """readTable basic""" fname = "tmp_lightdataframe.txt" inf = open(fname, 'w') inf.write("one\ttwo\tthree\n") inf.write("a\tb\t3\n") inf.write("a\tb\t33\n") inf.close() inf = open(fname, 'r') ldf = util.readTable(inf, header=True) self.assertTrue(ldf['three'][0] == 3) self.assertTrue(ldf['three'][1] == 33) inf.close() os.remove(fname)
def test_run(self): """readTable basic""" fname = "tmp_lightdataframe.txt" inf = open(fname, "w") inf.write("one\ttwo\tthree\n") inf.write("a\tb\t3\n") inf.write("a\tb\t33\n") inf.close() inf = open(fname, "r") ldf = util.readTable(inf, header=True) self.assertTrue(ldf["three"][0] == 3) self.assertTrue(ldf["three"][1] == 33) inf.close() os.remove(fname)
def test_run(self): """readTable dictrows""" fname = "tmp_lightdataframe.txt" inf = open(fname, 'w') inf.write("one\ttwo\tthree\n") inf.write("a\tb\t3\n") inf.write("a\tb\t33\n") inf.close() inf = open(fname, 'r') ldf = util.readTable(inf, header=True) for (ri, flds) in enumerate(ldf.dictrows): if ri == 0: self.assertTrue(flds['three'] == 3) if ri == 1: self.assertTrue(flds['three'] == 33) inf.close() os.remove(fname)
def test_run(self): """readTable dictrows""" fname = "tmp_lightdataframe.txt" inf = open(fname, "w") inf.write("one\ttwo\tthree\n") inf.write("a\tb\t3\n") inf.write("a\tb\t33\n") inf.close() inf = open(fname, "r") ldf = util.readTable(inf, header=True) for (ri, flds) in enumerate(ldf.dictrows): if ri == 0: self.assertTrue(flds["three"] == 3) if ri == 1: self.assertTrue(flds["three"] == 33) inf.close() os.remove(fname)
def __init__(self): self.pKa = {'D':3.9, 'E':4.3, 'H':6.1, 'C':8.3, 'Y':10.1, 'K':10.67, 'R':12, 'N-term':8, 'C-term':3.1} self.charges = {'D':-1, 'E':-1, 'H':1, 'C':-1, 'Y':-1, 'K':1, 'R':1, 'N-term':1, 'C-term':-1} #self.charges = {'D':-1, 'E':-1, 'H':1, 'K':1, 'R':1, 'N-term':1, 'C-term':-1} self.hydrophobicity_scales = {} # Hack so that we can store scale information in a file -- need better way to store. dir_path = os.path.dirname(os.path.realpath(__file__)) inf = open(os.path.expanduser(dir_path+"/../data/hydrophobicity-scales.txt"),'r') tab = util.readTable(inf) scales = tab.header[1:] for scale in scales: self.hydrophobicity_scales[scale.replace('.','-')] = dict(zip(tab.col('aa'), tab.col(scale))) # Molecular weights of the amino acids in Da, not residues; subtract 18 for residue weight self.mw = {'A': 89.09, 'C': 121.16, 'E': 147.13, 'D': 133.10, 'G': 75.07, 'F': 165.19, 'I': 131.18, 'H': 155.16, 'K': 146.19, 'M': 149.21, 'L': 131.18, 'N': 132.12, 'Q': 146.15, 'P': 115.13, 'S': 105.09, 'R': 174.20, 'T': 119.12, 'W': 204.23, 'V': 117.15, 'Y': 181.19, 'B': 132.61, 'Z': 146.64} inf.close()
def read(self, stream, header=True): tab = util.readTable(stream, header=header) for flds in tab.dictrows: self._comp_dict[flds['aa']] = flds['proportion']
data_outs.addStream(outf) else: # By default, write to stdout data_outs.addStream(sys.stdout) # Write out parameters data_outs.write("# Run started {}\n".format(util.timestamp())) data_outs.write("# Command: {}\n".format(' '.join(sys.argv))) data_outs.write("# Parameters:\n") optdict = vars(options) for (k,v) in optdict.items(): data_outs.write("#\t{k}: {v}\n".format(k=k, v=v)) # Read background information aa_volumes = {} vols = util.readTable(open(os.path.expanduser('~/research/lib/data/harpaz-aa-volumes.txt'))) #print vols['volume.a3'] aa_volumes = dict(zip(vols['aa'], [x/1000.0 for x in vols['mean.volume.a3']])) #print aa_volumes # Read input if not os.path.isfile(options.in_fname): raise IOError("# Error: file {} does not exist".format(options.in_fname)) (headers, seqs) = biofile.readFASTA(file(options.in_fname, 'r')) #, key_fxn=biofile.secondField) if options.translate_sequences: seqs = [translate.translate(s) for s in seqs] zhs = [(h,s) for (h,s) in zip(headers,seqs) if not s is None] all_keys = [biofile.firstField(h) for (h,s) in zhs] (headers, seqs) = zip(*zhs) prot_dict = dict([(biofile.firstField(h), s) for (h,s) in zhs]) gene_orf_dict = dict([(biofile.secondOrFirstField(h), biofile.firstField(h)) for h in headers])
# Read input if not os.path.isfile(options.in_fname): raise IOError("# Error: file {} does not exist".format( options.in_fname)) with open(options.in_fname, 'r') as inf: # Read a FASTA file? (headers, seqs) = biofile.readFASTA(inf) info_outs.write("# Read {:d} sequences\n".format(len(headers))) if not options.in_names_fname is None: if not os.path.isfile(options.in_names_fname): raise IOError("# Error: file {} does not exist".format( options.in_names_fname)) with open(options.in_names_fname, 'r') as inf: species = util.readTable(inf, header=True) species_name_lookup = dict( zip(species['ODB_code'], [shorten(x) for x in species['Organism']])) new_headers = [] new_seqs = [] for (hdr, seq) in zip(headers, seqs): odb_code = hdr.strip().split()[-1] if len(odb_code) == 5: try: # Rename species_name = species_name_lookup[odb_code] new_headers.append('{} {}'.format(species_name, hdr)) except KeyError: new_headers.append(hdr)
if not os.path.isfile(tree_fname): raise IOError("# Error: file {} does not exist".format(tree_fname)) tree_string = "" with open(tree_fname, 'r') as inf: lines = inf.readlines() for line in lines: if not line.strip()[0] == '#': tree_string += line.strip() trees = NewickIO.parse(StringIO(tree_string)) tree = next(trees) # Read mapping file map_fname = os.path.expanduser(options.mapping_in_fname) if not os.path.isfile(map_fname): raise IOError("# Error: file {} does not exist".format(map_fname)) with open(map_fname, 'r') as inf: map_table = util.readTable(inf, header=True) # Create mapping mapping_dict = dict(zip(map_table['species'], map_table['updated.species'])) # Update the FASTA headers #new_headers = [] #new_seqs = [] seq_dict = {} header_dict = {} short_species_names = {} for (i, h) in enumerate(headers): species_name = extractSpeciesName(h) short_name = makeShortSpeciesName(species_name) try:
def __init__(self): self.pKa = { 'D': 3.9, 'E': 4.3, 'H': 6.1, 'C': 8.3, 'Y': 10.1, 'K': 10.67, 'R': 12, 'N-term': 8, 'C-term': 3.1 } self.charges = { 'D': -1, 'E': -1, 'H': 1, 'C': -1, 'Y': -1, 'K': 1, 'R': 1, 'N-term': 1, 'C-term': -1 } #self.charges = {'D':-1, 'E':-1, 'H':1, 'K':1, 'R':1, 'N-term':1, 'C-term':-1} self.hydrophobicity_scales = {} # Hack so that we can store scale information in a file -- need better way to store. dir_path = os.path.dirname(os.path.realpath(__file__)) inf = open( os.path.expanduser(dir_path + "/../data/hydrophobicity-scales.txt"), 'r') tab = util.readTable(inf) scales = tab.header[1:] for scale in scales: self.hydrophobicity_scales[scale.replace('.', '-')] = dict( zip(tab.col('aa'), tab.col(scale))) # Molecular weights of the amino acids in Da, not residues; subtract 18 for residue weight self.mw = { 'A': 89.09, 'C': 121.16, 'E': 147.13, 'D': 133.10, 'G': 75.07, 'F': 165.19, 'I': 131.18, 'H': 155.16, 'K': 146.19, 'M': 149.21, 'L': 131.18, 'N': 132.12, 'Q': 146.15, 'P': 115.13, 'S': 105.09, 'R': 174.20, 'T': 119.12, 'W': 204.23, 'V': 117.15, 'Y': 181.19, 'B': 132.61, 'Z': 146.64 } inf.close()
for (k,v) in optdict.items(): data_outs.write("#\t{k}: {v}\n".format(k=k, v=v)) # Read input if not os.path.isfile(options.in_fname): raise IOError("# Error: file {} does not exist".format(options.in_fname)) with open(options.in_fname,'r') as inf: # Read a FASTA file? (headers, seqs) = biofile.readFASTA(inf) info_outs.write("# Read {:d} sequences\n".format(len(headers))) if not options.in_names_fname is None: if not os.path.isfile(options.in_names_fname): raise IOError("# Error: file {} does not exist".format(options.in_names_fname)) with open(options.in_names_fname,'r') as inf: species = util.readTable(inf, header=True) species_name_lookup = dict(zip(species['ODB_code'], [shorten(x) for x in species['Organism']])) new_headers = [] new_seqs = [] for (hdr, seq) in zip(headers,seqs): odb_code = hdr.strip().split()[-1] if len(odb_code) == 5: try: # Rename species_name = species_name_lookup[odb_code] new_headers.append('{} {}'.format(species_name, hdr)) except KeyError: new_headers.append(hdr) else: new_headers.append(hdr)
optdict = vars(options) for (k,v) in optdict.items(): data_outs.write("#\t{k}: {v}\n".format(k=k, v=v)) # Read input if not os.path.isfile(options.in_fname): raise IOError("# Error: file {} does not exist".format(options.in_fname)) with open(options.in_fname,'r') as inf: # Read a FASTA file? (headers, seqs) = biofile.readFASTA(inf) pref_ids = {} if not os.path.isfile(options.in_filter_fname): raise IOError("# Error: file {} does not exist".format(options.in_filter_fname)) with open(options.in_filter_fname,'r') as inf: tab = util.readTable(inf, header=True) pref_ids = dict(zip(tab['species'],tab['orthodb.name'])) # Now go through headers, find multiples, and select one from each. selected_indices = [] # index into headers and sequences new_headers = [] new_seqs = [] orthodb_ids = [h.split()[-1] for h in headers] species_names = [h.split()[0].split('_')[0] for h in headers] for species_name in list(set(species_names)): dupe_indices = [xi for (xi,spec) in enumerate(species_names) if spec==species_name] if len(dupe_indices)==1: # No problem, no duplicate selected_indices.append(dupe_indices[0]) continue
data_outs.write("#\t{k}: {v}\n".format(k=k, v=v)) # Read input if not os.path.isfile(options.in_fname): raise IOError("# Error: file {} does not exist".format( options.in_fname)) with open(options.in_fname, 'r') as inf: # Read a FASTA file? (headers, seqs) = biofile.readFASTA(inf) pref_ids = {} if not os.path.isfile(options.in_filter_fname): raise IOError("# Error: file {} does not exist".format( options.in_filter_fname)) with open(options.in_filter_fname, 'r') as inf: tab = util.readTable(inf, header=True) pref_ids = dict(zip(tab['species'], tab['orthodb.name'])) # Now go through headers, find multiples, and select one from each. selected_indices = [] # index into headers and sequences new_headers = [] new_seqs = [] orthodb_ids = [h.split()[-1] for h in headers] species_names = [h.split()[0].split('_')[0] for h in headers] for species_name in list(set(species_names)): dupe_indices = [ xi for (xi, spec) in enumerate(species_names) if spec == species_name ] if len(dupe_indices) == 1:
parser.add_argument(dest="prot_in_fname", type=str, help="FASTA file containing protein sequences") parser.add_argument(dest="feature_fname", type=str, help="SGD file containing sequence features") parser.add_argument(dest="paralog_fname", type=str, help="Yeast Gene Order Browser formatted file of paralog identifications") parser.add_argument("--aa", dest="do_aa", default=False, action="store_true", help="compute amino-acid frequencies?") parser.add_argument("--gc", dest="do_gc", default=False, action="store_true", help="compute GC frequencies?") parser.add_argument("--mw", dest="do_mw", default=False, action="store_true", help="compute molecular weights?") parser.add_argument("--target-aas", dest="target_aas", type=str, default=translate.AAs(), help="amino acids (e.g. ACDEF) for frequency analysis") parser.add_argument("-p", "--pseudo", dest="pseudocount", type=float, default=0.0, help="pseudocount to add to all frequencies") parser.add_argument("-o", "--out", dest="out_fname", type=str, default=None, help="output filename") options = parser.parse_args() cdna_dict = biofile.readFASTADict(os.path.expanduser(options.cds_in_fname)) prot_dict = biofile.readFASTADict(os.path.expanduser(options.prot_in_fname)) # Read paralog data from Yeast Gene Order Browser file ygob_data = util.readTable(file(os.path.expanduser(options.paralog_fname),'r')) paralog_dict = {} for flds in ygob_data.dictrows: scer1 = flds['scer1'].strip() scer2 = flds['scer2'].strip() if not (na.isNA(scer1) or na.isNA(scer2)): paralog_dict[scer1] = scer2 paralog_dict[scer2] = scer1 # Read SGD data sgd_features = util.readTable(file(os.path.expanduser(options.feature_fname),'r'), header=False) ''' http://downloads.yeastgenome.org/curation/chromosomal_feature/SGD_features.README 1. Primary SGDID (mandatory) 2. Feature type (mandatory) 3. Feature qualifier (optional)
if not os.path.isfile(fname): raise IOError("# Error: file {} does not exist".format(fname)) tree_root = Newick.Clade() tree_root.parent = None tree_root.name = "cellular organisms" # Get directory of guide file path = os.path.dirname(fname) curwd = os.getcwd() species_names = [] with open(fname,'r') as inf: os.chdir(path) tab = util.readTable(inf, header=True) rows = tab.dictrows if options.debug: rows = [x for x in tab.dictrows][:2] just_started = True for row in rows: spec_fname = row['filename'] #print(spec_fname) if not na.isNA(spec_fname): spec_inf = util.readTable(open(spec_fname,'r'), header=True) twig = phyloutil.treeFromClassificationTable(spec_inf) added = phyloutil.mergeTrees(tree_root, twig, add_to_leaf=just_started) if added: just_started = False species_names.append(row['updated.species']) #print(spec_fname)
default=0.0, help="pseudocount to add to all frequencies") parser.add_argument("-o", "--out", dest="out_fname", type=str, default=None, help="output filename") options = parser.parse_args() cdna_dict = biofile.readFASTADict(os.path.expanduser(options.cds_in_fname)) prot_dict = biofile.readFASTADict(os.path.expanduser( options.prot_in_fname)) # Read paralog data from Yeast Gene Order Browser file ygob_data = util.readTable( file(os.path.expanduser(options.paralog_fname), 'r')) paralog_dict = {} for flds in ygob_data.dictrows: scer1 = flds['scer1'].strip() scer2 = flds['scer2'].strip() if not (na.isNA(scer1) or na.isNA(scer2)): paralog_dict[scer1] = scer2 paralog_dict[scer2] = scer1 # Read SGD data sgd_features = util.readTable(file( os.path.expanduser(options.feature_fname), 'r'), header=False) ''' http://downloads.yeastgenome.org/curation/chromosomal_feature/SGD_features.README 1. Primary SGDID (mandatory)