def get_physical_ppi(partner_bool=True): if partner_bool: d_ref = read_in('uniprot', 'oln') d_ref2 = read_in('uniprot', 'pdb') else: d_ref = read_in('Entry', 'Gene names (ordered locus )', 'proteome') taxonomy = taxid()[organism] db = 'intact' #oln not supported for ecoli for mint and biogrid score_crit = None d = {} error_list = [] s = PSICQUIC(verbose=False) for uniprot, oln in d_ref.iteritems(): try: ppis = s.query( db, "{0} AND taxid:{1} AND affinity".format(uniprot, taxonomy)) except: ppis = [] error_list.append(uniprot) print "error! can't find ppis for {0}".format(uniprot) continue if score_crit != None: ppis = get_score(ppis, score_crit) if partner_bool: d.update(get_ppi_partner(uniprot, d_ref2[uniprot], ppis, d_ref2)) else: d[oln] = get_ppi_degree(ppis) if score_crit: db += '_{0}'.format(score_crit) return d, error_list, db
def __init__(self, verbose): self.verbose = verbose self.d_ref = read_in('oln', 'pdb', 'pre_seq2struc') self.d_ref2 = read_in('oln', 'uniprot', 'pre_seq2struc') self.d_input = read_in(*database(organism, 'length')) self.d_output = {}
def parse_input(user_input, organism): inputs, sign, operation = get_operation(user_input, organism) if len(inputs) > 1: d = reconcile(read_in(*inputs[0]), read_in(*inputs[1]), sign, operation) else: d = reconcile(read_in(*inputs[0]), read_in(*inputs[0]), sign) return d, get_label(user_input, inputs[0][2]), 'PDB' not in inputs[0][2]
def reference(proteome_subset_bool, organism): if organism == 'protherm': d_ref = read_in('uniprot', 'pdb', organism=organism) elif proteome_subset_bool: d_ref = read_in( 'Gene names (ordered locus )', 'Entry', 'proteome', organism=organism ) #based on oln cuz proteome data usually have oln only else: d_ref = read_in('oln', 'pdb', organism=organism) return parse_condition(d_ref, organism)
def get_info(organism): d_ref = read_in('uniprot', 'pdb', organism=organism) d = {} columns = [ 'id', 'genes', 'comment(function)', 'go(molecular function)', 'comment(SUBCELLULAR LOCATION)' ] uniprot_api = UniProtAPI(columns) label_list, response = uniprot_api.organism_info(organism=organism) for line in response: word_list = line.split('\t') word_list = [word.strip() for word in word_list] uniprot = word_list[label_list.index( 'Entry')] #maybe iterate through a list if uniprot in d_ref: genes = word_list[label_list.index('Gene names')] function = word_list[label_list.index('Function [CC]')] function2 = word_list[label_list.index( 'Gene ontology (molecular function)')] location = word_list[label_list.index('Subcellular location [CC]')] if '"' in function: #sqlite3 cant handle "" marks function = function.replace('"', '') d[d_ref[uniprot]] = [ genes, location[len('SUBCELLULAR LOCATION: '):], function[len("FUNCTION: "):], function2 ] return d
def __init__(self, verbose, d_ref): self.verbose = verbose self.d_ref = d_ref self.d_ref = read_in('Entry', 'Gene names (ordered locus )', filename='proteome') uniprotapi = UniProtAPI(['id', 'feature(CHAIN)']) self.labels, self.raw_data = uniprotapi.organism_info() self.d_output = {}
def read_in_index(): d = initialize_dict('dict') for organism in organism_list: pre_d = read_in( 'pdb', 'uniprot', filename='../0-identify_structure/0-identify_pdb/{0}/output.txt'. format(organism)) pre_d = collections.OrderedDict(sorted(pre_d.items())) d[organism] = {i: pdb for i, pdb in enumerate(pre_d)} return d
if __name__ == "__main__": help_message(help_msg, bool_org_dir=False) d_org = int2organism() d_index = initialize_dict('dict') d_val = initialize_dict('list') protein_property_list = [ 'length', 'abundance', 'evolutionary_rate', 'contact_density', 'PPI_degree', 'dosage_tolerance' ] log_zero_list = [ -1, -1, -4, 1, -1 ] #make into dict #dont log dosage tolerance, already logged for yeast, ecoli is discrete for organism in organism_list: pre_d_i = read_in('pdb', 'uniprot', organism=organism) pre_d_i = collections.OrderedDict(sorted(pre_d_i.items())) d_index[organism] = {i: pdb for i, pdb in enumerate(pre_d_i)} d_ref = read_in('oln', 'pdb', organism=organism) for protein_property in protein_property_list: x_input = database(organism, protein_property) d = read_in(*x_input) d_subset = { pdb: d[oln] for oln, pdb in d_ref.iteritems() if oln in d } d_val[organism].append(d_subset) line_list = prepare_sql(d_org, d_index, d_val, protein_property_list,
import sys, os CWD = os.getcwd() UTLTS_DIR = CWD[:CWD.index('proteomevis_scripts' )] + '/proteomevis_scripts/utlts' sys.path.append(UTLTS_DIR) from parse_user_input import help_message from read_in_file import read_in from parse_data import organism from output import writeout if __name__ == "__main__": help_message( help_msg) #need to adjust help message to allow yeast_ecoli case if organism == 'yeast_ecoli': #dependent on yeast/extra.txt being present d = read_in('uniprot', 'pdb', organism='ecoli') d_old = read_in('uniprot', 'pdb', filename='../ecoli/extra.txt') flag = 'EXTRA' else: d = read_in('uniprot', 'pdb') d_old = read_in( 'uniprot', 'pdb', filename='../../../0-identify_structure/3-length_check/{0}/{1}'. format(organism, 'old_seq2struc.txt')) flag = 'extra' pdb_list = set(d.items()) - set(d_old.items()) d_output = dict(x for x in pdb_list) writeout(['uniprot', 'pdb'], d_output, filename='extra') if organism == 'yeast_ecoli':
def get_file(): f = [] for (dirpath, dirnames, filenames) in os.walk("pdb_image/"): f.extend(filenames) f.remove('.gitkeep') return f def update_file_list(file_list, d): update = file_list[:] for organism, d_pdb in d.iteritems(): for pdb in d_pdb: pdb_file = pdb+'.png' if pdb_file in file_list: update.remove(pdb_file) return update def remove_image(update): for pdb_file in update: path = '{0}/{1}'.format(get_path(pdb_file), pdb_file) print path os.remove(path) if __name__ == "__main__": help_message(help_msg, bool_org_dir=False) file_list = get_file() d = initialize_dict('dict') for organism in organism_list: d[organism] = read_in('pdb', 'uniprot', organism=organism) update = update_file_list(file_list, d) remove_image(update)
else: want_i = chain_list.index(chain) self.io.set_structure(pre_chain_list[want_i]) self.io.save(self.pdb_file.get_id() + "." + chain + ".pdb") def run(self): self.get_pdb_chain() for pdb, self.chain_list in self.d_input.iteritems(): if os.path.exists("{0}/{1}.pdb".format(DIR, pdb)): self.pdb_file = PDBParser().get_structure( pdb, "{0}/{1}.pdb".format(DIR, pdb)) self.save_pdb_chain_file(None) else: pdb_bundle = glob.glob("{0}/{1}-pdb-bundle*pdb".format( DIR, pdb)) for sub_file in pdb_bundle: translate_chain = read_in_mapping(pdb) self.pdb_file = PDBParser().get_structure(pdb, sub_file) self.save_pdb_chain_file(translate_chain) if __name__ == "__main__": help_message(help_msg) untar() d_input = read_in('pdb', 'uniprot', filename='pre_seq2struc') pdbchain = PDBChain(d_input.keys()) pdbchain.run() print_next_step()
def run(self, verbose=''): self.get_all_info() self.get_best_pdb_chain() if verbose: self.print_verbose() return self.d_output def prepare_writeout(d_uniprot_pdb, d_proteome): d_output = {} for uniprot, pdb in d_uniprot_pdb.iteritems(): d_output[uniprot] = [pdb, d_proteome[uniprot]] return d_output if __name__ == '__main__': args = help_message(help_msg, bool_add_verbose=True) d_proteome = read_in('Entry', 'Gene names (ordered locus )', filename='proteome') uniprot2pdb = UniProt2PDB(d_proteome.keys()) d_uniprot_pdb = uniprot2pdb.run(args.verbose) d_output = prepare_writeout(d_uniprot_pdb, d_proteome) filename = 'pre_seq2struc' writeout(['uniprot', 'pdb', 'oln'], d_output, filename="new_{0}".format(filename)) database_update_needed(filename)
res1 = contact[0].id[1] res2 = contact[1].id[1] if not abs(res1 - res2) in [1, 0]: #no nearest neighbors M[res1][res2] = 1 return M + M.T if __name__ == "__main__": help_message(help_msg) extra = '' method = false_or_true("Calculate contact density like Shakh2006 [default Zhou2008]?") if false_or_true("Relax selection criterion 2"): extra += 'pre_output' contact_defn = ['Bloom', 'Shakh'][method] d_input = read_in('pdb', 'oln', filename = extra) d_input1 = read_in('pdb', 'uniprot', filename = extra) d_output = {} for pdb, oln in d_input.iteritems(): protein_contact = ProteinContact(pdb, contact_defn) residues = protein_contact.get_residues() contact_density = protein_contact.contact_matrix().sum() / float(len(residues)) if organism=='protherm': d_output[d_input1[pdb]] = contact_density x_name = 'uniprot' else: d_output[oln] = contact_density x_name = 'oln' filename = 'PDB' if method:
fig = plt.figure() ax = fig.add_axes([0, 0, 1, 1]) ax.set_axis_off() ax.scatter(data_list[1], data_list[0], c=color, s=1) ax.set_xlim([0, 1]) ax.set_ylim([0, 1]) ax.set_aspect('auto') plt.savefig('species.{0}.png'.format(int(d_label[organism])), transparent="True", dpi=350) # plt.show() if __name__ == "__main__": help_message(help_msg, bool_org_dir=False) d_org = int2organism() d_label = {org: i for i, org in d_org.iteritems()} protein_property_list = ['sid', 'tm'] for organism in organism_list: predata_list = [] for protein_property in protein_property_list: x_input = database(organism, protein_property) d = read_in(*x_input) predata_list.append(d) data_ppi = read_in_ppi_partners() data_tup, color = merge(predata_list[0], predata_list[1], data_ppi) data_list = zip(*data_tup) plotout(data_list, color, d_label) print_next_step('../')
data[1], 'bo', c=color[organism], label="$\\rho=${0:.2f} ({1:.2E})\n$n=${2} ({3:.0f}%)".format( r, pvalue, num_list[0], 100 * num_list[0] / num_list[1])) plt.title(title) plt.xlabel(label_list[0]), plt.ylabel(label_list[1]), plt.legend() plt.show() if __name__ == "__main__": organism = which_organism() d_x, xlabel, proteome_subset_bool_x = parse_input( str(raw_input("Property x: ")), organism) d_y, ylabel, proteome_subset_bool_y = parse_input( str(raw_input("Property y: ")), organism) proteome_subset_bool = proteome_subset_bool_x and proteome_subset_bool_y if proteome_subset_bool: proteome_subset_bool = false_or_true( "Include proteins not in ProteomeVis") data, labels = merge(d_x, d_y, proteome_subset_bool, organism) num = len(data[0]) total = len(read_in('Entry', 'Entry', 'proteome', organism=organism)) plotout(organism, data, [num, total], get_title(organism, proteome_subset_bool), [xlabel, ylabel], labels)
line.extend(d_info[organism][pdb]) line.append(int(o)) line_list.append(line) return line_list if __name__ == "__main__": args = help_message(help_msg, bool_add_verbose=True, bool_org_dir=False) #add verbose option d_org = int2organism() d_translate = initialize_dict('dict') d_index = initialize_dict('dict') d_info = initialize_dict('dict') for organism in organism_list: pre_d_i = read_in('pdb', 'uniprot', organism=organism) pre_d_i = collections.OrderedDict(sorted(pre_d_i.items())) d_translate[organism] = pre_d_i d_index[organism] = {i: pdb for i, pdb in enumerate(pre_d_i)} d_info[organism] = get_info(organism) line_list = prepare_sql(d_org, d_translate, d_index, d_info, args) columns = [ 'chain_id', 'pdb', 'uniprot', 'genes', 'location', 'function1', 'function2', 'species' ] write_sqlite = SQLite3('proteomevis_inspect', columns, line_list) write_sqlite.run() print_next_step('../')
url = "http://www.rcsb.org/pdb/download/downloadFile.do?fileFormat=pdb&compression=NO&structureId=" not_available_list = [] for pdb in pdb_list: pdb_name = "{0}.pdb".format(pdb) if not os.path.exists(pdb_name): pdbid = url+str(pdb) content = urllib.urlopen(pdbid).read() if '404 Not Found' in content: not_available_list.append(pdb) else: open(pdb_name, "w" ).write(content) print pdb_name return not_available_list def check(not_available_list): new_list = not_available_list[:] for pdb in not_available_list: if os.path.exists('{0}-pdb-bundle.tar.gz'.format(pdb)) or os.path.exists('{0}-pdb-bundle.tar'.format(pdb)): new_list.remove(pdb) if new_list: print "copy and paste the {0} structures below in the rcsb.org download feature (could not be downloaded programatically)".format(len(new_list)) #obtain bundle case print ",".join(new_list) if __name__ == "__main__": help_message(help_msg) d = read_in('pdb', 'uniprot', 'pre_seq2struc') pdb_list = [x[:4] for x in d] not_available_list = save_pdb_file(set(pdb_list)) check(not_available_list) print_next_step()
float(d_val[organism][1][pdb_pair]), int(d_val[organism][2][pdb_pair]), ppi_bool ] line_list.append(line) count += 1 return line_list if __name__ == "__main__": help_message(help_msg, bool_org_dir=False) d_org = int2organism() d_index = initialize_dict('dict') d_val = initialize_dict('list') for organism in organism_list: pre_d_i = read_in('pdb', 'uniprot', organism=organism) pre_d_i = collections.OrderedDict(sorted(pre_d_i.items())) d_index[organism] = {i: pdb for i, pdb in enumerate(pre_d_i)} for x in [ 'TM', 'SID', 'nal' ]: #, 'align1', 'align2']: sequence alignments takes up 700MB! makes downloading edges impossible d_val[organism].append(read_in(*database(organism, x))) d_ppi = read_in_ppi_partners() line_list = prepare_sql(d_org, d_index, d_ppi, d_val) columns = [ 'id', 'species', 'sourceID', 'targetID', 'tm', 'sid', 'align_length', 'ppi' ] write_sqlite = SQLite3('proteomevis_edge', columns, line_list)