def UIDs2JSON(base_uri, uids, fout): """ Uses uniprot library from Bosco Ho (https://github.com/boscoh/uniprot).""" import uniprot ## Bosco Ho (https://github.com/boscoh/uniprot) uniprot_data=uniprot.batch_uniprot_metadata(uids, None) for uid in uniprot_data.keys(): for key in uniprot_data[uid].keys(): if key in ('accs', 'sequence', 'go', 'description'): #keep simple del uniprot_data[uid][key] json_txt=json.dumps(uniprot_data, sort_keys=True, indent=2) fout.write(json_txt+'\n') return
def get_uniprot_loc(seq_id): # input should be a string in any case if not isinstance(seq_id, str): return None # if there are less than 3 spaces elif ' ' not in seq_id: # get all protein info uniprot_data = uniprot.batch_uniprot_metadata([seq_id]) prot_content = uniprot_data.get(seq_id) prot_content = str(prot_content.get('comment')) else: prot_content = seq_id if prot_content == None: return None # looking for location loc_header = 'SUBCELLULAR LOCATION: ' loc_idx = prot_content.find(loc_header) # if the location is present if loc_idx > -1: # isolating location information (not including header) # now left side begins with the location prot_content = prot_content[loc_idx + len(loc_header):] # making string a bit shorter by going up to next exclamation mark next_exclam = prot_content.find('!') prot_content = prot_content[:next_exclam] # finding where all punctuation marks occur in what's remaining punct_df = pd.DataFrame(list(string.punctuation), columns=['mark']) punct_df['pos'] = punct_df.mark.apply(lambda x: prot_content.find(x)) # ignoring punctuations that do not occur punct_df = punct_df[punct_df.pos > -1] # finding the mark that occurs first punct_df = punct_df.sort('pos').reset_index(drop=True) first_punct = punct_df.pos[punct_df.index == 0].values[0] # getting location; removing leading and trailing spaces location = prot_content[:first_punct].strip() return location else: return None
def insert_best_seqid_column(params): csv = params['csv'] uniprot_ids_header = params['uniprot_ids_header'] delimiter = params['delimiter'] out_csv = params['output_csv'] uniprot_cache = params['cache_dir'] if not csv: raise IOError('No file selected') if not os.path.isfile(csv): raise IOError(csv + ' not found') headers = get_headers(csv) if uniprot_ids_header not in headers: s = "Column header '%s' not found, available headers:\n" % uniprot_ids_header for header in headers: s += ' ' + header + '\n' raise IOError(s) logging('Reading %s\n' % csv) entries = read_csv(csv) all_seqids = [] for entry in entries: tokens = entry[uniprot_ids_header].split(delimiter) entry['seqids'] = [s.strip() for s in tokens] all_seqids.extend(entry['seqids']) logging('Found %d potential Uniprot IDs\n' % len(all_seqids)) uniprot_data = uniprot.batch_uniprot_metadata( all_seqids, uniprot_cache) for entry in entries: best_seqid = uniprot.sort_seqids_by_uniprot( entry['seqids'], uniprot_data)[0] entry['best_seqid'] = best_seqid entry['is_reviewed'] = False if best_seqid in uniprot_data: entry['is_reviewed'] = \ uniprot_data[best_seqid]['is_reviewed'] logging('Writing ') logging('%s\n' % os.path.abspath(out_csv), lambda: open_file(out_csv)) headers = ['best_seqid', 'is_reviewed'] + get_headers(csv) rows = [headers] for entry in entries: rows.append([entry[h] for h in headers]) write_csv(out_csv, rows)
def insert_best_seqid_column(params): csv = params['csv'] uniprot_ids_header = params['uniprot_ids_header'] delimiter = params['delimiter'] out_csv = params['output_csv'] uniprot_cache = params['cache_dir'] if not csv: raise IOError('No file selected') if not os.path.isfile(csv): raise IOError(csv + ' not found') headers = get_headers(csv) if uniprot_ids_header not in headers: s = "Column header '%s' not found, available headers:\n" % uniprot_ids_header for header in headers: s += ' ' + header + '\n' raise IOError(s) logging('Reading %s\n' % csv) entries = read_csv(csv) all_seqids = [] for entry in entries: tokens = entry[uniprot_ids_header].split(delimiter) entry['seqids'] = [s.strip() for s in tokens] all_seqids.extend(entry['seqids']) logging('Found %d potential Uniprot IDs\n' % len(all_seqids)) uniprot_data = uniprot.batch_uniprot_metadata(all_seqids, uniprot_cache) for entry in entries: best_seqid = uniprot.sort_seqids_by_uniprot(entry['seqids'], uniprot_data)[0] entry['best_seqid'] = best_seqid entry['is_reviewed'] = False if best_seqid in uniprot_data: entry['is_reviewed'] = \ uniprot_data[best_seqid]['is_reviewed'] logging('Writing ') logging('%s\n' % os.path.abspath(out_csv), lambda: open_file(out_csv)) headers = ['best_seqid', 'is_reviewed'] + get_headers(csv) rows = [headers] for entry in entries: rows.append([entry[h] for h in headers]) write_csv(out_csv, rows)
def get_prot_seq(seq_id): print seq_id # just in case uniprot tries to sever the connection try: uniprot_data = uniprot.batch_uniprot_metadata([seq_id]) except: try: uniprot_data = uniprot.batch_uniprot_metadata([seq_id]) except: try: uniprot_data = uniprot.batch_uniprot_metadata([seq_id]) except: uniprot_data = uniprot.batch_uniprot_metadata([seq_id]) prot_content = uniprot_data.get(seq_id) if prot_content == None: return None sequence = str(prot_content.get('sequence')) return sequence
def main(): file_rawdata = "data/MS_ISG15_Raw.tsv" MS_dict = read_MS_rawdata(file_rawdata) all_uniprot_ids = sorted(MS_dict.keys()) processed_ids = [ f.rsplit('_')[1].rsplit('.')[0] for f in glob.glob('Fasta/*.fa') ] uniprot_ids = sorted( list(set(all_uniprot_ids).difference(set(processed_ids)))) print "All IDs: %i" % len(all_uniprot_ids) print "Processed IDs: %i" % len(processed_ids) print "Remaining IDs: %i" % len(uniprot_ids) uniprot_data = uniprot.batch_uniprot_metadata(uniprot_ids[0:150], 'cache') parse_structure(uniprot_data) parse_ortholog(uniprot_ids)
def get_uniprot_str(seq_id, just_gene=False): # get all protein info uniprot_data = uniprot.batch_uniprot_metadata([seq_id]) prot_content = uniprot_data.get(seq_id) if prot_content == None: return None if not just_gene: prot_content = json.dumps(prot_content.get('comment')) else: prot_content = json.dumps(prot_content.get('gene'))[1:-1] # stripping out all '\n' carefully prot_content = strip_slantn(prot_content) return prot_content
def parse_uniprot_PDB(filename, MS_dict, dict_asa, outfile): uniprot_id = filename.rsplit('_')[1].rsplit('.')[0] uniprot_data = uniprot.batch_uniprot_metadata([uniprot_id], 'cache') uniprot_seq = uniprot_data[uniprot_id]['sequence'] positions = list(set(MS_dict[uniprot_id]['Position'])) PDBs = [line.rstrip() for line in open(filename, 'r').readlines()] for PDB in PDBs: file_PDB_zip = 'PDB/' + PDB + '.pdb.gz' file_PDB = 'PDB/' + PDB + '.pdb' file_dssp = 'dssp/' + PDB + '.dssp' fetchPDB(PDB) if len(glob.glob(PDB + '.pdb.gz')) == 0: continue os.system('mv ' + PDB + '.pdb.gz PDB/') os.system('gunzip -f ' + file_PDB_zip) os.system('/usr/local/Cellar/dssp/2.1.0/bin/mkdssp ' + file_PDB + ' > ' + file_dssp) dict_dssp = reading_dssp(file_dssp, dict_asa) atoms = parsePDB(file_PDB) get_RSA_all_lys(atoms, dict_dssp, uniprot_id, PDB, outfile) chain_ids = sorted(list(set(atoms.getChids()))) for chain_id in chain_ids: chain_atoms = atoms.select('chain ' + chain_id) top_aln = align_uniprot_PDB(uniprot_seq, atoms, chain_id) if top_aln == 'NA': print "No alignment" continue uniprot_seq_aln, pdb_seq_aln, score, begin, end = top_aln for position in positions: if position < int(begin): continue if position > int(end): continue pdb_resi, pdb_aa = identify_position(uniprot_seq_aln, pdb_seq_aln, position, uniprot_id) if pdb_resi == 'NA': continue pdb_resi = sorted(list(set( chain_atoms.getResnums())))[pdb_resi - 1] residueID = chain_id + '-' + str(pdb_resi) if residueID not in dict_dssp.keys(): continue RSA = dict_dssp[residueID]['RSA'] outfile.write("\t".join( map(str, [ 'ISG15', uniprot_id, position, PDB, chain_id, pdb_resi, RSA ])) + "\n")
def protein_seq_update_celery_nofunction( full_batch = False ): proteins = None # Protein.objects.extra(where=["CHAR_LENGTH(sequence) = 0"]) if full_batch: proteins = Protein.objects.all() else: proteins = Protein.objects.extra(where=["CHAR_LENGTH(sequence) = 0"]) uniprot_data = uniprot.batch_uniprot_metadata( [ b.prot_id for b in proteins ] ) for key in uniprot_data.keys(): defaults = {} try: defaults['sequence'] = uniprot_data[key]['sequence'] except: pass try: defaults['description'] = uniprot_data[key]['description'] except: pass prot, _ = Protein.objects.update_or_create( prot_id = key, defaults = defaults ) return 'Protein sequences updated'
def retrieve_uniprot_meta(uniprotIDs_file): with open(uniprotIDs_file, 'r') as f: id_list = [uni_id.strip() for uni_id in f] meta_dict = uniprot.batch_uniprot_metadata(id_list) return meta_dict
# Clean up caches os.system('rm cache*') # Example 1 - reading a fasta file seqids, fastas = uniprot.read_fasta('example.fasta') pprint.pprint(seqids, indent=2) # Example 2 - map identifiers for RefSeq to Uniprot seqids = "NP_000508.1 NP_001018081.3".split() pairs = uniprot.batch_uniprot_id_mapping_pairs('P_REFSEQ_AC', 'ACC', seqids) pprint.pprint(pairs, indent=2) # Example 2 - get UniProt metadata uniprot_seqids = [j for i, j in pairs] uniprot_data = uniprot.batch_uniprot_metadata(uniprot_seqids, 'cache') pprint.pprint(uniprot_data, indent=2) # Example 3 - parse for isoforms in metadata text = open('cache/metadata.0.txt').read() uniprot_data = uniprot.parse_isoforms(text) pprint.pprint(uniprot_data) # Example 4 - chaining commands to map seqids seqids = "EFG_MYCA1 YP_885981.1 ENSG00000196176 Q91ZU6-8".split() uniprot_data = uniprot.get_metadata_with_some_seqid_conversions( seqids, 'cache2') pprint.pprint(uniprot_data, indent=2) # Example 4 - chaining commands to map seqids seqids = "EFG_MYCA1 YP_885981.1 ENSG00000196176 Q91ZU6-8".split()
# Example 3 - sequential identifier mapping to UniProt # identifiers using robust but slow method seqids = """ EFG_MYCA1 YP_885981.1 CpC231_1796 """.split() mapping = uniprot.sequentially_convert_to_uniprot_id(seqids, "cache.json") uniprot_seqids = mapping.values() # Example 4 - get UniProt metadata uniprot_data = uniprot.batch_uniprot_metadata(uniprot_seqids, "cache2.txt") pprint.pprint(uniprot_data, indent=2) for l in open("cache2.txt"): print l.strip() uniprot.write_fasta("example.output.fasta", uniprot_data, uniprot_seqids) # Example 5 - chaining commands to make your own # special mapper def map_to_refseq(seqids): uniprot_mapping = uniprot.sequentially_convert_to_uniprot_id(seqids, "func.cache.json")
# Load hand-curated CycD putative target list filename = '/data/cycd_targets/cycd_target_uniprot.txt' targetIDs = pd.read_csv(filename) already_seen = pd.concat((targetIDs['Entry'], entries)) # Load hit list from PSSM filename = '/data/cycd_targets/hsap_proteome/hsap_hits>20.csv' targetIDs = pd.read_csv(filename, sep='\t') entries = targetIDs['Entry'] # Do a merge to see what's not already seen in the hand-curated list merged = set(entries) - set(already_seen) # Fetch and write as FASTA out_name = '/data/cycd_targets/hsap_hits>20.fasta' upData = uniprot.batch_uniprot_metadata(merged, 'cache') uniprot.write_fasta(out_name, upData, merged) split_fastas(out_name) PSIPRED_DIR = '/data/cycd_targets/cycd_target_uniprot_individuals' seqs = [] for filename in os.listdir(PSIPRED_DIR): if filename.endswith('.ss2'): print 'Working on ', filename #Load PSIPRED VFORMAT in a sane way to extract only relevant info df = pd.read_csv(os.path.join(PSIPRED_DIR, filename), header=0, delim_whitespace=True,
# Clean up caches os.system('rm cache*') # Example 1 - reading a fasta file seqids, fastas = uniprot.read_fasta('example.fasta') pprint.pprint(seqids, indent=2) # Example 2 - map identifiers for RefSeq to Uniprot seqids = "NP_000508.1 NP_001018081.3".split() pairs = uniprot.batch_uniprot_id_mapping_pairs( 'P_REFSEQ_AC', 'ACC', seqids) pprint.pprint(pairs, indent=2) # Example 2 - get UniProt metadata uniprot_seqids = [j for i,j in pairs] uniprot_data = uniprot.batch_uniprot_metadata( uniprot_seqids, 'cache') pprint.pprint(uniprot_data, indent=2) # Example 3 - parse for isoforms in metadata text = open('cache/metadata.0.txt').read() uniprot_data = uniprot.parse_isoforms(text) pprint.pprint(uniprot_data) # Example 4 - chaining commands to map seqids seqids = "EFG_MYCA1 YP_885981.1 ENSG00000196176 Q91ZU6-8".split() uniprot_data = uniprot.get_metadata_with_some_seqid_conversions( seqids, 'cache2') pprint.pprint(uniprot_data, indent=2) # Example 4 - chaining commands to map seqids seqids = "EFG_MYCA1 YP_885981.1 ENSG00000196176 Q91ZU6-8".split()