def _build_uniprot_sequences(): seq_file = resource_manager.get_create_resource_file('swissprot', cached=True) iso_file = resource_manager.get_create_resource_file('isoforms', cached=True) logger.info("Loading Swissprot sequences...") sp_seq = load_fasta_sequences(seq_file) logger.info("Loading Uniprot isoform sequences...") iso_seq = load_fasta_sequences(iso_file) sp_seq.update(iso_seq) return sp_seq
def _build_uniprot_entries(): up_entries_file = resource_manager.get_create_resource_file('up') sc_entries_file = resource_manager.get_create_resource_file('up_sars_cov2') uniprot_gene_name = {} uniprot_mnemonic = {} uniprot_mnemonic_reverse = {} uniprot_mgi = {} uniprot_rgd = {} uniprot_mgi_reverse = {} uniprot_rgd_reverse = {} uniprot_length = {} uniprot_features = {} uniprot_reviewed = set() files = [up_entries_file, sc_entries_file] for file in files: with open(file, 'r') as fh: csv_rows = csv.reader(fh, delimiter='\t') # Skip the header row next(csv_rows) for row in csv_rows: up_id, gene_name, up_mnemonic, rgd, mgi, length, reviewed, \ features_json = row # Store the entry in the reviewed set if reviewed == 'reviewed': uniprot_reviewed.add(up_id) uniprot_gene_name[up_id] = gene_name uniprot_mnemonic[up_id] = up_mnemonic uniprot_mnemonic_reverse[up_mnemonic] = up_id uniprot_length[up_id] = int(length) if mgi: mgi_ids = mgi.split(';') if mgi_ids: uniprot_mgi[up_id] = mgi_ids[0] uniprot_mgi_reverse[mgi_ids[0]] = up_id if rgd: rgd_ids = rgd.split(';') if rgd_ids: uniprot_rgd[up_id] = rgd_ids[0] uniprot_rgd_reverse[rgd_ids[0]] = up_id uniprot_features[up_id] = [ feature_from_json(feat) for feat in json.loads(features_json) ] # Build a dict of features by feature ID features_by_id = {} for up_id, feats in uniprot_features.items(): for feat in feats: features_by_id[feat.id] = feat return (uniprot_gene_name, uniprot_mnemonic, uniprot_mnemonic_reverse, uniprot_mgi, uniprot_rgd, uniprot_mgi_reverse, uniprot_rgd_reverse, uniprot_length, uniprot_reviewed, uniprot_features, features_by_id)
def _build_hgnc_mappings(): hgnc_file = resource_manager.get_create_resource_file('hgnc') with gzip.open(hgnc_file, 'rt', encoding='utf-8') as fh: csv_rows = csv.reader(fh, delimiter='\t') # Skip the header row next(csv_rows) hgnc_name_to_id = {} hgnc_id_to_up = {} up_to_hgnc_id = {} entrez_to_up = {} up_to_entrez = {} for row in csv_rows: hgnc_id = row[0][5:] hgnc_status = row[3] if hgnc_status == 'Approved': hgnc_name = row[1] hgnc_name_to_id[hgnc_name] = hgnc_id # Uniprot uniprot_id = row[6] if uniprot_id: hgnc_id_to_up[hgnc_id] = uniprot_id uniprot_ids = uniprot_id.split(', ') for upid in uniprot_ids: up_to_hgnc_id[upid] = hgnc_id # Entrez entrez_id = row[5] if entrez_id: for upid in uniprot_ids: up_to_entrez[upid] = entrez_id entrez_to_up[entrez_id] = uniprot_id return hgnc_name_to_id, hgnc_id_to_up, up_to_hgnc_id, \ entrez_to_up, up_to_entrez
def _build_human_mouse_rat(): hgnc_file = resource_manager.get_create_resource_file('hgnc') with gzip.open(hgnc_file, 'rt', encoding='utf-8') as fh: csv_rows = csv.reader(fh, delimiter='\t') # Skip the header row next(csv_rows) uniprot_mouse = {} uniprot_rat = {} for row in csv_rows: human_id, mgi_id, rgd_id = row[6:9] if human_id: if mgi_id: mgi_id = mgi_id.split(', ')[0] if mgi_id.startswith('MGI:'): mgi_id = mgi_id[4:] mouse_id = um.uniprot_mgi_reverse.get(mgi_id) if mouse_id: uniprot_mouse[human_id] = mouse_id if rgd_id: rgd_id = rgd_id.split(', ')[0] if rgd_id.startswith('RGD:'): rgd_id = rgd_id[4:] rat_id = um.uniprot_rgd_reverse.get(rgd_id) if rat_id: uniprot_rat[human_id] = rat_id return uniprot_mouse, uniprot_rat
def _build_refseq_uniprot(): refseq_uniprot_file = resource_manager.get_create_resource_file( 'refseq_uniprot') refseq_up = {} with gzip.open(refseq_uniprot_file, 'rt', encoding='utf-8') as f: csvreader = csv.reader(f) for refseq_id, up_id in csvreader: if refseq_id not in refseq_up: refseq_up[refseq_id] = [] refseq_up[refseq_id].append(up_id) return refseq_up
def _build_uniprot_entries(): up_entries_file = resource_manager.get_create_resource_file('up') uniprot_gene_name = {} uniprot_mnemonic = {} uniprot_mnemonic_reverse = {} uniprot_mgi = {} uniprot_rgd = {} uniprot_mgi_reverse = {} uniprot_rgd_reverse = {} uniprot_length = {} uniprot_signal_peptide = {} uniprot_reviewed = set() with open(up_entries_file, 'r') as fh: csv_rows = csv.reader(fh, delimiter='\t') # Skip the header row next(csv_rows) for row in csv_rows: up_id, gene_name, up_mnemonic, rgd, mgi, length, reviewed, \ signal_peptide = row # Store the entry in the reviewed set if reviewed == 'reviewed': uniprot_reviewed.add(up_id) uniprot_gene_name[up_id] = gene_name uniprot_mnemonic[up_id] = up_mnemonic uniprot_mnemonic_reverse[up_mnemonic] = up_id uniprot_length[up_id] = int(length) if mgi: mgi_ids = mgi.split(';') if mgi_ids: uniprot_mgi[up_id] = mgi_ids[0] uniprot_mgi_reverse[mgi_ids[0]] = up_id if rgd: rgd_ids = rgd.split(';') if rgd_ids: uniprot_rgd[up_id] = rgd_ids[0] uniprot_rgd_reverse[rgd_ids[0]] = up_id uniprot_signal_peptide[up_id] = (None, None) if signal_peptide: match = re.match(r'SIGNAL (\d+) (\d+) ', signal_peptide) if match: beg_pos, end_pos = match.groups() uniprot_signal_peptide[up_id] = \ (int(beg_pos), int(end_pos)) return (uniprot_gene_name, uniprot_mnemonic, uniprot_mnemonic_reverse, uniprot_mgi, uniprot_rgd, uniprot_mgi_reverse, uniprot_rgd_reverse, uniprot_length, uniprot_reviewed, uniprot_signal_peptide)
def _build_uniprot_sec(): # File containing secondary accession numbers mapped # to primary accession numbers sec_file = resource_manager.get_create_resource_file('upsec') uniprot_sec = {} lines = open(sec_file, 'rt').readlines() for i, l in enumerate(lines): if l.startswith('Secondary AC'): entry_lines = lines[i + 2:] for l in entry_lines: sec_id, prim_id = l.split() try: uniprot_sec[sec_id].append(prim_id) except KeyError: uniprot_sec[sec_id] = [prim_id] return uniprot_sec
def _get_phospho_site_dataset(): """Read phosphosite data into dicts keyed by Uniprot ID and by site group. Returns ------- tuple The first element of the tuple contains the PhosphoSite data keyed by Uniprot ID, the second element contains data keyed by site group. Both dicts have instances of the PhosphoSite namedtuple as values. If the PhosphoSite data file cannot be loaded, returns (None, None). """ global _data_by_up global _data_by_site_grp phosphosite_data_file = resource_manager.get_create_resource_file('psp') if _data_by_up is None or _data_by_site_grp is None: with open(phosphosite_data_file, 'r') as fh: # Get the csv reader generator reader = csv.reader(fh, delimiter='\t') # Skip 4 rows for _ in range(4): next(reader) # Build up a dict by protein data_by_up = defaultdict(lambda: defaultdict(list)) data_by_site_grp = defaultdict(list) for row in reader: site = PhosphoSite(*row) res_pos = site.MOD_RSD.split('-')[0] #res_pos = res_pos[1:] # DANGEROUS: lookup based on pos alone base_acc_id = site.ACC_ID.split('-')[0] data_by_up[site.ACC_ID][res_pos].append(site) # If the ID was isoform specific, add to the dict for the whole # protein if base_acc_id != site.ACC_ID: data_by_up[base_acc_id][res_pos].append(site) # Catch the handful of isoforms that have a Uniprot ID without # the hyphen elif site.ACC_ID in _iso_to_ref_map: ref_id = _iso_to_ref_map[site.ACC_ID] data_by_up[ref_id][res_pos].append(site) # To catch additional cases, include an entry for the -1 base ID else: data_by_up['%s-1' % base_acc_id] = data_by_up[base_acc_id] data_by_site_grp[site.SITE_GRP_ID].append(site) _data_by_up = data_by_up _data_by_site_grp = data_by_site_grp return (_data_by_up, _data_by_site_grp)
def _build_hgnc_mappings(): hgnc_file = resource_manager.get_create_resource_file('hgnc') with open(hgnc_file, 'r') as fh: csv_rows = csv.reader(fh, delimiter='\t') # Skip the header row next(csv_rows) hgnc_ids = {} uniprot_ids = {} for row in csv_rows: hgnc_id = row[0][5:] hgnc_status = row[3] if hgnc_status == 'Approved': hgnc_name = row[1] hgnc_ids[hgnc_name] = hgnc_id # Uniprot uniprot_id = row[6] uniprot_ids[hgnc_id] = uniprot_id return hgnc_ids, uniprot_ids
def _build_uniprot_entries(): up_entries_file = resource_manager.get_create_resource_file('up') uniprot_gene_name = {} uniprot_mnemonic = {} uniprot_mnemonic_reverse = {} uniprot_mgi = {} uniprot_rgd = {} uniprot_mgi_reverse = {} uniprot_rgd_reverse = {} uniprot_length = {} uniprot_features = {} uniprot_reviewed = set() organisms_by_id = {} uniprot_entrez = {} uniprot_entrez_reverse = {} files = [up_entries_file] mgi_name_to_up = {} rgd_name_to_up = {} for file in files: with gzip.open(file, 'rt', encoding='utf-8') as fh: csv_rows = csv.reader(fh, delimiter='\t') # Skip the header row next(csv_rows) for row in csv_rows: up_id, gene_name, up_mnemonic, rgd, mgi, length, reviewed, \ organism_id, entrez_id, features_json = row # Store the entry in the reviewed set if reviewed == 'reviewed': uniprot_reviewed.add(up_id) # This is to turn empty strings into explicit Nones uniprot_gene_name[up_id] = gene_name if gene_name else None uniprot_mnemonic[up_id] = up_mnemonic uniprot_mnemonic_reverse[up_mnemonic] = up_id uniprot_length[up_id] = int(length) if mgi: mgi_ids = mgi.split(';') if mgi_ids: uniprot_mgi[up_id] = mgi_ids[0] uniprot_mgi_reverse[mgi_ids[0]] = up_id mgi_name_to_up[gene_name] = up_id if rgd: rgd_ids = rgd.split(';') if rgd_ids: uniprot_rgd[up_id] = rgd_ids[0] uniprot_rgd_reverse[rgd_ids[0]] = up_id rgd_name_to_up[gene_name] = up_id uniprot_features[up_id] = [feature_from_json(feat) for feat in json.loads(features_json)] organisms_by_id[up_id] = organism_id # Entrez mappings entrez_ids = [ei for ei in [e.strip() for e in entrez_id.split(';')] if ei] for eid in entrez_ids: uniprot_entrez[up_id] = eid uniprot_entrez_reverse[eid] = up_id # Build a dict of features by feature ID features_by_id = {} for up_id, feats in uniprot_features.items(): for feat in feats: features_by_id[feat.id] = feat return (uniprot_gene_name, uniprot_mnemonic, uniprot_mnemonic_reverse, uniprot_mgi, uniprot_rgd, uniprot_mgi_reverse, uniprot_rgd_reverse, uniprot_length, uniprot_reviewed, uniprot_features, features_by_id, organisms_by_id, uniprot_entrez, uniprot_entrez_reverse, mgi_name_to_up, rgd_name_to_up)
def _build_refseq_sequences(): seq_file = resource_manager.get_create_resource_file('refseq_seq', cached=True) logger.info("Loading RefSeq protein sequences...") seq = load_fasta_sequences(seq_file, id_delimiter=' ', id_index=0) return seq