def overlap(self): pdb_uni = self.read_pdb_uni() cb_ids, cb_seqs = tools_fasta.fasta_to_id_seq(self.bc90) cb_pdb_unis = {} cb_pdbs = [] for id in cb_ids: if id in pdb_uni: cb_pdb_unis[pdb_uni[id]] = id cb_pdbs.append(pdb_uni[id]) pdb_ids, pdb_seqs = tools_fasta.fasta_to_id_seq(self.pdb90) print("Proteins overlapping between the PDB and BC datasets") for pdb_id in pdb_ids: if pdb_id in cb_pdbs: print(pdb_id) print(cb_pdb_unis[pdb_id])
def get_human(self): pids, seqs = tools_fasta.fasta_to_id_seq(self.human_fp) ns = NormScore() scores = ns.lc_norm_score(seqs) proteome = ['Human'] * len(pids) org = ['Human'] * len(pids) return pids, proteome, org, scores
def get_yeast(self): pids, seqs = tools_fasta.fasta_to_id_seq(self.yeast_fp) ns = NormScore() scores = ns.lc_norm_score(seqs) proteome = ['Yeast'] * len(pids) org = ['Yeast'] * len(pids) return pids, proteome, org, scores
def train_df(self): pdb_pids, pdb_seqs = tools_fasta.fasta_to_id_seq(self.pdb_fpi) pdb_lens = tools_fasta.get_lengths(pdb_seqs) bc_pids, bc_seqs = tools_fasta.fasta_to_id_seq(self.bc_fpi) bc_lens = tools_fasta.get_lengths(bc_seqs) lens = bc_lens + pdb_lens pids = bc_pids + pdb_pids seqs = bc_seqs + pdb_seqs y = [0] * len(bc_pids) + [1] * len(pdb_pids) df_dict = { 'Protein ID': pids, 'Sequence': seqs, 'Length': lens, 'y': y } cols = ['Protein ID', 'y', 'Sequence', 'Length'] df = pd.DataFrame(df_dict, columns=cols) df.to_csv(self.train_fpo, sep='\t')
def write_scores(self): ids, seqs = tools_fasta.fasta_to_id_seq(self.all_fpi) ns = NormScore() scores = ns.lc_norm_score(seqs) df_out = pd.DataFrame({ 'Protein ID': ids, 'LC Score': scores }, columns=['Protein ID', 'LC Score']) df_out = df_out.sort_values(by='LC Score', ascending=False) print(df_out) df_out.to_csv(self.all_fpo, sep='\t')
def get_scores(self): pbodies = self.get_pbody() pids, seqs = tools_fasta.fasta_to_id_seq(self.yeast_fasta) pseqs = [] ppids = [] for pid, seq in zip(pids, seqs): if pid in pbodies: pseqs.append(seq) ppids.append(pid) ns = NormScore() scores = ns.lc_norm_score(pseqs) df_dict = {'Protein ID': ppids, 'LC Score': scores} df_out = pd.DataFrame(df_dict) df_out.to_csv(self.yeast_scores, sep='\t')
def read_fasta(self): pids, seqs = tools_fasta.fasta_to_id_seq(self.fasta_fpi) norm = NormScore() # ent1[211:457] ent1 = seqs[0] #print(ent1[211:457]) ent1wo = ent1[:211] + ent1[457:] #print(norm.lc_norm_score([ent1wo])) #print(norm.lc_norm_score([ent1])) # ent2[224:616] ent2 = seqs[1] #print(ent2[224:616]) ent2wo = ent2[:224] + ent2[616:] #print(norm.lc_norm_score([ent2wo])) # yap1801[351:638] yap1801 = seqs[2] #print(yap1801[351:638]) yap1801wo = yap1801[:351] + yap1801[638:] #print() #print(norm.lc_norm_score([yap1801])) #print(norm.lc_norm_score([yap1801wo])) # yap1802[319:569] yap1802 = seqs[3] #print(yap1802[319:569]) yap1802wo = yap1802[:319] + yap1802[569:] #print(norm.lc_norm_score([yap1802wo])) # sla1[954:1244] sla1 = seqs[4] print(len(sla1)) print(sla1[954:1244]) print() ns = tools_lc.display_lc(sla1, self.k, self.lca, self.lce) print(sla1) print(ns) sla1wo = sla1[:954] + sla1[1244:] print(norm.lc_norm_score([sla1wo])) print(norm.lc_norm_score([sla1])) #sla2[348:442] sla2 = seqs[5] #print(sla2[348:442]) sla2wo = sla2[:348] + sla2[442:] #print(norm.lc_norm_score([sla2wo])) #print(norm.lc_norm_score([sla2])) # sup35[0:123] sup35 = seqs[6]
def run(self): ids, seqs = tools_fasta.fasta_to_id_seq(self.puncta) df = pd.read_csv(self.pfam_puncta, sep='\t', index_col=0) new_seqs = [] below = 0 above = 0 norm_scores = [] fl_norm_scores = [] for id, seq in zip(ids, seqs): ndf = df[df['uniprot_acc'] == id] ndf = ndf.sort_values(by='seq_start') segmented = self.segment_seq(seq, ndf) total = 0 for item in segmented: total += len(item) if total >= 100: above += 1 fl_score, fl_length = self.get_segment_scores([seq]) fl_norm = self.norm_function([fl_score], [fl_length]) raw_score, length = self.get_segment_scores(segmented) norm_score = self.norm_function([raw_score], [length]) norm_scores.append(norm_score[0]) fl_norm_scores.append(fl_norm[0]) else: below += 1 print(above) print(below) print(np.mean(norm_scores)) print(np.mean(fl_norm_scores)) print(np.median(norm_scores)) print(np.median(fl_norm_scores)) plt.hist(fl_norm_scores, alpha=0.5, bins=20, range=(-100, 200), label='Full length scores') plt.hist(norm_scores, alpha=0.5, bins=20, range=(-100, 200), label='Outside Pfam scores') plt.legend() plt.show()
def concat_train(self): bc_pids, bc_seqs = tools_fasta.fasta_to_id_seq(self.bc_fpi) bc_lens = tools_fasta.get_lengths(bc_seqs) pdb_df = pd.read_csv(self.pdb_fpi, sep='\t', index_col=0) pdb_pids = list(pdb_df['Protein ID']) pdb_seqs = list(pdb_df['Sequence']) pdb_lens = list(pdb_df['Length']) pids = bc_pids + pdb_pids seqs = bc_seqs + pdb_seqs lens = bc_lens + pdb_lens y = [0] * len(bc_pids) + [1] * len(pdb_pids) cols = ['Protein ID', 'y', 'Length', 'Sequence'] df_dict = { 'Protein ID': pids, 'Sequence': seqs, 'Length': lens, 'y': y } df = pd.DataFrame(df_dict, columns=cols) df.to_csv(self.fpo, sep='\t')
def with_pfam(self, fasta_fp, pfam_fp, fpo): """ How many proteins in the set have pfam domains? What is the fraction occupied by pfam domains?""" df = pd.read_csv(pfam_fp, sep='\t') pfam_ids = list(set(df['uniprot_acc'])) pids, seqs = tools_fasta.fasta_to_id_seq(fasta_fp) print(len(pids)) nopfam_ids = list(set(pids) - set(pfam_ids)) nopfam_seqs = [] for pid, seq in zip(pids, seqs): if pid in nopfam_ids: nopfam_seqs.append(seq) ns = NormScore() scores = ns.lc_norm_score(nopfam_seqs) df_out = pd.DataFrame({ 'UniProt ID': nopfam_ids, 'LC Score': scores }, columns=['UniProt ID', 'LC Score']) df_out = df_out.sort_values(by='LC Score', ascending=False) df_out.to_csv(fpo, sep='\t')
def percent_pfam(self, fasta_fp, pfam_fp, fpo): df = pd.read_csv(pfam_fp, sep='\t') pids, seqs = tools_fasta.fasta_to_id_seq(fasta_fp) frac_pfam = [] for id, seq in zip(pids, seqs): ndf = df[df['uniprot_acc'] == id] ndf = ndf.sort_values(by='seq_start') segmented = self.segment_seq(seq, ndf) len_seg = 0 for seg in segmented: len_seg += len(seg) frac_pfam.append(float(len(seq) - len_seg) / float(len(seq))) ns = NormScore() scores = ns.lc_norm_score(seqs) df_out = pd.DataFrame( { 'Uniprot ID': pids, 'LC Score': scores, 'Pfam Fraction': frac_pfam }, columns=['Uniprot ID', 'LC Score', 'Pfam Fraction']) df_out = df_out.sort_values(by='LC Score', ascending=False) df_out.to_csv(fpo, sep='\t') print(np.mean(frac_pfam))
def get_pids(self, fasta): pids, seqs = tools_fasta.fasta_to_id_seq(fasta) return pids