def Decriptor_generator(self, ps): protein = PyPro() protein.ReadProteinSequence(ps) DS_1 = protein.GetAAComp() # print len(DS_1) #DS_2 = protein.GetDPComp() # print len(DS_2) #DS_3 = protein.GetTPComp() # takes time # print len(DS_3) DS_4 = protein.GetTriad() DS_5 = protein.GetPAAC(lamda=5, weight=0.5) # takes time DS_6 = protein.GetAPAAC(lamda=5, weight=0.5) # takes time DS_7 = protein.GetCTD() DS_8 = protein.GetGearyAuto() DS_9 = protein.GetMoranAuto() DS_10 = protein.GetMoreauBrotoAuto() DS_11 = protein.GetQSO() DS_12 = protein.GetSOCN() DS_ALL = {} for DS in (DS_1, DS_4, DS_5, DS_6, DS_7, DS_8, DS_9, DS_10, DS_11, DS_12): DS_ALL.update(DS) # print len(DS_ALL) return DS_ALL
def Protein_gen(data, Proteingroup): import numpy as np from pydpi.pypro import PyPro protein = PyPro() HP_list, D_list = [], [] for ii in range(len(data)): p = data[ii] protein.ReadProteinSequence(p) keys, values = [], [] for jj in Proteingroup: if jj == '0': #All descriptors 2049 res = protein.GetALL() elif jj == '1': #amino acid composition 20 res = protein.GetAAComp() elif jj == '2': #dipeptide composition 400 res = protein.GetDPComp() elif jj == '3': #Tripeptide composition 8000 res = protein.GetTPComp() elif jj == '4': #Moreau-Broto autocorrelation 240 res = protein.GetMoreauBrotoAuto() elif jj == '5': #Moran autocorrelation 240 res = protein.GetMoranAuto() elif jj == '6': #Geary autocorrelation 240 res = protein.GetGearyAuto() elif jj == '7': #composition,transition,distribution 21+21+105 res = protein.GetCTD() elif jj == '8': #conjoint triad features 343 res = protein.GetTriad() elif jj == '9': #sequence order coupling number 60 res = protein.GetSOCN(30) elif jj == '10': #quasi-sequence order descriptors 100 res = protein.GetQSO() elif jj == '11': #pseudo amino acid composition 50 res = protein.GetPAAC(30) keys.extend(res.keys()) values.extend(res.values()) if ii == 0: HP_list = keys D_list.append(values) else: D_list.append(values) D_Pro = np.zeros((len(D_list), len(HP_list)), dtype=float) for k in range(len(D_list)): D_Pro[k, :] = D_list[k] #Variance threshold std > 0.01 import Descriptors_Selection as DesSe ind_var = DesSe.VarinceThreshold(D_Pro) D_Pro = D_Pro[:, ind_var] HP_list = np.array(HP_list)[ind_var] H_Pro = np.reshape(HP_list, (1, len(HP_list))) Array_Pro = np.append(H_Pro, D_Pro, axis=0) return Array_Pro
def Decriptor_generator(self, ps): protein = PyPro() protein.ReadProteinSequence(ps) moran = protein.GetPAAC(lamda=5,weight=0.5) DS_1 = protein.GetAPAAC(lamda=5,weight=0.5) DS_2 = protein.GetCTD() DS_3 = protein.GetDPComp() DS_4 = protein.GetGearyAuto() DS_5 = protein.GetMoranAuto() DS_6 = protein.GetMoreauBrotoAuto() DS_7 = protein.GetQSO() DS_8 = protein.GetSOCN() DS_9 = protein.GetTPComp() DS_ALL = {} for DS in (DS_1,DS_2,DS_3,DS_4,DS_5,DS_6,DS_7,DS_8,DS_9,moran): DS_ALL.update(DS) return DS_ALL
y = df_p['e'] pca = PCA(n_components=2) X_r = pca.fit(X).transform(X) plt.figure() for color, i, target_name in zip(['navy', 'darkorange'], [0, 1], ['0', '1']): plt.scatter(X_r[y == i, 0], X_r[y == i, 1], color=color, alpha=.8, lw=2,label=target_name) plt.legend(loc='best', shadow=False, scatterpoints=1) plt.title('PCA of hmmscan df') # Collect physicochemical features homology -- pydpi ## print 'Collecting physicochemical stats per protein...' with open('./physicochem_annot_training.csv', 'w') as f: for record in SeqIO.parse(all_fasta, "fasta"): protein = PyPro() protein.ReadProteinSequence(str(record.seq)) desc = protein.GetGearyAuto() desc2 = protein.GetDPComp() z = desc.copy() z.update(desc2) len_p = str(len(record.seq)) label = str(record.description).strip().split(':')[1] id = str(record.id) row = [id, label, len_p] + [str(i) for i in z.values()] f.write(','.join(row) + '\n') df_desc = pd.read_table('./physicochem_annot_training.csv', header=None, sep=',') header = z.keys() df_desc.columns = ['id', 'label', 'seq_length'] + header print 'Pydpi table: {}'.format(df_desc.columns)
def Decriptor_generator(infile, lamda, weight, maxlag, destype, out_file): list_pep_name = [] f = open(infile) lines = f.readlines() for line in lines: if ">" in line: pass else: list_pep_name.append(line.strip('\n')) out_df = pd.DataFrame() for seq in list_pep_name: protein = PyPro() protein.ReadProteinSequence(seq) if destype == "GetAAComp": DS = protein.GetAAComp() df = pd.DataFrame(DS, index=[0]) elif destype == "GetDPComp": DS = protein.GetDPComp() df = pd.DataFrame(DS, index=[0]) elif destype == "GetTPComp": DS = protein.GetTPComp() df = pd.DataFrame(DS, index=[0]) elif destype == "GetMoreauBrotoAuto": DS = protein.GetMoreauBrotoAuto() df = pd.DataFrame(DS, index=[0]) elif destype == "GetMoranAuto": DS = protein.GetMoranAuto() df = pd.DataFrame(DS, index=[0]) elif destype == "GetGearyAuto": DS = protein.GetGearyAuto() df = pd.DataFrame(DS, index=[0]) elif destype == "GetCTD": DS = protein.GetCTD() df = pd.DataFrame(DS, index=[0]) elif destype == "GetPAAC": DS = protein.GetPAAC(lamda=int(lamda), weight=float(weight)) df = pd.DataFrame(DS, index=[0]) elif destype == "GetAPAAC": DS = protein.GetAPAAC(lamda=int(lamda), weight=float(weight)) df = pd.DataFrame(DS, index=[0]) elif destype == "GetSOCN": DS = protein.GetSOCN(maxlag=int(maxlag)) df = pd.DataFrame(DS, index=[0]) elif destype == "GetQSO": DS = protein.GetQSO(maxlag=int(maxlag), weight=float(weight)) df = pd.DataFrame(DS, index=[0]) elif destype == "GetTriad": DS = protein.GetTriad() df = pd.DataFrame(DS, index=[0]) elif destype == "All": DS1 = protein.GetAAComp() DS2 = protein.GetDPComp() DS3 = protein.GetTPComp() DS4 = protein.GetMoreauBrotoAuto() DS5 = protein.GetMoranAuto() DS6 = protein.GetGearyAuto() DS7 = protein.GetCTD() DS8 = protein.GetPAAC(lamda=int(lamda), weight=float(weight)) DS9 = protein.GetAPAAC(lamda=int(lamda), weight=float(weight)) DS10 = protein.GetSOCN(maxlag=int(maxlag)) DS11 = protein.GetQSO(maxlag=int(maxlag), weight=float(weight)) DS12 = protein.GetTriad() DS = {} for D in (DS1, DS2, DS3, DS4, DS5, DS6, DS7, DS8, DS9, DS10, DS11, DS12): print(D) DS.update(D) df = pd.DataFrame(DS, index=[0]) if destype == 'BinaryDescriptor': out_df = BinaryDescriptor(list_pep_name) else: out_df = pd.concat([out_df, df], axis=0) out_df.to_csv(out_file, index=False, sep='\t')