def generate_features(seq): """ expect a list of sequences (a list of one for single sequence input) return pandas dataframe containing 20 unscaled features 10 from modlamp, 10 from custom feature generateion """ from modlamp.descriptors import GlobalDescriptor custom_features = pd.Series(seq).apply(generate_custom_features) gdesc = GlobalDescriptor(seq) gdesc.calculate_all() modlamp_features = pd.DataFrame(gdesc.descriptor) modlamp_features.columns=gdesc.featurenames out = pd.concat([modlamp_features,custom_features],axis=1) return out
def calculate_peptide_props(fasta_dict): ''' Give a sequence_dictionary (made from get_sequence_dict) returns a list of dictionaries. Each dictionary has type of chemical property as the keys and the calculated value for that property as the value. Designed to be written to a csv file using DictWriter. ''' property_list = [] for header in fasta_dict: s = str(fasta_dict[header].seq) t = GlobalDescriptor([s]) t.calculate_all() d = dict(zip(t.featurenames, t.descriptor[0])) d['Peptide_name'] = header property_list.append(d) return property_list
def propi(): des_fis = GlobalDescriptor(seq) des_fis.calculate_all() prop_fis = des_fis.descriptor # Composición de aminoácidos amino_comp = map(AC.CalculateAAComposition, seq) # AA dipep_comp = map(AC.CalculateDipeptideComposition, seq) # Dipéptidos # Autocorrelación moreau_auto = map(auto.CalculateNormalizedMoreauBrotoAutoTotal, seq) # Moreau moran_auto = map(auto.CalculateMoranAutoTotal, seq) # Moran geary_auto = map(auto.CalculateGearyAutoTotal, seq) # Geary # Composition - Distribution - Transition ctd = map(CTD.CalculateCTD, seq) # QuasiSequence sqa = map(lambda p: qua.GetQuasiSequenceOrder(p, maxlag=5, weight=0.1), seq) secq = map(lambda p: qua.GetSequenceOrderCouplingNumber(p, d=1), seq) amino_comp = pd.DataFrame.from_dict(amino_comp) amino_comp.reset_index(drop=True, inplace=True) dipep_comp = pd.DataFrame.from_dict(dipep_comp) dipep_comp.reset_index(drop=True, inplace=True) moreau_auto = pd.DataFrame.from_dict(moreau_auto) moreau_auto.reset_index(drop=True, inplace=True) moran_auto = pd.DataFrame.from_dict(moran_auto) moran_auto.reset_index(drop=True, inplace=True) geary_auto = pd.DataFrame.from_dict(geary_auto) geary_auto.reset_index(drop=True, inplace=True) ctd = pd.DataFrame.from_dict(ctd) ctd.reset_index(drop=True, inplace=True) # PseudoAAC - Tipo I Hydrophobicity = PAAC._Hydrophobicity hydrophilicity = PAAC._hydrophilicity residuemass = PAAC._residuemass pK1 = PAAC._pK1 pK2 = PAAC._pK2 pI = PAAC._pI clasI_pse = map( lambda p: PAAC.GetPseudoAAC( p, lamda=3, weight=0.7, AAP=[Hydrophobicity, hydrophilicity, residuemass, pK1, pK2, pI]), seq) clasI_pse = pd.DataFrame.from_dict(clasI_pse) clasI_pse.reset_index(drop=True, inplace=True) sqa = pd.DataFrame.from_dict(sqa) sqa.reset_index(drop=True, inplace=True) secq = pd.DataFrame.from_dict(secq) secq.reset_index(drop=True, inplace=True) prop_fis = pd.DataFrame(prop_fis) prop_fis.columns = [ 'Longitud', 'MW', 'Carga', 'DensCarga', 'pIso', 'InestInd', 'Aroma', 'Alifa', 'Boman', 'HidroRa' ] var = pd.concat([ amino_comp, dipep_comp, moreau_auto, moran_auto, ctd, clasI_pse, sqa, secq, geary_auto, prop_fis ], axis=1) return var
newFeatures = [ 'MW', 'ChargeDensity', 'pI', 'InstabilityInd', 'Aromaticity', 'AliphaticInd', 'BomanInd', 'HydRatio' ] #writing feature names in excel sheet for i in range(cols + len(aminoAcid) + 1, cols + len(aminoAcid) + len(newFeatures) + 1): writingSheet.cell( row=1, column=i).value = newFeatures[i - (cols + len(aminoAcid) + 1)] for i in range(2, rows + 1): #filling feature value in excel sheet pepSequencee = readingSheet.cell(row=i, column=cols).value desc = GlobalDescriptor(pepSequencee) desc.calculate_all(amide=True) array = desc.descriptor.tolist() countt = 1 for j in range(cols + len(aminoAcid) + 1, cols + len(aminoAcid) + 1 + len(newFeatures)): writingSheet.cell(row=i, column=j).value = float(array[0][countt]) countt += 1 writingBook.save(str(outputFile)) #saving all data to output file ##################################################################TESTING DATA#################################################### trainingData = pd.read_csv(r"test.csv") #reading CSV training data trainingData.to_excel(r"test.xlsx", index=None, header=True) #converting CSV to Excel
def analyze_generated(self, num, fname='analysis.txt', plot=False): """ Method to analyze the generated sequences located in `self.generated`. :param num: {int} wanted number of sequences to sample :param fname: {str} filename to save analysis info to :param plot: {bool} whether to plot an overview of descriptors :return: file with analysis info (distances) """ with open(fname, 'w') as f: print("Analyzing...") f.write("ANALYSIS OF SAMPLED SEQUENCES\n==============================\n\n") f.write("Nr. of duplicates in generated sequences: %i\n" % (len(self.generated) - len(set(self.generated)))) count = len(set(self.generated) & set(self.sequences)) # get shared entries in both lists f.write("%.1f percent of generated sequences are present in the training data.\n" % ((count / len(self.generated)) * 100)) d = GlobalDescriptor(self.generated) len1 = len(d.sequences) d.filter_aa('B') len2 = len(d.sequences) d.length() f.write("\n\nLENGTH DISTRIBUTION OF GENERATED DATA:\n\n") f.write("Number of sequences too short:\t%i\n" % (num - len1)) f.write("Number of invalid (with 'B'):\t%i\n" % (len1 - len2)) f.write("Number of valid unique seqs:\t%i\n" % len2) f.write("Mean sequence length: \t\t%.1f ± %.1f\n" % (np.mean(d.descriptor), np.std(d.descriptor))) f.write("Median sequence length: \t\t%i\n" % np.median(d.descriptor)) f.write("Minimal sequence length: \t\t%i\n" % np.min(d.descriptor)) f.write("Maximal sequence length: \t\t%i\n" % np.max(d.descriptor)) descriptor = 'pepcats' seq_desc = PeptideDescriptor([s[1:].rstrip() for s in self.sequences], descriptor) seq_desc.calculate_autocorr(7) gen_desc = PeptideDescriptor(d.sequences, descriptor) gen_desc.calculate_autocorr(7) # random comparison set self.ran = Random(len(self.generated), np.min(d.descriptor), np.max(d.descriptor)) # generate rand seqs probas = count_aas(''.join(seq_desc.sequences)).values() # get the aa distribution of training seqs self.ran.generate_sequences(proba=probas) ran_desc = PeptideDescriptor(self.ran.sequences, descriptor) ran_desc.calculate_autocorr(7) # amphipathic helices comparison set self.hel = Helices(len(self.generated), np.min(d.descriptor), np.max(d.descriptor)) self.hel.generate_sequences() hel_desc = PeptideDescriptor(self.hel.sequences, descriptor) hel_desc.calculate_autocorr(7) # distance calculation f.write("\n\nDISTANCE CALCULATION IN '%s' DESCRIPTOR SPACE\n\n" % descriptor.upper()) desc_dist = distance.cdist(gen_desc.descriptor, seq_desc.descriptor, metric='euclidean') f.write("Average euclidean distance of sampled to training data:\t%.3f +/- %.3f\n" % (np.mean(desc_dist), np.std(desc_dist))) ran_dist = distance.cdist(ran_desc.descriptor, seq_desc.descriptor, metric='euclidean') f.write("Average euclidean distance if randomly sampled seqs:\t%.3f +/- %.3f\n" % (np.mean(ran_dist), np.std(ran_dist))) hel_dist = distance.cdist(hel_desc.descriptor, seq_desc.descriptor, metric='euclidean') f.write("Average euclidean distance if amphipathic helical seqs:\t%.3f +/- %.3f\n" % (np.mean(hel_dist), np.std(hel_dist))) # more simple descriptors g_seq = GlobalDescriptor(seq_desc.sequences) g_gen = GlobalDescriptor(gen_desc.sequences) g_ran = GlobalDescriptor(ran_desc.sequences) g_hel = GlobalDescriptor(hel_desc.sequences) g_seq.calculate_all() g_gen.calculate_all() g_ran.calculate_all() g_hel.calculate_all() sclr = StandardScaler() sclr.fit(g_seq.descriptor) f.write("\n\nDISTANCE CALCULATION FOR SCALED GLOBAL DESCRIPTORS\n\n") desc_dist = distance.cdist(sclr.transform(g_gen.descriptor), sclr.transform(g_seq.descriptor), metric='euclidean') f.write("Average euclidean distance of sampled to training data:\t%.2f +/- %.2f\n" % (np.mean(desc_dist), np.std(desc_dist))) ran_dist = distance.cdist(sclr.transform(g_ran.descriptor), sclr.transform(g_seq.descriptor), metric='euclidean') f.write("Average euclidean distance if randomly sampled seqs:\t%.2f +/- %.2f\n" % (np.mean(ran_dist), np.std(ran_dist))) hel_dist = distance.cdist(sclr.transform(g_hel.descriptor), sclr.transform(g_seq.descriptor), metric='euclidean') f.write("Average euclidean distance if amphipathic helical seqs:\t%.2f +/- %.2f\n" % (np.mean(hel_dist), np.std(hel_dist))) # hydrophobic moments uh_seq = PeptideDescriptor(seq_desc.sequences, 'eisenberg') uh_seq.calculate_moment() uh_gen = PeptideDescriptor(gen_desc.sequences, 'eisenberg') uh_gen.calculate_moment() uh_ran = PeptideDescriptor(ran_desc.sequences, 'eisenberg') uh_ran.calculate_moment() uh_hel = PeptideDescriptor(hel_desc.sequences, 'eisenberg') uh_hel.calculate_moment() f.write("\n\nHYDROPHOBIC MOMENTS\n\n") f.write("Hydrophobic moment of training seqs:\t%.3f +/- %.3f\n" % (np.mean(uh_seq.descriptor), np.std(uh_seq.descriptor))) f.write("Hydrophobic moment of sampled seqs:\t\t%.3f +/- %.3f\n" % (np.mean(uh_gen.descriptor), np.std(uh_gen.descriptor))) f.write("Hydrophobic moment of random seqs:\t\t%.3f +/- %.3f\n" % (np.mean(uh_ran.descriptor), np.std(uh_ran.descriptor))) f.write("Hydrophobic moment of amphipathic seqs:\t%.3f +/- %.3f\n" % (np.mean(uh_hel.descriptor), np.std(uh_hel.descriptor))) if plot: if self.refs: a = GlobalAnalysis([uh_seq.sequences, uh_gen.sequences, uh_hel.sequences, uh_ran.sequences], ['training', 'sampled', 'hel', 'ran']) else: a = GlobalAnalysis([uh_seq.sequences, uh_gen.sequences], ['training', 'sampled']) a.plot_summary(filename=fname[:-4] + '.png')
def describe_sequences(): path = r"C:\Users\Patrick\OneDrive - University College Dublin\Bioinformatics\HemolyticStudies\BOTH_peptides.json" aa_letters = [ 'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y' ] di_letters = ["%s%s" % (a, b) for a in aa_letters for b in aa_letters] tri_letters = [ "%s%s%s" % (a, b, c) for a in aa_letters for b in aa_letters for c in aa_letters ] conjoint_letters = ["A", "I", "Y", "H", "R", "D", "C"] letters = { 1: aa_letters, 2: di_letters, 3: tri_letters, 4: conjoint_letters } #Conjoint src = https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-015-0828-1 conjoint_dict = { "A": "A", "G": "A", "V": "A", "I": "I", "L": "I", "F": "I", "P": "I", "Y": "Y", "M": "Y", "T": "Y", "S": "Y", "H": "H", "N": "H", "Q": "H", "W": "H", "R": "R", "K": "R", "D": "D", "E": "D", "C": "C", } def counter(string, seq_type): ''' A function for counting the number of letters present. Returns a list of (letter, #occurances) tuples. ''' l = len(string) d = {i: 0 for i in letters[seq_type]} if seq_type == 1: for s in string: try: d[s] += 1.0 except KeyError: d[s] = 1.0 d = {k: d[k] / l for k in d} if seq_type == 2: for a in range(l - 1): s = string[a:a + seq_type] try: d[s] += 1.0 except KeyError: d[s] = 1.0 d = {k: d[k] / (l - 1) for k in d} if seq_type == 3: for a in range(l - 2): s = string[a:a + seq_type] try: d[s] += 1.0 except KeyError: d[s] = 1.0 d = {k: d[k] / (l - 2) for k in d} return d def counter_boolean(string, seq_type): ''' A function for counting the number of letters present. Returns a list of (letter, #occurances) tuples. ''' l = len(string) d = {i: 0 for i in letters[seq_type]} if seq_type == 1: for s in string: try: d[s] = 1.0 except KeyError: d[s] = 1.0 if seq_type == 2: for a in range(l - 1): s = string[a:a + seq_type] try: d[s] = 1.0 except KeyError: d[s] = 1.0 return d def counter_abs(string, seq_type): ''' A function for counting the number of letters present. Returns a list of (letter, #occurances) tuples. ''' l = len(string) d = {i: 0 for i in letters[seq_type]} if seq_type == 1: for s in string: try: d[s] = d[s] + 1.0 except KeyError: d[s] = 1.0 if seq_type == 2: for a in range(l - 1): s = string[a:a + seq_type] try: d[s] = d[s] + 1.0 except KeyError: d[s] = 1.0 return d def residue_distribution(all_residues, seq_type, dp): ''' Takes as arguments a string with letters, and the type of sequence represented. Returns an alphabetically ordered string of relative frequencies, correct to three decimal places. ''' d = counter(all_residues, seq_type) if seq_type == 1: residue_counts = list( sorted([(i, d[i]) for i in letters[seq_type] ])) ##Removes ambiguous letters elif seq_type == 2: residue_counts = list( sorted([(i, d[i]) for i in letters[seq_type] if dp[i] >= 50])) elif seq_type == 3: residue_counts = list( sorted([(i, d[i]) for i in letters[seq_type] if tp[i] >= 20])) elif seq_type == 4: residue_counts = list( sorted([(i, d[i]) for i in letters[seq_type]])) r_c = [i[1] for i in residue_counts] dis = np.array([ r_c, ]) return dis def residue_boolean(all_residues, seq_type, dp): ''' Takes as arguments a string with letters, and the type of sequence represented. Returns an alphabetically ordered string of relative frequencies, correct to three decimal places. ''' d = counter_boolean(all_residues, seq_type) if seq_type == 1: residue_counts = list( sorted([(i, d[i]) for i in letters[seq_type] ])) ##Removes ambiguous letters elif seq_type == 2: residue_counts = list( sorted([(i, d[i]) for i in letters[seq_type] if dp[i] >= 50])) r_c = [i[1] for i in residue_counts] dis = np.array([ r_c, ]) return dis def residue_abs(all_residues, seq_type, dp): ''' Takes as arguments a string with letters, and the type of sequence represented. Returns an alphabetically ordered string of relative frequencies, correct to three decimal places. ''' d = counter_abs(all_residues, seq_type) if seq_type == 1: residue_counts = list( sorted([(i, d[i]) for i in letters[seq_type] ])) ##Removes ambiguous letters elif seq_type == 2: residue_counts = list( sorted([(i, d[i]) for i in letters[seq_type] if dp[i] >= 50])) r_c = [i[1] for i in residue_counts] dis = np.array([ r_c, ]) return dis with open(path, "r") as f: text = f.read() peptides = eval(text)["Peptides"] train_peptides, test_peptides = train_test_split(peptides, test_size=0.15, random_state=42) train_peptides_seqs = [peptide["seq"] for peptide in train_peptides] for peptide in peptides: if peptide["seq"] in train_peptides_seqs: peptide["train"] = True else: peptide["train"] = False print(len([p for p in peptides if p["train"] == True])) print(len([p for p in peptides if p["train"] == False])) new_peptides = [] for peptide in peptides: if peptide["train"] == True: new_peptide = peptide.copy() new_seq = ''.join(reversed(peptide["seq"])) new_peptide["seq"] = new_seq new_peptides.append(new_peptide) #peptides.extend(new_peptides) random.shuffle(peptides) print(len([p for p in peptides if p["train"] == True])) print(len([p for p in peptides if p["train"] == False])) print("doubling complete") dp = {i: 0 for i in letters[2]} tp = {i: 0 for i in letters[3]} name_i = 0 for peptide in peptides: temp_set = set() seq = peptide["seq"] l = len(seq) for a in range(l - 1): s = seq[a:a + 2] temp_set.add(s) for s in temp_set: dp[s] = dp[s] + 1 for peptide in peptides: temp_set = set() seq = peptide["seq"] l = len(seq) for a in range(l - 2): s = seq[a:a + 3] temp_set.add(s) for s in temp_set: tp[s] = tp[s] + 1 for peptide in peptides: peptide["conjoint_seq"] = "".join( [conjoint_dict[letter] for letter in peptide["seq"]]) for peptide in peptides: globdesc = GlobalDescriptor(peptide["seq"]) globdesc.calculate_all(amide=peptide["cTer"] == "Amidation") ctdc = CTD.CalculateC(peptide["seq"]) ctdc_keys = list(sorted(list([key for key in ctdc]))) ctdc_vals = np.array([[ctdc[key] for key in ctdc_keys]]) conjointtriad = ConjointTriad.CalculateConjointTriad(peptide["seq"]) conjointtriad_keys = list(sorted(list([key for key in conjointtriad]))) conjointtriad_vals = np.array( [[conjointtriad[key] for key in conjointtriad_keys]]) conjoint_dis = residue_distribution(peptide["conjoint_seq"], 4, None) #peptide["GlobalDescriptor"] = globdesc #print(peptide["GlobalDescriptor"].descriptor) #Eisenberg hydrophobicity consensus #Take most of the values from here pepdesc = PeptideDescriptor(peptide["seq"], "eisenberg") pepdesc.calculate_global(modality="mean", append=True) pepdesc.calculate_global(modality="max", append=True) pepdesc.calculate_moment(modality="max", append=True) pepdesc.calculate_moment(modality="mean", append=True) #pepdesc.calculate_profile(append=True, prof_type = "uH") pepdesc.load_scale("Ez") pepdesc.calculate_global(modality="mean", append=True) pepdesc.calculate_global(modality="max", append=True) pepdesc.load_scale("aasi") pepdesc.calculate_global(append=True) pepdesc.calculate_moment(modality="max", append=True) pepdesc.calculate_moment(modality="mean", append=True) pepdesc.load_scale("abhprk") pepdesc.calculate_global(modality="mean", append=True) pepdesc.calculate_global(modality="max", append=True) pepdesc.load_scale("charge_acid") pepdesc.calculate_global(modality="mean", append=True) pepdesc.calculate_global(modality="max", append=True) pepdesc.calculate_moment(modality="max", append=True) pepdesc.calculate_moment(modality="mean", append=True) pepdesc.load_scale("cougar") pepdesc.calculate_global(modality="mean", append=True) pepdesc.calculate_global(modality="max", append=True) pepdesc.load_scale("gravy") pepdesc.calculate_global(modality="mean", append=True) pepdesc.calculate_global(modality="max", append=True) pepdesc.calculate_moment(modality="max", append=True) pepdesc.calculate_moment(modality="mean", append=True) pepdesc.load_scale("hopp-woods") pepdesc.calculate_global(modality="mean", append=True) pepdesc.calculate_global(modality="max", append=True) pepdesc.calculate_moment(modality="max", append=True) pepdesc.calculate_moment(modality="mean", append=True) pepdesc.load_scale("kytedoolittle") pepdesc.calculate_global(modality="mean", append=True) pepdesc.calculate_global(modality="max", append=True) pepdesc.calculate_moment(modality="max", append=True) pepdesc.calculate_moment(modality="mean", append=True) pepdesc.load_scale("ppcali") pepdesc.calculate_global(modality="mean", append=True) pepdesc.calculate_global(modality="max", append=True) pepdesc.load_scale("msw") pepdesc.calculate_global(modality="mean", append=True) pepdesc.calculate_global(modality="max", append=True) pepdesc.load_scale("charge_phys") pepdesc.calculate_moment(modality="max", append=True) pepdesc.calculate_moment(modality="mean", append=True) pepdesc.calculate_global(modality="mean", append=True) pepdesc.calculate_global(modality="max", append=True) pepdesc.load_scale("flexibility") pepdesc.calculate_moment(modality="max", append=True) pepdesc.calculate_moment(modality="mean", append=True) pepdesc.calculate_global(modality="mean", append=True) pepdesc.calculate_global(modality="max", append=True) pepdesc.load_scale("bulkiness") pepdesc.calculate_moment(modality="max", append=True) pepdesc.calculate_moment(modality="mean", append=True) pepdesc.calculate_global(modality="mean", append=True) pepdesc.calculate_global(modality="max", append=True) pepdesc.load_scale("TM_tend") pepdesc.calculate_moment(modality="max", append=True) pepdesc.calculate_moment(modality="mean", append=True) pepdesc.calculate_global(modality="mean", append=True) pepdesc.calculate_global(modality="max", append=True) pepdesc.load_scale("mss") pepdesc.calculate_moment(modality="max", append=True) pepdesc.calculate_moment(modality="mean", append=True) pepdesc.calculate_global(modality="mean", append=True) pepdesc.calculate_global(modality="max", append=True) pepdesc.load_scale("t_scale") pepdesc.calculate_global(modality="mean", append=True) pepdesc.calculate_global(modality="max", append=True) pepdesc.load_scale("peparc") pepdesc.calculate_arc(modality="max", append=True) pepdesc.calculate_arc(modality="mean", append=True) pepdesc.load_scale("msw") pepdesc.calculate_global(modality="mean", append=True) pepdesc.calculate_global(modality="max", append=True) pepdesc.load_scale("polarity") pepdesc.calculate_moment(modality="max", append=True) pepdesc.calculate_moment(modality="mean", append=True) pepdesc.calculate_global(modality="mean", append=True) pepdesc.calculate_global(modality="max", append=True) pepdesc.load_scale("pepcats") pepdesc.calculate_global(modality="mean", append=True) pepdesc.calculate_global(modality="max", append=True) pepdesc.load_scale("isaeci") pepdesc.calculate_global(modality="mean", append=True) pepdesc.calculate_global(modality="max", append=True) pepdesc.load_scale("refractivity") pepdesc.calculate_moment(modality="max", append=True) pepdesc.calculate_moment(modality="mean", append=True) pepdesc.calculate_global(modality="mean", append=True) pepdesc.calculate_global(modality="max", append=True) pepdesc.load_scale("z3") pepdesc.calculate_global(modality="mean", append=True) pepdesc.calculate_global(modality="max", append=True) pepdesc.load_scale("z5") pepdesc.calculate_global(modality="mean", append=True) pepdesc.calculate_global(modality="max", append=True) #pepdesc.load_scale("PPCALI") #pepdesc.calculate_autocorr(2) #peptide["PeptideDescriptor"] = pepdesc protein = PyPro() protein.ReadProteinSequence(peptide["seq"]) paac = protein.GetPAAC(lamda=1, weight=0.05) paac2 = [[ paac[a] for a in list( sorted([k for k in paac], key=lambda x: int(x.replace("PAAC", "")))) ]] cTer = np.array([[1 if peptide["cTer"] == "Amidation" else 0]]) paac = np.array(paac2) analysed_seq = ProteinAnalysis(peptide["seq"]) secondary_structure_fraction = np.array( [analysed_seq.secondary_structure_fraction()]) peptide["TotalDescriptor"] = str( np.concatenate((pepdesc.descriptor, globdesc.descriptor), axis=1)) try: pepid = np.array([[ int(peptide["id"].replace("HEMOLYTIK", "").replace( "DRAMP", "").replace("DBAASP", "")) ]]) except KeyError: pepid = 0 pep_train = np.array([[1 if peptide["train"] == True else 0]]) freq_1d = residue_distribution(peptide["seq"], 1, dp) freq_2d = residue_distribution(peptide["seq"], 2, dp) freq_3d = residue_distribution(peptide["seq"], 3, dp) freq_1dbool = residue_boolean(peptide["seq"], 1, dp) freq_2dbool = residue_boolean(peptide["seq"], 2, dp) freq_1dabs = residue_abs(peptide["seq"], 1, dp) freq_2dabs = residue_abs(peptide["seq"], 2, dp) len_peptide = np.array([[len(peptide["seq"])]]) if peptide["activity"] == "YES": pepact = 1 else: pepact = 0 pepact = np.array([[pepact]]) peptide_di2 = di2(peptide["seq"]) peptide_di3 = di3(peptide["conjoint_seq"]) ####################### AAindex ######################### to_get = [ ("CHAM810101", "mean"), #Steric Hinderance ("CHAM810101", "total"), #Steric Hinderance ("KYTJ820101", "mean"), #Hydropathy ("KLEP840101", "total"), #Charge ("KLEP840101", "mean"), #Charge ("MITS020101", "mean"), #Amphiphilicity ("FAUJ830101", "mean"), #Hydrophobic parameter pi ("GOLD730102", "total"), #Residue volume ("MEEJ800101", "mean"), #Retention coefficient in HPLC ("OOBM850105", "mean"), #Optimized side chain interaction parameter ("OOBM850105", "total"), #Optimized side chain interaction parameter ("VELV850101", "total"), #Electron-ion interaction parameter ("VELV850101", "mean"), #Electron-ion interaction parameter ("PUNT030102", "mean"), #Knowledge-based membrane-propensity scale from 3D_Helix ("BHAR880101", "mean"), #Average flexibility indeces ("KRIW790102", "mean"), #Fraction of site occupied by water ("PLIV810101", "mean"), #Partition coefficient ("ZIMJ680102", "mean"), #Bulkiness ("ZIMJ680102", "total"), #Bulkiness ("ZHOH040101", "mean"), #Stability scale ("CHAM820102", "total"), #Free energy solubility in water #From HemoPi: src = https://github.com/riteshcanfly/Hemopi/blob/master/pcCalculator.java ("HOPT810101", "mean"), #Hydrophilicity ("EISD840101", "mean"), #Hydrophobicity ("FAUJ880109", "total"), #Net Hydrogen ("EISD860101", "mean"), #Solvation ] tetra_peptides = [ "KLLL", # src = https://github.com/riteshcanfly/Hemopi/blob/master/tetrapos.txt "GCSC", "AAAK", "KLLS", "LGKL", "VLKA", "LLGK", "LVGA", "LSDF", "SDFK", "SWLR", "WLRD", ] tp_bin = [] for t_p in tetra_peptides: if t_p in peptide["seq"]: tp_bin.append(1) else: tp_bin.append(0) tp_bin = np.array([tp_bin]) for identifier, mode in to_get: x = aaf(peptide["seq"], identifier, mode) aminoacidindeces = np.array([[ aaf(peptide["seq"], identifier, mode) for identifier, mode in to_get ]]) peptide["array"] = np.concatenate( ( pepid, pep_train, pepdesc.descriptor, globdesc.descriptor, len_peptide, cTer, secondary_structure_fraction, aminoacidindeces, ctdc_vals, conjointtriad_vals, tp_bin, freq_1d, freq_2d, freq_3d, freq_1dbool, freq_2dbool, freq_1dabs, freq_2dabs, peptide_di2, peptide_di3, #Conjoint Alphabet paac, pepact, ), axis=1) #print(peptide["TotalDescriptor"]) x = np.concatenate([peptide["array"] for peptide in peptides], axis=0) np.save("peptides_array", x, allow_pickle=False)
def _add_features_to_peptide_series(self, peptide, index, n_cluster=-1, lpvs=None): # primary intensity weights d = delta, pd = penalty delta # TODO only d_start and d_stop depends on impval, pd_start and pd_stop does not because # they are always between a d_start and d_stop, and should thus be above imp_val! # therefore we can write out d_start as and d_stop as: # [before_start, after_start], [befrore_stop, after_stop] # thus if we have # raw data = [0, 0, 5, 5, 7, 7, 5, 5, 0, 0] # then for the peptide 3--------------8 # before_start, after_start = [ 0, 5 ] # but for the peptide 5--6 # before_start, after_start = [ 5, 7 ] # by making a none linear model we could formulate the w_start parameter as follows: # w_start * (after_start - max(before_start, imp_val)) # which is consistent with how we currently do the grid search (imp_val=4): # d_start = 5 - max(0, 4) = 1 # d_start = 7 - max(5, 4) = 2 if lpvs is None: lpvs = set() i_start = peptide.start.index i_stop = peptide.stop.index # MS Delta series = pd.Series(np.zeros(len(index)) * np.nan, index=index) ms_int = self.ms_intensity_features.type series[ms_int, 'start'] = self.start_scores[i_start] series[ms_int, 'stop'] = self.stop_scores[i_stop] if 4 < len(peptide): penalty = SequenceRange(peptide.start + 1, peptide.stop - 1, validate=False) series[ms_int, 'penalty_start'] = self.start_scores[penalty.slice].sum() series[ms_int, 'penalty_stop'] = self.stop_scores[penalty.slice].sum() else: series[ms_int, 'penalty_start'] = series[ms_int, 'penalty_stop'] = 0 # MS Bool b_obs, f_obs = self._calc_observed(peptide) series[self.ms_bool_features.type, "first"] = self.h_first[i_start] series[self.ms_bool_features.type, "last"] = self.h_last[i_stop] series[self.ms_bool_features.type, "observed"] = b_obs # MS Frequency # ptm weights # TODO: should it get extra penalties if there are PTM's between start and end? ms_freq = self.ms_frequency_features.type series[ms_freq, 'acetylation'] = self.ac_freq[i_start] series[ms_freq, 'amidation'] = self.am_freq[i_stop] series[ms_freq, 'start'] = self.h_start_freq[i_start] series[ms_freq, 'stop'] = self.h_stop_freq[i_stop] series[ms_freq, 'observed'] = f_obs series[ms_freq, 'sample'] = self.h_sample[peptide.slice].min() series[ms_freq, 'ladder'] = \ self.h_ladder_start[i_start] * self.h_ladder_stop[i_stop] series[ms_freq, 'protein_coverage'] = self.protein_coverage series[ms_freq, 'cluster_coverage'] = self.cluster_coverage[n_cluster] # thise are good features, but there may be better ways to extract them series[ms_freq, 'bond'] = self.h_bond[self.get_bond_slice(peptide)].min() # MS Counts ms_count = self.ms_count_features.type series[ms_count, 'start'] = self.start_counts[peptide.start] series[ms_count, 'stop'] = self.stop_counts[peptide.stop] # series[ms_count, 'ladder'] = \ # self.h_ladder_start[i_start] + self.h_ladder_stop[i_stop] ############################################################ # Chemical sequence = self.protein_sequence[peptide.slice] peptide_features = GlobalDescriptor(sequence) is_amidated = series[ms_freq, 'amidation'] > 0.05 peptide_features.calculate_all(amide=is_amidated) chem = self.chemical_features.type for i, name in enumerate(peptide_features.featurenames): if name in self.chemical_features.features: series[chem, name] = peptide_features.descriptor[0, i] eisenberg = PeptideDescriptor(sequence, 'eisenberg') eisenberg.calculate_moment() series[chem, 'eisenberg'] = eisenberg.descriptor.flatten()[0] # Annotations series[self.annotations.type, "Known"] = peptide in self.known_peptides # series[self.annotations.type, "Type"] = peptide in self.known_peptides series[self.annotations.type, "Cluster"] = n_cluster series[self.annotations.type, "Sequence"] = peptide.seq series[self.annotations.type, "LPV"] = False # TODO! series[self.annotations.type, "N Flanking"] = \ self.get_nflanking_region(peptide.start, self.protein_sequence) series[self.annotations.type, "C Flanking"] = \ self.get_cflanking_region(peptide.stop, self.protein_sequence) series[self.annotations.type, "LPV"] = peptide in lpvs if f_obs != 0: _pep_index = (slice(None), slice(None), peptide.start.pos, peptide.stop.pos) series[self.annotations.type, "Intensity"] = self.df.loc[_pep_index, :].sum().sum() return series
def describe_sequences(): aa_letters = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'] di_letters = ["%s%s" % (a, b) for a in aa_letters for b in aa_letters] letters = {1 : aa_letters, 2 : di_letters} def counter(string, seq_type): ''' A function for counting the number of letters present. Returns a list of (letter, #occurances) tuples. ''' l = len(string) d = {i : 0 for i in letters[seq_type]} if seq_type == 1: for s in string: try: d[s] += 1.0 except KeyError: d[s] = 1.0 d = {k : d[k]/l for k in d} if seq_type == 2: for a in range(l-1): s = string[a:a+seq_type] try: d[s] += 1.0 except KeyError: d[s] = 1.0 d = {k : d[k]/(l-1) for k in d} return d def residue_distribution(all_residues, seq_type): ''' Takes as arguments a string with letters, and the type of sequence represented. Returns an alphabetically ordered string of relative frequencies, correct to three decimal places. ''' d = counter(all_residues, seq_type) residue_counts = list(sorted([(i, d[i]) for i in letters[seq_type] ])) ##Removes ambiguous letters r_c = [i[1] for i in residue_counts] dis = np.array([r_c,]) return dis peptides = [{"seq" : "FLPILASLAAKFGPKLFCLVTKKC", "cTer" : None, "activity" : "YES"}, {"seq" : "ILGPVISTIGGVLGGLLKNL", "cTer" : "Amidation", "activity" : "YES"}, {"seq": "GIGGKILSGLKTALKGAAKELASTYLH", "cTer" : None, "activity" : "NO"}, {"seq": "GIGSAILSAGKSALKGLAKGLAEHFAN", "cTer" : None, "activity" : "NO"}, {"seq": "FLSLIPHAINAVSAIAKHF", "cTer" : "Amidation", "activity" : "NO"}, ] for peptide in peptides: #print(peptide["id"]) #print(peptide["seq"]) globdesc = GlobalDescriptor(peptide["seq"]) globdesc.calculate_all(amide = peptide["cTer"] == "Amidation") #peptide["GlobalDescriptor"] = globdesc #print(peptide["GlobalDescriptor"].descriptor) #Eisenberg hydrophobicity consensus #Take most of the values from here pepdesc = PeptideDescriptor(peptide["seq"], "eisenberg") pepdesc.calculate_global() pepdesc.calculate_moment(append=True) #pepdesc.calculate_profile(append=True, prof_type = "uH") pepdesc.load_scale("Ez") pepdesc.calculate_global(append=True) pepdesc.load_scale("charge_phys") pepdesc.calculate_moment(append=True) pepdesc.calculate_global(append=True) pepdesc.load_scale("flexibility") pepdesc.calculate_moment(append=True) pepdesc.calculate_global(append=True) pepdesc.load_scale("polarity") pepdesc.calculate_moment(append=True) pepdesc.calculate_global(append=True) pepdesc.load_scale("isaeci") pepdesc.calculate_global(append=True) pepdesc.load_scale("refractivity") pepdesc.calculate_moment(append=True) pepdesc.calculate_global(append=True) pepdesc.load_scale("z5") pepdesc.calculate_global(append=True) #peptide["PeptideDescriptor"] = pepdesc peptide["TotalDescriptor"] = str(np.concatenate((pepdesc.descriptor, globdesc.descriptor), axis=1)) try: pepid = np.array([[int(peptide["id"].replace("HEMOLYTIK",""))]]) except KeyError: pepid = np.array([[0]]) freq_1d = residue_distribution(peptide["seq"], 1) freq_2d = residue_distribution(peptide["seq"], 2) len_peptide = np.array([[len(peptide["seq"])]]) if peptide["activity"] == "YES": pepact = 1 else: pepact = 0 pepact = np.array([[pepact]]) peptide_di2 = di2(peptide["seq"]) peptide["array"] = np.concatenate((pepid, pepdesc.descriptor, globdesc.descriptor, len_peptide, freq_1d, #freq_2d, #peptide_di2, pepact,), axis=1) #print(peptide["TotalDescriptor"]) x = np.concatenate([peptide["array"] for peptide in peptides], axis=0) print(x) np.save("hemolytik_array_custom_tests", x, allow_pickle=False)