def calculate_physiochemical_features(temp_dict, sequence): analyzed_seq = ProteinAnalysis(sequence) charge_at_pH7 = analyzed_seq.charge_at_pH(7) instability_index = analyzed_seq.instability_index() molecular_weight = analyzed_seq.molecular_weight() aromaticity = analyzed_seq.aromaticity() molar_extinction_coefficient = analyzed_seq.molar_extinction_coefficient() range_l, range_h = molar_extinction_coefficient molar_extinction_coefficient = (float(range_l) + float(range_h)) / 2 gravy = analyzed_seq.gravy( ) #Grand Average Hyrdopathy - Higher value = More Hydrophobic isoelectric_point = analyzed_seq.isoelectric_point() helix_fraction, turn_fraction, sheet_fraction = analyzed_seq.secondary_structure_fraction( ) physiochem_dict = { "Charge at pH7": charge_at_pH7, "Instability Index": instability_index, "Molecular Wt": molecular_weight, "Aromaticity": aromaticity, "Molar Extinction Coeff": molar_extinction_coefficient, "Gravy": gravy, "Isoelectric pt": isoelectric_point, "Helix Fraction": helix_fraction, "Turn Fraction": turn_fraction, "Sheet Fraction": sheet_fraction } temp_dict.update(physiochem_dict) #Adding separately because get_amino_acids_percent() generates a dictionary on its own aa_percent = analyzed_seq.get_amino_acids_percent() temp_dict.update(aa_percent)
def get_coord_array(path, file_name): ''' Function: get coord array of all atoms in a pdb file :param path: the path of pdb file of all proteins :param file_name: the file name of ****.pdb :return: atom coord array [[x0,y0,z0], [x1,y1,z1], ... , [xn,yn,zn]] : charge of first model : mass of first model ''' parser = PDBParser(PERMISSIVE=1) structure_id = file_name.split('.')[0] path_file_name = path + file_name structure = parser.get_structure(structure_id, path_file_name) # Extract mass and charge from first model mass, charge = 0.0, 0.0 polypep_builder = PPBuilder() for polypep in polypep_builder.build_peptides(structure): analyzer = ProteinAnalysis(polypep.get_sequence()) mass += analyzer.molecular_weight() charge += analyzer.charge_at_pH(7.4) atom_coord_list = [] for model in structure: for chain in model: for residue in chain: for atom in residue: atom_coord = atom.get_coord() atom_coord_list.append(atom_coord) atom_coord_array = np.array(atom_coord_list) # print file_name,'atom_coo_array\n',atom_coord_array,'\n' # print('get_coord_array DONE!\t', path, ": ", file_name) return atom_coord_array, charge, mass
with temppathlib.TemporaryDirectory() as tmpdir: # unzip the file with all the test PDBs with zipfile.ZipFile(args.infile, "r") as zip_: zip_.extractall(tmpdir.path) for test_pdb in tmpdir.path.glob("*.pdb"): for record in SeqIO.parse(test_pdb, "pdb-atom"): sequence = str(record.seq).replace('X', 'G') protein = ProteinAnalysis(str(sequence)) p_len.append(len(sequence)) mol_w.append(protein.molecular_weight()) iso_p.append(protein.isoelectric_point()) smell.append(protein.aromaticity()) taste_factor.append(protein.gravy()) insta_ind.append(protein.instability_index()) char_at_acid.append(protein.charge_at_pH(1)) char_at_neutral.append(protein.charge_at_pH(7)) char_at_base.append(protein.charge_at_pH(14)) helter_skeler.append(protein.secondary_structure_fraction()[0]) turnip.append(protein.secondary_structure_fraction()[1]) garfield.append(protein.secondary_structure_fraction()[2]) for x in amino_acids: n = protein.count_amino_acids()[x] for y in d_count.keys(): if y[-1] == x: d_count[y].append(n) for a in amino_acids: m = protein.get_amino_acids_percent()[a] for b in d_perc.keys(): if b[-1] == a: d_perc[b].append(m)
def main(): aa = [ 'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y' ] dipeptide = [ 'AA', 'AC', 'AD', 'AE', 'AF', 'AG', 'AH', 'AI', 'AK', 'AL', 'AM', 'AN', 'AP', 'AQ', 'AR', 'AS', 'AT', 'AV', 'AW', 'AY', 'CA', 'CC', 'CD', 'CE', 'CF', 'CG', 'CH', 'CI', 'CK', 'CL', 'CM', 'CN', 'CP', 'CQ', 'CR', 'CS', 'CT', 'CV', 'CW', 'CY', 'DA', 'DC', 'DD', 'DE', 'DF', 'DG', 'DH', 'DI', 'DK', 'DL', 'DM', 'DN', 'DP', 'DQ', 'DR', 'DS', 'DT', 'DV', 'DW', 'DY', 'EA', 'EC', 'ED', 'EE', 'EF', 'EG', 'EH', 'EI', 'EK', 'EL', 'EM', 'EN', 'EP', 'EQ', 'ER', 'ES', 'ET', 'EV', 'EW', 'EY', 'FA', 'FC', 'FD', 'FE', 'FF', 'FG', 'FH', 'FI', 'FK', 'FL', 'FM', 'FN', 'FP', 'FQ', 'FR', 'FS', 'FT', 'FV', 'FW', 'FY', 'GA', 'GC', 'GD', 'GE', 'GF', 'GG', 'GH', 'GI', 'GK', 'GL', 'GM', 'GN', 'GP', 'GQ', 'GR', 'GS', 'GT', 'GV', 'GW', 'GY', 'HA', 'HC', 'HD', 'HE', 'HF', 'HG', 'HH', 'HI', 'HK', 'HL', 'HM', 'HN', 'HP', 'HQ', 'HR', 'HS', 'HT', 'HV', 'HW', 'HY', 'IA', 'IC', 'ID', 'IE', 'IF', 'IG', 'IH', 'II', 'IK', 'IL', 'IM', 'IN', 'IP', 'IQ', 'IR', 'IS', 'IT', 'IV', 'IW', 'IY', 'KA', 'KC', 'KD', 'KE', 'KF', 'KG', 'KH', 'KI', 'KK', 'KL', 'KM', 'KN', 'KP', 'KQ', 'KR', 'KS', 'KT', 'KV', 'KW', 'KY', 'LA', 'LC', 'LD', 'LE', 'LF', 'LG', 'LH', 'LI', 'LK', 'LL', 'LM', 'LN', 'LP', 'LQ', 'LR', 'LS', 'LT', 'LV', 'LW', 'LY', 'MA', 'MC', 'MD', 'ME', 'MF', 'MG', 'MH', 'MI', 'MK', 'ML', 'MM', 'MN', 'MP', 'MQ', 'MR', 'MS', 'MT', 'MV', 'MW', 'MY', 'NA', 'NC', 'ND', 'NE', 'NF', 'NG', 'NH', 'NI', 'NK', 'NL', 'NM', 'NN', 'NP', 'NQ', 'NR', 'NS', 'NT', 'NV', 'NW', 'NY', 'PA', 'PC', 'PD', 'PE', 'PF', 'PG', 'PH', 'PI', 'PK', 'PL', 'PM', 'PN', 'PP', 'PQ', 'PR', 'PS', 'PT', 'PV', 'PW', 'PY', 'QA', 'QC', 'QD', 'QE', 'QF', 'QG', 'QH', 'QI', 'QK', 'QL', 'QM', 'QN', 'QP', 'QQ', 'QR', 'QS', 'QT', 'QV', 'QW', 'QY', 'RA', 'RC', 'RD', 'RE', 'RF', 'RG', 'RH', 'RI', 'RK', 'RL', 'RM', 'RN', 'RP', 'RQ', 'RR', 'RS', 'RT', 'RV', 'RW', 'RY', 'SA', 'SC', 'SD', 'SE', 'SF', 'SG', 'SH', 'SI', 'SK', 'SL', 'SM', 'SN', 'SP', 'SQ', 'SR', 'SS', 'ST', 'SV', 'SW', 'SY', 'TA', 'TC', 'TD', 'TE', 'TF', 'TG', 'TH', 'TI', 'TK', 'TL', 'TM', 'TN', 'TP', 'TQ', 'TR', 'TS', 'TT', 'TV', 'TW', 'TY', 'VA', 'VC', 'VD', 'VE', 'VF', 'VG', 'VH', 'VI', 'VK', 'VL', 'VM', 'VN', 'VP', 'VQ', 'VR', 'VS', 'VT', 'VV', 'VW', 'VY', 'WA', 'WC', 'WD', 'WE', 'WF', 'WG', 'WH', 'WI', 'WK', 'WL', 'WM', 'WN', 'WP', 'WQ', 'WR', 'WS', 'WT', 'WV', 'WW', 'WY', 'YA', 'YC', 'YD', 'YE', 'YF', 'YG', 'YH', 'YI', 'YK', 'YL', 'YM', 'YN', 'YP', 'YQ', 'YR', 'YS', 'YT', 'YV', 'YW', 'YY' ] sequences = pandas.read_csv('protein_data.csv', header=None) lengths = [] weights = [] for protein in sequences.itertuples(): protein_length = len(str(protein[1])) # length of protein sequence lengths.append(protein_length) analyzed_protein = ProteinAnalysis(str(protein[1])) ambigious_match = re.findall("X+|Z+", protein[1]) if ambigious_match: molecular_weight = "?" else: molecular_weight = analyzed_protein.molecular_weight() weights.append(molecular_weight) # remove bad amino acids from sequences for i in range(len(sequences)): sequences[0][i] = sequences[0][i].replace('B', '') sequences[0][i] = sequences[0][i].replace('U', '') sequences[0][i] = sequences[0][i].replace('X', '') sequences[0][i] = sequences[0][i].replace('Z', '') pandas.DataFrame(sequences).to_csv('updated_protein_data.csv', index_label=None, header=None, index=None) # use amino acid composition results from pfeature to generate most common amino acid and dipeptide data = pandas.read_csv('updated_protein_data.csv', header=None) data = numpy.asarray(data) most_frequent_di = [] most_frequent = [] for i in range(len(data)): max = 0 col = 0 for j in range(len(dipeptide)): c = data[i][0].count(dipeptide[j]) if (c > max): max = c col = j most_frequent_di.append(dipeptide[col]) for j in range(len(aa)): c = data[i][0].count(aa[j]) if (c > max): max = c col = j most_frequent.append(aa[col]) # more features amino_acid = {} first_aa = [] last_aa = [] arom = [] ii = [] ip = [] mec_rc = [] mec_db = [] ssf_helix = [] ssf_turn = [] ssf_sheet = [] gravy = [] ph_0 = [] ph_7 = [] ph_14 = [] A = [] C = [] D = [] E = [] F = [] G = [] H = [] I = [] K = [] L = [] M = [] N = [] P = [] Q = [] R = [] S = [] T = [] V = [] W = [] Y = [] classes = [] data = pandas.read_csv('updated_protein_data.csv', header=None) for protein in data.itertuples(): analyzed_protein = ProteinAnalysis(str(protein[1])) amino_acid = (analyzed_protein.count_amino_acids()) A.append(amino_acid.get('A')) C.append(amino_acid.get('C')) D.append(amino_acid.get('D')) E.append(amino_acid.get('E')) F.append(amino_acid.get('F')) G.append(amino_acid.get('G')) H.append(amino_acid.get('H')) I.append(amino_acid.get('I')) K.append(amino_acid.get('K')) L.append(amino_acid.get('L')) M.append(amino_acid.get('M')) N.append(amino_acid.get('N')) P.append(amino_acid.get('P')) Q.append(amino_acid.get('Q')) R.append(amino_acid.get('R')) S.append(amino_acid.get('S')) T.append(amino_acid.get('T')) V.append(amino_acid.get('V')) W.append(amino_acid.get('W')) Y.append(amino_acid.get('Y')) first_aa.append(str(protein[1])[0]) last_aa.append(str(protein[1])[-1]) arom.append(analyzed_protein.aromaticity()) ii.append(analyzed_protein.instability_index()) ip.append(analyzed_protein.isoelectric_point()) mec_rc.append(analyzed_protein.molar_extinction_coefficient()[0]) mec_db.append(analyzed_protein.molar_extinction_coefficient()[1]) ssf_helix.append(analyzed_protein.secondary_structure_fraction()[0]) ssf_turn.append(analyzed_protein.secondary_structure_fraction()[1]) ssf_sheet.append(analyzed_protein.secondary_structure_fraction()[2]) gravy.append(analyzed_protein.gravy()) ph_0.append(analyzed_protein.charge_at_pH(0.0)) ph_7.append(analyzed_protein.charge_at_pH(7.0)) ph_14.append(analyzed_protein.charge_at_pH(14.0)) classes.append(protein[2]) features = pandas.DataFrame() features["LENGTH"] = lengths #features["MOLECULAR WEIGHT"] = weights features["most frequent aa"] = most_frequent #features["first amino acids"] = first_aa features["last amino acid"] = last_aa features["most frequence dipeptide"] = most_frequent_di features["aromaticity"] = arom features["instability index"] = ii features["isolectric point"] = ip features["molecular extinction coefficient - reduced cysteines"] = mec_rc features["molecular extinction coefficient - disulfid bridges"] = mec_db features["secondary structure fraction helix"] = ssf_helix features["secondary structure fraction turn"] = ssf_turn features["secondary structure fraction sheet"] = ssf_sheet features["gravy"] = gravy features["charge at ph 0"] = ph_0 features["charge at ph 7"] = ph_7 features["charge at ph 14"] = ph_14 features['A'] = A features['C'] = C features['D'] = D features['E'] = E features['F'] = F features['G'] = G features['H'] = H features['I'] = I features['K'] = K features['L'] = L features['M'] = M features['N'] = N features['P'] = P features['Q'] = Q features['R'] = R features['S'] = S features['T'] = T features['V'] = V features['W'] = W features['Y'] = Y features["CLASS"] = classes features.to_csv('features.csv', index=None)
def get_ieq_nc(seq, is_iep=True): protparam = PA(seq) return protparam.isoelectric_point( ) if is_iep else protparam.charge_at_pH(7.0)
def featurise(self, data): """ Featurise the data. Parameters: ----------- data : `list` of `Bio.SeqRecord.SeqRecord` The data to be featurised. Returns: ------- featurised_data : `pandas.DataFrame` (num_data, features) The featurised data. """ # Get features of data features = collections.defaultdict(list) # Featurise the data for i, example in enumerate(data): # Convert Bio.SeqRecord.SeqRecord object to string for Bio.SeqUtils.ProtParam.ProteinAnalysis analysed_example = ProteinAnalysis(str(example.seq)) first50_analysed_example = ProteinAnalysis(str(example.seq)[:50]) last50_analysed_example = ProteinAnalysis(str(example.seq)[-50:]) features["length"].append(analysed_example.length) features["molecular_weight"].append( analysed_example.molecular_weight()) features["isoelectric_point"].append( analysed_example.isoelectric_point()) features["aromaticity"].append(analysed_example.aromaticity()) features["instability_index"].append( analysed_example.instability_index()) features["gravy"].append(analysed_example.gravy()) reduced, oxidised = analysed_example.molar_extinction_coefficient() features["reduced"].append(reduced) features["oxidised"].append(oxidised) helix, turn, sheet = analysed_example.secondary_structure_fraction( ) features["helix"].append(helix) features["turn"].append(turn) features["sheet"].append(sheet) features["charge_at_ph1"].append(analysed_example.charge_at_pH(1)) # features["charge_at_ph2"].append(analysed_example.charge_at_pH(2)) # features["charge_at_ph3"].append(analysed_example.charge_at_pH(3)) # features["charge_at_ph4"].append(analysed_example.charge_at_pH(4)) features["charge_at_ph7"].append(analysed_example.charge_at_pH(7)) features["charge_at_ph12"].append( analysed_example.charge_at_pH(12)) features["hydrophobicity"].append( np.mean( analysed_example.protein_scale(self.dicts['kd'], window=5, edge=1.0))) features["flexibility"].append( np.mean( analysed_example.protein_scale(self.dicts['flex'], window=5, edge=1.0))) features["hydrophilicity"].append( np.mean( analysed_example.protein_scale(self.dicts['hw'], window=5, edge=1.0))) features["surface_accessibility"].append( np.mean( analysed_example.protein_scale(self.dicts['em'], window=5, edge=1.0))) features["janin"].append( np.mean( analysed_example.protein_scale(self.dicts['ja'], window=5, edge=1.0))) # features["dipeptide_dg "].append(np.mean(analysed_example.protein_scale(self.dicts['diwv'], window=5, edge=1.0))) features["first50_hydrophobicity"].append( np.mean( first50_analysed_example.protein_scale(self.dicts['kd'], window=5, edge=1.0))) features["first50_flexibility"].append( np.mean( first50_analysed_example.protein_scale(self.dicts['flex'], window=5, edge=1.0))) features["first50_hydrophilicity"].append( np.mean( first50_analysed_example.protein_scale(self.dicts['hw'], window=5, edge=1.0))) features["first50_surface_accessibility"].append( np.mean( first50_analysed_example.protein_scale(self.dicts['em'], window=5, edge=1.0))) features["first50_janin"].append( np.mean( first50_analysed_example.protein_scale(self.dicts['ja'], window=5, edge=1.0))) features["last50_hydrophobicity"].append( np.mean( last50_analysed_example.protein_scale(self.dicts['kd'], window=5, edge=1.0))) features["last50_flexibility"].append( np.mean( last50_analysed_example.protein_scale(self.dicts['flex'], window=5, edge=1.0))) features["last50_hydrophilicity"].append( np.mean( last50_analysed_example.protein_scale(self.dicts['hw'], window=5, edge=1.0))) features["last50_surface_accessibility"].append( np.mean( last50_analysed_example.protein_scale(self.dicts['em'], window=5, edge=1.0))) features["last50_janin"].append( np.mean( last50_analysed_example.protein_scale(self.dicts['ja'], window=5, edge=1.0))) for key, val in analysed_example.get_amino_acids_percent().items(): features[key].append(val * 5) for key, val in first50_analysed_example.get_amino_acids_percent( ).items(): features["first_50_" + str(key)].append(val * 5) for key, val in last50_analysed_example.get_amino_acids_percent( ).items(): features["last_50_" + str(key)].append(val * 5) return pd.DataFrame.from_dict(features)