def get_pI(seq): seqprot = mRNA_translate(seq) strinfoAmbiguous = re.compile("X|B|Z|J|U", re.I) newseqprot = strinfoAmbiguous.sub("", str(seqprot)) protparam_obj = ProtParam.ProteinAnalysis(str(newseqprot).replace("*", "")) pI = protparam_obj.isoelectric_point() return pI
def get_Mw(seq): seqprot = mRNA_translate(seq) strinfoAmbiguous = re.compile("X|B|Z|J|U", re.I) newseqprot = strinfoAmbiguous.sub("", str(seqprot)) protparam_obj = ProtParam.ProteinAnalysis(str(newseqprot).replace("*", "")) mw = protparam_obj.molecular_weight() return mw
def get_gravy(seq): seqprot = mRNA_translate(seq) strinfoAmbiguous = re.compile("X|B|Z|J|U", re.I) newseqprot = strinfoAmbiguous.sub("", str(seqprot)) protparam_obj = ProtParam.ProteinAnalysis(str(newseqprot).replace("*", "")) Gravy = protparam_obj.gravy() return Gravy
def DoPrint(self): self.seqText.Show(False) self.seqText.Destroy() frVal = self.frSize.GetValue() self.SetText() maxLen = len(self.seqRec) ch = self.hButton.GetBackgroundColour() cl = self.lButton.GetBackgroundColour() fh = self.hButton.GetForegroundColour() fl = self.lButton.GetForegroundColour() vals = [] i = 0 while i < maxLen - frVal: if frVal == 1: vals.append(self.ppd[str(self.seqRec[i])]) else: protAnal = ProtParam.ProteinAnalysis( str(self.seqRec[i:(i + frVal)])) vals = protAnal.protein_scale(self.ppd, frVal) self.seqText.AppendText(str(self.seqRec[i:(i + frVal)])) if vals[0] < self.llim: self.seqText.SetStyle(i, (i + frVal), wx.TextAttr(fl, cl)) elif vals[0] > self.hlim: self.seqText.SetStyle(i, (i + frVal), wx.TextAttr(fh, ch)) i += frVal if i < maxLen - 1: self.seqText.AppendText(str(self.seqRec[i:maxLen]))
def get_instablility_index(seq): seqprot = mRNA_translate(seq) strinfoAmbiguous = re.compile("X|B|Z|J|U", re.I) newseqprot = strinfoAmbiguous.sub("", str(seqprot)) protparam_obj = ProtParam.ProteinAnalysis(str(newseqprot).replace("*", "")) instablility_index = protparam_obj.instability_index() return instablility_index
def get_basic_properties_features(seq): ''' Get basic physical properties as in BioPython/ExPasy ProtParam module. Returns: PI, MW, GRAVY, aromaticity,aliphaticness,Net Charge. Note: These methods all assume a standard AA Alphabet. Warning! Returned PI is INNACCURATE For a parsed (Tail(s) removed) subseq. (BioPy-ProtParam.isoelectric_point assumes N,C terminii!) ''' Bio_ProtParam = pp.ProteinAnalysis( seq) #BioPython SequenceAnalysis object from str PI = Bio_ProtParam.isoelectric_point() MW = Bio_ProtParam.molecular_weight() GRAVY = Bio_ProtParam.gravy() aromaticity = Bio_ProtParam.aromaticity() aliphaticness = GetAliphaticness(seq) NetCharges = get_netCharge(seq) prot_pp = { 'PI': PI, 'Molecular_Weight': round(MW, 4), 'GRAVY': round(GRAVY, 4), 'Aromaticity': round(aromaticity, 4) } # #Added now - Dan. # flex = GetFlex(Bio_ProtParam) #Returns 3 keys/values # prot_pp.update(flex) # Problem with mpty window prot_pp.update(aliphaticness) prot_pp.update(NetCharges) return prot_pp
def write_weka_input(weka_input, SHORT_IDENTIFIERS, SEQUENCES, pepstats_dic): """ Function: write_weka_input() Purpose: Given the query identifiers and pepstats-calculated protein features, write the input arff file for WEKA. Input: WEKA arff file name, query identifiers and pepstats dictionary. Return: None. """ with open(weka_input, 'w') as f: # Create a list of features for each protein X = [[] for __ in range(len(SHORT_IDENTIFIERS))] for protein_position, (TARGET_ID, sequence) in enumerate(zip(SHORT_IDENTIFIERS, SEQUENCES)): TARGET_ID = TARGET_ID.replace('>', '') TARGET_ID = TARGET_ID.strip() molecular_weight, charge, isoelectric, amino_acid_classes, amino_acid_frequencies, length = pepstats_dic[TARGET_ID] prot = ProtParam.ProteinAnalysis(sequence.replace('*','')) X[protein_position] = [charge, isoelectric] + amino_acid_classes + amino_acid_frequencies + [GRAVY(sequence)] + [prot.aromaticity(), prot.instability_index()] + [sequence.count('C')] # Write protein feature data to WEKA arff file f.writelines(ARFF_HEADER) for index, vector in enumerate(X): for feature in vector: f.writelines(str(feature) + ',') f.writelines('?\n') return
def mass( self ): """@return float, ProtParam molecular weight""" if not self.sequence: return 0.0 try: return PP.ProteinAnalysis(self.sequence).molecular_weight() except: return 0.0
def test_get_monoisotopic_molecular_weight_identical(self): """Confirm protein molecular weight agrees with calculation from Bio.SeqUtils.""" self.analysis = ProtParam.ProteinAnalysis(self.seq_text, monoisotopic=True) mw_1 = self.analysis.molecular_weight() mw_2 = molecular_weight(Seq(self.seq_text, IUPAC.protein), monoisotopic=True) self.assertAlmostEqual(mw_1, mw_2)
def isoelectric( self ): """@return float, ProtParam iso-electric point""" if not self.sequence: return 0.0 try: r = PP.ProteinAnalysis(self.sequence).isoelectric_point() return round(r, 2) except: return 0.0
def prot(): for seq_rec in SeqIO.parse("tool/media/query.txt", "fasta"): res = str(seq_rec.seq) X = ProtParam.ProteinAnalysis(res) count = len(res) m = (float("{:.2f}".format(X.molecular_weight()))) / 1000 m = np.round(m, 2) a = float("{:.2f}".format(X.aromaticity())) i = float("{:.2f}".format(X.instability_index())) if i > 40: c = "Unstable" else: c = "Stable" p = float("{0:.2f}".format(X.isoelectric_point())) mc = X.molar_extinction_coefficient()[1] ss = X.secondary_structure_fraction() ss = [i * 100 for i in ss] hel = np.round(ss[0], 2) turn = np.round(ss[1], 2) shet = np.round(ss[2], 2) j = X.get_amino_acids_percent() j.update((x, y * 100) for x, y in j.items()) vals = list(j.values()) test = [vals[x:x + 4] for x in range(0, len(vals), 4)] u = [[np.round(float(i), 2) for i in nested] for nested in test] item1 = list(u[0]) ele1 = ["A (Ala):", "C (Cys):", "D (Asp):", "E (Glu):"] item2 = list(u[1]) ele2 = ["F (Phe):", "G (Gly):", "H (His):", "I (Ile):"] item3 = list(u[2]) ele3 = ["K (Lys):", "L (Leu):", "M (Met):", "N (Asn):"] item4 = list(u[3]) ele4 = ["P (Pro):", "Q (Gln):", "R (Arg):", "S (Ser):"] item5 = list(u[4]) ele5 = ["T (Thr):", "V (Val):", "W (Trp):", "Y (Tyr):"] ad = [ '{}'.format(ele + " " + str(item) + "%") for item, ele in zip(item1, ele1) ] fi = [ '{}'.format(ele + " " + str(item) + "%") for item, ele in zip(item2, ele2) ] kn = [ '{}'.format(ele + " " + str(item) + "%") for item, ele in zip(item3, ele3) ] ps = [ '{}'.format(ele + " " + str(item) + "%") for item, ele in zip(item4, ele4) ] ty = [ '{}'.format(ele + " " + str(item) + "%") for item, ele in zip(item5, ele5) ] return res, count, m, a, i, c, p, mc, hel, shet, turn, ad, fi, kn, ps, ty
def GetExec(seqRec, frSize): # Calculate protParamData a = ProtParam.ProteinAnalysis(str(seqRec)).get_amino_acids_percent() retMat = [[], []] for b in a.keys(): retMat[0].append(b) retMat[1].append(str(Decimal(a[b]).quantize(Decimal(10)**-2))) return retMat
def test_get_monoisotopic_molecular_weight_identical(self): """Confirm protein molecular weight agrees with calculation from Bio.SeqUtils.""" # This test is somehow useless, since ProteinAnalysis.molecular_weight # is internally calling SeqUtils.molecular_weight. self.analysis = ProtParam.ProteinAnalysis(self.seq_text, monoisotopic=True) mw_1 = self.analysis.molecular_weight() mw_2 = molecular_weight( Seq(self.seq_text), seq_type="protein", monoisotopic=True ) self.assertAlmostEqual(mw_1, mw_2)
def mito_classifier_allwindows(pepstats_dic_aas, pepstats_dic_aas_short, IDENTIFIERS, SEQUENCES, TMP_PATH, WEKA_PATH, SCRIPT_PATH): predicted_mito = [] weka = TMP_PATH + 'mito.arff' with open(weka, 'w') as f: # Create a list of features for each protein X = [[] for __ in range(len(IDENTIFIERS))] for protein_position, TARGET_ID in enumerate(IDENTIFIERS): TARGET_ID = TARGET_ID.replace('>', '') TARGET_ID = TARGET_ID.strip() sequence = SEQUENCES[protein_position] molecular_weight, charge, isoelectric, amino_acid_classes, amino_acid_frequencies = pepstats_dic_aas[TARGET_ID] prot = ProtParam.ProteinAnalysis(sequence.replace('*','')) molecular_weight_short, charge_short, isoelectric_short, amino_acid_classes_short, amino_acid_frequencies_short = pepstats_dic_aas_short[TARGET_ID] X[protein_position] = [charge, isoelectric] + amino_acid_classes + amino_acid_frequencies + [GRAVY(sequence)] + [prot.secondary_structure_fraction()[0], prot.secondary_structure_fraction()[1], prot.secondary_structure_fraction()[2], prot.aromaticity()] + [charge_short, isoelectric_short] + amino_acid_frequencies_short f.writelines(parameters.ARFF_MITOCHONDRIA_HEADER) for index, vector in enumerate(X): for feature in vector: f.writelines(str(feature) + ',') f.writelines('?\n') ParamList = ['java', '-cp', WEKA_PATH, 'weka.classifiers.functions.SMO', '-l', SCRIPT_PATH + '/MODEL_FILES/MITOCHONDRIA_NOTMITOCHONDRIA.model', '-T', weka, '-p', 'first-last'] with open(TMP_PATH + 'Mitochondria_Predictions.txt', 'wb') as out: try: Process = subprocess.Popen(ParamList, shell=False, stdout=out) sts = Process.wait() cstdout, cstderr = Process.communicate() if Process.returncode: raise Exception("Calling WEKA returned %s"%Process.returncode) if cstdout: pass elif cstderr: sys.exit() except: e = sys.exc_info()[1] print("Error calling WEKA: %s" % e) sys.exit(1) file_input = TMP_PATH + 'Mitochondria_Predictions.txt' file_output = TMP_PATH + 'Mitochondria_Predictions.fasta' predicted_mito = parse_weka_output(file_input, IDENTIFIERS, SEQUENCES, 'Mitochondria', 'Non-Mitochondria') return predicted_mito
def compute_params(self): self.sequence = self.sequence.replace(' ', '').replace('X', '') p = ProtParam.ProteinAnalysis(self.sequence) self.properties = {} self.properties['kd'] = p.protein_scale(ProtParamData.kd, window=9, edge=.4) # Kyte & Doolittle index of hydrophobicity J. Mol. Biol. 157:105-132(1982). self.properties['Flex'] = p.protein_scale(ProtParamData.Flex, window=9, edge=.4) # Flexibility Normalized flexibility parameters (B-values), average Vihinen M., Torkkila E., Riikonen P. Proteins. 19(2):141-9(1994). self.properties['hw'] = p.protein_scale(ProtParamData.hw, window=9, edge=.4) # Hydrophilicity Hopp & Wood Proc. Natl. Acad. Sci. U.S.A. 78:3824-3828(1981) self.properties['em'] = p.protein_scale(ProtParamData.em, window=9, edge=.4) # Surface accessibility Vergoten G & Theophanides T, Biomolecular Structure and Dynamics, pg.138 (1997). self.properties['ja'] = p.protein_scale(ProtParamData.ja, window=9, edge=.4) # Janin Interior to surface transfer energy scale #DIWV requires a mod. return self
def prot(): for seq_rec in SeqIO.parse("media/query.txt", "fasta"): res = str(seq_rec.seq) X = ProtParam.ProteinAnalysis(res) count = len(res) m = float("{0:.2f}".format(X.molecular_weight())) a = float("{0:.2f}".format(X.aromaticity())) i = float("{0:.2f}".format(X.instability_index())) if i > 40: c = "Instable" else: c = "Stable" p = float("{0:.2f}".format(X.isoelectric_point())) mc = X.molar_extinction_coefficient()[1] return res, count, m, a, i, c, p, mc
def calc_region_charges(seq, cur_window): """Perform calculation of charges via isoelectric points for a sequence. """ # internal small regions, so do not deal with C and N terminal charges IsoelectricPoint.pKcterminal = {} IsoelectricPoint.pKnterminal = {} cur_pos = 0 region_charges = [] while cur_pos < len(seq) - cur_window: cur_seq = seq[cur_pos:cur_pos + cur_window] prot_analysis = ProtParam.ProteinAnalysis(str(cur_seq)) ie_calc = IsoelectricPoint.IsoelectricPoint( cur_seq, prot_analysis.count_amino_acids()) region_charges.append(ie_calc.pi()) cur_pos += 1 return region_charges
def param(seq): strinfoAmbiguous = re.compile("X|B|Z|J|U",re.I) ptU = re.compile("U",re.I) seqRNA = ptU.sub("T",str(seq).strip()) seqRNA = seqRNA.upper() CDS_size1,CDS_integrity,seqCDS= ExtractORF(seqRNA).longest_ORF(start=['ATG'],stop=['TAA','TAG','TGA']) seqprot = mRNA_translate(seqCDS) pep_len = len(seqprot.strip("*")) newseqprot = strinfoAmbiguous.sub("",str(seqprot)) protparam_obj = ProtParam.ProteinAnalysis(str(newseqprot.strip("*"))) if pep_len > 0: Instability_index,PI,Gravy = protein_param(protparam_obj) else: Instability_index = 0.0 PI=0.0 Gravy=0.0 return(Instability_index,PI,Gravy)
def compute_aa_composition(protein_sequence: str) -> dict: """ Computes the aminoacid composition of a given protein sequence. Parameters ---------- protein_sequence: str sequence of the protein to be processed Returns ------- aa_composition: dict dictionary containing the relative abundance of each aminoacid """ analyzer = ProtParam.ProteinAnalysis(str(protein_sequence)) aa_composition = analyzer.get_amino_acids_percent() return aa_composition
def stats(self, sequences): '''Generate a file with useful data of the protein. Based in ProtParam Tools from Expasy''' from Bio.SeqUtils import ProtParam #output_filename = self.structure_id + "_stats.dat" #stats_file = open(output_filename, "w") for chain in sequences.keys(): print "*************************" print "** ", self.structure_id, "- Chain", chain, " **" print "*************************" seq_stats = ProtParam.ProteinAnalysis(sequences[chain]) ## Printing the aa count... print "Amino acids counts and percents" total_aa = 0 for aa, percent in zip(seq_stats.count_amino_acids(), seq_stats.get_amino_acids_percent()): print aa, ":", seq_stats.count_amino_acids()[aa], "(", round( seq_stats.get_amino_acids_percent()[aa] * 100, 2), "% )" total_aa += seq_stats.count_amino_acids()[aa] print "TOTAL:", total_aa molar_mass = seq_stats.molecular_weight() / 100 print "\nMolecular mass:", molar_mass, "kDa" print "\nIsoelectric point:", round(seq_stats.isoelectric_point(), 2) extintion_coef_Cyst, extintion_coef_noCyst, molar_extintion_coef_Cyst, molar_extintion_coef_noCyst = self.get_extintion_coef( seq_stats, molar_mass) print "\nExtintion coefficient (Cystines) =", extintion_coef_Cyst, "M^-1*cm^-1" print "Extintion coefficient (no Cystines) =", extintion_coef_noCyst, "M^-1*cm^-1" print "\nMolar extintion coefficient (Cystines) [Abs 0.1% (=1 g/l)] =", molar_extintion_coef_Cyst print "Molar extintion coefficient (no Cystines) [Abs 0.1% (=1 g/l)] =", molar_extintion_coef_noCyst, "\n" return
def get_weight(value): traces = [] file_path = choose_fasta(value) with open(file_path, "r") as file_fasta: for entry in SeqIO.parse(file_fasta, "fasta"): id_prot = entry.id.split("|") id_chain = id_prot[0].split(":") seq = str(entry.seq) X = pp.ProteinAnalysis(seq) weight = X.molecular_weight() chain_name = [] chain_weight = [] chain_name.append("Chain " + id_chain[1]) chain_weight.append(weight) traces.append( go.Bar(x=chain_name, y=chain_weight, name="Chain " + id_chain[1])) return traces
def get_percentage_aa(value): traces = [] file_path = choose_fasta(value) with open(file_path, "r") as file_fasta: for entry2 in SeqIO.parse(file_fasta, "fasta"): id_prot = entry2.id.split("|") id_chain = id_prot[0].split(":") seq = str(entry2.seq) X = pp.ProteinAnalysis(seq) percent_aa = X.get_amino_acids_percent() aa_list = [] aa_percent = [] for key, value in percent_aa.items(): aa_name = seq3(key) aa_list.append(aa_name) aa_percent.append(value * 100) traces.append( go.Bar(x=aa_list, y=aa_percent, name="Chain " + id_chain[1])) return traces
def part_two(): data = list(csv.DictReader(open('data.csv'))) l = [len(s['sequence']) for s in data] print(l) w = csv.DictWriter(open('data_extra.csv', 'w', newline=''), fieldnames=('identifier', 'sequence', 'len', 'sheet', 'turn', 'helix')) w.writeheader() from Bio.SeqUtils import ProtParam for gene in data: (h, t, s) = ProtParam.ProteinAnalysis( gene['sequence']).secondary_structure_fraction() w.writerow({ 'identifier': gene['identifier'], 'sequence': gene['sequence'], 'len': len(gene['sequence']), 'sheet': s, 'turn': t, 'helix': h })
def mod(sequence): """ This is a not implemented function. It is a fix for ProtParam.ProteinAnalysis().protein_scale and the DIWV scale. As the latter requires knowldge of the preceeding amino acid it will fail. >>> p = ProtParam.ProteinAnalysis(sequence) >>> p.protein_scale(ProtParamData.DIWV, window=9, edge=.4) hashtag epicfail. So this is the repalacement. :param sequence: sequence to score :type sequence: str :return: DIWV score. :rtype: list[int] """ p = ProtParam.ProteinAnalysis(sequence) param_dict = ProtParamData.DIWV window = 9 edge = 0.4 weights = p._weight_list(window, edge) sum_of_weights = sum(weights) * 2 + 1 scores = [] for i in range(p.length - window): subsequence = p.sequence[i:i + window] score = 0.0 for j in range(window // 2): try: front = param_dict[subsequence[j]][subsequence[j + 1]] back = param_dict[subsequence[window - j]][subsequence[window - j + 1]] score += weights[j] * front + weights[j] * back except KeyError: warn(f'warning: {subsequence[j]} or {subsequence[window - j - 1]} is not a standard amino acid.') middle = subsequence[window // 2] if middle in param_dict: score += param_dict[middle] else: warn(f'warning: {middle} is not a standard amino acid.') scores.append(score / sum_of_weights) return scores
def GetExec(seqRec, frSize): # Calculate protParamData pa = ProtParam.ProteinAnalysis(str(seqRec)) d = Decimal(10)**-2 flexList = pa.flexibility() lenf = len(flexList) * 1. flexSum = 0 for f in flexList: flexSum += f retMat = [[], []] retMat[0].append("Mol. Weight:") retMat[1].append(str(Decimal(pa.molecular_weight()).quantize(d))) retMat[0].append("Aromaticity:") retMat[1].append(str(Decimal(pa.aromaticity()).quantize(d))) retMat[0].append("Instability:") retMat[1].append(str(Decimal(pa.instability_index()).quantize(d))) retMat[0].append("Avg. Flexibility:") retMat[1].append(str(Decimal(flexSum / lenf / 1.).quantize(d))) retMat[0].append("pI:") retMat[1].append(str(Decimal(pa.isoelectric_point()).quantize(d))) #retMat[0].append("Avg. Hydropathy:") #retMat[1].append( # str(Decimal(pa.protein_scale(ProtParamData.kd,lenf,1)[0]).quantize(d))) return retMat
def calculate_property(seq_path): seq_fasta = SeqIO.parse(seq_path, "fasta") result_primary_feature = pd.DataFrame(columns=[ "SeqID", "molecular_weight", "instability_index", "GRAVY", "theoretical_pI" ]) func_dict = { "molecular_weight": ProtParam.ProteinAnalysis.molecular_weight, "instability_index": ProtParam.ProteinAnalysis.instability_index, "GRAVY": ProtParam.ProteinAnalysis.gravy, "theoretical_pI": ProtParam.ProteinAnalysis.isoelectric_point } for seq in seq_fasta: protein_seq = str(seq.seq).strip("*") protein_result = ProtParam.ProteinAnalysis(protein_seq) tmp_dict = {"SeqID": seq.id} for key, Prot_func in func_dict.items(): try: tmp_dict[key] = Prot_func(protein_result) except BaseException: tmp_dict[key] = "NA" result_primary_feature = result_primary_feature.append( tmp_dict, ignore_index=True) return result_primary_feature
def calculate_potential(fasta, strand, outfile): ''' Calculate three features: putative peptide length,pI and Fickett And assess coding potential based on SVM model ''' strinfoAmbiguous = re.compile("X|B|Z|J|U", re.I) ptU = re.compile("U", re.I) ftmp_feat = open(outfile + ".feat", "w") ftmp_svm = open(outfile + ".tmp.1", "w") ftmp_result = open(outfile, "w") ftmp_result.write("\t".join( map(str, [ "#ID", "transcript_length", "peptide_length", "Fickett_score", "pI", "ORF_integrity", "coding_probability", "label" ])) + "\n") ftmp_result.close() fickett_obj = Fickett() for seq in seqio.fasta_read(fasta): seqid = seq.id seqRNA = ptU.sub("T", str(seq.seq).strip()) '''seqRNA:transcript full sequence''' seqRNA = seqRNA.upper() seqCDS, start_pos, orf_strand, orf_fullness = FindCDS( seqRNA).longest_orf(strand) '''seqCDS:longest ORF''' seqprot = mRNA_translate(seqCDS) pep_len = len(seqprot) # pep_len = len(seqprot.strip("*")) newseqprot = strinfoAmbiguous.sub("", str(seqprot)) '''exclude ambiguous amio acid X, B, Z, J, Y in peptide sequence''' fickett_score = fickett_obj.fickett_value(seqRNA) protparam_obj = ProtParam.ProteinAnalysis(str(newseqprot.strip("*"))) if pep_len > 0: # fickett_score = fickett_obj.fickett_value(seqCDS) isoelectric_point = protein_param(protparam_obj) else: # fickett_score = 0.0 orf_fullness = -1 isoelectric_point = 0.0 ftmp_feat.write("\t".join( map(str, [ seqid, len(seqRNA), pep_len, fickett_score, isoelectric_point, orf_fullness ])) + "\n") ftmp_svm.write("".join( map(str, [ "999", " 1:", pep_len, " 2:", fickett_score, " 3:", isoelectric_point, " 4:", orf_fullness ])) + "\n") ftmp_feat.close() ftmp_svm.close() # return 0 # calculate the coding probability using LIBSVM sys.stderr.write("\n[INFO] Predicting coding potential, please wait ...\n") # set directories and check depending tools existance data_dir = os.path.join( os.path.dirname(pkg_resources.resource_filename( "CPC2", "__init__.py")), "data") + os.path.sep lib_dir = os.path.join( os.path.dirname(pkg_resources.resource_filename("CPC2", "__init__.py")), "libs") app_svm_scale = os.path.join(lib_dir, "libsvm/libsvm-3.18/svm-scale") app_svm_predict = os.path.join(lib_dir, "libsvm/libsvm-3.18/svm-predict") sp.call( 'test -x ' + app_svm_scale + ' || echo \"[ERROR] No excutable svm-scale on CPC2 path!\" > /dev/stderr', shell=True) sp.call( 'test -x ' + app_svm_predict + ' || echo \"[ERROR] No excutable svm-predict on CPC2 path!\" > /dev/stderr', shell=True) model = os.path.join(data_dir, 'cpc2.range') cmd = app_svm_scale + ' -r ' + model + ' ' + outfile + '.tmp.1 > ' + outfile + '.tmp.2 &&' cmd = cmd + app_svm_predict + ' -b 1 -q ' + outfile + '.tmp.2 ' + data_dir + 'cpc2.model ' + outfile + '.tmp.1 &&' cmd = cmd + 'awk -vOFS="\\t" \'{if ($1 == 1){print $2,"coding"} else if ($1 == 0){print $2,"noncoding"}}\' ' + outfile + '.tmp.1 > ' + outfile + '.tmp.2 &&' cmd = cmd + 'paste ' + outfile + '.feat ' + outfile + '.tmp.2 >>' + outfile command = sp.Popen(cmd, shell=True, stdout=sp.PIPE, stderr=sp.PIPE) (outtext, errtext) = command.communicate() exitstatus = command.returncode os.system('rm -f ' + outfile + '.tmp.1 ' + outfile + '.tmp.2') if exitstatus == 0: rm_cmd = "rm -f " + outfile + '.feat' sp.getoutput(rm_cmd) sys.stderr.write("\n[INFO] Running Done!\n") return 0 else: sys.stderr.write( "\n[ERROR] Prediction error! Exit code: {}\n".format(exitstatus)) sys.stderr.write(outtext.decode()) sys.stderr.write(errtext.decode()) return -1
def setUp(self): self.seq_text = "MAEGEITTFTALTEKFNLPPGNYKKPKLLYCSNGGHFLRILPDGTVDGTRDRSDQHIQLQLSAESVGEVYIKSTETGQYLAMDTSGLLYGSQTPSEECLFLERLEENHYNTYTSKKHAEKNWFVGLKKNGSCKRGPRTHYGQKAILFLPLPV" self.analysis = ProtParam.ProteinAnalysis(self.seq_text)
""" Module to map info on tree """ # Load libraries import sys from Bio import SeqIO from Bio.SeqUtils import ProtParam # Load ancestral sequences node_dict = {} handle = open(sys.argv[1], "rU") for record in SeqIO.parse(handle, "fasta"): sequence = record.seq.tostring() sequence = sequence.replace("-", "") analysed_protein = ProtParam.ProteinAnalysis(sequence) # Compute some properties pI = analysed_protein.isoelectric_point() MW = analysed_protein.molecular_weight() # print record.id, pI, MW node_dict[record.id] = pI handle.close() # Load tree tree_tab = [] tree_file = open(sys.argv[2], "r") while 1: line = tree_file.readline() if line == "":
def test_get_monoisotopic_molecular_weight(self): "Test calculating the monoisotopic molecular weight" self.analysis = ProtParam.ProteinAnalysis(self.seq_text, monoisotopic=True) self.assertAlmostEqual(self.analysis.molecular_weight(), 17092.53)