Пример #1
0
def get_pI(seq):
    seqprot = mRNA_translate(seq)
    strinfoAmbiguous = re.compile("X|B|Z|J|U", re.I)
    newseqprot = strinfoAmbiguous.sub("", str(seqprot))
    protparam_obj = ProtParam.ProteinAnalysis(str(newseqprot).replace("*", ""))
    pI = protparam_obj.isoelectric_point()
    return pI
Пример #2
0
def get_Mw(seq):
    seqprot = mRNA_translate(seq)
    strinfoAmbiguous = re.compile("X|B|Z|J|U", re.I)
    newseqprot = strinfoAmbiguous.sub("", str(seqprot))
    protparam_obj = ProtParam.ProteinAnalysis(str(newseqprot).replace("*", ""))
    mw = protparam_obj.molecular_weight()
    return mw
Пример #3
0
def get_gravy(seq):
    seqprot = mRNA_translate(seq)
    strinfoAmbiguous = re.compile("X|B|Z|J|U", re.I)
    newseqprot = strinfoAmbiguous.sub("", str(seqprot))
    protparam_obj = ProtParam.ProteinAnalysis(str(newseqprot).replace("*", ""))
    Gravy = protparam_obj.gravy()
    return Gravy
Пример #4
0
 def DoPrint(self):
     self.seqText.Show(False)
     self.seqText.Destroy()
     frVal = self.frSize.GetValue()
     self.SetText()
     maxLen = len(self.seqRec)
     ch = self.hButton.GetBackgroundColour()
     cl = self.lButton.GetBackgroundColour()
     fh = self.hButton.GetForegroundColour()
     fl = self.lButton.GetForegroundColour()
     vals = []
     i = 0
     while i < maxLen - frVal:
         if frVal == 1:
             vals.append(self.ppd[str(self.seqRec[i])])
         else:
             protAnal = ProtParam.ProteinAnalysis(
                 str(self.seqRec[i:(i + frVal)]))
             vals = protAnal.protein_scale(self.ppd, frVal)
         self.seqText.AppendText(str(self.seqRec[i:(i + frVal)]))
         if vals[0] < self.llim:
             self.seqText.SetStyle(i, (i + frVal), wx.TextAttr(fl, cl))
         elif vals[0] > self.hlim:
             self.seqText.SetStyle(i, (i + frVal), wx.TextAttr(fh, ch))
         i += frVal
     if i < maxLen - 1:
         self.seqText.AppendText(str(self.seqRec[i:maxLen]))
Пример #5
0
def get_instablility_index(seq):
    seqprot = mRNA_translate(seq)
    strinfoAmbiguous = re.compile("X|B|Z|J|U", re.I)
    newseqprot = strinfoAmbiguous.sub("", str(seqprot))
    protparam_obj = ProtParam.ProteinAnalysis(str(newseqprot).replace("*", ""))
    instablility_index = protparam_obj.instability_index()
    return instablility_index
Пример #6
0
def get_basic_properties_features(seq):
    '''
    Get basic physical properties as in BioPython/ExPasy ProtParam
    module.
    Returns: PI, MW, GRAVY, aromaticity,aliphaticness,Net Charge.

    Note: These methods all assume a standard AA Alphabet.
    Warning! Returned PI is INNACCURATE For a parsed (Tail(s) removed) subseq.
    (BioPy-ProtParam.isoelectric_point assumes N,C terminii!)
    '''
    Bio_ProtParam = pp.ProteinAnalysis(
        seq)  #BioPython SequenceAnalysis object from str

    PI = Bio_ProtParam.isoelectric_point()
    MW = Bio_ProtParam.molecular_weight()
    GRAVY = Bio_ProtParam.gravy()
    aromaticity = Bio_ProtParam.aromaticity()
    aliphaticness = GetAliphaticness(seq)
    NetCharges = get_netCharge(seq)

    prot_pp = {
        'PI': PI,
        'Molecular_Weight': round(MW, 4),
        'GRAVY': round(GRAVY, 4),
        'Aromaticity': round(aromaticity, 4)
    }

    # #Added now - Dan.
    # flex = GetFlex(Bio_ProtParam) #Returns 3 keys/values
    # prot_pp.update(flex) # Problem with mpty window

    prot_pp.update(aliphaticness)
    prot_pp.update(NetCharges)

    return prot_pp
Пример #7
0
def write_weka_input(weka_input, SHORT_IDENTIFIERS, SEQUENCES, pepstats_dic):
    """ Function: write_weka_input()

        Purpose:  Given the query identifiers and pepstats-calculated
                  protein features, write the input arff file for WEKA. 
              
        Input:    WEKA arff file name, query identifiers and pepstats dictionary.                  
    
        Return:   None. 
    """   
    with open(weka_input, 'w') as f:
        # Create a list of features for each protein
        X = [[] for __ in range(len(SHORT_IDENTIFIERS))]

        for protein_position, (TARGET_ID, sequence) in enumerate(zip(SHORT_IDENTIFIERS, SEQUENCES)):
            TARGET_ID = TARGET_ID.replace('>', '')
            TARGET_ID = TARGET_ID.strip()

            molecular_weight, charge, isoelectric, amino_acid_classes, amino_acid_frequencies, length = pepstats_dic[TARGET_ID]

            prot = ProtParam.ProteinAnalysis(sequence.replace('*',''))            
            
            X[protein_position] = [charge, isoelectric] + amino_acid_classes + amino_acid_frequencies + [GRAVY(sequence)] + [prot.aromaticity(), prot.instability_index()] + [sequence.count('C')] 
 
        # Write protein feature data to WEKA arff file
        f.writelines(ARFF_HEADER)
        for index, vector in enumerate(X):
            for feature in vector:
                f.writelines(str(feature) + ',')
            f.writelines('?\n')

    return
Пример #8
0
 def mass( self ):
     """@return float, ProtParam molecular weight"""
     if not self.sequence:
         return 0.0
     try:
         return PP.ProteinAnalysis(self.sequence).molecular_weight()
     except:
         return 0.0
Пример #9
0
 def test_get_monoisotopic_molecular_weight_identical(self):
     """Confirm protein molecular weight agrees with calculation from Bio.SeqUtils."""
     self.analysis = ProtParam.ProteinAnalysis(self.seq_text,
                                               monoisotopic=True)
     mw_1 = self.analysis.molecular_weight()
     mw_2 = molecular_weight(Seq(self.seq_text, IUPAC.protein),
                             monoisotopic=True)
     self.assertAlmostEqual(mw_1, mw_2)
Пример #10
0
 def isoelectric( self ):
     """@return float, ProtParam iso-electric point"""
     if not self.sequence:
         return 0.0
     try:
         r = PP.ProteinAnalysis(self.sequence).isoelectric_point()
         return round(r, 2)
     except:
         return 0.0
Пример #11
0
def prot():
    for seq_rec in SeqIO.parse("tool/media/query.txt", "fasta"):
        res = str(seq_rec.seq)
    X = ProtParam.ProteinAnalysis(res)
    count = len(res)
    m = (float("{:.2f}".format(X.molecular_weight()))) / 1000
    m = np.round(m, 2)
    a = float("{:.2f}".format(X.aromaticity()))
    i = float("{:.2f}".format(X.instability_index()))
    if i > 40:
        c = "Unstable"
    else:
        c = "Stable"
    p = float("{0:.2f}".format(X.isoelectric_point()))
    mc = X.molar_extinction_coefficient()[1]
    ss = X.secondary_structure_fraction()
    ss = [i * 100 for i in ss]
    hel = np.round(ss[0], 2)
    turn = np.round(ss[1], 2)
    shet = np.round(ss[2], 2)
    j = X.get_amino_acids_percent()
    j.update((x, y * 100) for x, y in j.items())
    vals = list(j.values())
    test = [vals[x:x + 4] for x in range(0, len(vals), 4)]
    u = [[np.round(float(i), 2) for i in nested] for nested in test]
    item1 = list(u[0])
    ele1 = ["A (Ala):", "C (Cys):", "D (Asp):", "E (Glu):"]
    item2 = list(u[1])
    ele2 = ["F (Phe):", "G (Gly):", "H (His):", "I (Ile):"]
    item3 = list(u[2])
    ele3 = ["K (Lys):", "L (Leu):", "M (Met):", "N (Asn):"]
    item4 = list(u[3])
    ele4 = ["P (Pro):", "Q (Gln):", "R (Arg):", "S (Ser):"]
    item5 = list(u[4])
    ele5 = ["T (Thr):", "V (Val):", "W (Trp):", "Y (Tyr):"]

    ad = [
        '{}'.format(ele + " " + str(item) + "%")
        for item, ele in zip(item1, ele1)
    ]
    fi = [
        '{}'.format(ele + " " + str(item) + "%")
        for item, ele in zip(item2, ele2)
    ]
    kn = [
        '{}'.format(ele + " " + str(item) + "%")
        for item, ele in zip(item3, ele3)
    ]
    ps = [
        '{}'.format(ele + " " + str(item) + "%")
        for item, ele in zip(item4, ele4)
    ]
    ty = [
        '{}'.format(ele + " " + str(item) + "%")
        for item, ele in zip(item5, ele5)
    ]
    return res, count, m, a, i, c, p, mc, hel, shet, turn, ad, fi, kn, ps, ty
Пример #12
0
def GetExec(seqRec, frSize):
    # Calculate protParamData
    a = ProtParam.ProteinAnalysis(str(seqRec)).get_amino_acids_percent()
    retMat = [[], []]
    for b in a.keys():
        retMat[0].append(b)
        retMat[1].append(str(Decimal(a[b]).quantize(Decimal(10)**-2)))

    return retMat
Пример #13
0
 def test_get_monoisotopic_molecular_weight_identical(self):
     """Confirm protein molecular weight agrees with calculation from Bio.SeqUtils."""
     # This test is somehow useless, since ProteinAnalysis.molecular_weight
     # is internally calling SeqUtils.molecular_weight.
     self.analysis = ProtParam.ProteinAnalysis(self.seq_text, monoisotopic=True)
     mw_1 = self.analysis.molecular_weight()
     mw_2 = molecular_weight(
         Seq(self.seq_text), seq_type="protein", monoisotopic=True
     )
     self.assertAlmostEqual(mw_1, mw_2)
Пример #14
0
def mito_classifier_allwindows(pepstats_dic_aas, pepstats_dic_aas_short, IDENTIFIERS, SEQUENCES, TMP_PATH, WEKA_PATH, SCRIPT_PATH):

    predicted_mito = []
    weka = TMP_PATH + 'mito.arff'

    with open(weka, 'w') as f:

        # Create a list of features for each protein
        X = [[] for __ in range(len(IDENTIFIERS))]

        for protein_position, TARGET_ID in enumerate(IDENTIFIERS):
            TARGET_ID = TARGET_ID.replace('>', '')
            TARGET_ID = TARGET_ID.strip()
            sequence = SEQUENCES[protein_position]

            molecular_weight, charge, isoelectric, amino_acid_classes, amino_acid_frequencies = pepstats_dic_aas[TARGET_ID]

            prot = ProtParam.ProteinAnalysis(sequence.replace('*',''))

            molecular_weight_short, charge_short, isoelectric_short, amino_acid_classes_short, amino_acid_frequencies_short = pepstats_dic_aas_short[TARGET_ID]

            X[protein_position] = [charge, isoelectric] + amino_acid_classes + amino_acid_frequencies + [GRAVY(sequence)] + [prot.secondary_structure_fraction()[0], prot.secondary_structure_fraction()[1], prot.secondary_structure_fraction()[2], prot.aromaticity()] + [charge_short, isoelectric_short] + amino_acid_frequencies_short  
 
        f.writelines(parameters.ARFF_MITOCHONDRIA_HEADER)
        for index, vector in enumerate(X):
            for feature in vector:
                f.writelines(str(feature) + ',')
            f.writelines('?\n')

    ParamList = ['java', '-cp', WEKA_PATH, 'weka.classifiers.functions.SMO',
             '-l', SCRIPT_PATH + '/MODEL_FILES/MITOCHONDRIA_NOTMITOCHONDRIA.model',
             '-T', weka, '-p', 'first-last']

    with open(TMP_PATH + 'Mitochondria_Predictions.txt', 'wb') as out:
        try:
            Process = subprocess.Popen(ParamList, shell=False, stdout=out)
            sts = Process.wait()
            cstdout, cstderr = Process.communicate()

            if Process.returncode:
                raise Exception("Calling WEKA returned %s"%Process.returncode)
            if cstdout:
                pass
            elif cstderr:
                sys.exit()
        except:
            e = sys.exc_info()[1]
            print("Error calling WEKA: %s" % e)
            sys.exit(1)

    file_input = TMP_PATH + 'Mitochondria_Predictions.txt'
    file_output = TMP_PATH + 'Mitochondria_Predictions.fasta'
    predicted_mito = parse_weka_output(file_input, IDENTIFIERS, SEQUENCES, 'Mitochondria', 'Non-Mitochondria')

    return predicted_mito
 def compute_params(self):
     self.sequence = self.sequence.replace(' ', '').replace('X', '')
     p = ProtParam.ProteinAnalysis(self.sequence)
     self.properties = {}
     self.properties['kd'] = p.protein_scale(ProtParamData.kd, window=9, edge=.4) # Kyte & Doolittle index of hydrophobicity J. Mol. Biol. 157:105-132(1982).
     self.properties['Flex'] = p.protein_scale(ProtParamData.Flex, window=9, edge=.4) # Flexibility Normalized flexibility parameters (B-values), average Vihinen M., Torkkila E., Riikonen P. Proteins. 19(2):141-9(1994).
     self.properties['hw'] = p.protein_scale(ProtParamData.hw, window=9, edge=.4) # Hydrophilicity Hopp & Wood Proc. Natl. Acad. Sci. U.S.A. 78:3824-3828(1981)
     self.properties['em'] = p.protein_scale(ProtParamData.em, window=9, edge=.4) # Surface accessibility Vergoten G & Theophanides T, Biomolecular Structure and Dynamics, pg.138 (1997).
     self.properties['ja'] = p.protein_scale(ProtParamData.ja, window=9, edge=.4) # Janin Interior to surface transfer energy scale
     #DIWV requires a mod.
     return self
Пример #16
0
def prot():
    for seq_rec in SeqIO.parse("media/query.txt", "fasta"):
        res = str(seq_rec.seq)
    X = ProtParam.ProteinAnalysis(res)
    count = len(res)
    m = float("{0:.2f}".format(X.molecular_weight()))
    a = float("{0:.2f}".format(X.aromaticity()))
    i = float("{0:.2f}".format(X.instability_index()))
    if i > 40:
        c = "Instable"
    else:
        c = "Stable"
    p =  float("{0:.2f}".format(X.isoelectric_point()))
    mc = X.molar_extinction_coefficient()[1]
    return res, count, m, a, i, c, p, mc
Пример #17
0
def calc_region_charges(seq, cur_window):
    """Perform calculation of charges via isoelectric points for a sequence.
    """
    # internal small regions, so do not deal with C and N terminal charges
    IsoelectricPoint.pKcterminal = {}
    IsoelectricPoint.pKnterminal = {}
    cur_pos = 0
    region_charges = []
    while cur_pos < len(seq) - cur_window:
        cur_seq = seq[cur_pos:cur_pos + cur_window]
        prot_analysis = ProtParam.ProteinAnalysis(str(cur_seq))
        ie_calc = IsoelectricPoint.IsoelectricPoint(
            cur_seq, prot_analysis.count_amino_acids())
        region_charges.append(ie_calc.pi())
        cur_pos += 1
    return region_charges
Пример #18
0
def param(seq):
	strinfoAmbiguous = re.compile("X|B|Z|J|U",re.I)
	ptU = re.compile("U",re.I)
	seqRNA = ptU.sub("T",str(seq).strip())
	seqRNA = seqRNA.upper()
	CDS_size1,CDS_integrity,seqCDS= ExtractORF(seqRNA).longest_ORF(start=['ATG'],stop=['TAA','TAG','TGA'])
	seqprot = mRNA_translate(seqCDS)
	pep_len = len(seqprot.strip("*"))
	newseqprot = strinfoAmbiguous.sub("",str(seqprot))
	protparam_obj = ProtParam.ProteinAnalysis(str(newseqprot.strip("*")))
	if pep_len > 0:
		Instability_index,PI,Gravy = protein_param(protparam_obj)
	else:
		Instability_index = 0.0
		PI=0.0
		Gravy=0.0
	return(Instability_index,PI,Gravy)
Пример #19
0
def compute_aa_composition(protein_sequence: str) -> dict:
    """
    Computes the aminoacid composition of a given protein sequence.

    Parameters
    ----------
    protein_sequence: str
        sequence of the protein to be processed

    Returns
    -------
    aa_composition: dict
        dictionary containing the relative abundance of each aminoacid
    """

    analyzer = ProtParam.ProteinAnalysis(str(protein_sequence))
    aa_composition = analyzer.get_amino_acids_percent()

    return aa_composition
Пример #20
0
    def stats(self, sequences):
        '''Generate a file with useful data of the protein. Based in ProtParam Tools from Expasy'''

        from Bio.SeqUtils import ProtParam

        #output_filename = self.structure_id + "_stats.dat"

        #stats_file = open(output_filename, "w")

        for chain in sequences.keys():
            print "*************************"
            print "**  ", self.structure_id, "- Chain", chain, "   **"
            print "*************************"

            seq_stats = ProtParam.ProteinAnalysis(sequences[chain])

            ## Printing the aa count...
            print "Amino acids counts and percents"
            total_aa = 0
            for aa, percent in zip(seq_stats.count_amino_acids(),
                                   seq_stats.get_amino_acids_percent()):
                print aa, ":", seq_stats.count_amino_acids()[aa], "(", round(
                    seq_stats.get_amino_acids_percent()[aa] * 100, 2), "% )"
                total_aa += seq_stats.count_amino_acids()[aa]
            print "TOTAL:", total_aa

            molar_mass = seq_stats.molecular_weight() / 100

            print "\nMolecular mass:", molar_mass, "kDa"

            print "\nIsoelectric point:", round(seq_stats.isoelectric_point(),
                                                2)

            extintion_coef_Cyst, extintion_coef_noCyst, molar_extintion_coef_Cyst, molar_extintion_coef_noCyst = self.get_extintion_coef(
                seq_stats, molar_mass)

            print "\nExtintion coefficient (Cystines) =", extintion_coef_Cyst, "M^-1*cm^-1"
            print "Extintion coefficient (no Cystines) =", extintion_coef_noCyst, "M^-1*cm^-1"

            print "\nMolar extintion coefficient (Cystines) [Abs 0.1% (=1 g/l)] =", molar_extintion_coef_Cyst
            print "Molar extintion coefficient (no Cystines) [Abs 0.1% (=1 g/l)] =", molar_extintion_coef_noCyst, "\n"

        return
Пример #21
0
def get_weight(value):
    traces = []
    file_path = choose_fasta(value)
    with open(file_path, "r") as file_fasta:
        for entry in SeqIO.parse(file_fasta, "fasta"):
            id_prot = entry.id.split("|")
            id_chain = id_prot[0].split(":")
            seq = str(entry.seq)
            X = pp.ProteinAnalysis(seq)
            weight = X.molecular_weight()
            chain_name = []
            chain_weight = []
            chain_name.append("Chain " + id_chain[1])
            chain_weight.append(weight)
            traces.append(
                go.Bar(x=chain_name,
                       y=chain_weight,
                       name="Chain " + id_chain[1]))
    return traces
Пример #22
0
def get_percentage_aa(value):
    traces = []
    file_path = choose_fasta(value)
    with open(file_path, "r") as file_fasta:
        for entry2 in SeqIO.parse(file_fasta, "fasta"):
            id_prot = entry2.id.split("|")
            id_chain = id_prot[0].split(":")
            seq = str(entry2.seq)
            X = pp.ProteinAnalysis(seq)
            percent_aa = X.get_amino_acids_percent()
            aa_list = []
            aa_percent = []
            for key, value in percent_aa.items():
                aa_name = seq3(key)
                aa_list.append(aa_name)
                aa_percent.append(value * 100)
            traces.append(
                go.Bar(x=aa_list, y=aa_percent, name="Chain " + id_chain[1]))
    return traces
Пример #23
0
def part_two():
    data = list(csv.DictReader(open('data.csv')))
    l = [len(s['sequence']) for s in data]
    print(l)
    w = csv.DictWriter(open('data_extra.csv', 'w', newline=''),
                       fieldnames=('identifier', 'sequence', 'len', 'sheet',
                                   'turn', 'helix'))
    w.writeheader()
    from Bio.SeqUtils import ProtParam
    for gene in data:
        (h, t, s) = ProtParam.ProteinAnalysis(
            gene['sequence']).secondary_structure_fraction()
        w.writerow({
            'identifier': gene['identifier'],
            'sequence': gene['sequence'],
            'len': len(gene['sequence']),
            'sheet': s,
            'turn': t,
            'helix': h
        })
def mod(sequence):
    """
    This is a not implemented function. It is a fix for ProtParam.ProteinAnalysis().protein_scale and the DIWV scale.
    As the latter requires knowldge of the preceeding amino acid it will fail.
    >>> p = ProtParam.ProteinAnalysis(sequence)
    >>> p.protein_scale(ProtParamData.DIWV, window=9, edge=.4)
    hashtag epicfail.
    So this is the repalacement.
    :param sequence: sequence to score
    :type sequence: str
    :return: DIWV score.
    :rtype: list[int]
    """
    p = ProtParam.ProteinAnalysis(sequence)
    param_dict = ProtParamData.DIWV
    window = 9
    edge = 0.4
    weights = p._weight_list(window, edge)
    sum_of_weights = sum(weights) * 2 + 1
    scores = []
    for i in range(p.length - window):
        subsequence = p.sequence[i:i + window]
        score = 0.0
        for j in range(window // 2):
            try:
                front = param_dict[subsequence[j]][subsequence[j + 1]]
                back = param_dict[subsequence[window - j]][subsequence[window - j + 1]]
                score += weights[j] * front + weights[j] * back
            except KeyError:
                warn(f'warning: {subsequence[j]} or {subsequence[window - j - 1]} is not a standard amino acid.')
        middle = subsequence[window // 2]
        if middle in param_dict:
            score += param_dict[middle]
        else:
            warn(f'warning: {middle} is not a standard amino acid.')
        scores.append(score / sum_of_weights)
    return scores
Пример #25
0
def GetExec(seqRec, frSize):
    # Calculate protParamData
    pa = ProtParam.ProteinAnalysis(str(seqRec))
    d = Decimal(10)**-2
    flexList = pa.flexibility()
    lenf = len(flexList) * 1.
    flexSum = 0
    for f in flexList:
        flexSum += f
    retMat = [[], []]
    retMat[0].append("Mol. Weight:")
    retMat[1].append(str(Decimal(pa.molecular_weight()).quantize(d)))
    retMat[0].append("Aromaticity:")
    retMat[1].append(str(Decimal(pa.aromaticity()).quantize(d)))
    retMat[0].append("Instability:")
    retMat[1].append(str(Decimal(pa.instability_index()).quantize(d)))
    retMat[0].append("Avg. Flexibility:")
    retMat[1].append(str(Decimal(flexSum / lenf / 1.).quantize(d)))
    retMat[0].append("pI:")
    retMat[1].append(str(Decimal(pa.isoelectric_point()).quantize(d)))
    #retMat[0].append("Avg. Hydropathy:")
    #retMat[1].append(
    #    str(Decimal(pa.protein_scale(ProtParamData.kd,lenf,1)[0]).quantize(d)))
    return retMat
def calculate_property(seq_path):
    seq_fasta = SeqIO.parse(seq_path, "fasta")
    result_primary_feature = pd.DataFrame(columns=[
        "SeqID", "molecular_weight", "instability_index", "GRAVY",
        "theoretical_pI"
    ])
    func_dict = {
        "molecular_weight": ProtParam.ProteinAnalysis.molecular_weight,
        "instability_index": ProtParam.ProteinAnalysis.instability_index,
        "GRAVY": ProtParam.ProteinAnalysis.gravy,
        "theoretical_pI": ProtParam.ProteinAnalysis.isoelectric_point
    }
    for seq in seq_fasta:
        protein_seq = str(seq.seq).strip("*")
        protein_result = ProtParam.ProteinAnalysis(protein_seq)
        tmp_dict = {"SeqID": seq.id}
        for key, Prot_func in func_dict.items():
            try:
                tmp_dict[key] = Prot_func(protein_result)
            except BaseException:
                tmp_dict[key] = "NA"
        result_primary_feature = result_primary_feature.append(
            tmp_dict, ignore_index=True)
    return result_primary_feature
Пример #27
0
def calculate_potential(fasta, strand, outfile):
    '''
    Calculate three features: putative peptide length,pI and Fickett
    And assess coding potential based on SVM model
    '''
    strinfoAmbiguous = re.compile("X|B|Z|J|U", re.I)
    ptU = re.compile("U", re.I)
    ftmp_feat = open(outfile + ".feat", "w")
    ftmp_svm = open(outfile + ".tmp.1", "w")
    ftmp_result = open(outfile, "w")
    ftmp_result.write("\t".join(
        map(str, [
            "#ID", "transcript_length", "peptide_length", "Fickett_score",
            "pI", "ORF_integrity", "coding_probability", "label"
        ])) + "\n")
    ftmp_result.close()
    fickett_obj = Fickett()
    for seq in seqio.fasta_read(fasta):
        seqid = seq.id
        seqRNA = ptU.sub("T", str(seq.seq).strip())
        '''seqRNA:transcript full sequence'''
        seqRNA = seqRNA.upper()
        seqCDS, start_pos, orf_strand, orf_fullness = FindCDS(
            seqRNA).longest_orf(strand)
        '''seqCDS:longest ORF'''
        seqprot = mRNA_translate(seqCDS)
        pep_len = len(seqprot)  # pep_len = len(seqprot.strip("*"))
        newseqprot = strinfoAmbiguous.sub("", str(seqprot))
        '''exclude ambiguous amio acid X, B, Z, J, Y in peptide sequence'''
        fickett_score = fickett_obj.fickett_value(seqRNA)
        protparam_obj = ProtParam.ProteinAnalysis(str(newseqprot.strip("*")))
        if pep_len > 0:
            # fickett_score = fickett_obj.fickett_value(seqCDS)
            isoelectric_point = protein_param(protparam_obj)
        else:
            # fickett_score = 0.0
            orf_fullness = -1
            isoelectric_point = 0.0
        ftmp_feat.write("\t".join(
            map(str, [
                seqid,
                len(seqRNA), pep_len, fickett_score, isoelectric_point,
                orf_fullness
            ])) + "\n")
        ftmp_svm.write("".join(
            map(str, [
                "999", " 1:", pep_len, " 2:", fickett_score, " 3:",
                isoelectric_point, " 4:", orf_fullness
            ])) + "\n")
    ftmp_feat.close()
    ftmp_svm.close()
    # return 0

    # calculate the coding probability using LIBSVM
    sys.stderr.write("\n[INFO] Predicting coding potential, please wait ...\n")
    #    set directories and check depending tools existance

    data_dir = os.path.join(
        os.path.dirname(pkg_resources.resource_filename(
            "CPC2", "__init__.py")), "data") + os.path.sep
    lib_dir = os.path.join(
        os.path.dirname(pkg_resources.resource_filename("CPC2",
                                                        "__init__.py")),
        "libs")

    app_svm_scale = os.path.join(lib_dir, "libsvm/libsvm-3.18/svm-scale")
    app_svm_predict = os.path.join(lib_dir, "libsvm/libsvm-3.18/svm-predict")
    sp.call(
        'test -x ' + app_svm_scale +
        ' || echo \"[ERROR] No excutable svm-scale on CPC2 path!\" > /dev/stderr',
        shell=True)
    sp.call(
        'test -x ' + app_svm_predict +
        ' || echo \"[ERROR] No excutable svm-predict on CPC2 path!\" > /dev/stderr',
        shell=True)

    model = os.path.join(data_dir, 'cpc2.range')
    cmd = app_svm_scale + ' -r ' + model + ' ' + outfile + '.tmp.1 > ' + outfile + '.tmp.2 &&'
    cmd = cmd + app_svm_predict + ' -b 1 -q ' + outfile + '.tmp.2 ' + data_dir + 'cpc2.model ' + outfile + '.tmp.1 &&'
    cmd = cmd + 'awk -vOFS="\\t" \'{if ($1 == 1){print $2,"coding"} else if ($1 == 0){print $2,"noncoding"}}\' ' + outfile + '.tmp.1 > ' + outfile + '.tmp.2 &&'
    cmd = cmd + 'paste ' + outfile + '.feat ' + outfile + '.tmp.2 >>' + outfile
    command = sp.Popen(cmd, shell=True, stdout=sp.PIPE, stderr=sp.PIPE)
    (outtext, errtext) = command.communicate()
    exitstatus = command.returncode
    os.system('rm -f ' + outfile + '.tmp.1 ' + outfile + '.tmp.2')
    if exitstatus == 0:
        rm_cmd = "rm -f " + outfile + '.feat'
        sp.getoutput(rm_cmd)
        sys.stderr.write("\n[INFO] Running Done!\n")
        return 0
    else:
        sys.stderr.write(
            "\n[ERROR] Prediction error! Exit code: {}\n".format(exitstatus))
        sys.stderr.write(outtext.decode())
        sys.stderr.write(errtext.decode())
        return -1
Пример #28
0
 def setUp(self):
     self.seq_text = "MAEGEITTFTALTEKFNLPPGNYKKPKLLYCSNGGHFLRILPDGTVDGTRDRSDQHIQLQLSAESVGEVYIKSTETGQYLAMDTSGLLYGSQTPSEECLFLERLEENHYNTYTSKKHAEKNWFVGLKKNGSCKRGPRTHYGQKAILFLPLPV"
     self.analysis = ProtParam.ProteinAnalysis(self.seq_text)
Пример #29
0
""" Module to map info on tree """

# Load libraries

import sys
from Bio import SeqIO
from Bio.SeqUtils import ProtParam

# Load ancestral sequences
node_dict = {}
handle = open(sys.argv[1], "rU")
for record in SeqIO.parse(handle, "fasta"):
    sequence = record.seq.tostring()
    sequence = sequence.replace("-", "")
    analysed_protein = ProtParam.ProteinAnalysis(sequence)

    # Compute some properties
    pI = analysed_protein.isoelectric_point()
    MW = analysed_protein.molecular_weight()
    # print record.id, pI, MW
    node_dict[record.id] = pI
handle.close()


# Load tree
tree_tab = []
tree_file = open(sys.argv[2], "r")
while 1:
    line = tree_file.readline()
    if line == "":
Пример #30
0
 def test_get_monoisotopic_molecular_weight(self):
     "Test calculating the monoisotopic molecular weight"
     self.analysis = ProtParam.ProteinAnalysis(self.seq_text,
                                               monoisotopic=True)
     self.assertAlmostEqual(self.analysis.molecular_weight(), 17092.53)