Пример #1
0
def main(): #programm, mis kysib valgu fasta faili ja annab selle kohta parameetrid
    fasta = input()
    sequence = read_fasta(fasta)
    print(sequence)
    analysed_seq = ProteinAnalysis(str(sequence))
    print("\n","Molekulaarmass:",analysed_seq.molecular_weight())
    print("\n","Aminohapete arv:",analysed_seq.count_amino_acids())
    print("\n","Isoelektriline punkt:",analysed_seq.isoelectric_point())
    text_file = open("Valgu_parameetrid.txt", "w")
    text_file.write(str(analysed_seq.molecular_weight()))
    text_file.write("\n")
    text_file.write(str(analysed_seq.count_amino_acids()))
    text_file.write("\n")
    text_file.write(str(analysed_seq.isoelectric_point()))
    text_file.close()
Пример #2
0
def protParam(seq):
    params = ProteinAnalysis(seq)
    mw = params.molecular_weight()
    c_aa = params.count_amino_acids()
    p_aa = params.get_amino_acids_percent()
    gravy = params.gravy()
    aromaticity = params.aromaticity()
    isoelectric_point = params.isoelectric_point()
    ext_coeff = sum([c_aa["W"]*5690,c_aa["Y"]*1280,c_aa["C"]*120])
    mgml = ext_coeff * (1./mw)
    
    print("Amino acid count")
    pprint.pprint(c_aa)
    print("Amino acid percent")
    pprint.pprint(p_aa)
    print("Molecular weight")
    print("%f Da"%mw)
    print("Gravy")
    print(gravy)
    print("Isoelectric point")
    print(isoelectric_point)
    print("Aromaticity")
    print(aromaticity)
    print("Extinction coefficient: %d M-1cm-1 (Assuming reduced)"%ext_coeff)
    print("")
def get_protein_analysis(aa):
    protein_analysis = ProteinAnalysis(aa)
    analyze = [protein_analysis.molecular_weight(), 
        protein_analysis.aromaticity(),
        protein_analysis.instability_index(),
        protein_analysis.isoelectric_point(),
        protein_analysis.gravy()] + list(
        protein_analysis.secondary_structure_fraction())
    return analyze
Пример #4
0
def prot_feats_seq(seq):

    aa = [
        'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q',
        'R', 'S', 'T', 'V', 'W', 'Y'
    ]

    f = []

    X = ProteinAnalysis(str(seq))

    X.molecular_weight(
    )  #throws an error if 'X' in sequence. we skip such sequences
    p = X.get_amino_acids_percent()

    dp = []
    for a in aa:
        dp.append(p[a])
    dp = np.array(dp)
    dp = normalize(np.atleast_2d(dp),
                   norm='l2',
                   copy=True,
                   axis=1,
                   return_norm=False)
    f.extend(dp[0])
    tm = np.array(twomerFromSeq(str(seq)))
    tm = normalize(np.atleast_2d(tm),
                   norm='l2',
                   copy=True,
                   axis=1,
                   return_norm=False)

    f.extend(tm[0])
    thm = np.array(threemerFromSeq(str(seq)))
    thm = normalize(np.atleast_2d(thm),
                    norm='l2',
                    copy=True,
                    axis=1,
                    return_norm=False)
    f.extend(thm[0])

    return np.array(f)
Пример #5
0
def properties(toxin_faa, antitoxin_faa, out):

    # Build a dictionary of {locus:[{properties:values},{properties:values}]}
    from collections import defaultdict
    loci = defaultdict(list)
    from Bio import SeqIO
    for f in [toxin_faa, antitoxin_faa]:
        # Parse FASTA files
        with open(f, 'rU') as handle:
            for record in SeqIO.parse(handle, 'fasta'):
                locus, start = getNameAndPosition(record)
                if not start:
                    continue
                aaseq = str(record.seq).strip("*")
                # Omit sequences with missing positions or premature stops
                # give them 0 as flag for missing data instead
                if "*" not in aaseq and "X" not in aaseq:
                    data = ProteinAnalysis(aaseq)
                    loci[locus].append({
                        'start': start,
                        'pI': data.isoelectric_point(),
                        'weight': data.molecular_weight(),
                        'instability': data.instability_index()
                    })
                else:
                    loci[locus].append({
                        'start': start,
                        'pI': 0,
                        'weight': 0,
                        'instability': 0
                    })

    # Order genes in a locus positionally
    loci = orderPairs(loci)

    # Write to output fil
    outfile = ".".join([out, "properties", "txt"])
    with open(outfile, 'w') as o:
        header = "\t".join([
            "locus", "gene1_pI", "gene2_pI", "gene1_weight", "gene2_weight",
            "gene1_instability", "gene2_instability"
        ])

        o.write("#" + header.upper() + "\n")
        for locus, gene in loci.iteritems():
            if len(gene) != 2:
                continue
            line = map(str, [
                locus, gene[0]['pI'], gene[1]['pI'], gene[0]['weight'],
                gene[1]['weight'], gene[0]['instability'],
                gene[1]['instability']
            ])
            o.write("\t".join(line) + "\n")
    return outfile
Пример #6
0
 def molecular_weight_printer(filename):
 ppb = PPBuilder()
 MW_final = 0
 print(filename[25:29])
 for pp in ppb.build_peptides(structure(filename)):
  seq = pp.get_sequence()
  seqstring = str(seq)
  analysed_seq = ProteinAnalysis(seqstring)
  MW = analysed_seq.molecular_weight()
  MW_final += MW
 print(MW_final)
Пример #7
0
def checksize(peptide):
    #only peptides with a minimal length of 6 amino acids
    #and a molecular weigth between 400 and 6000 are accepted
    if len(peptide) >= 6:
        analysed_seq = ProteinAnalysis(seq_weight_corrections(peptide))
        mol_weight = analysed_seq.molecular_weight()
        if mol_weight >= 400 and mol_weight <= 6000:
            return True
        else:
            return False
    else:
        return False
Пример #8
0
def get_params(fasta_file, out_file):
    with open(out_file, "w") as out:
        out.write("UniprotID,MW,pI\n")
        with open(fasta_file, "r") as handle:
            for record in SeqIO.parse(handle, "fasta"):

                analysed_seq = ProteinAnalysis(
                    str(record.seq).replace("X", ""))
                outstring = record.id + "," + str(
                    analysed_seq.molecular_weight()) + "," + str(
                        analysed_seq.isoelectric_point()) + "\n"
                out.write(outstring)
Пример #9
0
def molecular_weight(filename):
 ppb = PPBuilder()
 MW_final = 0
# return(filename[25:29])
#use above comment to return pbd identifier along with the MW if desired
 for pp in ppb.build_peptides(structure(filename)):
  seq = pp.get_sequence()
  seqstring = str(seq)
  analysed_seq = ProteinAnalysis(seqstring)
  MW = analysed_seq.molecular_weight()
  MW_final += MW
 return(MW_final)
Пример #10
0
def prot_feats(filename):
    XX = []
    ids = []

    for rec in SeqIO.parse(filename, "fasta"):
        f = []
        X = ProteinAnalysis(str(rec.seq))
        #        import pdb; pdb.set_trace()
        try:
            X.molecular_weight(
            )  #throws an error if 'X' in sequence. we skip such sequences
            f = list(prot_feats_seq(str(rec.seq)))
            #
            XX.append(f)
            ids.append(rec.id)
        except:
            continue

    XX = np.array(XX)
    #    import pdb; pdb.set_trace()

    return XX, ids
Пример #11
0
    def test_molecular_weight(self):
        "Test Lantipeptide.molecular_weight"
        lant = Lantipeptide(23, 42, 17, 'Class-I')
        lant.core = "MAGICHAT"
        analysis = ProteinAnalysis("MAGICHAT", monoisotopic=False)
        mw = analysis.molecular_weight()
        # Thr is assumed to be dehydrated
        mw -= 18.02
        self.assertAlmostEqual(mw, lant.molecular_weight)
        self.assertAlmostEqual(mw, lant._weight)

        lant._weight = 42
        self.assertEqual(42, lant.molecular_weight)
Пример #12
0
def biopython_protein_analysis(inseq):
    """Utiize Biopython's ProteinAnalysis module to return general sequence properties of an amino acid string.

    For full definitions see: http://biopython.org/DIST/docs/api/Bio.SeqUtils.ProtParam.ProteinAnalysis-class.html

    Args:
        inseq: Amino acid sequence

    Returns:
        dict: Dictionary of sequence properties. Some definitions include:
        instability_index: Any value above 40 means the protein is unstable (has a short half life).
        secondary_structure_fraction: Percentage of protein in helix, turn or sheet

    TODO:
        Finish definitions of dictionary

    """

    inseq = ssbio.protein.sequence.utils.cast_to_str(inseq)

    analysed_seq = ProteinAnalysis(inseq)

    info_dict = {}
    info_dict['amino_acids_content-biop'] = analysed_seq.count_amino_acids()
    info_dict[
        'amino_acids_percent-biop'] = analysed_seq.get_amino_acids_percent()
    info_dict['length-biop'] = analysed_seq.length
    info_dict['monoisotopic-biop'] = analysed_seq.monoisotopic
    info_dict['molecular_weight-biop'] = analysed_seq.molecular_weight()
    info_dict['aromaticity-biop'] = analysed_seq.aromaticity()
    info_dict['instability_index-biop'] = analysed_seq.instability_index()
    # TODO: What is flexibility?
    info_dict['flexibility-biop'] = analysed_seq.flexibility()
    info_dict['isoelectric_point-biop'] = analysed_seq.isoelectric_point()

    # grand average of hydrophobicity
    info_dict['gravy-biop'] = analysed_seq.gravy()

    # Separated secondary_structure_fraction into each definition
    # info_dict['secondary_structure_fraction-biop'] = analysed_seq.secondary_structure_fraction()
    info_dict[
        'percent_helix_naive-biop'] = analysed_seq.secondary_structure_fraction(
        )[0]
    info_dict[
        'percent_turn_naive-biop'] = analysed_seq.secondary_structure_fraction(
        )[1]
    info_dict[
        'percent_strand_naive-biop'] = analysed_seq.secondary_structure_fraction(
        )[2]

    return info_dict
def find_composition(df_original):
    df_copy = df_original.copy()

    column_names = []
    for ch in codes:
        column_names.append(ch + '_percent')
        column_names.append(ch + '_percent_first')
        column_names.append(ch + '_percent_last')
    column_names.append('len')
    column_names.append('weight')
    column_names.append('gravy')
    column_names.append('flex_mean')
    column_names.append('flex_std')
    column_names.append('ss_helix')
    column_names.append('ss_turn')
    column_names.append('ss_sheet')
    column_names.append('iep')
    column_names.append('aromaticity')

    df = pd.DataFrame(columns=column_names)
    for _, seq in enumerate(tqdm(df_copy['seq'])):
        df_temp = pd.Series()
        sequence = str(seq)
        analysed = ProteinAnalysis(sequence)
        analysed_first = ProteinAnalysis(sequence[:first_n])
        analysed_last = ProteinAnalysis(sequence[-last_n:])

        df_temp['len'] = analysed.length
        df_temp['ss_helix'], df_temp['ss_turn'], df_temp['ss_sheet'] = analysed.secondary_structure_fraction()
        df_temp['iep'] = analysed.isoelectric_point()

        # overall
        for aa, percent in analysed.get_amino_acids_percent().items():
            df_temp[aa + '_percent'] = percent

        # # first N
        for aa, percent in analysed_first.get_amino_acids_percent().items():
            df_temp[aa + '_percent_first'] = percent

        # last N
        for aa, percent in analysed_last.get_amino_acids_percent().items():
            df_temp[aa + '_percent_last'] = percent

        df_temp['weight'] = analysed.molecular_weight()
        df_temp['gravy'] = analysed.gravy()
        df_temp['aromaticity'] = analysed.aromaticity()
        df_temp['flex_mean'] = np.mean(analysed.flexibility())
        df_temp['flex_std'] = np.std(analysed.flexibility())
        df = df.append(df_temp, ignore_index=True)

    return pd.concat([df_copy, df], axis=1)
Пример #14
0
def analysis(listofaas, outlist):
    for prot in listofaas:
        exc = 0
        try:
            templist = []
            p = ProteinAnalysis(prot)
            templist.append(p.molecular_weight())
            templist.append(p.instability_index())
            templist.append(p.isoelectric_point())
            outlist.append(templist)
        except ValueError:
            exc = exc + 1
        except KeyError:
            exc = exc + 1
Пример #15
0
    def molecular_weight(self):  #overrides base class
        if self._contains_unknown:
            new_seq = ''
            for aa in self._sequence:
                if aa not in UNKNOWNS:
                    new_seq += aa
            new_p = ProteinAnalysis(new_seq)
            mw = new_p.molecular_weight()
            #just increase by avg mw of known aa's
            mw *= len(self._sequence) / len(new_seq)
        else:
            mw = super(Protein, self).molecular_weight()

        return mw
def phyChemProps(seq):
    svv = [0 for x in range(10)]
    X = ProteinAnalysis(seq)
    svv[0] = X.aromaticity()
    svv[1] = X.secondary_structure_fraction()[0]
    svv[2] = X.secondary_structure_fraction()[1]
    svv[3] = X.secondary_structure_fraction()[2]
    svv[4] = X.gravy()
    svv[5] = X.instability_index()
    svv[6] = X.isoelectric_point()
    svv[7] = X.molecular_weight()
    svv[8] = X.molar_extinction_coefficient()[0]
    svv[9] = X.molar_extinction_coefficient()[1]
    return svv
Пример #17
0
def pep_param(pep):

    lanA_param = ProteinAnalysis(pep)

    lanA_mw = lanA_param.molecular_weight()

    params = [lanA_mw]
    if len(pep) > 0:
        lanA_pI = lanA_param.isoelectric_point()
    else:
        lanA_pI = 'na'
    params.extend([lanA_pI])
    lanA_AAs = lanA_param.count_amino_acids().values()
    params.extend(lanA_AAs)
    return params
Пример #18
0
def molecular_weight(fastas):
    #seq_new=seq.replace('X','').replace('B','')
    encodings3 = []
    header = ["Weight"]
    encodings3.append(header)
    for i in fastas:
        name, sequence = i[0], re.sub('-', '', i[1])
        code = []
        analysed_seq = ProteinAnalysis(sequence)
        analysed_seq.monoisotopic = True
        mw = analysed_seq.molecular_weight()
        Normw = (mw - 513.222346) / (9577.017286 - 513.222346)
        code.append(Normw)
        encodings3.append(code)
    return (encodings3)
Пример #19
0
def properties(toxin_faa,antitoxin_faa,out):

    # Build a dictionary of {locus:[{properties:values},{properties:values}]}
    from collections import defaultdict
    loci = defaultdict(list)
    from Bio import SeqIO
    for f in [toxin_faa,antitoxin_faa]:
        # Parse FASTA files
        with open(f,'rU') as handle:
            for record in SeqIO.parse(handle,'fasta'):
                locus,start = getNameAndPosition(record)
                if not start:
                    continue
                aaseq = str(record.seq).strip("*")
                # Omit sequences with missing positions or premature stops
                # give them 0 as flag for missing data instead
                if "*" not in aaseq and "X" not in aaseq:
                    data = ProteinAnalysis(aaseq)
                    loci[locus].append({ 'start':  start,
                                         'pI':     data.isoelectric_point(),
                                         'weight': data.molecular_weight(),
                                         'instability': data.instability_index() })
                else:
                    loci[locus].append({ 'start': start,
                                         'pI': 0, 'weight':0 ,
                                         'instability': 0 })

        
    # Order genes in a locus positionally
    loci = orderPairs(loci)

    # Write to output fil
    outfile = ".".join([out,"properties","txt"])
    with open(outfile,'w') as o:
        header = "\t".join(["locus",
                            "gene1_pI","gene2_pI",
                            "gene1_weight","gene2_weight",
                            "gene1_instability","gene2_instability" ])

        o.write("#"+ header.upper() + "\n")
        for locus, gene in loci.iteritems():
            if len(gene) != 2:
                continue
            line = map(str, [ locus,gene[0]['pI'],gene[1]['pI'],
                              gene[0]['weight'],gene[1]['weight'],
                              gene[0]['instability'],gene[1]['instability'] ])
            o.write("\t".join(line)+"\n")
    return outfile
Пример #20
0
def compute_molecular_weight(aa_seq):
    """Description: calculates the molecular weight from an amino acid sequence

    Parameters:
        aa_seq : the amino acid sequence (must be string object)

    Return: returns the molecular weight of the amino acid sequence

    Example of usage:
      >> MolecularWeight = compute_molecular_weight(aa_seq)
      >> print(MolecularWeight)

    Output (String):
        [molecular weight of amino acid sequence]
    """
    analysed_seq = ProteinAnalysis(aa_seq)
    weight = analysed_seq.molecular_weight()
    return weight
def bio_feat(record):
    clean_seq = str(MutableSeq(record.seq)).replace("X", "")
    clean_seq = clean_seq.replace("U", "C")
    clean_seq = clean_seq.replace("B", "N")
    clean_seq = clean_seq.replace('Z', 'Q')
    clean_seq = MutableSeq(clean_seq).toseq()

    ### features
    seq_length = len(str(clean_seq))
    analysed_seq = ProteinAnalysis(str(clean_seq))
    molecular_weight = analysed_seq.molecular_weight()
    amino_percent = analysed_seq.get_amino_acids_percent().values()
    isoelectric_points = analysed_seq.isoelectric_point()
    count = analysed_seq.count_amino_acids().values()
    # aromaticity = analysed_seq.aromaticity()
    instability_index = analysed_seq.instability_index()
    # hydrophobicity = analysed_seq.protein_scale(ProtParamData.kd, 5, 0.4)
    secondary_structure_fraction = analysed_seq.secondary_structure_fraction()
    return np.array([seq_length, molecular_weight, isoelectric_points, instability_index] + list(secondary_structure_fraction) + list(count) + list(amino_percent))
def biochemical_properties(sequence: str) -> Dict[str, Any]:
    # Define objects used for calculations
    analysis_object = ProteinAnalysis(sequence)
    descriptor_object = PyPro.GetProDes(sequence)
    sequence_object = Seq(sequence)
    # TODO(Ahmed): Verify that all these calculations are actually returning reasonable values
    # For example, it says the percent composition of every amino acid is zero when I run
    # calculate_biochem_properties.biochemical_properties('qwertyipasdfghklcvnm')
    return {
        'Isoelectric point': analysis_object.isoelectric_point(),
        'Molecular weight':
        analysis_object.molecular_weight(),  # Daltons? Amu? g/mol?
        'Aromaticity': analysis_object.aromaticity(),
        'Instability index': analysis_object.instability_index(),
        'GRAVY': analysis_object.gravy(),
        'H-bonding percent': h_bonding_percent(sequence),
        'Melting temp': melting_temp(sequence),
        'LCC': lcc.lcc_simp(sequence)
    }
Пример #23
0
 def amino_acid_analysis(self):
     """
     Adds fraction of amino acid residues (defined in RESIDUES) to data frame.
     """
     for res in RESIDUES:
         self.df["fraction_" + res] = (
             self.df["sequence"].str.count(res) / self.df["sequence"].str.len()
         )
     self.df["length"] = self.df["sequence"].str.len()
     for index, row in tqdm(self.df.iterrows(), total=self.df.shape[0]):
         # for index, row in self.df.iterrows():
         seq = row["sequence"]
         seqanalysis = ProteinAnalysis(seq)
         acidist = seqanalysis.get_amino_acids_percent()
         self.df.loc[index, "IEP"] = seqanalysis.isoelectric_point()
         if "X" not in seq and "B" not in seq:
             self.df.loc[index, "molecular_weight"] = seqanalysis.molecular_weight()
         if "U" not in seq and "X" not in seq and "B" not in seq:
             self.df.loc[index, "gravy"] = seqanalysis.gravy()
Пример #24
0
    def __init__(self, sequence):
        self.sequence = sequence
        self.sequence_length = len(sequence)
        analysis = ProteinAnalysis(sequence)

        self.amino_acid_percents = analysis.get_amino_acids_percent()
        self.amino_acids_composition = calculate_amino_acids_composition(sequence)
        self.aromaticity = analysis.aromaticity()
        self.instability = analysis.instability_index()
        self.flexibility = calculate_flexibility(sequence)
        protein_scale_parameters = [{'name': 'Hydrophilicity', 'dictionary': hw},
                                    {'name': 'Surface accessibility', 'dictionary': em},
                                    {'name': 'Janin Interior to surface transfer energy scale', 'dictionary': ja},
                                    {'name': 'Bulkiness', 'dictionary': bulkiness},
                                    {'name': 'Polarity', 'dictionary': polarity},
                                    {'name': 'Buried residues', 'dictionary': buried_residues},
                                    {'name': 'Average area buried', 'dictionary': average_area_buried},
                                    {'name': 'Retention time', 'dictionary': retention_time}]
        self.protein_scales = calculate_protein_scales(analysis, protein_scale_parameters)
        self.isoelectric_point = analysis.isoelectric_point()
        self.secondary_structure_fraction = calculate_secondary_structure_fraction(analysis)
        self.molecular_weight = analysis.molecular_weight()
        self.kyte_plot = analysis.gravy()
        self.pefing = calculate_pefing(sequence)

        # next parameters are calculated using R.Peptides
        r('require(Peptides)')
        r('sequence = "{0}"'.format(sequence))
        self.aliphatic_index = r('aindex(sequence)')[0]
        self.boman_index = r('boman(sequence)')[0]
        self.charges = calculate_charges(sequence, 1.0, 14.0, 0.5, 'Lehninger')
        self.hydrophobicity = r('seq(sequence)')[0]
        angles = [{'name': 'Alpha-helix', 'angle': -47},
                  {'name': '3-10-helix', 'angle': -26},
                  {'name': 'Pi-helix', 'angle': -80},
                  {'name': 'Omega', 'angle': 180},
                  {'name': 'Antiparallel beta-sheet', 'angle': 135},
                  {'name': 'Parallel beta-sheet', 'angle': 113}]
        if self.amino_acid_percents['P'] + self.amino_acid_percents['G'] > 0.3:
            angles.append({'name': 'Polygly-polypro helix', 'angle': 153})
        self.hydrophobic_moments = calculate_hydrophobic_moments(sequence, angles)
        self.kidera_factors = calculate_kidera_factors(sequence)
        self.peptide_types = calculate_peptide_types(sequence, angles)
Пример #25
0
def protein_analysis():
    if session.username == None: redirect(URL(r=request,f='../account/log_in'))
    from Bio.SeqUtils.ProtParam import ProteinAnalysis
    form = FORM(TABLE(
            TR("Amino acid sequence:  ",
               TEXTAREA(_type="text", _name="sequence",
                        requires=IS_NOT_EMPTY())),
            INPUT(_type="submit", _value="SUBMIT")))
    if form.accepts(request.vars,session):
        session['sequence'] = seqClean(form.vars.sequence.upper())
        X = ProteinAnalysis(session['sequence'])
        session['aa_count'] = X.count_amino_acids()
        session['percent_aa'] = X.get_amino_acids_percent()
        session['mw'] = X.molecular_weight()
        session['aromaticity'] = X.aromaticity()
        session['instability'] = X.instability_index()
        session['flexibility'] = X.flexibility()
        session['pI'] = X.isoelectric_point()
        session['sec_struct'] = X.secondary_structure_fraction()
        redirect(URL(r=request, f='protein_analysis_output'))
    return dict(form=form)
Пример #26
0
def analyze_proteins(cdss):
    for cds in cdss:
        seq = ProteinAnalysis(cds['sequence'])
        seq_stats = OrderedDict()
        try:
            seq_stats['molecular_weight'] = seq.molecular_weight()
        except:
            log.warning(
                'could not calc molecular weight! contig=%s, start=%i, stop=%i, strand=%s, frame=%s',
                cds['contig'], cds['start'], cds['stop'], cds['strand'],
                cds['frame'])
            seq_stats['molecular_weight'] = float('nan')
        try:
            seq_stats['isoelectric_point'] = seq.isoelectric_point()
        except:
            log.warning(
                'could not calc isoelectric point! contig=%s, start=%i, stop=%i, strand=%s, frame=%s',
                cds['contig'], cds['start'], cds['stop'], cds['strand'],
                cds['frame'])
            seq_stats['isoelectric_point'] = float('nan')
        cds['seq_stats'] = seq_stats
def get_features(seq):
    """get global features from a protein sequence

    Parameters
    ----------
    seq : str
        protein sequence

    Return
    ----------
    dictionary:
        global features of the protein sequence

    """

    features = {}
    features['undefined_count'] = len([x for x in seq if x in ['X','B','Z',"'",'O','U']])
    features['length'] = len(seq)
    features['perc_undefined_count'] = features['undefined_count']/features['length']
    features['entropy'] = entropy(seq)
    features['ideal_entropy'] = entropy_ideal(len(seq))
    features['perc_entropy'] = features['entropy']/features['ideal_entropy']
    features['hydr_count'] = sum(1 for x in seq if x in hydrophobic_proteins)
    features['polar_count'] = sum(1 for x in seq if x in polar_proteins)
    features['buried'] = sum(buried[x] for x in seq if x in hydrophobic_proteins)

    seq = ''.join([x for x in seq if x not in ['X','B','Z',"'",'O','U']])

    protein = ProteinAnalysis(seq)
    features['gravy'] = protein.gravy()
    features['molecular_weight'] = protein.molecular_weight()
    features['aromaticity'] = protein.aromaticity()
    features['instability_index'] = protein.instability_index()
    features['isoelectric_point'] = protein.isoelectric_point()
    features['helix'], features['turn'], features['sheet'] = protein.secondary_structure_fraction()

    features.update(protein.count_amino_acids())
    # features.update(protein.get_amino_acids_percent())
    return features
Пример #28
0
def GetFeatures (My_seq):

    Features = {}

    ProteinAnalysis(My_seq)
    analysed_seq = ProteinAnalysis(My_seq)
    #Caracteristicas monovaloradas

    Features["Molecular_weight"] = analysed_seq.molecular_weight()
    Features["Aromaticity"] = analysed_seq.aromaticity()
    Features["Instability_index"] = analysed_seq.instability_index()
    Features["Isoelectric_point"] = analysed_seq.isoelectric_point()


    #Caracteristicas multivaloradas

    Features["Flexibility"] = analysed_seq.flexibility() # List 580
    Features["Second_structure_fraction"] = analysed_seq.secondary_structure_fraction() #3 Tupla
    Features["Count_amino_acids"] = analysed_seq.count_amino_acids() #20 Dict
    Features["Amino_acids_percent"] = analysed_seq.get_amino_acids_percent() #20 Dict


    return Features
Пример #29
0
    def seqs_to_features(self, seqs, no_seqs):
        """ Extract the features from the sequences."""
        X = np.zeros((no_seqs, 32))
        for i, s in enumerate(chain(*seqs)):  # iterate over all sequences
            # get amino acid counts
            alphabet = 'ABCDEFGHIKLMNPQRSTUVWXY'  # no JOZ
            for j, letter in enumerate(alphabet):
                X[i, j] = s.count(letter) / len(s)

            # other analysis
            analysis = ProteinAnalysis(
                s.replace('X', 'A').replace('B', 'A').replace('U', 'A'))
            X[i, -1] = analysis.molecular_weight()
            X[i, -2] = analysis.aromaticity()
            X[i, -3] = analysis.instability_index()
            X[i, -4] = analysis.isoelectric_point()
            helix_array_sheet_fracs = analysis.secondary_structure_fraction()
            X[i, -5] = helix_array_sheet_fracs[0]
            X[i, -6] = helix_array_sheet_fracs[1]
            X[i, -7] = helix_array_sheet_fracs[2]
            X[i, -8] = len(s)
            X[i, -9] = analysis.gravy()  # mean hydrophobicity
        return X
Пример #30
0
    def __init__(self, sequence):

        self.seq = sequence

        self.prop = dict()

        # if the residues are ambiguous
        self.extended = False

        # check if there exist ambiguous residues
        for res, possible_res in ProtSeqProp.trans_dict.items():
            if res in sequence:
                self.extended = True

        if not self.extended:
            PA_analysis = ProteinAnalysis(sequence)
            #            self.prop['secondary_structure_fraction']=PA_analysis.secondary_structure_fraction()
            self.prop['mean_molecular_weight'] = (
                (PA_analysis.molecular_weight() / len(self.seq)) -
                136.90020499999997) / 30.081960849078222
            self.prop['mean_surface_accessibility'] = np.mean(
                [ProtSeqProp.em[res] for res in list(sequence)])
            self.prop['mean_surface_accessibility'] = np.mean(
                [ProtSeqProp.em[res] for res in list(sequence)])
            self.prop['mean_kd_hydrophobicity'] = np.mean(
                [ProtSeqProp.kd[res] for res in list(sequence)])
            self.prop['mean_flexibility'] = np.mean(
                [ProtSeqProp.Flex[res] for res in list(sequence)])
            self.prop['mean_hydrophilicity'] = np.mean(
                [ProtSeqProp.hw[res] for res in list(sequence)])
            self.prop['mean_ja'] = np.mean(
                [ProtSeqProp.ja[res] for res in list(sequence)])
            alph_list = list(sequence)
            self.prop['instability'] = np.mean([
                ProtSeqProp.DIWV[x][y]
                for x, y in zip(alph_list[:-1], alph_list[1:])
            ])
Пример #31
0
def get_coord_array(path, file_name):
    '''
    Function: get coord array of all atoms in a pdb file
    :param path: the path of pdb file of all proteins
    :param file_name: the file name of ****.pdb
    :return: atom coord array
                [[x0,y0,z0],
                 [x1,y1,z1],
                    ... ,
                 [xn,yn,zn]]
           : charge of first model
           : mass of first model
    '''
    parser = PDBParser(PERMISSIVE=1)
    structure_id = file_name.split('.')[0]
    path_file_name = path + file_name
    structure = parser.get_structure(structure_id, path_file_name)

    # Extract mass and charge from first model
    mass, charge = 0.0, 0.0
    polypep_builder = PPBuilder()
    for polypep in polypep_builder.build_peptides(structure):
        analyzer = ProteinAnalysis(polypep.get_sequence())
        mass += analyzer.molecular_weight()
        charge += analyzer.charge_at_pH(7.4)

    atom_coord_list = []
    for model in structure:
        for chain in model:
            for residue in chain:
                for atom in residue:
                    atom_coord = atom.get_coord()
                    atom_coord_list.append(atom_coord)
    atom_coord_array = np.array(atom_coord_list)
    # print file_name,'atom_coo_array\n',atom_coord_array,'\n'
    # print('get_coord_array DONE!\t', path, ": ", file_name)
    return atom_coord_array, charge, mass
Пример #32
0
def physchem_props(ara_d):
    """Calculate the physicochemical properties per protein in ara_d."""
    c = 0
    g = 0
    for protein in ara_d:
        seq = ara_d[protein]["sequence"]
        # Calculates the properties
        if "X" in seq:
            continue  # Skip non-usable sequences, only negs
        if '*' in seq:
            if ara_d[protein]["pos"] != []:
                print(protein)
            continue
        a_seq = ProteinAnalysis(seq)
        # Update ara_d with new physchem properties
        results = [
            a_seq.molecular_weight(),
            a_seq.gravy(),
            a_seq.aromaticity(),
            a_seq.instability_index(),
            a_seq.flexibility(),
            a_seq.isoelectric_point(),
            a_seq.secondary_structure_fraction(),
        ]
        keys = [
            "mol_weight",
            "gravy",
            "aromaticity",
            "instab_index",
            "flexi",
            "iso_point",
            "seq_struct",
        ]
        ara_d[protein]["Properties"] = {}
        for k, v in zip(keys, results):
            ara_d[protein]["Properties"][k] = v
    return ara_d
Пример #33
0
    def on_enter(self, *args):  #what happens as you enter screen #3
        sequence_identity = ObjectProperty(None)

        # reads the no_header_sequence.txt file to calculate Mw in kDa
        noHeader = open("no_header_sequence.txt").read()
        print("noHeader: ", noHeader)
        analysed_seq = ProteinAnalysis(noHeader)
        Mw = analysed_seq.molecular_weight()  # Mw g/mol
        Mw_kDa = round(Mw / 1000, 3)  # Mw kDa

        print(analysed_seq.count_amino_acids()
              )  # Dictionary with count for each amino acid

        heaviness = str(Mw_kDa) + " kDa"
        self.weight.text = heaviness  # updates protein weight in kDa on the screen

        statinfo = os.stat('my_blast.xml')
        size = statinfo.st_size

        if size == 0:  #if no xml file created
            sequence_identity = "BLAST search failed.\nCheck your FASTA file and try again."
        else:
            result_handle = open("my_blast.xml")
            blast_record = NCBIXML.read(result_handle)

            counter = 1
            for alignment in blast_record.alignments:
                for hsp in alignment.hsps:
                    if counter < 2:  #takes only the first result
                        sequence_identity = alignment.hit_def
                        print("hit_def:", alignment.hit_def)
                        title_split = sequence_identity.split('>')
                        reduced_title = title_split[0]
                        print(title_split[0])
                        counter = counter + 1
        self.protname.text = reduced_title  #updates sequence identity on the app screen
    def parse_nuc_sequence(self, n_seq, id=None, desc=None):
        """
        Parses valid RNA sequence, translates nucleotides, calculates GC content and other methods available from ProteinAnalysis() in BioPython module.

        Keyword arguments:
        seq -- valid string sequence
        id -- id obtained from FASTA file record (default None)
        desc -- description obtained from FASTA file record (default None)
        """

        try:
            # append fasta sequence metadata
            self.id.append(id)
            self.description.append(desc)
            self.nucleotide_sequence.append(n_seq)
    
            # translate nucleotide string sequence
            p_seq = self.translate_nucleotides(n_seq)
            self.protein_sequence.append(p_seq)
            # self.protein_sequence.append(str(record.seq.translate()).replace('*', ' '))
            
            # GC content
            self.gc_content.append(self.calculate_gc_content(n_seq))
            
            # protein analysis methods
            analysis = ProteinAnalysis(p_seq)
            self.amino_acid_dict.append(analysis.get_amino_acids_percent())
            self.molecular_weight.append(analysis.molecular_weight())
            self.instability_index.append(analysis.instability_index())
            self.aromaticity.append(analysis.aromaticity())            

        except Exception as e:
            print('-'*80)
            print(f"Exception in parsing uploaded virus sequence: {e}")
            traceback.print_exc(file=sys.stdout)
            print('-'*80)
Пример #35
0
def biopython_proteinanalysis_seq(seq, scaling=False):
    res = ProteinAnalysis(seq)
    d = {}
    flex = np.array(res.flexibility())
    d['flex:min'], d['flex:max'], d['flex:std'] = flex.min(), flex.max(
    ), flex.std()
    d['gravy'] = res.gravy()
    d['instability_index'] = res.instability_index()
    d['isoelectric_point'] = res.isoelectric_point()
    r, c = res.molar_extinction_coefficient()
    d['molar_extinction_coefficient_reduced'], d[
        'molar_extinction_coefficient_cysteines'] = r, c
    d['molecular_weight'] = res.molecular_weight()
    d['percent_helix_naive'], d['percent_turn_naive'], d[
        'percent_strand_naive'] = res.secondary_structure_fraction()

    aap = res.get_amino_acids_percent()
    aas = sorted(aap.keys())
    d.update({'percent:%s' % aa: aap[aa] for aa in aas})
    d.update({
        'prop_res_%s' % key: sum([aap.get(x, 0) for x in value])
        for key, value in list(property_residues.items())
    })
    return d
Пример #36
0
def draw_sequence(sequence, mode = 'simple', alphabet = None):
        
    if mode == 'protparams':
        returndiv = DIV()
        from Bio.SeqUtils.ProtParam import ProteinAnalysis
        seq_div=DIV(_style='font-family:monospace',_class='raw-sequence')
        spacer=len(str(len(sequence)))+1
        for i,pos in enumerate(sequence):
            if i==0:
                seq_div.append(XML((str(i+1)+' ').rjust(spacer).replace(' ','&nbsp;')))
            if i%10==0 and i!=0:
                seq_div.append(' ')
            if i%60==0 and i!=0:
                seq_div.append(XML((str(i)).ljust(spacer).replace(' ','&nbsp;')))
                seq_div.append(BR())
                seq_div.append(XML((str(i+1)+' ').rjust(spacer).replace(' ','&nbsp;')))
            seq_div.append(SPAN(pos,_class='seq-position',_title = i+1))
        returndiv.append(seq_div)
        returndiv.append(H3('Protein Parameters'))
        params_table = TABLE(_style= "width:200px;")
        
        protpar=ProteinAnalysis(sequence)
        params_table.append(TR(SPAN('Length:',_class = 'line-header'), '%i aa'%len(sequence)))
        try:
            params_table.append(TR(SPAN('MW:',_class = 'line-header'), '%i KDa'%round(protpar.molecular_weight()/1000,0)))
        except KeyError:
            pass
        try:
            params_table.append(TR(SPAN('pI:',_class = 'line-header'), '%1.2f'%protpar.isoelectric_point()))
        except KeyError:
            pass
        returndiv.append(params_table)
        return returndiv
        
    if mode == 'simple':
        seq_div=DIV(_style='font-family:monospace',_class='raw-sequence')
        spacer=len(str(len(sequence)))+1
        for i,pos in enumerate(sequence):
            if i==0:
                seq_div.append(XML((str(i+1)+' ').rjust(spacer).replace(' ','&nbsp;')))
            if i%10==0 and i!=0:
                seq_div.append(' ')
            if i%60==0 and i!=0:
                seq_div.append(XML((str(i)).ljust(spacer).replace(' ','&nbsp;')))
                seq_div.append(BR())
                seq_div.append(XML((str(i+1)+' ').rjust(spacer).replace(' ','&nbsp;')))
            seq_div.append(SPAN(pos,_class='seq-position', _title = i+1))
        return seq_div
Пример #37
0
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from Bio.SeqUtils import ProtParamData
from Bio import SeqIO
with open('../../samples/pdbaa') as fh:
   for rec in SeqIO.parse(fh,'fasta'):
       myprot = ProteinAnalysis(str(rec.seq))
       print(myprot.count_amino_acids())
       print(myprot.get_amino_acids_percent())
       print(myprot.molecular_weight())
       print(myprot.aromaticity())
       print(myprot.instability_index())
       print(myprot.flexibility())
       print(myprot.isoelectric_point())
       print(myprot.secondary_structure_fraction())
       print(myprot.protein_scale(ProtParamData.kd, 9, .4))
Пример #38
0
#!/usr/bin/env python

import sys
from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis

sys.stdout.write("ID\tMW\tIP\tgravy\tlength\tinstability\tmonoisotpoic\tSequence\n")

for record in SeqIO.parse(sys.stdin, "fasta"):
    a = ProteinAnalysis(str(record.seq))

    properties = list()
    properties.append(record.id)
    properties.append(a.molecular_weight())
    properties.append(a.isoelectric_point())
    properties.append(a.gravy())
    properties.append(a.length)
    properties.append(a.instability_index())
    properties.append(a.aromaticity())
    # always last column to make the output more readable
    properties.append(a.sequence)
    sys.stdout.write( '\t'.join(map(str, properties))+"\n" )

Пример #39
0
def main(argv):
		## we use ArgumentParser, which requires 2.7
		if sys.version_info < (2, 7):
			raise "This script requires python 2.7 or greater"

		## add weight filtering functionality if BioPython is available
		try:
			from Bio.SeqUtils.ProtParam import ProteinAnalysis	
			has_biopython = 1
		except :
			has_biopython = 0
			
  		
		parser = argparse.ArgumentParser(description='Add abundance to FASTA files.')
		parser.add_argument('infile', type=argparse.FileType('r'), help='Input FASTA file')
		parser.add_argument('outfile', type=argparse.FileType('w'), help='Output FASTA file')
		
		parser.add_argument('--mu', dest='mu', action='store', default=3, help='mean of gaussian in log space')
		parser.add_argument('--sigma', dest='sigma', action='store', default=1, help='sd of gaussian in log space')
		parser.add_argument('--sample', dest='sample', action='store', default=0, help='Number of entries to keep (for sampling a bigger FASTA file)')
		parser.add_argument('--random', dest='random', action='store_true', help='Randomly shuffle entries before sampling (only if --sample is given). If not given, the first \'X\' samples are used.')
		if (has_biopython):
			parser.add_argument('--weight_low', dest='weight_low', action='store', default=0, help='minimum molecular weight of protein')
			parser.add_argument('--weight_up', dest='weight_up', action='store', default=0, help='Maximum molecular weight of protein (use 0 for unlimited)')
		else:
			print "Warning: protein weight filtering not supported, as BioPython module is not installed."
			
		## argument parsing
		args = parser.parse_args()
		fileobj = args.infile
		fileoutobj = args.outfile
		sample_size = int(args.sample)
		sample_random = bool(args.random)
		if (has_biopython):
			weight_low = float(args.weight_low)
			weight_up = float(args.weight_up)
			if (weight_up <= 0): weight_up = sys.float_info.max
			
		
		## list of final entries
		fasta_entries = []
		
		for entry in nextEntry(fileobj):
				header = entry.header
				## check if it contains 'intensity'?
				rep = re.compile(r"\[# *(.*) *#\]")
				m = rep.search(header)
				header_new = ""
				other = []
				if (m):
					header_new = header.replace(m.group(0), "") ## delete meta
					for element in m.group(1).split(','):
							#print "element:", element
							if (element.find("intensity") == -1):
									other.append(element)
				else:
					header_new = header	## nothing to replace

				## create new metainfo array
				i = "intensity=" + str(sampleAbundance(float(args.mu), float(args.sigma)))
				other.append(i)

				entry.header = header_new.rstrip() + "[# " + (", ").join(other) + " #]"
				
				if (has_biopython):
					sequence = "".join(entry.sequence.split("\n"))
					##
					## BioPython does not like some AA letters - they need replacement
					##
					## replace "U" (Selenocystein) with "C" (Cystein)
					sequence = sequence.replace("U","C")
					## replace "X" (unknown) with "P" (Proline) [arbitrary choice - but weight of 115 is very close to averagine]
					sequence = sequence.replace("X","P")
					## replace "B" (Asparagine or aspartic acid) with "N" (Asparagine)
					sequence = sequence.replace("B","N")
					## replace "Z" (Glutamine or glutamic acid) with "Q" (Glutamine)
					sequence = sequence.replace("Z","Q")
					## replace "Z" (Glutamine or glutamic acid) with "Q" (Glutamine)
					sequence = sequence.replace("Z","Q")
					## replace "J" (Leucine or Isoleucine) with "L" (Leucine)
					sequence = sequence.replace("J","L")
					analysed_seq = ProteinAnalysis(sequence)
					weight = analysed_seq.molecular_weight()
					if (not(weight_low <= weight and weight <= weight_up)):
						continue
				
				
				fasta_entries.append(entry.header + "\n" + entry.sequence)
				
				## only read to sample size (the rest is thrown away anyways)
				if (sample_size > 0 and not(sample_random)):
					if (len(fasta_entries) >= sample_size):
						break
					
				
		## select subset (if required)		
		if (sample_size > 0):
			indices = range(0,len(fasta_entries))
			## random sampling only makes sense if we take a subset
			if (sample_random and sample_size < len(fasta_entries)):
				random.shuffle(indices)
			indices = [indices[i] for i in range(0,sample_size)]
			fasta_entries = [fasta_entries[i] for i in indices]
			
		## write to file
		for entry in fasta_entries:
			fileoutobj.write(entry)		
Пример #40
0



data_mwt = []
y_axis = []
x_axis = data_mwt


for record in SeqIO.parse(seq_file, "fasta"):      #for record in SeqIO.parse(seq_file, "fasta"):
    temp_seq=str(record.seq)
    analysis_seq=ProteinAnalysis(temp_seq)
    if ("ribosomal protein" in record.description or "ribosomal subunit" in record.description):
    #if ("ribosomal protein" in record.description or "ribosomal subunit" in record.description or "Ribosomal" in record.description):
        
        if (analysis_seq.molecular_weight() < 20000):
            data_mwt.append('%.2f'%(analysis_seq.molecular_weight()))
            y_axis.append(1)
            
            text_out.setTextColor(QColor('blue'))
            text_out.append(str(len(data_mwt)) + "," + record.description + "," + '%.2f'%(analysis_seq.molecular_weight()) + "," + '%.2f'%(analysis_seq.isoelectric_point()))
            
            
            
        
        #new=sorted(data_mwt)
        #data_mwt.append(list(zip(['%.2f'%(analysis_seq.molecular_weight())])))   
        #print(record.description + "  =  " + '%.2f'%(analysis_seq.molecular_weight()))
        
        csv_write = csv.writer(output)
        #row_wise = zip([record.description],['%.2f'%(analysis_seq.molecular_weight())],['%.2f'%(analysis_seq.isoelectric_point())])
Пример #41
0
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from Bio.SeqUtils import ProtParamData
import sys
import json

inp = json.loads(sys.argv[1])

seq = inp["Sequence"]

X = ProteinAnalysis(seq)

data = dict()

if "MW" in inp["Options"]:
	data["MW"] = X.molecular_weight()

if "EC280" in inp["Options"]:
	aa_count = X.count_amino_acids()
	if "hasDisulfide" in inp["Options"]:
		data["EC280"] = 1490 * aa_count["Y"] + 5500 * aa_count["W"] + 62.5 * aa_count["C"]
	else:
		data["EC280"] = 1490 * aa_count["Y"] + 5500 * aa_count["W"]

if "PI" in inp["Options"]:
	data["PI"] = X.isoelectric_point()

if "AACont" in inp["Options"]:
	ratios = X.get_amino_acids_percent()
	data["AACont"] = {aa: ratios[aa] * 100. for aa in ratios}

print json.dumps(data)