def return_equal_charge_concentrations(protein_file,
                                       RNA_file,
                                       protein_conc=1000,
                                       count_histidine=0,
                                       count_gfp=1):

    protein = list(SeqIO.parse(protein_file, 'fasta'))
    RNAS = list(SeqIO.parse(RNA_file, 'fasta'))

    pos_protein = 0
    for p in protein:
        protein_obj = SequenceParameters(str(p.seq))
        pos_protein = protein_obj.get_countPos() - protein_obj.get_countNeg()

    GFP = SequenceParameters(
        "MVSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLTYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITLGMDELYK"
    )

    if count_histidine:
        pos_protein = pos_protein + 0.5 * protein_obj.get_amino_acid_fractions(
        )['H'] * (protein_obj.get_length())
    if count_gfp:
        pos_protein = pos_protein + (GFP.get_countPos() - GFP.get_countNeg())
    counter_conc_max = {}
    for rna in RNAS:
        counter_conc_max[rna.id.split('(')
                         [0]] = protein_conc * pos_protein / len(rna.seq)

    return (counter_conc_max)
Пример #2
0
def get_features_charge(seq):
    """Return dictionary of all features associated with charge."""
    SeqOb = SequenceParameters(seq)
    return {'FCR': FCR(seq), 'NCPR': NCPR(seq),
            'net_charge': net_charge(seq), 'net_charge_P': net_charge_P(seq),
            'RK_ratio': RK_ratio(seq), 'ED_ratio': ED_ratio(seq),
            'kappa': SeqOb.get_kappa(), 'omega': SeqOb.get_Omega(), 'SCD': SeqOb.get_SCD()}
Пример #3
0
def get_features_physchem(seq):
    """Return dictionary of all features associated with physiochemical properties."""
    SeqOb = SequenceParameters(seq)
    return {'fraction_acidic': fraction_acidic(seq), 'fraction_basic': fraction_basic(seq),
            'fraction_aliphatic': fraction_aliphatic(seq), 'fraction_aromatic': fraction_aromatic(seq),
            'fraction_polar': fraction_polar(seq), 'fraction_disorder': fraction_disorder(seq), 'fraction_chainexp': fraction_chainexp(seq),
            'hydropathy': SeqOb.get_uversky_hydropathy(), 'isopoint': predict_isoelectric_point(seq),
            'loglen': log2(len(seq)), 'PPII_propensity': SeqOb.get_PPII_propensity()}
Пример #4
0
def feat_charge(seq):
    SeqOb = SequenceParameters(seq)
    return {
        'FCR': FCR(seq),
        'NCPR': NCPR(seq),
        'net_charge': net_charge(seq),
        'net_charge_P': net_charge_P(seq),
        'RK_ratio': RK_ratio(seq),
        'ED_ratio': ED_ratio(seq),
        'kappa': SeqOb.get_kappa(),
        'omega': SeqOb.get_Omega(),
        'SCD': SeqOb.get_SCD()
    }
Пример #5
0
def feat_physchem(seq):
    SeqOb = SequenceParameters(seq)
    return {
        'frac_acidic': frac_acidic(seq),
        'frac_basic': frac_basic(seq),
        'frac_aliphatic': frac_aliphatic(seq),
        'frac_chainexp': frac_chainexp(seq),
        'frac_polar': frac_polar(seq),
        'frac_aromatic': frac_aromatic(seq),
        'frac_disorder': frac_disorder(seq),
        'loglen': log2(len(seq)),
        'hydropathy': SeqOb.get_uversky_hydropathy(),
        'iso_point': ProteinAnalysis(seq).isoelectric_point(),
        'PPII_prop': SeqOb.get_PPII_propensity()
    }
Пример #6
0
    def test_get_reduced_alphabet_sequence_predefined_alphabets(self):
        for i in [2,3,4,5,6,8,10,11, 12,15,18,20]:
            
            # checks that a sequence with all 20 amino acids returns a reduced-alphabet sequence 
            # made up of i-residues which contains exactly i residues!
            self.assertEqual(len(set(self.SP_60.get_reduced_alphabet_sequence(i)[0])),i)

            random_seqs = testTools.generate_random_sequence(10)

            for j in random_seqs:

                # build obj
                SP = SequenceParameters(j)

                # check reduced alphabet sequence length matches
                self.assertEqual(len(SP.get_reduced_alphabet_sequence(i)[0]), len(j))
Пример #7
0
    def test_get_reduced_alphabet_sequence_predefined_alphabets(self):
        for i in [2,3,4,5,6,8,10,11, 12,15,18,20]:
            
            # checks that a sequence with all 20 amino acids returns a reduced-alphabet sequence 
            # made up of i-residues which contains exactly i residues!
            self.assertEqual(len(set(self.SP_60.get_reduced_alphabet_sequence(i)[0])),i)

            random_seqs = testTools.generate_random_sequence(10)

            for j in random_seqs:

                # build obj
                SP = SequenceParameters(j)

                # check reduced alphabet sequence length matches
                self.assertEqual(len(SP.get_reduced_alphabet_sequence(i)[0]), len(j))
Пример #8
0
def encode(cdr3):
    #print cdr3
    cdr3 = cdr3.replace("X", "V")
    cdr3 = cdr3[1:-3]
    #print cdr3
    cidercdr3 = SequenceParameters(str(cdr3))
    a = cidercdr3.get_linear_sequence_composition(blobLen=1, grps=grps)
    
    res = np.where(a[1][0]==1, "N",a[1][0]) 
    res = np.where(a[1][1]==1, "P", res) 
    res = np.where(a[1][2]==1, "H", res)
    res = np.where(a[1][3]==1, "H", res)
    res = np.where(res=="0.0", "X", res)
    res = "".join(res)
    
    return res
Пример #9
0
def feat_complexity(seq):
    return {
        'wf_complexity':
        SequenceParameters(seq).get_linear_complexity(blobLen=len(seq))[1]
        [0],  # Returns a 2xN matrix containing the complexity vector and the corresponding residue positions distributed equally along the sequence
        **rep_fractions(seq)
    }
Пример #10
0
def get_features_complexity(seq):
    """Return dictionary of all features associated with sequence complexity."""
    repeats = ['Q', 'N', 'S', 'G', 'E', 'D', 'K', 'R', 'P',
               'QN', 'RG', 'FG', 'SG', 'SR', 'KAP', 'PTS']
    features = {}
    for repeat in repeats:
        features['repeat_' + repeat] = fraction_regex(seq, f'[{repeat}]' + '{2,}')
    features['wf_complexity'] = SequenceParameters(seq).get_linear_complexity(blobLen=len(seq))[1][0]  # Returns a 2xN matrix containing the complexity vector and the corresponding residue positions distributed equally along the sequence

    return features
Пример #11
0
    def test_get_linear_WF_complexity(self):
        
        # general test
        random_seqs = testTools.generate_random_sequence_list(10, minLen=15, maxLen=500)        
        for j in random_seqs:
            SequenceParameters(j).get_linear_complexity('WF',alphabetSize=5)
                    
        # test all alphabets
        for i in [2,3,4,5,6,8,10,11, 12,15,18,20]:
            for j in random_seqs:
                SequenceParameters(j).get_linear_complexity('WF',alphabetSize=i)

        # test a range of window sizes
        for i in [1,2,3,4,5,10]:
            for j in random_seqs:
                SequenceParameters(j).get_linear_complexity('WF',blobLen=i)

        # test a range of window sizes
        for i in [1,2,3,4,5,10]:
            for j in random_seqs:
                SequenceParameters(j).get_linear_complexity('WF',blobLen=i)

        # test a range of step-sizes
        for i in [1,2,3,4,5,10]:
            for j in random_seqs:
                SequenceParameters(j).get_linear_complexity('WF',stepSize=i)

        # test a range of word-sizes
        for i in [1,2,3,4,5,10]:
            for j in random_seqs:
                SequenceParameters(j).get_linear_complexity('WF',wordSize=i)
Пример #12
0
def get_polyamp_regions(inputFasta,outputFile):
    # look for mixed charge regions in each protein. Each entry is preceeded by its accession
    seqs,out,added = [],[],{}
    with open(inputFasta,'rU') as infile, open(outputFile,'wb') as outf:
        reader = SeqIO.parse(infile,'fasta')
        writer = csv.writer(outf, delimiter='\t')
        writer.writerow(['accession','name','highly charged region','r/(r+k)','d/(d+e)','length','charge_density','kappa'])
        for s in reader:
            tmp = s.id.split('|')
            acc = tmp[1]
            tmp2 = s.description.split()[-3]
            name = tmp2[tmp2.find('=')+1:]
            t2=score_polyamp(str(s.seq),window_size,cut_off*window_size)
            if len(t2)>0:
                for i in range(len(t2)):
                    to_write = [acc,name, '', 0,0, 0,0,1]
                    r,k = t2[i].count('R'), t2[i].count('K')
                    d,e = t2[i].count('D'), t2[i].count('E')
                    neg = d+e #+t2[i].count('X')
                    pos = r+k
                    if pos>0 and neg>0 and pos/neg>=charge_bal and neg/pos>=charge_bal:
                        l = len(t2[i])
                        r_ratio = r/pos
                        d_ratio = d/neg
                        region_noX = ''
                        for char in t2[i]:
                            if char=='X':
                                region_noX = region_noX+'S'
                            elif char=='S' or char == 'U':
                                region_noX = region_noX+'T'
                            else:
                                region_noX = region_noX+char
                        SeqOb = SequenceParameters(region_noX)
                        kap = SeqOb.get_kappa_X(grp1 = ['E','D'], grp2 = ['K','R'])
                        if kap<=kappa:
                            to_write = [acc,name, t2[i], str(r_ratio),str(d_ratio), l,(pos+neg)/l ,str(kap)]
                        writer.writerow(to_write)
Пример #13
0
def get_polyamp_regions(inputFasta,outputFile):
    # look for polyamph regions in each protein, write all of them in the outfile, separated by a space. Each entry is preceeded by its accession
    seqs,out,added = [],[],{}
    with open(inputFasta,'rU') as infile, open(outputFile,'wb') as outf:
        reader = SeqIO.parse(infile,'fasta')
        writer = csv.writer(outf, delimiter='\t')
        writer.writerow(['accession','name','highly charged region','r/(r+k)','d/(d+e)','length','charge_density','kappa'])
        for s in reader:
            tmp = s.id.split('|')
            acc = tmp[1]
            tmp2 = s.description.split()[-3]
            name = tmp2[tmp2.find('=')+1:]
            t2=score_polyamp(str(s.seq),window_size,cut_off*window_size)
            if len(t2)>0:
                for i in range(len(t2)):
                    to_write = [acc,name, '', 0,0, 0,0,1]
                    r,k = t2[i].count('R'), t2[i].count('K')
                    d,e = t2[i].count('D'), t2[i].count('E')
                    neg = d+e #+t2[i].count('X')
                    pos = r+k
                    if pos>0 and neg>0 and pos/neg>=charge_bal and neg/pos>=charge_bal:
                        l = len(t2[i])
                        r_ratio = r/pos
                        d_ratio = d/neg
                        region_noX = ''
                        for char in t2[i]:
                            if char=='X':
                                region_noX = region_noX+'S'
                            elif char=='S' or char == 'U':
                                region_noX = region_noX+'T'
                            else:
                                region_noX = region_noX+char
                        SeqOb = SequenceParameters(region_noX)
                        kap = SeqOb.get_kappa_X(grp1 = ['E','D'], grp2 = ['K','R'])
                        if kap<=kappa:
                            to_write = [acc,name, t2[i], str(r_ratio),str(d_ratio), l,(pos+neg)/l ,str(kap)]
                        writer.writerow(to_write)
Пример #14
0
def apply_attribute_kappa(proteome):

    interface_tools.check_proteome(proteome,
                                   'apply_attribute_kappa (apis.localcider)')

    for protein in proteome:

        # get the protein sequence
        seq = protein.sequence

        # this is where we convert sequence into NCPR
        kappa = SequenceParameters(protein.sequence).get_kappa()

        # this is where we add the NCPR track
        protein.add_attribute('kappa', kappa)
Пример #15
0
 def in_out_kappa(self):
     df = pd.read_csv(self.train_fpi, sep='\t', index_col=0)
     df = df[df['y'] == 0]
     seqs = list(df['Sequence'])
     for seq in seqs:
         ms = motif_seq.LcSeq(seq, self.k, self.lca, 'lca')
         in_seq, out_seq = ms.seq_in_motif()
         SeqOb = SequenceParameters(in_seq)
         print(SeqOb.get_kappa())
         seqOb = SequenceParameters(out_seq)
         print(seqOb.get_kappa())
         print('')
Пример #16
0
def get_kappa(sequence):
####-CREATE A SEQUENCEOBJECT FROM THE AMINO ACID SEQUENCE-##############################################################
    SeqOb = SequenceParameters(sequence)
####-KAPPA RANGES: 0 < K < 1 --------------- LOW KAPPA:EXTENDED ---- HIGH KAPPA:COMPACTED --------------################
    kappa = SeqOb.get_kappa()
    return kappa
Пример #17
0
from localcider.sequenceParameters import SequenceParameters
from natsort import natsorted
import matplotlib.pyplot as plt

#Assigning path and fasta files to be analyzed
path = "ecoli/input/"
filelist = os.listdir(path)
#Sorting them in an order
filelist=natsorted(filelist)

#create an empty lists for sequences to be processed
list_of_SeqObjs = []
#First cider command to load the fasta files into the list of sequences
for file in filelist:
   file= path + file
   list_of_SeqObjs.append(SequenceParameters(sequenceFile=file))
#output file
f = open("cider_ecoli.dat", "w")

#Cider commands to get the parameters of interest
for obj in list_of_SeqObjs:
   f_pos=obj.get_fraction_positive()
   f_neg=obj.get_fraction_negative()
   mhyd=obj.get_uversky_hydropathy()
   mnc=obj.get_mean_net_charge(pH=7)
   kappa=obj.get_kappa()
   f.write("%s %f %f %f %f %f\n"%(filelist[list_of_SeqObjs.index(obj)],f_pos,f_neg,mhyd,mnc,kappa))
#closing the output file
f.close()

#For plotting mean net charge vs mean hydropathy
Пример #18
0
    print(file)
    contents = open(file).read().split('>')
    #can't use SeqParam(seqfile=file)
    #because all the sequences are appended to each other.
    output = open(file + "_charge", 'w+')
    for protein in [x for x in contents if x]:

        header = protein[0:protein.index('\n')]
        seq = protein[protein.index('\n'):-1]

        print(header)
        if ('X' in seq):
            print("Warning: unspecified protein encountered.")
        seq = seq.replace('X', '')

        seq_param = SequenceParameters(seq)

        #mean_net_charge is always positive, whereas
        # net_charge_per_residue is alternating
        net_charge = seq_param.get_NCPR(pH=7.0) * seq_param.get_length()
        print(net_charge)

        output.write(header)
        output.write(", ")
        output.write(str(7.0))
        output.write(str(", "))
        output.write(str(seq_param.get_molecular_weight()))
        output.write(str(", "))
        output.write(str(net_charge))
        output.write('\n')
Пример #19
0
class TestComplexityFunctions(unittest.TestCase):
    


    def setUp(self):
        self.SP_60 = SequenceParameters('QWERTYIPASDFGHKLCVNMQWERTYIPASDFGHKLCVNMQWERTYIPASDFGHKLCVNM')
        self.SP_10 = SequenceParameters('KDNIKHVPGG')

            
    def test_get_reduced_alphabet_sequence_predefined_alphabets(self):
        for i in [2,3,4,5,6,8,10,11, 12,15,18,20]:
            
            # checks that a sequence with all 20 amino acids returns a reduced-alphabet sequence 
            # made up of i-residues which contains exactly i residues!
            self.assertEqual(len(set(self.SP_60.get_reduced_alphabet_sequence(i)[0])),i)

            random_seqs = testTools.generate_random_sequence(10)

            for j in random_seqs:

                # build obj
                SP = SequenceParameters(j)

                # check reduced alphabet sequence length matches
                self.assertEqual(len(SP.get_reduced_alphabet_sequence(i)[0]), len(j))


    def test_get_linear_WF_complexity(self):
        
        # general test
        random_seqs = testTools.generate_random_sequence_list(10, minLen=15, maxLen=500)        
        for j in random_seqs:
            SequenceParameters(j).get_linear_complexity('WF',alphabetSize=5)
                    
        # test all alphabets
        for i in [2,3,4,5,6,8,10,11, 12,15,18,20]:
            for j in random_seqs:
                SequenceParameters(j).get_linear_complexity('WF',alphabetSize=i)

        # test a range of window sizes
        for i in [1,2,3,4,5,10]:
            for j in random_seqs:
                SequenceParameters(j).get_linear_complexity('WF',blobLen=i)

        # test a range of window sizes
        for i in [1,2,3,4,5,10]:
            for j in random_seqs:
                SequenceParameters(j).get_linear_complexity('WF',blobLen=i)

        # test a range of step-sizes
        for i in [1,2,3,4,5,10]:
            for j in random_seqs:
                SequenceParameters(j).get_linear_complexity('WF',stepSize=i)

        # test a range of word-sizes
        for i in [1,2,3,4,5,10]:
            for j in random_seqs:
                SequenceParameters(j).get_linear_complexity('WF',wordSize=i)

    def test_get_linear_LC_complexity(self):
        
        # general test
        random_seqs = testTools.generate_random_sequence_list(10, minLen=15, maxLen=500)        
        for j in random_seqs:
            SequenceParameters(j).get_linear_complexity('LC',alphabetSize=5)
                    
        # test all alphabets
        for i in [2,3,4,5,6,8,10,11, 12,15,18,20]:
            for j in random_seqs:
                SequenceParameters(j).get_linear_complexity('LC',alphabetSize=i)

        # test a range of window sizes
        for i in [1,2,3,4,5,10]:
            for j in random_seqs:
                SequenceParameters(j).get_linear_complexity('LC',blobLen=i)

        # test a range of window sizes
        for i in [1,2,3,4,5,10]:
            for j in random_seqs:
                SequenceParameters(j).get_linear_complexity('LC',blobLen=i)

        # test a range of step-sizes
        for i in [1,2,3,4,5,10]:
            for j in random_seqs:
                SequenceParameters(j).get_linear_complexity('LC',stepSize=i)

        # test a range of word-sizes
        for i in [1,2,3,4,5,10]:
            for j in random_seqs:
                SequenceParameters(j).get_linear_complexity('LC',wordSize=i)

    def test_get_linear_LZW_complexity(self):
        
        # general test
        random_seqs = testTools.generate_random_sequence_list(10, minLen=15, maxLen=500)        
        for j in random_seqs:
            SequenceParameters(j).get_linear_complexity('LZW',alphabetSize=5)
                    
        # test all alphabets
        for i in [2,3,4,5,6,8,10,11, 12,15,18,20]:
            for j in random_seqs:
                SequenceParameters(j).get_linear_complexity('LZW',alphabetSize=i)

        # test a range of window sizes
        for i in [1,2,3,4,5,10]:
            for j in random_seqs:
                SequenceParameters(j).get_linear_complexity('LZW',blobLen=i)

        # test a range of window sizes
        for i in [1,2,3,4,5,10]:
            for j in random_seqs:
                SequenceParameters(j).get_linear_complexity('LZW',blobLen=i)

        # test a range of step-sizes
        for i in [1,2,3,4,5,10]:
            for j in random_seqs:
                SequenceParameters(j).get_linear_complexity('LZW',stepSize=i)

        # test a range of word-sizes
        for i in [1,2,3,4,5,10]:
            for j in random_seqs:
                SequenceParameters(j).get_linear_complexity('LZW',wordSize=i)


    def test_complexity_values(self):

        out = {}

        # WF alphatbetsize 4
        out['WF'] = np.array([ 0.92321967,  0.92321967,  0.92321967,  0.88048202,  0.88048202,
                               0.92321967,  0.92321967,  0.92321967,  0.94773092,  0.92321967,
                               0.86096405,  0.86096405,  0.86096405,  0.68048202,  0.68048202,
                               0.86096405,  0.92321967,  0.92321967,  0.96096405,  0.9854753 ,
                               0.92321967,  0.92321967,  0.92321967,  0.88048202,  0.88048202,
                               0.92321967,  0.92321967,  0.92321967,  0.94773092,  0.92321967,
                               0.86096405,  0.86096405,  0.86096405,  0.68048202,  0.68048202,
                               0.86096405,  0.92321967,  0.92321967,  0.96096405,  0.9854753 ,
                               0.92321967,  0.92321967,  0.92321967,  0.88048202,  0.88048202,
                               0.92321967,  0.92321967,  0.92321967,  0.94773092,  0.92321967,
                               0.86096405])
        
        # LZW alphabate size=4
        out['LZW'] = np.array([ 0.9,  0.9,  0.9,  0.9,  0.9,  0.9,  0.9,  0.9,  0.8,  0.8,  0.8,
                                0.8,  0.8,  0.8,  0.8,  0.9,  0.9,  0.9,  1. ,  1. ,  0.9,  0.9,
                                0.9,  0.9,  0.9,  0.9,  0.9,  0.9,  0.8,  0.8,  0.8,  0.8,  0.8,
                                0.8,  0.8,  0.9,  0.9,  0.9,  1. ,  1. ,  0.9,  0.9,  0.9,  0.9,
                                0.9,  0.9,  0.9,  0.9,  0.8,  0.8,  0.8])

        # LC alphabet size = 2
        out['LC'] = np.array([ 0.625,  0.5  ,  0.5  ,  0.5  ,  0.5  ,  0.5  ,  0.625,  0.75 ,
                            0.625,  0.75 ,  0.625,  0.75 ,  0.875,  0.75 ,  0.625,  0.625,
                            0.625,  0.625,  0.75 ,  0.75 ,  0.625,  0.5  ,  0.5  ,  0.5  ,
                            0.5  ,  0.5  ,  0.625,  0.75 ,  0.625,  0.75 ,  0.625,  0.75 ,
                            0.875,  0.75 ,  0.625,  0.625,  0.625,  0.625,  0.75 ,  0.75 ,
                            0.625,  0.5  ,  0.5  ,  0.5  ,  0.5  ,  0.5  ,  0.625,  0.75 ,
                            0.625,  0.75 ,  0.625])
        
        self.assertEqual((self.SP_60.get_linear_complexity('WF',  alphabetSize=4)[1]  - out['WF'] < 0.00001).all(), True)
        self.assertEqual((self.SP_60.get_linear_complexity('LZW', alphabetSize=4)[1] - out['LZW'] < 0.00001).all(), True)
        self.assertEqual((self.SP_60.get_linear_complexity('LC',  alphabetSize=2)[1]  - out['LC'] < 0.00001).all(), True)
Пример #20
0
 def setUp(self):
     self.SP_60 = SequenceParameters('QWERTYIPASDFGHKLCVNMQWERTYIPASDFGHKLCVNMQWERTYIPASDFGHKLCVNM')
     self.SP_10 = SequenceParameters('KDNIKHVPGG')
Пример #21
0
class TestComplexityFunctions(unittest.TestCase):
    


    def setUp(self):
        self.SP_60 = SequenceParameters('QWERTYIPASDFGHKLCVNMQWERTYIPASDFGHKLCVNMQWERTYIPASDFGHKLCVNM')
        self.SP_10 = SequenceParameters('KDNIKHVPGG')

            
    def test_get_reduced_alphabet_sequence_predefined_alphabets(self):
        for i in [2,3,4,5,6,8,10,11, 12,15,18,20]:
            
            # checks that a sequence with all 20 amino acids returns a reduced-alphabet sequence 
            # made up of i-residues which contains exactly i residues!
            self.assertEqual(len(set(self.SP_60.get_reduced_alphabet_sequence(i)[0])),i)

            random_seqs = testTools.generate_random_sequence(10)

            for j in random_seqs:

                # build obj
                SP = SequenceParameters(j)

                # check reduced alphabet sequence length matches
                self.assertEqual(len(SP.get_reduced_alphabet_sequence(i)[0]), len(j))


    def test_get_linear_WF_complexity(self):
        
        # general test
        random_seqs = testTools.generate_random_sequence_list(10, minLen=15, maxLen=500)        
        for j in random_seqs:
            SequenceParameters(j).get_linear_complexity('WF',alphabetSize=5)
                    
        # test all alphabets
        for i in [2,3,4,5,6,8,10,11, 12,15,18,20]:
            for j in random_seqs:
                SequenceParameters(j).get_linear_complexity('WF',alphabetSize=i)

        # test a range of window sizes
        for i in [1,2,3,4,5,10]:
            for j in random_seqs:
                SequenceParameters(j).get_linear_complexity('WF',blobLen=i)

        # test a range of window sizes
        for i in [1,2,3,4,5,10]:
            for j in random_seqs:
                SequenceParameters(j).get_linear_complexity('WF',blobLen=i)

        # test a range of step-sizes
        for i in [1,2,3,4,5,10]:
            for j in random_seqs:
                SequenceParameters(j).get_linear_complexity('WF',stepSize=i)

        # test a range of word-sizes
        for i in [1,2,3,4,5,10]:
            for j in random_seqs:
                SequenceParameters(j).get_linear_complexity('WF',wordSize=i)

    def test_get_linear_LC_complexity(self):
        
        # general test
        random_seqs = testTools.generate_random_sequence_list(10, minLen=15, maxLen=500)        
        for j in random_seqs:
            SequenceParameters(j).get_linear_complexity('LC',alphabetSize=5)
                    
        # test all alphabets
        for i in [2,3,4,5,6,8,10,11, 12,15,18,20]:
            for j in random_seqs:
                SequenceParameters(j).get_linear_complexity('LC',alphabetSize=i)

        # test a range of window sizes
        for i in [1,2,3,4,5,10]:
            for j in random_seqs:
                SequenceParameters(j).get_linear_complexity('LC',blobLen=i)

        # test a range of window sizes
        for i in [1,2,3,4,5,10]:
            for j in random_seqs:
                SequenceParameters(j).get_linear_complexity('LC',blobLen=i)

        # test a range of step-sizes
        for i in [1,2,3,4,5,10]:
            for j in random_seqs:
                SequenceParameters(j).get_linear_complexity('LC',stepSize=i)

        # test a range of word-sizes
        for i in [1,2,3,4,5,10]:
            for j in random_seqs:
                SequenceParameters(j).get_linear_complexity('LC',wordSize=i)

    def test_get_linear_LZW_complexity(self):
        
        # general test
        random_seqs = testTools.generate_random_sequence_list(10, minLen=15, maxLen=500)        
        for j in random_seqs:
            SequenceParameters(j).get_linear_complexity('LZW',alphabetSize=5)
                    
        # test all alphabets
        for i in [2,3,4,5,6,8,10,11, 12,15,18,20]:
            for j in random_seqs:
                SequenceParameters(j).get_linear_complexity('LZW',alphabetSize=i)

        # test a range of window sizes
        for i in [1,2,3,4,5,10]:
            for j in random_seqs:
                SequenceParameters(j).get_linear_complexity('LZW',blobLen=i)

        # test a range of window sizes
        for i in [1,2,3,4,5,10]:
            for j in random_seqs:
                SequenceParameters(j).get_linear_complexity('LZW',blobLen=i)

        # test a range of step-sizes
        for i in [1,2,3,4,5,10]:
            for j in random_seqs:
                SequenceParameters(j).get_linear_complexity('LZW',stepSize=i)

        # test a range of word-sizes
        for i in [1,2,3,4,5,10]:
            for j in random_seqs:
                SequenceParameters(j).get_linear_complexity('LZW',wordSize=i)


    def test_complexity_values(self):

        out = {}

        # WF alphatbetsize 4
        out['WF'] = np.array([ 0.92321967,  0.92321967,  0.92321967,  0.88048202,  0.88048202,
                               0.92321967,  0.92321967,  0.92321967,  0.94773092,  0.92321967,
                               0.86096405,  0.86096405,  0.86096405,  0.68048202,  0.68048202,
                               0.86096405,  0.92321967,  0.92321967,  0.96096405,  0.9854753 ,
                               0.92321967,  0.92321967,  0.92321967,  0.88048202,  0.88048202,
                               0.92321967,  0.92321967,  0.92321967,  0.94773092,  0.92321967,
                               0.86096405,  0.86096405,  0.86096405,  0.68048202,  0.68048202,
                               0.86096405,  0.92321967,  0.92321967,  0.96096405,  0.9854753 ,
                               0.92321967,  0.92321967,  0.92321967,  0.88048202,  0.88048202,
                               0.92321967,  0.92321967,  0.92321967,  0.94773092,  0.92321967,
                               0.86096405])
        
        # LZW alphabate size=4
        out['LZW'] = np.array([ 0.9,  0.9,  0.9,  0.9,  0.9,  0.9,  0.9,  0.9,  0.8,  0.8,  0.8,
                                0.8,  0.8,  0.8,  0.8,  0.9,  0.9,  0.9,  1. ,  1. ,  0.9,  0.9,
                                0.9,  0.9,  0.9,  0.9,  0.9,  0.9,  0.8,  0.8,  0.8,  0.8,  0.8,
                                0.8,  0.8,  0.9,  0.9,  0.9,  1. ,  1. ,  0.9,  0.9,  0.9,  0.9,
                                0.9,  0.9,  0.9,  0.9,  0.8,  0.8,  0.8])

        # LC alphabet size = 2
        out['LC'] = np.array([ 0.625,  0.5  ,  0.5  ,  0.5  ,  0.5  ,  0.5  ,  0.625,  0.75 ,
                            0.625,  0.75 ,  0.625,  0.75 ,  0.875,  0.75 ,  0.625,  0.625,
                            0.625,  0.625,  0.75 ,  0.75 ,  0.625,  0.5  ,  0.5  ,  0.5  ,
                            0.5  ,  0.5  ,  0.625,  0.75 ,  0.625,  0.75 ,  0.625,  0.75 ,
                            0.875,  0.75 ,  0.625,  0.625,  0.625,  0.625,  0.75 ,  0.75 ,
                            0.625,  0.5  ,  0.5  ,  0.5  ,  0.5  ,  0.5  ,  0.625,  0.75 ,
                            0.625,  0.75 ,  0.625])
        
        self.assertEquals((self.SP_60.get_linear_complexity('WF',  alphabetSize=4)[1]  - out['WF'] < 0.00001).all(), True)
        self.assertEquals((self.SP_60.get_linear_complexity('LZW', alphabetSize=4)[1] - out['LZW'] < 0.00001).all(), True)
        self.assertEquals((self.SP_60.get_linear_complexity('LC',  alphabetSize=2)[1]  - out['LC'] < 0.00001).all(), True)
Пример #22
0
def aa_analysis(df, property):
    if property == "ncpr":
        df = df[pd.notnull(df['Amino_acids'])]
        df[["AA1","AA2"]] = df['Amino_acids'].str.split('/',expand=True)
        isoelectric_point = []
        for sequence in df["AA1"]:
            try:
                cdr3 = ProteinAnalysis(str(sequence))
                cidercdr3 = SequenceParameters(str(sequence)) 
                isoelectric_point.append(cidercdr3.get_NCPR())
            except:
                isoelectric_point.append(0)
                pass
        df["AA1_Iso"] = isoelectric_point
        isoelectric_point2 = []
        for sequence in df["AA2"]:
            try:
                cdr3 = ProteinAnalysis(str(sequence))
                cidercdr3 = SequenceParameters(str(sequence)) 
                isoelectric_point2.append(cidercdr3.get_NCPR())
            except:
                isoelectric_point2.append(0)
                pass
        df["AA2_Iso"] = isoelectric_point2
        df["AA_Iso_Delta"] = df["AA2_Iso"] - df["AA1_Iso"]
        df = df[["AA1_Iso", "AA2_Iso", "AA_Iso_Delta"]]
    elif property == "uversky_hydropathy":
        df = df[pd.notnull(df['Amino_acids'])]
        df[["AA1","AA2"]] = df['Amino_acids'].str.split('/',expand=True)
        isoelectric_point = []
        for sequence in df["AA1"]:
            try:
                cdr3 = ProteinAnalysis(str(sequence))
                cidercdr3 = SequenceParameters(str(sequence)) 
                isoelectric_point.append(cidercdr3.get_uversky_hydropathy())
            except:
                isoelectric_point.append(0)
                pass
        df["AA1_Iso"] = isoelectric_point
        isoelectric_point2 = []
        for sequence in df["AA2"]:
            try:
                cdr3 = ProteinAnalysis(str(sequence))
                cidercdr3 = SequenceParameters(str(sequence)) 
                isoelectric_point2.append(cidercdr3.get_uversky_hydropathy())
            except:
                isoelectric_point2.append(0)
                pass
        df["AA2_Iso"] = isoelectric_point2
        df["AA_Iso_Delta"] = df["AA2_Iso"] - df["AA1_Iso"]
        df = df[["AA1_Iso", "AA2_Iso", "AA_Iso_Delta"]]
    return df
Пример #23
0
 def setUp(self):
     self.SP_60 = SequenceParameters('QWERTYIPASDFGHKLCVNMQWERTYIPASDFGHKLCVNMQWERTYIPASDFGHKLCVNM')
     self.SP_10 = SequenceParameters('KDNIKHVPGG')
Пример #24
0
#!/usr/bin/env python

import sys
import os
import json
import numpy as np
from pandas import *

# import the relevant code
import localcider
from localcider.sequenceParameters import SequenceParameters

# create an empty list 
list_of_SeqObjs = []

with open("FUS_mammals.seq") as f:
	#for each ortholog in the file
	for seq in f:
		try:
			list_of_SeqObjs.append(SequenceParameters(seq))
		except localcider.SequenceFileParserException:
      	# if we encounter a file parsing error just skip that sequence
      	continue

# for each 
for obj in list_of_SeqObjs:
   print obj.get_kappa()