def return_equal_charge_concentrations(protein_file, RNA_file, protein_conc=1000, count_histidine=0, count_gfp=1): protein = list(SeqIO.parse(protein_file, 'fasta')) RNAS = list(SeqIO.parse(RNA_file, 'fasta')) pos_protein = 0 for p in protein: protein_obj = SequenceParameters(str(p.seq)) pos_protein = protein_obj.get_countPos() - protein_obj.get_countNeg() GFP = SequenceParameters( "MVSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLTYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITLGMDELYK" ) if count_histidine: pos_protein = pos_protein + 0.5 * protein_obj.get_amino_acid_fractions( )['H'] * (protein_obj.get_length()) if count_gfp: pos_protein = pos_protein + (GFP.get_countPos() - GFP.get_countNeg()) counter_conc_max = {} for rna in RNAS: counter_conc_max[rna.id.split('(') [0]] = protein_conc * pos_protein / len(rna.seq) return (counter_conc_max)
def get_features_charge(seq): """Return dictionary of all features associated with charge.""" SeqOb = SequenceParameters(seq) return {'FCR': FCR(seq), 'NCPR': NCPR(seq), 'net_charge': net_charge(seq), 'net_charge_P': net_charge_P(seq), 'RK_ratio': RK_ratio(seq), 'ED_ratio': ED_ratio(seq), 'kappa': SeqOb.get_kappa(), 'omega': SeqOb.get_Omega(), 'SCD': SeqOb.get_SCD()}
def get_features_physchem(seq): """Return dictionary of all features associated with physiochemical properties.""" SeqOb = SequenceParameters(seq) return {'fraction_acidic': fraction_acidic(seq), 'fraction_basic': fraction_basic(seq), 'fraction_aliphatic': fraction_aliphatic(seq), 'fraction_aromatic': fraction_aromatic(seq), 'fraction_polar': fraction_polar(seq), 'fraction_disorder': fraction_disorder(seq), 'fraction_chainexp': fraction_chainexp(seq), 'hydropathy': SeqOb.get_uversky_hydropathy(), 'isopoint': predict_isoelectric_point(seq), 'loglen': log2(len(seq)), 'PPII_propensity': SeqOb.get_PPII_propensity()}
def feat_charge(seq): SeqOb = SequenceParameters(seq) return { 'FCR': FCR(seq), 'NCPR': NCPR(seq), 'net_charge': net_charge(seq), 'net_charge_P': net_charge_P(seq), 'RK_ratio': RK_ratio(seq), 'ED_ratio': ED_ratio(seq), 'kappa': SeqOb.get_kappa(), 'omega': SeqOb.get_Omega(), 'SCD': SeqOb.get_SCD() }
def feat_physchem(seq): SeqOb = SequenceParameters(seq) return { 'frac_acidic': frac_acidic(seq), 'frac_basic': frac_basic(seq), 'frac_aliphatic': frac_aliphatic(seq), 'frac_chainexp': frac_chainexp(seq), 'frac_polar': frac_polar(seq), 'frac_aromatic': frac_aromatic(seq), 'frac_disorder': frac_disorder(seq), 'loglen': log2(len(seq)), 'hydropathy': SeqOb.get_uversky_hydropathy(), 'iso_point': ProteinAnalysis(seq).isoelectric_point(), 'PPII_prop': SeqOb.get_PPII_propensity() }
def test_get_reduced_alphabet_sequence_predefined_alphabets(self): for i in [2,3,4,5,6,8,10,11, 12,15,18,20]: # checks that a sequence with all 20 amino acids returns a reduced-alphabet sequence # made up of i-residues which contains exactly i residues! self.assertEqual(len(set(self.SP_60.get_reduced_alphabet_sequence(i)[0])),i) random_seqs = testTools.generate_random_sequence(10) for j in random_seqs: # build obj SP = SequenceParameters(j) # check reduced alphabet sequence length matches self.assertEqual(len(SP.get_reduced_alphabet_sequence(i)[0]), len(j))
def encode(cdr3): #print cdr3 cdr3 = cdr3.replace("X", "V") cdr3 = cdr3[1:-3] #print cdr3 cidercdr3 = SequenceParameters(str(cdr3)) a = cidercdr3.get_linear_sequence_composition(blobLen=1, grps=grps) res = np.where(a[1][0]==1, "N",a[1][0]) res = np.where(a[1][1]==1, "P", res) res = np.where(a[1][2]==1, "H", res) res = np.where(a[1][3]==1, "H", res) res = np.where(res=="0.0", "X", res) res = "".join(res) return res
def feat_complexity(seq): return { 'wf_complexity': SequenceParameters(seq).get_linear_complexity(blobLen=len(seq))[1] [0], # Returns a 2xN matrix containing the complexity vector and the corresponding residue positions distributed equally along the sequence **rep_fractions(seq) }
def get_features_complexity(seq): """Return dictionary of all features associated with sequence complexity.""" repeats = ['Q', 'N', 'S', 'G', 'E', 'D', 'K', 'R', 'P', 'QN', 'RG', 'FG', 'SG', 'SR', 'KAP', 'PTS'] features = {} for repeat in repeats: features['repeat_' + repeat] = fraction_regex(seq, f'[{repeat}]' + '{2,}') features['wf_complexity'] = SequenceParameters(seq).get_linear_complexity(blobLen=len(seq))[1][0] # Returns a 2xN matrix containing the complexity vector and the corresponding residue positions distributed equally along the sequence return features
def test_get_linear_WF_complexity(self): # general test random_seqs = testTools.generate_random_sequence_list(10, minLen=15, maxLen=500) for j in random_seqs: SequenceParameters(j).get_linear_complexity('WF',alphabetSize=5) # test all alphabets for i in [2,3,4,5,6,8,10,11, 12,15,18,20]: for j in random_seqs: SequenceParameters(j).get_linear_complexity('WF',alphabetSize=i) # test a range of window sizes for i in [1,2,3,4,5,10]: for j in random_seqs: SequenceParameters(j).get_linear_complexity('WF',blobLen=i) # test a range of window sizes for i in [1,2,3,4,5,10]: for j in random_seqs: SequenceParameters(j).get_linear_complexity('WF',blobLen=i) # test a range of step-sizes for i in [1,2,3,4,5,10]: for j in random_seqs: SequenceParameters(j).get_linear_complexity('WF',stepSize=i) # test a range of word-sizes for i in [1,2,3,4,5,10]: for j in random_seqs: SequenceParameters(j).get_linear_complexity('WF',wordSize=i)
def get_polyamp_regions(inputFasta,outputFile): # look for mixed charge regions in each protein. Each entry is preceeded by its accession seqs,out,added = [],[],{} with open(inputFasta,'rU') as infile, open(outputFile,'wb') as outf: reader = SeqIO.parse(infile,'fasta') writer = csv.writer(outf, delimiter='\t') writer.writerow(['accession','name','highly charged region','r/(r+k)','d/(d+e)','length','charge_density','kappa']) for s in reader: tmp = s.id.split('|') acc = tmp[1] tmp2 = s.description.split()[-3] name = tmp2[tmp2.find('=')+1:] t2=score_polyamp(str(s.seq),window_size,cut_off*window_size) if len(t2)>0: for i in range(len(t2)): to_write = [acc,name, '', 0,0, 0,0,1] r,k = t2[i].count('R'), t2[i].count('K') d,e = t2[i].count('D'), t2[i].count('E') neg = d+e #+t2[i].count('X') pos = r+k if pos>0 and neg>0 and pos/neg>=charge_bal and neg/pos>=charge_bal: l = len(t2[i]) r_ratio = r/pos d_ratio = d/neg region_noX = '' for char in t2[i]: if char=='X': region_noX = region_noX+'S' elif char=='S' or char == 'U': region_noX = region_noX+'T' else: region_noX = region_noX+char SeqOb = SequenceParameters(region_noX) kap = SeqOb.get_kappa_X(grp1 = ['E','D'], grp2 = ['K','R']) if kap<=kappa: to_write = [acc,name, t2[i], str(r_ratio),str(d_ratio), l,(pos+neg)/l ,str(kap)] writer.writerow(to_write)
def get_polyamp_regions(inputFasta,outputFile): # look for polyamph regions in each protein, write all of them in the outfile, separated by a space. Each entry is preceeded by its accession seqs,out,added = [],[],{} with open(inputFasta,'rU') as infile, open(outputFile,'wb') as outf: reader = SeqIO.parse(infile,'fasta') writer = csv.writer(outf, delimiter='\t') writer.writerow(['accession','name','highly charged region','r/(r+k)','d/(d+e)','length','charge_density','kappa']) for s in reader: tmp = s.id.split('|') acc = tmp[1] tmp2 = s.description.split()[-3] name = tmp2[tmp2.find('=')+1:] t2=score_polyamp(str(s.seq),window_size,cut_off*window_size) if len(t2)>0: for i in range(len(t2)): to_write = [acc,name, '', 0,0, 0,0,1] r,k = t2[i].count('R'), t2[i].count('K') d,e = t2[i].count('D'), t2[i].count('E') neg = d+e #+t2[i].count('X') pos = r+k if pos>0 and neg>0 and pos/neg>=charge_bal and neg/pos>=charge_bal: l = len(t2[i]) r_ratio = r/pos d_ratio = d/neg region_noX = '' for char in t2[i]: if char=='X': region_noX = region_noX+'S' elif char=='S' or char == 'U': region_noX = region_noX+'T' else: region_noX = region_noX+char SeqOb = SequenceParameters(region_noX) kap = SeqOb.get_kappa_X(grp1 = ['E','D'], grp2 = ['K','R']) if kap<=kappa: to_write = [acc,name, t2[i], str(r_ratio),str(d_ratio), l,(pos+neg)/l ,str(kap)] writer.writerow(to_write)
def apply_attribute_kappa(proteome): interface_tools.check_proteome(proteome, 'apply_attribute_kappa (apis.localcider)') for protein in proteome: # get the protein sequence seq = protein.sequence # this is where we convert sequence into NCPR kappa = SequenceParameters(protein.sequence).get_kappa() # this is where we add the NCPR track protein.add_attribute('kappa', kappa)
def in_out_kappa(self): df = pd.read_csv(self.train_fpi, sep='\t', index_col=0) df = df[df['y'] == 0] seqs = list(df['Sequence']) for seq in seqs: ms = motif_seq.LcSeq(seq, self.k, self.lca, 'lca') in_seq, out_seq = ms.seq_in_motif() SeqOb = SequenceParameters(in_seq) print(SeqOb.get_kappa()) seqOb = SequenceParameters(out_seq) print(seqOb.get_kappa()) print('')
def get_kappa(sequence): ####-CREATE A SEQUENCEOBJECT FROM THE AMINO ACID SEQUENCE-############################################################## SeqOb = SequenceParameters(sequence) ####-KAPPA RANGES: 0 < K < 1 --------------- LOW KAPPA:EXTENDED ---- HIGH KAPPA:COMPACTED --------------################ kappa = SeqOb.get_kappa() return kappa
from localcider.sequenceParameters import SequenceParameters from natsort import natsorted import matplotlib.pyplot as plt #Assigning path and fasta files to be analyzed path = "ecoli/input/" filelist = os.listdir(path) #Sorting them in an order filelist=natsorted(filelist) #create an empty lists for sequences to be processed list_of_SeqObjs = [] #First cider command to load the fasta files into the list of sequences for file in filelist: file= path + file list_of_SeqObjs.append(SequenceParameters(sequenceFile=file)) #output file f = open("cider_ecoli.dat", "w") #Cider commands to get the parameters of interest for obj in list_of_SeqObjs: f_pos=obj.get_fraction_positive() f_neg=obj.get_fraction_negative() mhyd=obj.get_uversky_hydropathy() mnc=obj.get_mean_net_charge(pH=7) kappa=obj.get_kappa() f.write("%s %f %f %f %f %f\n"%(filelist[list_of_SeqObjs.index(obj)],f_pos,f_neg,mhyd,mnc,kappa)) #closing the output file f.close() #For plotting mean net charge vs mean hydropathy
print(file) contents = open(file).read().split('>') #can't use SeqParam(seqfile=file) #because all the sequences are appended to each other. output = open(file + "_charge", 'w+') for protein in [x for x in contents if x]: header = protein[0:protein.index('\n')] seq = protein[protein.index('\n'):-1] print(header) if ('X' in seq): print("Warning: unspecified protein encountered.") seq = seq.replace('X', '') seq_param = SequenceParameters(seq) #mean_net_charge is always positive, whereas # net_charge_per_residue is alternating net_charge = seq_param.get_NCPR(pH=7.0) * seq_param.get_length() print(net_charge) output.write(header) output.write(", ") output.write(str(7.0)) output.write(str(", ")) output.write(str(seq_param.get_molecular_weight())) output.write(str(", ")) output.write(str(net_charge)) output.write('\n')
class TestComplexityFunctions(unittest.TestCase): def setUp(self): self.SP_60 = SequenceParameters('QWERTYIPASDFGHKLCVNMQWERTYIPASDFGHKLCVNMQWERTYIPASDFGHKLCVNM') self.SP_10 = SequenceParameters('KDNIKHVPGG') def test_get_reduced_alphabet_sequence_predefined_alphabets(self): for i in [2,3,4,5,6,8,10,11, 12,15,18,20]: # checks that a sequence with all 20 amino acids returns a reduced-alphabet sequence # made up of i-residues which contains exactly i residues! self.assertEqual(len(set(self.SP_60.get_reduced_alphabet_sequence(i)[0])),i) random_seqs = testTools.generate_random_sequence(10) for j in random_seqs: # build obj SP = SequenceParameters(j) # check reduced alphabet sequence length matches self.assertEqual(len(SP.get_reduced_alphabet_sequence(i)[0]), len(j)) def test_get_linear_WF_complexity(self): # general test random_seqs = testTools.generate_random_sequence_list(10, minLen=15, maxLen=500) for j in random_seqs: SequenceParameters(j).get_linear_complexity('WF',alphabetSize=5) # test all alphabets for i in [2,3,4,5,6,8,10,11, 12,15,18,20]: for j in random_seqs: SequenceParameters(j).get_linear_complexity('WF',alphabetSize=i) # test a range of window sizes for i in [1,2,3,4,5,10]: for j in random_seqs: SequenceParameters(j).get_linear_complexity('WF',blobLen=i) # test a range of window sizes for i in [1,2,3,4,5,10]: for j in random_seqs: SequenceParameters(j).get_linear_complexity('WF',blobLen=i) # test a range of step-sizes for i in [1,2,3,4,5,10]: for j in random_seqs: SequenceParameters(j).get_linear_complexity('WF',stepSize=i) # test a range of word-sizes for i in [1,2,3,4,5,10]: for j in random_seqs: SequenceParameters(j).get_linear_complexity('WF',wordSize=i) def test_get_linear_LC_complexity(self): # general test random_seqs = testTools.generate_random_sequence_list(10, minLen=15, maxLen=500) for j in random_seqs: SequenceParameters(j).get_linear_complexity('LC',alphabetSize=5) # test all alphabets for i in [2,3,4,5,6,8,10,11, 12,15,18,20]: for j in random_seqs: SequenceParameters(j).get_linear_complexity('LC',alphabetSize=i) # test a range of window sizes for i in [1,2,3,4,5,10]: for j in random_seqs: SequenceParameters(j).get_linear_complexity('LC',blobLen=i) # test a range of window sizes for i in [1,2,3,4,5,10]: for j in random_seqs: SequenceParameters(j).get_linear_complexity('LC',blobLen=i) # test a range of step-sizes for i in [1,2,3,4,5,10]: for j in random_seqs: SequenceParameters(j).get_linear_complexity('LC',stepSize=i) # test a range of word-sizes for i in [1,2,3,4,5,10]: for j in random_seqs: SequenceParameters(j).get_linear_complexity('LC',wordSize=i) def test_get_linear_LZW_complexity(self): # general test random_seqs = testTools.generate_random_sequence_list(10, minLen=15, maxLen=500) for j in random_seqs: SequenceParameters(j).get_linear_complexity('LZW',alphabetSize=5) # test all alphabets for i in [2,3,4,5,6,8,10,11, 12,15,18,20]: for j in random_seqs: SequenceParameters(j).get_linear_complexity('LZW',alphabetSize=i) # test a range of window sizes for i in [1,2,3,4,5,10]: for j in random_seqs: SequenceParameters(j).get_linear_complexity('LZW',blobLen=i) # test a range of window sizes for i in [1,2,3,4,5,10]: for j in random_seqs: SequenceParameters(j).get_linear_complexity('LZW',blobLen=i) # test a range of step-sizes for i in [1,2,3,4,5,10]: for j in random_seqs: SequenceParameters(j).get_linear_complexity('LZW',stepSize=i) # test a range of word-sizes for i in [1,2,3,4,5,10]: for j in random_seqs: SequenceParameters(j).get_linear_complexity('LZW',wordSize=i) def test_complexity_values(self): out = {} # WF alphatbetsize 4 out['WF'] = np.array([ 0.92321967, 0.92321967, 0.92321967, 0.88048202, 0.88048202, 0.92321967, 0.92321967, 0.92321967, 0.94773092, 0.92321967, 0.86096405, 0.86096405, 0.86096405, 0.68048202, 0.68048202, 0.86096405, 0.92321967, 0.92321967, 0.96096405, 0.9854753 , 0.92321967, 0.92321967, 0.92321967, 0.88048202, 0.88048202, 0.92321967, 0.92321967, 0.92321967, 0.94773092, 0.92321967, 0.86096405, 0.86096405, 0.86096405, 0.68048202, 0.68048202, 0.86096405, 0.92321967, 0.92321967, 0.96096405, 0.9854753 , 0.92321967, 0.92321967, 0.92321967, 0.88048202, 0.88048202, 0.92321967, 0.92321967, 0.92321967, 0.94773092, 0.92321967, 0.86096405]) # LZW alphabate size=4 out['LZW'] = np.array([ 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 0.9, 1. , 1. , 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 0.9, 1. , 1. , 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.8, 0.8, 0.8]) # LC alphabet size = 2 out['LC'] = np.array([ 0.625, 0.5 , 0.5 , 0.5 , 0.5 , 0.5 , 0.625, 0.75 , 0.625, 0.75 , 0.625, 0.75 , 0.875, 0.75 , 0.625, 0.625, 0.625, 0.625, 0.75 , 0.75 , 0.625, 0.5 , 0.5 , 0.5 , 0.5 , 0.5 , 0.625, 0.75 , 0.625, 0.75 , 0.625, 0.75 , 0.875, 0.75 , 0.625, 0.625, 0.625, 0.625, 0.75 , 0.75 , 0.625, 0.5 , 0.5 , 0.5 , 0.5 , 0.5 , 0.625, 0.75 , 0.625, 0.75 , 0.625]) self.assertEqual((self.SP_60.get_linear_complexity('WF', alphabetSize=4)[1] - out['WF'] < 0.00001).all(), True) self.assertEqual((self.SP_60.get_linear_complexity('LZW', alphabetSize=4)[1] - out['LZW'] < 0.00001).all(), True) self.assertEqual((self.SP_60.get_linear_complexity('LC', alphabetSize=2)[1] - out['LC'] < 0.00001).all(), True)
def setUp(self): self.SP_60 = SequenceParameters('QWERTYIPASDFGHKLCVNMQWERTYIPASDFGHKLCVNMQWERTYIPASDFGHKLCVNM') self.SP_10 = SequenceParameters('KDNIKHVPGG')
class TestComplexityFunctions(unittest.TestCase): def setUp(self): self.SP_60 = SequenceParameters('QWERTYIPASDFGHKLCVNMQWERTYIPASDFGHKLCVNMQWERTYIPASDFGHKLCVNM') self.SP_10 = SequenceParameters('KDNIKHVPGG') def test_get_reduced_alphabet_sequence_predefined_alphabets(self): for i in [2,3,4,5,6,8,10,11, 12,15,18,20]: # checks that a sequence with all 20 amino acids returns a reduced-alphabet sequence # made up of i-residues which contains exactly i residues! self.assertEqual(len(set(self.SP_60.get_reduced_alphabet_sequence(i)[0])),i) random_seqs = testTools.generate_random_sequence(10) for j in random_seqs: # build obj SP = SequenceParameters(j) # check reduced alphabet sequence length matches self.assertEqual(len(SP.get_reduced_alphabet_sequence(i)[0]), len(j)) def test_get_linear_WF_complexity(self): # general test random_seqs = testTools.generate_random_sequence_list(10, minLen=15, maxLen=500) for j in random_seqs: SequenceParameters(j).get_linear_complexity('WF',alphabetSize=5) # test all alphabets for i in [2,3,4,5,6,8,10,11, 12,15,18,20]: for j in random_seqs: SequenceParameters(j).get_linear_complexity('WF',alphabetSize=i) # test a range of window sizes for i in [1,2,3,4,5,10]: for j in random_seqs: SequenceParameters(j).get_linear_complexity('WF',blobLen=i) # test a range of window sizes for i in [1,2,3,4,5,10]: for j in random_seqs: SequenceParameters(j).get_linear_complexity('WF',blobLen=i) # test a range of step-sizes for i in [1,2,3,4,5,10]: for j in random_seqs: SequenceParameters(j).get_linear_complexity('WF',stepSize=i) # test a range of word-sizes for i in [1,2,3,4,5,10]: for j in random_seqs: SequenceParameters(j).get_linear_complexity('WF',wordSize=i) def test_get_linear_LC_complexity(self): # general test random_seqs = testTools.generate_random_sequence_list(10, minLen=15, maxLen=500) for j in random_seqs: SequenceParameters(j).get_linear_complexity('LC',alphabetSize=5) # test all alphabets for i in [2,3,4,5,6,8,10,11, 12,15,18,20]: for j in random_seqs: SequenceParameters(j).get_linear_complexity('LC',alphabetSize=i) # test a range of window sizes for i in [1,2,3,4,5,10]: for j in random_seqs: SequenceParameters(j).get_linear_complexity('LC',blobLen=i) # test a range of window sizes for i in [1,2,3,4,5,10]: for j in random_seqs: SequenceParameters(j).get_linear_complexity('LC',blobLen=i) # test a range of step-sizes for i in [1,2,3,4,5,10]: for j in random_seqs: SequenceParameters(j).get_linear_complexity('LC',stepSize=i) # test a range of word-sizes for i in [1,2,3,4,5,10]: for j in random_seqs: SequenceParameters(j).get_linear_complexity('LC',wordSize=i) def test_get_linear_LZW_complexity(self): # general test random_seqs = testTools.generate_random_sequence_list(10, minLen=15, maxLen=500) for j in random_seqs: SequenceParameters(j).get_linear_complexity('LZW',alphabetSize=5) # test all alphabets for i in [2,3,4,5,6,8,10,11, 12,15,18,20]: for j in random_seqs: SequenceParameters(j).get_linear_complexity('LZW',alphabetSize=i) # test a range of window sizes for i in [1,2,3,4,5,10]: for j in random_seqs: SequenceParameters(j).get_linear_complexity('LZW',blobLen=i) # test a range of window sizes for i in [1,2,3,4,5,10]: for j in random_seqs: SequenceParameters(j).get_linear_complexity('LZW',blobLen=i) # test a range of step-sizes for i in [1,2,3,4,5,10]: for j in random_seqs: SequenceParameters(j).get_linear_complexity('LZW',stepSize=i) # test a range of word-sizes for i in [1,2,3,4,5,10]: for j in random_seqs: SequenceParameters(j).get_linear_complexity('LZW',wordSize=i) def test_complexity_values(self): out = {} # WF alphatbetsize 4 out['WF'] = np.array([ 0.92321967, 0.92321967, 0.92321967, 0.88048202, 0.88048202, 0.92321967, 0.92321967, 0.92321967, 0.94773092, 0.92321967, 0.86096405, 0.86096405, 0.86096405, 0.68048202, 0.68048202, 0.86096405, 0.92321967, 0.92321967, 0.96096405, 0.9854753 , 0.92321967, 0.92321967, 0.92321967, 0.88048202, 0.88048202, 0.92321967, 0.92321967, 0.92321967, 0.94773092, 0.92321967, 0.86096405, 0.86096405, 0.86096405, 0.68048202, 0.68048202, 0.86096405, 0.92321967, 0.92321967, 0.96096405, 0.9854753 , 0.92321967, 0.92321967, 0.92321967, 0.88048202, 0.88048202, 0.92321967, 0.92321967, 0.92321967, 0.94773092, 0.92321967, 0.86096405]) # LZW alphabate size=4 out['LZW'] = np.array([ 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 0.9, 1. , 1. , 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 0.9, 1. , 1. , 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.8, 0.8, 0.8]) # LC alphabet size = 2 out['LC'] = np.array([ 0.625, 0.5 , 0.5 , 0.5 , 0.5 , 0.5 , 0.625, 0.75 , 0.625, 0.75 , 0.625, 0.75 , 0.875, 0.75 , 0.625, 0.625, 0.625, 0.625, 0.75 , 0.75 , 0.625, 0.5 , 0.5 , 0.5 , 0.5 , 0.5 , 0.625, 0.75 , 0.625, 0.75 , 0.625, 0.75 , 0.875, 0.75 , 0.625, 0.625, 0.625, 0.625, 0.75 , 0.75 , 0.625, 0.5 , 0.5 , 0.5 , 0.5 , 0.5 , 0.625, 0.75 , 0.625, 0.75 , 0.625]) self.assertEquals((self.SP_60.get_linear_complexity('WF', alphabetSize=4)[1] - out['WF'] < 0.00001).all(), True) self.assertEquals((self.SP_60.get_linear_complexity('LZW', alphabetSize=4)[1] - out['LZW'] < 0.00001).all(), True) self.assertEquals((self.SP_60.get_linear_complexity('LC', alphabetSize=2)[1] - out['LC'] < 0.00001).all(), True)
def aa_analysis(df, property): if property == "ncpr": df = df[pd.notnull(df['Amino_acids'])] df[["AA1","AA2"]] = df['Amino_acids'].str.split('/',expand=True) isoelectric_point = [] for sequence in df["AA1"]: try: cdr3 = ProteinAnalysis(str(sequence)) cidercdr3 = SequenceParameters(str(sequence)) isoelectric_point.append(cidercdr3.get_NCPR()) except: isoelectric_point.append(0) pass df["AA1_Iso"] = isoelectric_point isoelectric_point2 = [] for sequence in df["AA2"]: try: cdr3 = ProteinAnalysis(str(sequence)) cidercdr3 = SequenceParameters(str(sequence)) isoelectric_point2.append(cidercdr3.get_NCPR()) except: isoelectric_point2.append(0) pass df["AA2_Iso"] = isoelectric_point2 df["AA_Iso_Delta"] = df["AA2_Iso"] - df["AA1_Iso"] df = df[["AA1_Iso", "AA2_Iso", "AA_Iso_Delta"]] elif property == "uversky_hydropathy": df = df[pd.notnull(df['Amino_acids'])] df[["AA1","AA2"]] = df['Amino_acids'].str.split('/',expand=True) isoelectric_point = [] for sequence in df["AA1"]: try: cdr3 = ProteinAnalysis(str(sequence)) cidercdr3 = SequenceParameters(str(sequence)) isoelectric_point.append(cidercdr3.get_uversky_hydropathy()) except: isoelectric_point.append(0) pass df["AA1_Iso"] = isoelectric_point isoelectric_point2 = [] for sequence in df["AA2"]: try: cdr3 = ProteinAnalysis(str(sequence)) cidercdr3 = SequenceParameters(str(sequence)) isoelectric_point2.append(cidercdr3.get_uversky_hydropathy()) except: isoelectric_point2.append(0) pass df["AA2_Iso"] = isoelectric_point2 df["AA_Iso_Delta"] = df["AA2_Iso"] - df["AA1_Iso"] df = df[["AA1_Iso", "AA2_Iso", "AA_Iso_Delta"]] return df
#!/usr/bin/env python import sys import os import json import numpy as np from pandas import * # import the relevant code import localcider from localcider.sequenceParameters import SequenceParameters # create an empty list list_of_SeqObjs = [] with open("FUS_mammals.seq") as f: #for each ortholog in the file for seq in f: try: list_of_SeqObjs.append(SequenceParameters(seq)) except localcider.SequenceFileParserException: # if we encounter a file parsing error just skip that sequence continue # for each for obj in list_of_SeqObjs: print obj.get_kappa()