def Run_Uverskey(Fasta1, Fasta2, OutFile): amyload_seq = load_fasta_file(Fasta1) disprot_seq = load_fasta_file(Fasta2) net_abs_charge = Feature(get_aa2charge(default=0)).then(average_absolute) mean_hydropathy = Feature(get_aa2hydropathy(default=0)).then(average) uversky_fs = FeatureSet("uversky") uversky_fs.add(mean_hydropathy, name="mean_hydropathy") uversky_fs.add(net_abs_charge, name="net_abs_charge") amyload_uversky_seq = uversky_fs(amyload_seq) disprot_uversky_seq = uversky_fs(disprot_seq) amyload_data_x = amyload_uversky_seq.columns(feature="mean_hydropathy")[0] amyload_data_y = amyload_uversky_seq.columns(feature="net_abs_charge")[0] plt.plot(amyload_data_x, amyload_data_y,'.', label="Amyload") disprot_data = compact(disprot_uversky_seq).columns() plt.plot(disprot_data[0], disprot_data[1],'.', label="Disprot") plt.plot([-0.78, 0.835], [0.0, 0.5],'k') plt.xlabel("mean hydrophobicity") plt.ylabel("net abs charge") plt.legend() plt.savefig(OutFile)
def get_df_from_file(fname): """ Loads data from a file to a dataframe. The identifiers are set as a primary key of the dataframe. The sequence entries are a1, a2,... and so on. """ # Load the file f = load_fasta_file(fname) # Get the identifiers and the sequences names, dataset = [], [] for i in range(len(f)): dataset.append(f[i].data) names.append(f[i].identifier) # Generate a header for the dataframe headers = ['a' + str(i + 1) for i in range(np.shape(dataset)[1])] # Generate dataframe df = pd.DataFrame(dataset, columns=headers) df['names'] = names df = df.set_index('names') return df
def run(Fasta1, Fasta2, windows_per_frame, overlap_factor, xlabel, ylabel, pop1_label, pop2_label, htmlOutDir, htmlFname, Workdirpath): if not os.path.exists(htmlOutDir): os.makedirs(htmlOutDir) amyload_pos_seq = load_fasta_file(Fasta1) amyload_neg_seq = load_fasta_file(Fasta2) # Calculate quantitive features: volume and hydropathy mean_volume = Feature(get_aa2volume()).then(average) mean_hydropathy = Feature(get_aa2hydropathy()).then(average) fs = FeatureSet("volume'n'hydropathy") fs.add(mean_volume) fs.add(mean_hydropathy) amyload_pos_conv_seq = fs(amyload_pos_seq) amyload_neg_conv_seq = fs(amyload_neg_seq) # Do local Fisher: result = local_fisher_2d(amyload_pos_conv_seq, amyload_neg_conv_seq, windows_per_frame=int(windows_per_frame), overlap_factor=int(overlap_factor)) # Plot local Fisher: _plot_local_fisher_2d(result, xlabel=xlabel, ylabel=ylabel, pop1_label=pop1_label, pop2_label=pop2_label, out_file_path=os.path.join(os.getcwd(), "out.png")) # plt.savefig(os.path.join(Workdirpath, htmlOutDir, "1.png")) HTML_Gen(os.path.join(Workdirpath, htmlOutDir, htmlFname))
def Run_ngrams(fasta1, fasta2, OutFile ): alphasyn_seq = load_fasta_file(fasta1) amyload_pos_seq = load_fasta_file(fasta2) fs_aa = FeatureSet("aa patterns") fs_aa.add(identity) fs_aa.add(pattern_match, pattern='VT', padded=True) fs_aa.add(pattern_count, pattern='VT') result_seq = fs_aa(alphasyn_seq) fs_hp = FeatureSet("hydropathy patterns") fs_hp.add(Feature(get_aa2hydropathy())) fs_hp.add(Feature(get_aa2hydropathy()).then(pattern_match, pattern=[0.0, 2.0], metric='taxi', radius=1.0)) result_seq2 = fs_hp(alphasyn_seq) result_freq = ngram_count(alphasyn_seq, n=2) result_fit = zipf_law_fit(amyload_pos_seq, n=3, verbose=True) counts = sorted(result_fit["ngram_counts"], reverse=True) ranks = range(1, len(counts)+1) slope = result_fit["slope"] harmonic_num = sum([rank**-slope for rank in ranks]) fitted_counts = [(rank**-slope) / harmonic_num * sum(counts) for rank in ranks] plt.plot(ranks, counts, 'k', label="empirical") plt.plot(ranks, fitted_counts, 'k--', label="Zipf's law\nslope: {:.2f}".format((slope))) plt.xlabel('rank') plt.ylabel('count') plt.xscale('log') plt.yscale('log') plt.legend() plt.savefig(OutFile)
def encoded_seq_from_file(fname, dirname, particle, index): """ Function to encode the sequences from a file using AAindex. The encoded sequences are then padded to maximum length. """ # Load the fasta file f = load_fasta_file(dirname + '/' + fname) feat_map = _get_feature_map(index) # Get the sequences in a dataset dataset = [] for i in range(len(f)): dataset.append(f[i]) # Create a dictionary with keys as identifiers # and their values as the data. enc = {} if particle == 'virus': for seq in dataset: seq_id = _change_format_virus(seq.identifier) enc[seq_id] = feat_map(seq).data elif particle == 'mouse': for seq in dataset: seq_id = _change_format_mouse(seq.identifier) if seq_id not in ids_set_mouse: print seq.identifier, seq_id enc[seq_id] = feat_map(seq).data # Pad all sequences to maximum value in the # dataset. maxlen = max([len(val) for val in enc.values()]) enc = _pad_encoding(enc, maxlen) # Check if all values have lengths # equal to maxlen. for val in enc.values(): assert len(val) == maxlen return enc
import os import sys sys.path.insert(0, os.path.abspath('..')) from quantiprot.utils.io import load_fasta_file from quantiprot.utils.feature import Feature, FeatureSet from quantiprot.metrics.aaindex import get_aaindex_file from quantiprot.metrics.basic import average # Load data: seq = load_fasta_file("data/Alphasyn.fasta") # Build a feature: average polarity (Graham, 1974), AAindex entry: GRAR740102: feat = Feature(get_aaindex_file("GRAR740102")).then(average) # Add the feature to new feature set: fs = FeatureSet("my set") fs.add(feat) # Process sequences: res_seq = fs(seq) # Export average polarities res = res_seq.columns() print res
sys.path.insert(0, os.path.abspath('..')) from quantiprot.utils.io import load_fasta_file from quantiprot.utils.feature import Feature, FeatureSet from quantiprot.metrics.aaindex import get_aa2hydropathy from quantiprot.metrics.basic import identity # Ngram-related imports from quantiprot.metrics.ngram import pattern_match, pattern_count from quantiprot.analysis.ngram import ngram_count from quantiprot.analysis.ngram import zipf_law_fit from matplotlib import pyplot as plt # Load some data alphasyn_seq = load_fasta_file("data/Alphasyn.fasta") amyload_pos_seq = load_fasta_file("data/Amyload_positive.fasta") # Find and count matches to a pattern 'VT' fs_aa = FeatureSet("aa patterns") fs_aa.add(identity) fs_aa.add(pattern_match, pattern='VT', padded=True) fs_aa.add(pattern_count, pattern='VT') result_seq = fs_aa(alphasyn_seq) for seq in result_seq[:3]: print seq # ...and something much more subtle: # Map a sequence to the hydrophaty scale, and search for the pattern 0.0 - 2.0
# feature2 if args.quantify2 in ['rec', 'det', 'pal', 'ratio_det', 'ratio_pal']: feat2 = feat2.then(quantify_method[args.quantify2], metric=args.metric2, radius=float(args.radius2), dim=int(args.dim2), tau=int(args.tau2), det_len=int(args.diaglen2), pal_len=int(args.diaglen2)) else: feat2 = feat2.then(quantify_method[args.quantify2]) # Add the features to a FeatureSet fs = FeatureSet("fs") fs.add(feat1) fs.add(feat2) # Convert and plot input1 sequences in the 2d space input_seq1 = load_fasta_file(args.input1) conv_seq1 = fs(input_seq1) conv_data1_x = conv_seq1.columns(feature=feat1.name)[0] conv_data1_y = conv_seq1.columns(feature=feat2.name)[0] plt.plot(conv_data1_x, conv_data1_y, '.', label="input1") # Convert and plot input1 sequences in the 2d space if args.input2 is not None: input_seq2 = load_fasta_file(args.input2) conv_seq2 = fs(input_seq2) conv_data2_x = conv_seq2.columns(feature=feat1.name)[0] conv_data2_y = conv_seq2.columns(feature=feat2.name)[0] plt.plot(conv_data2_x, conv_data2_y, '.', label="input2") # Show legend and labels plt.xlabel(feat1.name)
def getLabelIndex(fromFile): print(fromFile) splitArray = fromFile.split(' ', 1) # Remove last element from split splitArray.pop(0) y = splitArray[0].split(' ') y = sorted([int(i) for i in y[:-1]]) return y # Load the 'xxxxx.fasta' sequence set alphasyn_seq = load_fasta_file("./Benchmark/benchmark3.fasta") alphasyn_seq1 = load_fasta_file("./Benchmark/benchmark3.fasta") # Get array of lengths fastaLength, fastaID, count = [],[], 0 for seq in alphasyn_seq1: fastaLength.append(len(seq.data)) fastaID.append(seq.identifier) for leng in fastaLength: count += leng print(fastaLength)
help='num. of classes (default: 3)') group_simplify.add_argument('-t', '--iterations', default=0, help='num. of iterations for kmeans (default: 0)') group_ngrams = parser.add_argument_group('N-grams') group_ngrams.add_argument('-n', '--n', default='1', help='n-gram size (default: 1)') group_ngrams.add_argument('-m', '--metric', default='identity', choices=['identity', 'taxi', 'euclid', 'sup', 'inf'], help='metric for matching n-grams (default: identity)') group_ngrams.add_argument('-r', '--radius', default=0.0, help='similarity radius (default: 0.0)') args = parser.parse_args() # Load the 'input' sequence set input_seq = load_fasta_file(args.input, unique=False) # Retrieve AAindex mapping for the 'property' if args.property is not None: try: aa_mapping = get_aaindex_file(args.property) except ValueError: aa_mapping = get_aaindex_www(args.property) # Simplify if and as requested if args.simplify is not None: aa_mapping = simplify(aa_mapping, aa_mapping.__name__+"/"+args.classes, method=args.simplify, k=int(args.classes), iters=int(args.iterations)) # Assign 'default' value for the Mapping
import os import sys sys.path.insert(0, os.path.abspath('..')) from quantiprot.utils.io import load_fasta_file from quantiprot.utils.sequence import SequenceSet from quantiprot.utils.sequence import merge # Load protein sequences from 'data/Amyload_positive.fasta': amyload_pos_seq = load_fasta_file("data/Amyload_positive.fasta") # Display first three sequences: print amyload_pos_seq for seq in amyload_pos_seq[:3]: print seq # Find a sequence 'AMY438|7-13|Sup35' in 'amyload_pos_seq': my_seq_index = amyload_pos_seq.ids().index("AMY438|7-13|Sup35") my_seq = amyload_pos_seq[my_seq_index] print my_seq # And copy the sequence to a new sequence set: my_seq_set = SequenceSet("my seq set") my_seq_set.add(my_seq) print my_seq_set # Try again to add the same sequence to 'my_seq_set' with 'unique' = True: my_seq_set.add(my_seq) print my_seq_set
from quantiprot.metrics.basic import average from quantiprot.metrics.ngram import pattern_match, pattern_count from quantiprot.utils.sequence import compact, subset from quantiprot.metrics.ngram import NgramFeatureSet from quantiprot.metrics.alphabet import PROTEIN from Bio import SeqIO #Load sequence length_seqs = [] for record in SeqIO.parse("sequence_2.fasta", "fasta"): length_seqs.append(len(record)) #print((record)) #load the sequence from the file seq = load_fasta_file("sequence_2.fasta") SequenceIds = [] SequenceIds2_list = [] for i in SequenceSet.ids(seq): SequenceIds.append(i) for i in SequenceIds: SequenceIds2 = i[i.find("[") + 1:i.find("]")] SequenceIds2_list.append(SequenceIds2) #gather important protein features polarity = Feature(get_aaindex_file("GRAR740102")).then(average) hydropathy = Feature(get_aaindex_file("KYTJ820101")).then(average) iso_point = Feature(get_aaindex_file("ZIMJ680104")).then(average) pk_COOH = Feature(get_aaindex_file("JOND750102")).then(average) entropy_form = Feature(get_aaindex_file("HUTJ700103")).then(average) melting_point = Feature(get_aaindex_file("FASG760102")).then(average)
###quantiprot analysis, it will write out tables with AMK properties for segregating sites # in order to see what are the physiochemical differences between the haplotypes from quantiprot.utils.io import load_fasta_file from quantiprot.utils.sequence import SequenceSet from quantiprot.utils.sequence import subset, columns from quantiprot.utils.feature import Feature, FeatureSet # Conversions-related imports: from quantiprot.utils.mapping import simplify from quantiprot.metrics.aaindex import get_aa2charge, get_aa2hydropathy, get_aa2volume, get_aa2mj from quantiprot.metrics.aaindex import get_aaindex_file from quantiprot.metrics.basic import identity import numpy as np fapr = load_fasta_file('prot_segregating.fasta') #load fasta fs = FeatureSet("myTLRset") fs.add(get_aa2charge()) fs.add(get_aa2volume()) fs.add(get_aa2mj()) fs.add(get_aa2hydropathy()) convfapr = fs(fapr) metrics = ["formal_charge", "volume", "miyazawa-jernigan", "hydropathy"] # which metrics of AMK to generate #print convfapr for m in metrics: outf = open(m + ".tsv", "w") with outf as f: h = np.matrix(columns(convfapr, feature=m, transpose=True))
import os import sys sys.path.insert(0, os.path.abspath('..')) from quantiprot.utils.io import load_fasta_file from quantiprot.utils.feature import FeatureSet from quantiprot.metrics.aaindex import get_aa2mj from quantiprot.metrics.rqa import RQAFeatureSet from quantiprot.metrics.basic import average from matplotlib import pyplot as plt # Load the HET-E1 sequence with WD40 repeats: hete1_seq = load_fasta_file("data/HETE1_PODAS.fasta") # Prepare FeatureSet for conversion from aa to Miyazawa-Jernigan hydrophobicity: mj_fs = FeatureSet("mj") mj_fs.add(get_aa2mj()) # Prepare specialized FeatureSet with basic RQA parameters calculated # over 100aa window, then smoothed over the 10aa window: rqa_fs = RQAFeatureSet("rqa", features=['recurrence', 'determinism'], window=100, metric='taxi', radius=4, dim=4, det_len=8) rqa_fs.then_all(average, window=10) print rqa_fs
import os import sys sys.path.insert(0, os.path.abspath('..')) # Uversky plot from quantiprot.utils.io import load_fasta_file from quantiprot.utils.feature import Feature, FeatureSet from quantiprot.utils.sequence import compact from quantiprot.metrics.aaindex import get_aa2charge, get_aa2hydropathy from quantiprot.metrics.basic import average, average_absolute from matplotlib import pyplot as plt amyload_seq = load_fasta_file("data/Amyload_positive.fasta") disprot_seq = load_fasta_file("data/Disprot.fasta") # Non-standard letters in Disprot assigned neutral charge and hydropathy: net_abs_charge = Feature(get_aa2charge(default=0)).then(average_absolute) mean_hydropathy = Feature(get_aa2hydropathy(default=0)).then(average) uversky_fs = FeatureSet("uversky") uversky_fs.add(mean_hydropathy, name="mean_hydropathy") uversky_fs.add(net_abs_charge, name="net_abs_charge") amyload_uversky_seq = uversky_fs(amyload_seq) disprot_uversky_seq = uversky_fs(disprot_seq) # First approach to get hydrophobicity/charge pairs amyload_data_x = amyload_uversky_seq.columns(feature="mean_hydropathy")[0] amyload_data_y = amyload_uversky_seq.columns(feature="net_abs_charge")[0] plt.plot(amyload_data_x, amyload_data_y, '.', label="Amyload")
default=1, help='rqa: embedding dimension (default: 1)') group_rqa.add_argument('-u', '--tau', default=0, help='rqa: embedding delay tau (default: 0)') group_rqa.add_argument( '-l', '--diaglen', default=2, help='rqa: minimal diagonal length for det/pal (default: 2)') args = parser.parse_args() # Load the 'input' sequence set input_seq = load_fasta_file(args.input) # Retrieve AAindex mapping for the 'property' if args.property is not None: try: aa_mapping = get_aaindex_file(args.property) except ValueError: aa_mapping = get_aaindex_www(args.property) # Assign 'default' value for the Mapping try: aa_mapping.default = float(args.default) except (TypeError, ValueError): aa_mapping.default = args.default # Simplify if and as requested