def gappy_pair_data(sequences, maximum_distance, **args) : """generate a dataset object that contains all pairs of letters in a sequence that are within a certain distance of each other. :Parameters: - `sequences` - a list of sequences from which to construct the gappy pair representation or a Fasta file that contains the sequences - `maximum_distance` - the maximum distance between pairs of length of the substrings to consider :Keywords: - `normalize` - whether to normalize the dataset [default: True] - `prefix` - a string to be added to the name of each feature (useful when combining spectrum features from several sources) - `skip` - a list of characters that should be skipped in computing the features """ prefix = '' if 'prefix' in args : prefix = args['prefix'] normalize = True if 'normalize' in args : normalize = args['normalize'] skip = [] if 'skip' in args : skip = args['skip'] if type(sequences) == type('') : sequences = fasta_read(sequences) data = SparseDataSet(generate_gappy_pairs(sequences, maximum_distance, prefix, skip)) if normalize : data.normalize(2) return data
def gappy_pair_data(sequences, maximum_distance, **args): """generate a dataset object that contains all pairs of letters in a sequence that are within a certain distance of each other. :Parameters: - `sequences` - a list of sequences from which to construct the gappy pair representation or a Fasta file that contains the sequences - `maximum_distance` - the maximum distance between pairs of length of the substrings to consider :Keywords: - `normalize` - whether to normalize the dataset [default: True] - `prefix` - a string to be added to the name of each feature (useful when combining spectrum features from several sources) - `skip` - a list of characters that should be skipped in computing the features """ prefix = '' if 'prefix' in args: prefix = args['prefix'] normalize = True if 'normalize' in args: normalize = args['normalize'] skip = [] if 'skip' in args: skip = args['skip'] if type(sequences) == type(''): sequences = fasta_read(sequences) data = SparseDataSet( generate_gappy_pairs(sequences, maximum_distance, prefix, skip)) if normalize: data.normalize(2) return data
def spectrum_data(sequences, k1, k2=None, **args) : """generate a dataset object that represents the spectrum of a sequence, i.e. its kernel function is the spectrum kernel. Reference: C. Leslie, E. Eskin, and WS Noble. The spectrum kernel: A string kernel for SVM protein classification. :Parameters: - `sequences` - either a name of a fasta file that contains the sequences or a list of sequences - `k1` - the length of the substrings to consider - `k2` - if k2 is provided then strings whose length is between k1 and k2 are used in constructing the spectrum. :Keywords: - `normalize` - whether to normalize the dataset [default: True] - `prefix` - a string to be added to the name of each feature (useful when combining spectrum features from several sources) - `skip` - a list of characters that should be skipped in computing the spectrum [default: []]. Whenever a character in this list is encountered, the substring is not included in the spectrum - `mismatch` - whether to allow a single mismatch [default: False] """ prefix = '' if 'prefix' in args : prefix = args['prefix'] normalize = True if 'normalize' in args : normalize = args['normalize'] if k2 is None : k2 = k1 + 1 skip = [] if 'skip' in args : skip = args['skip'] mismatch = False if 'mismatch' in args : mismatch = args['mismatch'] if mismatch : spectrum_generator = generate_single_mismatch_spectrum else : spectrum_generator = generate_spectrum if type(sequences) == type('') : sequences = fasta_read(sequences) data = SparseDataSet(spectrum_generator(sequences, k1, prefix, skip)) if normalize : data.normalize(2) for k in range(k1+1, k2 + 1) : data2 = SparseDataSet(spectrum_generator(sequences, k, prefix, skip)) if normalize : data2.normalize(2) data.addFeatures(data2) return data
def spectrum_data(sequences, k1, k2=None, **args): """generate a dataset object that represents the spectrum of a sequence, i.e. its kernel function is the spectrum kernel. Reference: C. Leslie, E. Eskin, and WS Noble. The spectrum kernel: A string kernel for SVM protein classification. :Parameters: - `sequences` - either a name of a fasta file that contains the sequences or a list of sequences - `k1` - the length of the substrings to consider - `k2` - if k2 is provided then strings whose length is between k1 and k2 are used in constructing the spectrum. :Keywords: - `normalize` - whether to normalize the dataset [default: True] - `prefix` - a string to be added to the name of each feature (useful when combining spectrum features from several sources) - `skip` - a list of characters that should be skipped in computing the spectrum [default: []]. Whenever a character in this list is encountered, the substring is not included in the spectrum - `mismatch` - whether to allow a single mismatch [default: False] """ prefix = '' if 'prefix' in args: prefix = args['prefix'] normalize = True if 'normalize' in args: normalize = args['normalize'] if k2 is None: k2 = k1 + 1 skip = [] if 'skip' in args: skip = args['skip'] mismatch = False if 'mismatch' in args: mismatch = args['mismatch'] if mismatch: spectrum_generator = generate_single_mismatch_spectrum else: spectrum_generator = generate_spectrum if type(sequences) == type(''): sequences = fasta_read(sequences) data = SparseDataSet(spectrum_generator(sequences, k1, prefix, skip)) if normalize: data.normalize(2) for k in range(k1 + 1, k2 + 1): data2 = SparseDataSet(spectrum_generator(sequences, k, prefix, skip)) if normalize: data2.normalize(2) data.addFeatures(data2) return data
def positional_kmer_data(sequences, k1, k2=None, **args) : """generate a dataset object that represents kmers that occur in specific positions. When using weighting, this is essentially the 'weighted degree' kernel of Sonenburg et al. :Parameters: - `sequences` - the name of a fasta file that contains the sequences or a list of sequences - `k1` - the smallest length kmer to consider - `k2` - if k2 is provided then strings whose length is between k1 and k2 are used in constructing the kernel :Keywords: - `normalize` - whether to normalize the dataset [default: True] - `prefix` - a string to be added to the name of each feature (useful when combining spectrum features from several sources) - `skip` - a list of characters that should be skipped in computing the spectrum [default: []]. Whenever a character in this list is encountered, the substring is not included in the spectrum - `weighted` - whether to use the weighting of sonenborg et al [default: equal weights] - `shift` - whether to consider a shift [default: False] - `shift_start` - the position in the seq to start shifting [default: 0] - `shift_end` - the position in the sequence to stop shifting [default: end of sequence] """ prefix = '' if 'prefix' in args : prefix = args['prefix'] normalize = True if 'normalize' in args : normalize = args['normalize'] if k2 is None : k2 = k1 + 1 skip = [] if 'skip' in args : skip = args['skip'] shift = False if 'shift' in args : shift = args['shift'] shift_start = 0 if 'shift_start' in args : shift_start = args['shift_start'] if type(sequences) == type('') : sequences = fasta_read(sequences) if 'shift_end' in args : shift_end = args['shift_end'] else : shift_end = len(sequences[0]) - 1 weighted = False if weighted in args : weighted = args['weighted'] if weighted : weights = [1.0 for i in range(k1, k2 + 1)] else : weights = [1.0 for i in range(k1, k2 + 1)] data = SparseDataSet(generate_positional_kmers( sequences, k1, prefix, skip, weights[0], shift, shift_start, shift_end)) if normalize : data.normalize(2) for k in range(k1 + 1, k2 + 1) : data2 = SparseDataSet(generate_positional_kmers( sequences, k, prefix, skip, weights[k - k1], shift, shift_start, shift_end)) if normalize : data2.normalize(2) data.addFeatures(data2) return data
def positional_kmer_data(sequences, k1, k2=None, **args): """generate a dataset object that represents kmers that occur in specific positions. When using weighting, this is essentially the 'weighted degree' kernel of Sonenburg et al. :Parameters: - `sequences` - the name of a fasta file that contains the sequences or a list of sequences - `k1` - the smallest length kmer to consider - `k2` - if k2 is provided then strings whose length is between k1 and k2 are used in constructing the kernel :Keywords: - `normalize` - whether to normalize the dataset [default: True] - `prefix` - a string to be added to the name of each feature (useful when combining spectrum features from several sources) - `skip` - a list of characters that should be skipped in computing the spectrum [default: []]. Whenever a character in this list is encountered, the substring is not included in the spectrum - `weighted` - whether to use the weighting of sonenborg et al [default: equal weights] - `shift` - whether to consider a shift [default: False] - `shift_start` - the position in the seq to start shifting [default: 0] - `shift_end` - the position in the sequence to stop shifting [default: end of sequence] """ prefix = '' if 'prefix' in args: prefix = args['prefix'] normalize = True if 'normalize' in args: normalize = args['normalize'] if k2 is None: k2 = k1 + 1 skip = [] if 'skip' in args: skip = args['skip'] shift = False if 'shift' in args: shift = args['shift'] shift_start = 0 if 'shift_start' in args: shift_start = args['shift_start'] if type(sequences) == type(''): sequences = fasta_read(sequences) if 'shift_end' in args: shift_end = args['shift_end'] else: shift_end = len(sequences[0]) - 1 weighted = False if weighted in args: weighted = args['weighted'] if weighted: weights = [1.0 for i in range(k1, k2 + 1)] else: weights = [1.0 for i in range(k1, k2 + 1)] data = SparseDataSet( generate_positional_kmers(sequences, k1, prefix, skip, weights[0], shift, shift_start, shift_end)) if normalize: data.normalize(2) for k in range(k1 + 1, k2 + 1): data2 = SparseDataSet( generate_positional_kmers(sequences, k, prefix, skip, weights[k - k1], shift, shift_start, shift_end)) if normalize: data2.normalize(2) data.addFeatures(data2) return data