def get_phyche_index(k, phyche_list, alphabet): """get phyche_value according phyche_list.""" phyche_value = {} if 0 == len(phyche_list): for nucleotide in make_kmer_list(k, alphabet): phyche_value[nucleotide] = [] return phyche_value nucleotide_phyche_value = get_phyche_factor_dic(k, alphabet) for nucleotide in make_kmer_list(k, alphabet): if nucleotide not in phyche_value: phyche_value[nucleotide] = [] for e in nucleotide_phyche_value[nucleotide]: if e[0] in phyche_list: phyche_value[nucleotide].append(e[1]) return phyche_value
def make_pseknc_vector(sequence_list, phyche_value, k=2, w=0.05, lamada=1, alphabet=index_list.DNA, theta_type=1): """Generate the pseknc vector.""" kmer = make_kmer_list(k, alphabet) vector = [] for sequence in sequence_list: if len(sequence) < k or lamada + k > len(sequence): error_info = "Sorry, the sequence length must be larger than " + str( lamada + k) sys.stderr.write(error_info) sys.exit(0) # Get the nucleotide frequency in the DNA sequence. fre_list = [frequency(sequence, str(key)) for key in kmer] fre_sum = float(sum(fre_list)) # Get the normalized occurrence frequency of nucleotide in the DNA sequence. fre_list = [e / fre_sum for e in fre_list] # Get the theta_list. if 1 == theta_type: theta_list = get_parallel_factor(k, lamada, sequence, phyche_value, alphabet) elif 2 == theta_type: theta_list = get_series_factor(k, lamada, sequence, phyche_value, alphabet) elif 3 == theta_type: theta_list = get_parallel_factor(k=2, lamada=lamada, sequence=sequence, phyche_value=phyche_value, alphabet=alphabet) theta_sum = sum(theta_list) # Generate the vector according the Equation 9. denominator = 1 + w * theta_sum temp_vec = [round(f / denominator, 8) for f in fre_list] for theta in theta_list: temp_vec.append(round(w * theta / denominator, 8)) vector.append(temp_vec) return vector
def gen_vector(tng_list, n): """Generate feature vectors based on the top-n-gram. :param tng_list: the generated top-n-gram list. :param n: the n most frequent amino acids in the amino acid frequency profiles. """ gram_list = make_kmer_list(n, const.PROTEIN) #print tng_list #print gram_list vector_list = [] for tng in tng_list: vec_len = len(tng) #print vec_len vector = [] for elem in gram_list: gram_count = tng.count(elem) occur_freq = round((gram_count * 1.0) / vec_len, 4) vector.append(occur_freq) vector_list.append(vector) return vector_list
def make_pseknc_vector(sequence_list, phyche_value, k=2, w=0.05, lamada=1, alphabet=index_list.DNA, theta_type=1): """Generate the pseknc vector.""" kmer = make_kmer_list(k, alphabet) vector = [] for sequence in sequence_list: if len(sequence) < k or lamada + k > len(sequence): error_info = "Sorry, the sequence length must be larger than " + str(lamada + k) sys.stderr.write(error_info) sys.exit(0) # Get the nucleotide frequency in the DNA sequence. fre_list = [frequency(sequence, str(key)) for key in kmer] fre_sum = float(sum(fre_list)) # Get the normalized occurrence frequency of nucleotide in the DNA sequence. fre_list = [e / fre_sum for e in fre_list] # Get the theta_list. if 1 == theta_type: theta_list = get_parallel_factor(k, lamada, sequence, phyche_value, alphabet) elif 2 == theta_type: theta_list = get_series_factor(k, lamada, sequence, phyche_value, alphabet) elif 3 == theta_type: theta_list = get_parallel_factor(k=2, lamada=lamada, sequence=sequence, phyche_value=phyche_value, alphabet=alphabet) theta_sum = sum(theta_list) # Generate the vector according the Equation 9. denominator = 1 + w * theta_sum temp_vec = [round(f / denominator, 8) for f in fre_list] for theta in theta_list: temp_vec.append(round(w * theta / denominator, 8)) vector.append(temp_vec) return vector