def MotifEnumeration(Dna, k, d): Patterns = set() candidates = {} kmers = list(km.getKmersAndNeighborhood(Dna[0], k, d)) for j in range(len(kmers)): pattern_one = kmers[j] for z in range(1, len(kmers)): pattern_two = kmers[z] hamming_value = hd.hammingDistance(pattern_one, pattern_two) if hamming_value <= d and pattern_one not in candidates: candidates[pattern_one] = set({0}) for candidate in candidates: for i in range(1, len(Dna)): kmers_to_compare = km.getKmers(Dna[i], k) for kmer_two in kmers_to_compare: difference = hd.hammingDistance(candidate, kmer_two) if difference <= d: candidates[candidate].add(i) #print(candidates) for candidate in candidates: if len(candidates[candidate]) == len(Dna): Patterns.add(candidate) return (' ').join(sorted(Patterns))
def MostProbableProfile(sequence, size, matrix): kmers = km.getKmers(sequence, size) results = {} for kmer in kmers: results[kmer] = ProfileMatrixCalc(kmer, matrix) return max(results.items(), key=operator.itemgetter(1))[0]
def approximatePatternMatch(pattern, sequence, k): kmer_array = getKmers(sequence, len(pattern)) indexes = [] for kmer_index in range(len(kmer_array)): hamming_diff = hammingDistance(kmer_array[kmer_index], pattern) if hamming_diff <= int(k): indexes.append(kmer_index) return len(indexes)
def mostFrequentKmerLimitHamming(text, kmer_size, mismatch): kmers_array = (getKmers(text, kmer_size)) dic = {} for kmer in kmers_array: print(kmer) freq = kmersFrequencyMatches(text, kmer, mismatch) for match in freq: if match not in dic: dic[match] = 0 else: dic[match] = dic[match] + 1 return dic
def extract(parameters): # Table of solutions solutions = [] # Number of attempts at the first iteration n_attempts = 1 # Variable checking if a solution has been identified objective = False # Population retained at each iteration temporaryPopulation = [] # Load the training data D = data.loadData(parameters["training_fasta"]) # Get the k-mers existing in the sequences K = kmers.getKmers(parameters["k"], D) # Generate the samples matrix (X) and the target values (y) X, y = matrix.generateSamplesTargets(D, K , parameters["k"]) # Variance threshold preprocessing X, K = algorithm.varianceThreshold(X, K, parameters["variance_threshold"]) # Get the number of features n_features = numpy.size(X, 1) # Initialize the number of genes n_genes = parameters["n_genes"] # Initialize gene indexes genes = algorithm.generateGenes(n_features) # Initialize the weights weights = algorithm.initialWeights(genes) # Iterate through the number of iterations for n in range(parameters["n_iterations"]): # Initialize the global scores max_global_weighted_score = 0 max_global_unweighted_score = 0 # Iterate through the number of attempts for attempt in range(n_attempts): print("Iteration: " + str(n + 1) + " | Attempt(s):", str(attempt + 1) + " / " + str(n_attempts)) # Generate the initial population if n == 0: population = algorithm.generateInitialPopulation(parameters["n_chromosomes"], genes, n_genes, weights) # Generate the next population else: population = algorithm.generateNextPopulation(parameters["n_chromosomes"], genes, n_genes, weights) population = algorithm.mergePopulation(population, temporaryPopulation) # Evaluate the population scores = algorithm.fitnessCalculation(X, y, population) # Update the scores maximum scores max_global_weighted_score, max_global_unweighted_score = algorithm.getScores(scores, max_global_weighted_score, max_global_unweighted_score) # Check if they are sone solutions solutions = algorithm.checkSolutions(solutions, population, scores, parameters["objective_score"]) # Check if the goal is reached if objective == False: objective = algorithm.checkObjective(parameters["objective_score"], scores) # Display the progress of the research print("Number of genes :", n_genes, "\n") # Update the number of gene and the mutatiom rate if objective == False and attempt + 1 == n_attempts: n_genes = n_genes + 1 # Select the part of the next generation selection = algorithm.selection(scores, population) # Update weights weights = algorithm.updateWeights(weights, selection, n_features) # Apply crossovers selection = algorithm.crossover(selection, parameters["crossover_rate"]) # Apply mutation selection = algorithm.mutation(selection, parameters["mutation_rate"], genes, n_genes, objective, n_attempts, attempt) # Clear the actual population temporaryPopulation.clear() # Add the selection to the temporary population temporaryPopulation = selection # If the objectif is not reached, update the number of attempts if attempt + 1 == n_attempts and objective == False: n_attempts = algorithm.compute_n_attempts(parameters["objective_score"], max_global_weighted_score, max_global_unweighted_score) # If the objectif is reached, update the number of attempts to 1 elif attempt + 1 == n_attempts and objective == True: n_attempts = 1 # If the number of solution is reached, stop the algorithm if parameters["n_solutions"] <= len(solutions): break # Save the identified solutions print("Identified solutions (" + str(len(solutions)) + ") saved at : " + parameters["k_mers_path"]) kmers.saveExtractedKmers(K = K, solutions = solutions, path = parameters["k_mers_path"])