Пример #1
0
def fit(parameters):
    # Load the training data
    D_train = data.loadData(parameters["training_fasta"])
    # Iterate through the fasta file
    for fasta in (os.listdir(parameters["k_mers_path"])):
        # Get the k-mers of the actual file
        K = kmers.loadKmers(parameters["k_mers_path"] + "/" + fasta)
        # Generate the samples matrix (X_train) and the target values (y_train)
        X_train, y_train = matrix.generateSamplesTargets(
            D_train, K, parameters["k"])
        # Instantiate a linear svm classifier
        clf = SVC(kernel='linear',
                  C=1,
                  probability=True,
                  random_state=0,
                  cache_size=1000)
        # Fit the classifier
        clf.fit(X_train, y_train)
        # Get index of the separator
        index = fasta.index(".")
        # Get he filename
        file_name = fasta[0:index]
        # Save the model
        joblib.dump(clf, parameters["model_path"] + "/" + file_name + ".pkl")
        # Information message
        print("Model: " + file_name + ".pkl saved at: " +
              parameters["model_path"])
Пример #2
0
def predict(parameters):
	# Get the path of the model file
	model_path = str(parameters["model_path"])
	# Get the  path of the k-mers file
	k_mers_path = str(parameters["k_mers_path"])
	# Get the testing fasta file
	file_path = str(parameters["testing_fasta"])
	# Get the prediction file path
	prediction_path = str(parameters["prediction_path"])
	# Get the evaluation mode
	evaluation_mode = str(parameters["evaluation_mode"])
	# Load the training data
	D = data.loadData(file_path)
	# Get the set of k-mers
	K = kmers.loadKmers(k_mers_path)
	# Get the k-mers length
	k = len(list(K.keys())[0])
	# Generate the samples matrix (X) and the target values (y)
	X, y = matrix.generateSamplesTargets(D, K , k)
	# Load the classifier
	clf = joblib.load(model_path)
	# Predict the sequences
	y_pred = clf.predict(X)
	# If evaluation mode is egal to True
	if evaluation_mode == "True":
		# If the target values list is empty
		if len(y) == 0: print("Evaluation cannot be performed because target values are not given")
		# Else display the classification report
		else: print("Classification report \n", classification_report(y, y_pred))
	# Save the predictions
	f = open(prediction_path, "w")
	# Write the header
	f.write("id,y_pred\n")
	# Iterate through the predictions
	for i, y in enumerate(y_pred): 
		# Save the current prediction
		f.write(D[i][0] + "," + y + "\n")
	# Close the file
	f.close()
	# Displays a confirmation message
	print("Predictions saved at the path:", prediction_path)
Пример #3
0
def fit(parameters):
	# Get the parameters
	model_path = str(parameters["model_path"])
	# Get the path of the k-mers file
	k_mers_path = str(parameters["k_mers_path"])
	# Get the path of the training fasta file
	file_path = str(parameters["training_fasta"])
	# Load the training data
	D = data.loadData(file_path)
	# Get the set of k-mers
	K = kmers.loadKmers(k_mers_path)
	# Get the k-mers length
	k = len(list(K.keys())[0])
	# Generate the samples matrix (X) and the target values (y)
	X, y = matrix.generateSamplesTargets(D, K , k)
	#  Instantiate a linear svm classifier
	clf = svm()
	# Fit the classifier
	clf.fit(X, y)
	# Save the model
	joblib.dump(clf,  model_path)
	# Displays a confirmation message
	print("Model saved at the path:", model_path)
Пример #4
0
def predict(parameters):
    # Table of predictions
    y_pred = []
    # Table of classes
    classes = []
    # Table of belonging probabilities
    probabilities = numpy.empty(0, float)
    # Load the testing data
    D_test = data.loadData(parameters["testing_fasta"])

    # Compute the belonging probability for each model
    for fasta, model in zip(os.listdir(parameters["k_mers_path"]),
                            os.listdir(parameters["model_path"])):
        # Get the current model
        clf = joblib.load(parameters["model_path"] + "/" + model)
        if len(classes) == 0: classes = clf.classes_
        # Get the current k-mers
        K = kmers.loadKmers(parameters["k_mers_path"] + "/" + fasta)
        # Generate the samples matrix (X_test) and the target values (y_test)
        X_test, y_test = matrix.generateSamplesTargets(D_test, K,
                                                       parameters["k"])
        # Load the current model
        clf = joblib.load(parameters["model_path"] + "/" + model)
        # Compute the membership probabilities for the initial sub-model
        if probabilities.shape[0] == 0:
            probabilities = clf.predict_proba(X_test)
            # Sum the membership probabilities of the additional sub-models
        else:
            probabilities += clf.predict_proba(X_test)

    # Iterate membership probabilities
    for p in probabilities:
        # Get the maximum score of the array
        max_score = numpy.max(p)
        # Get the index asocciated to the high score of the array
        index = numpy.where(p == max_score)
        # Save the prediction
        y_pred.append(classes[index][0])

    # If evaluation mode is egal to True
    if parameters["evaluation_mode"] == "True":
        # If the target values list is empty
        if len(y_test) == 0:
            print(
                "Evaluation cannot be performed because target values are not given"
            )
            # Else display the classification report
        else:
            print("Classification report \n",
                  classification_report(y_test, y_pred))
    # Save the predictions
    f = open(parameters["prediction_path"] + "/prediction.csv", "w")
    # Write the header
    f.write("id,y_pred\n")
    # Iterate through the predictions
    for i, y in enumerate(y_pred):
        # Save the current prediction
        f.write(D_test[i][0] + "," + y + "\n")
    # Close the file
    f.close()
    # Displays a confirmation message
    print("Predictions saved at the path:", parameters["prediction_path"])
Пример #5
0
def extract(parameters):
	# Table of solutions
	solutions = []
	# Number of attempts at the first iteration 
	n_attempts = 1
	# Variable checking if a solution has been identified 
	objective = False
	# Population retained at each iteration
	temporaryPopulation = []	
	# Load the training data
	D = data.loadData(parameters["training_fasta"])
	# Get the k-mers existing in the sequences
	K = kmers.getKmers(parameters["k"], D)
	# Generate the samples matrix (X) and the target values (y)
	X, y = matrix.generateSamplesTargets(D, K , parameters["k"])
	# Variance threshold preprocessing
	X, K = algorithm.varianceThreshold(X, K, parameters["variance_threshold"])
	# Get the number of features
	n_features = numpy.size(X, 1)
	# Initialize the number of genes 
	n_genes = parameters["n_genes"]
	# Initialize gene indexes
	genes = algorithm.generateGenes(n_features)
	# Initialize the weights
	weights = algorithm.initialWeights(genes)

	# Iterate through the number of iterations
	for n in range(parameters["n_iterations"]):
		# Initialize the global scores 
		max_global_weighted_score = 0
		max_global_unweighted_score = 0
		# Iterate through the number of attempts
		for attempt in range(n_attempts):
			print("Iteration: " + str(n + 1) + " | Attempt(s):", str(attempt + 1) + " / " + str(n_attempts))
			# Generate the initial population
			if n == 0: 
				population = algorithm.generateInitialPopulation(parameters["n_chromosomes"], genes, n_genes, weights)
			# Generate the next population
			else:
				population = algorithm.generateNextPopulation(parameters["n_chromosomes"], genes, n_genes, weights)
				population = algorithm.mergePopulation(population, temporaryPopulation)
			# Evaluate the population
			scores = algorithm.fitnessCalculation(X, y, population)
			# Update the scores maximum scores
			max_global_weighted_score, max_global_unweighted_score = algorithm.getScores(scores, max_global_weighted_score, max_global_unweighted_score)
			# Check if they are sone solutions
			solutions = algorithm.checkSolutions(solutions, population, scores, parameters["objective_score"])
			# Check if the goal is reached 
			if objective ==  False: objective = algorithm.checkObjective(parameters["objective_score"], scores)
			# Display the progress of the research 
			print("Number of genes :", n_genes, "\n")
			# Update the number of gene and the mutatiom rate
			if objective == False and attempt + 1 == n_attempts: n_genes = n_genes + 1
			# Select the part of the next generation
			selection = algorithm.selection(scores, population)
			# Update weights
			weights = algorithm.updateWeights(weights, selection, n_features)
			# Apply crossovers
			selection = algorithm.crossover(selection, parameters["crossover_rate"])
			# Apply mutation
			selection = algorithm.mutation(selection, parameters["mutation_rate"], genes, n_genes, objective, n_attempts, attempt)
			# Clear the actual population
			temporaryPopulation.clear()
			# Add the selection to the temporary population
			temporaryPopulation = selection
			# If the objectif is not reached, update the number of attempts
			if attempt + 1 == n_attempts and objective == False: n_attempts = algorithm.compute_n_attempts(parameters["objective_score"], max_global_weighted_score, max_global_unweighted_score)
			# If the objectif is reached, update the number of attempts to 1
			elif attempt + 1 == n_attempts and objective == True: n_attempts = 1
		# If the number of solution is reached, stop the algorithm
		if parameters["n_solutions"] <= len(solutions): break
	# Save the identified solutions
	print("Identified solutions (" + str(len(solutions)) + ") saved at : " + parameters["k_mers_path"])
	kmers.saveExtractedKmers(K = K, solutions = solutions, path = parameters["k_mers_path"])