def get_changed_background_sig_idx(exposures, background_sigs): background_sigs_values = sub.get_items_from_index(exposures,background_sigs) temp_exposures = exposures[:] temp_exposures[:] = (value for value in temp_exposures if value != 0) # remove the background signatures with zero values background_sigs[:] = (value for value in background_sigs_values if value != 0) # get the new indices of the background signatures background_sigs = sub.get_indeces(temp_exposures, background_sigs_values) return background_sigs
def add_connected_sigs(background_sigs, allsigids): connected_sigs = [["SBS2", "SBS13"], ["SBS7a", "SBS7b", "SBS7c", "SBS7d"], ["SBS10a", "SBS10b"], ["SBS17a", "SBS17b"]] backround_sig_names = sub.get_items_from_index(allsigids, background_sigs) connect_the_sigs = [] for i in range(len(connected_sigs)): if len(set(connected_sigs[i]).intersection( set(backround_sig_names))) > 0: connect_the_sigs = connect_the_sigs + connected_sigs[i] backround_sig_names = list( set(backround_sig_names).union(set(connect_the_sigs))) background_sigs = sub.get_indeces(allsigids, backround_sig_names) background_sigs.sort() return background_sigs
def add_remove_signatures(W, sample, metric="l2", solver="nnls", background_sigs=[], permanent_sigs=[], candidate_sigs="all", add_penalty=0.05, remove_penalty=0.01, check_rule_negatives=[], checkrule_penalty=1.00, allsigids=False, directory=os.getcwd() + "/Signature_assaignment_logfile.txt", connected_sigs=True, verbose=False): lognote = open(directory, 'a') maxmutation = np.sum(np.array(sample)) always_background = copy.deepcopy(permanent_sigs) M = sample if candidate_sigs == "all": candidate_sigs = list(range(W.shape[1])) #first add each signatures original_distance = np.inf # a big number layer = 0 # check the cosine_similarity with 4 signatures (highest) cosine_similarity_with_four_signatures = 1.0 # a value that will allow the signature not be reported as novel signature #set the signature's name if type(allsigids) != bool: allsigids = sub.get_items_from_index(allsigids, candidate_sigs) else: allsigids = candidate_sigs while True: if verbose: print("\n\n\n\n!!!!!!!!!!!!!!!!!!STARTING LAYER: ", layer) lognote.write( "\n\n!!!!!!!!!!!!!!!!!!!!!!!!! LAYER: {} !!!!!!!!!!!!!!!!!!!!!!!!!\n" .format(layer)) layer = layer + 1 layer_original_distance = np.inf #a big number sigsToBeAdded = list(set(candidate_sigs) - set(background_sigs)) #set the signature's name if type(allsigids) != bool: allsigidsToBeAdded = sub.get_items_from_index( allsigids, sigsToBeAdded) else: allsigidsToBeAdded = sigsToBeAdded for i, j in zip(sigsToBeAdded, allsigidsToBeAdded): loop_sig = [i] background_sigs = list( set(background_sigs).union(set(always_background))) if connected_sigs == True: background_sigs = add_connected_sigs(background_sigs, list(allsigids)) add_exposures, add_distance, _ = add_signatures( W, M[:, np.newaxis], presentSignatures=copy.deepcopy(background_sigs), toBeAdded=loop_sig, metric="l2", verbose=False, check_rule_negatives=check_rule_negatives, check_rule_penalty=checkrule_penalty, cutoff=add_penalty) if verbose: print( "\n\n\n################## Add Index {} ########################" .format(j)) print(np.nonzero(add_exposures)[0]) print(sigsToBeAdded) print(add_distance) remove_exposures, remove_distance, _ = remove_all_single_signatures( W, add_exposures, M, background_sigs=always_background, metric="l2", verbose=False, cutoff=remove_penalty) if verbose: print("\n################## Remove ########################") print(np.nonzero(remove_exposures)[0]) print(remove_distance) # check if there is any change between the add and remove signatures and assign the distance and exposure accoridingly if (np.nonzero(add_exposures)[0].all() == np.nonzero(remove_exposures)[0].all() ) and np.nonzero(add_exposures)[0].shape == np.nonzero( remove_exposures)[0].shape: distance = add_distance exposures = add_exposures else: distance = remove_distance exposures = remove_exposures if distance < layer_original_distance: selected_signatures = np.nonzero(exposures)[0] layer_original_distance = distance activities = exposures if verbose: print( "\n\n#################### After Add-Remove #################################" ) print(selected_signatures) print(layer_original_distance) lognote.write("Best Signature Composition {}\n".format( sub.get_items_from_index(allsigids, selected_signatures))) lognote.write("L2 Error % {}\n".format( round(layer_original_distance, 2))) lognote.write("Cosine Similarity {}\n".format( round(cos_sim(M, np.dot(W, activities)), 2))) if layer_original_distance < original_distance: original_distance = layer_original_distance background_sigs = list(selected_signatures) finalactivities = activities if len(background_sigs) == 4: cosine_similarity_with_four_signatures = cos_sim( M, np.dot(W, finalactivities)) #print(cosine_similarity_with_four_signatures) else: N = np.dot(W, finalactivities) cosine_similarity = cos_sim(M, N) kldiv = round(scipy.stats.entropy(M, N), 3) correlation, _ = scipy.stats.pearsonr(M, N) correlation = round(correlation, 2) break # get the maximum value of the new Exposure maxcoef = np.max(finalactivities) idxmaxcoef = list(finalactivities).index(maxcoef) finalactivities = np.round(finalactivities) # We may need to tweak the maximum value of the new exposure to keep the total number of mutation equal to the original mutations in a genome if np.sum(finalactivities) != maxmutation: finalactivities[idxmaxcoef] = round( finalactivities[idxmaxcoef]) + maxmutation - np.sum( finalactivities) if verbose: print("\n########################## Final ###########################") print(background_sigs) print(original_distance) print(finalactivities) lognote.write( "\n#################### Final Composition #################################\n" ) lognote.write("{}\n".format( sub.get_items_from_index(allsigids, selected_signatures))) lognote.write("L2 Error % {}\n".format(round(original_distance, 2))) lognote.write("Cosine Similarity {}\n".format( round(cos_sim(M, np.dot(W, finalactivities)), 2))) #close lognote lognote.close() #newExposure, newSimilarity = fit_signatures(W[:,list(background_sigs)], M, metric="l2") #print(newExposure, newSimilarity) return (background_sigs, finalactivities, original_distance, cosine_similarity, kldiv, correlation, cosine_similarity_with_four_signatures)
def remove_all_single_signatures(W, H, genomes, metric="l2", solver="nnls", cutoff=0.05, background_sigs=[], verbose=False): # make the empty list of the successfull combinations signature_ids = sub.make_letter_ids(idlenth=W.shape[1], mtype="Signature ") current_signatures = signature_ids #print(current_signatures) successList = [0, [], 0] background_sig = copy.deepcopy(background_sigs) #setting the base_similarity base_H_index = list(np.nonzero(H)[0]) reg = nnls(W[:, base_H_index], genomes) base_weights = reg[0] if metric == "cosine": # get the cos_similarity with sample for the oringinal W and H[:,i] originalSimilarity = 1 - cos_sim( genomes, np.dot(W[:, base_H_index], base_weights)) if verbose == True: print("originalSimilarity", 1 - originalSimilarity) elif metric == "l2": originalSimilarity = np.linalg.norm( genomes - np.dot(W[:, base_H_index], base_weights), ord=2) / np.linalg.norm(genomes, ord=2) if verbose == True: print("originalSimilarity", originalSimilarity) # make the original exposures of specific sample round oldExposures = np.round(H) # set the flag for the while loop if len(oldExposures[np.nonzero(oldExposures)]) > 1: Flag = True else: Flag = False if metric == "cosine": return oldExposures, 1 - originalSimilarity, cos_sim( genomes, np.dot(W, H) ) #first value is the exprosure, second value is the similarity (based on the similarity matic), third value is the cosine similarity else: return oldExposures, originalSimilarity, cos_sim( genomes, np.dot(W, H) ) #first value is the exprosure, second value is the similarity (based on the similarity matic), third value is the cosine similarity # The while loop starts here current_signatures = sub.get_items_from_index(signature_ids, np.nonzero(oldExposures)[0]) while Flag: # get the list of the indices those already have zero values if len(successList[1]) == 0: initialZerosIdx = list(np.where(oldExposures == 0)[0]) #get the indices to be selected selectableIdx = list(np.where(oldExposures > 0)[0]) elif len(successList[1]) > 1: initialZerosIdx = list(np.where(successList[1] == 0)[0]) #get the indices to be selected selectableIdx = list(np.where(successList[1] > 0)[0]) else: print("iteration is completed") #break # get the total mutations for the given sample maxmutation = round(np.sum(genomes)) # new signature matrix omiting the column for zero #Winit = np.delete(W, initialZerosIdx, 1) Winit = W[:, selectableIdx] # set the initial cos_similarity or other similarity distance record = [np.inf, [], 1] # # get the number of current nonzeros l = Winit.shape[1] background_sig = get_changed_background_sig_idx( list(oldExposures), background_sig) for i in range(l): if i in background_sig: continue loopSelection = list(range(l)) del loopSelection[i] W1 = Winit[:, loopSelection] #initialize the guess x0 = np.random.rand(l - 1, 1) * maxmutation x0 = x0 / np.sum(x0) * maxmutation #set the bounds and constraints bnds = create_bounds([], genomes, W1.shape[1]) cons1 = {'type': 'eq', 'fun': constraints1, 'args': [genomes]} #the optimization step if solver == "slsqp": sol = minimize(parameterized_objective2_custom, x0, args=(W1, genomes), bounds=bnds, constraints=cons1, tol=1e-15) #print (sol.success) #print (sol.x) #convert the newExposure vector into list type structure newExposure = list(sol.x) if solver == "nnls": ### using NNLS algorithm reg = nnls(W1, genomes) weights = reg[0] newSample = np.dot(W1, weights) normalised_weights = weights / sum(weights) solution = normalised_weights * sum(genomes) newExposure = list(solution) #insert the loopZeros in its actual position newExposure.insert(i, 0) #insert zeros in the required position the newExposure matrix initialZerosIdx.sort() for zeros in initialZerosIdx: newExposure.insert(zeros, 0) # get the maximum value the new Exposure #maxcoef = max(newExposure) #idxmaxcoef = newExposure.index(maxcoef) #newExposure = np.round(newExposure) #if np.sum(newExposure)!=maxmutation: #newExposure[idxmaxcoef] = round(newExposure[idxmaxcoef])+maxmutation-sum(newExposure) newExposure = np.array(newExposure) if verbose == True: #print(newExposure) print("\nRemoving {}".format(current_signatures[i])) if metric == "cosine": newSimilarity = 1 - cos_sim(genomes, newSample) if verbose == True: print("newSimilarity", 1 - newSimilarity) elif metric == "l2": newSimilarity = np.linalg.norm(genomes - newSample, ord=2) / np.linalg.norm(genomes, ord=2) if verbose == True: print("newSimilarity", newSimilarity) difference = newSimilarity - originalSimilarity if difference < 0: difference = cutoff + 1e-100 if verbose == True: print("difference", difference) if difference < record[0]: record = [difference, newExposure, newSimilarity] #print("difference", difference) if verbose == True: print("--------------------------------------") if verbose == True: print( "\n############################\n############################") #print("Selected Exposure") #print(record[1]) selected_sigs = sub.get_items_from_index(signature_ids, np.nonzero(record[1])[0]) dropped_sig = list(set(current_signatures) - set(selected_sigs)) print("Dropped Signature: {}".format(dropped_sig)) current_signatures = selected_sigs print("New Similarity", record[2]) print("Similarity Difference: {}".format(record[2] - originalSimilarity)) print("Current Signatures: {}".format(selected_sigs)) print( "****************************\n****************************\n\n" ) #print ("This loop's selection is {}".format(record)) if record[0] > cutoff: Flag = False elif len(record[1][np.nonzero(record[1])]) == 1: successList = record Flag = False else: successList = record background_sig = get_changed_background_sig_idx( list(record[1]), background_sig) originalSimilarity = record[2] #print("The loop selection is {}".format(successList)) #print (Flag) #print ("\n\n") #print ("The final selection is {}".format(successList)) if len(successList[1]) == 0: successList = [0.0, oldExposures, originalSimilarity] if verbose == True: print("\n") print("Final Exposure") print(successList[1]) if metric == "cosine": print("Final Similarity: ", 1 - successList[2]) if metric == "l2": print("Final Similarity: ", successList[2]) H = successList[1] return H, successList[2], cos_sim( genomes, np.dot(W, H) ) #first value is the exprosure, second value is the similarity (based on the similarity matic), third value is the cosine similarity