def tune_and_eval(self, results_file, params=None, feature_names=None, njobs=50, kfold=10, optimized_for='f1_macro'): ''' :param results_file: :param params: :param feature_names: :param njobs: :param kfold: :return: ''' if params is None: params = RFClassifier.params_tuning self.CV = KFoldCrossVal(self.X, self.Y, folds=kfold) self.CV.tune_and_evaluate(self.model, parameters=params, score=optimized_for, file_name=results_file + '_RF', n_jobs=njobs) if feature_names is not None: [ label_set, conf, label_set, best_score_, best_estimator_, cv_results_, best_params_, (cv_predictions_pred, cv_predictions_trues, isolates), (Y_test_pred, Y_test) ] = FileUtility.load_obj(results_file + '_RF.pickle') self.generate_RF_important_features(best_estimator_, feature_names, results_file)
def load_precalculated(file_path): ''' load precalculated results :param file_path: :return: ''' return FileUtility.load_obj(file_path)
def tune_and_eval(self, results_file, params=None, njobs=50, kfold=10, feature_names=None, optimized_for='f1_macro'): ''' K-fold cross-validation :param results_file: file to save the results :param params: parameters to be tuned :param njobs: number of cores :param kfold: number of folds :return: ''' if params == None: params = SVM.params_tuning CV = KFoldCrossVal(self.X, self.Y, folds=kfold) CV.tune_and_evaluate(self.model, parameters=params, score=optimized_for, file_name=results_file + '_SVM', n_jobs=njobs) if feature_names is not None: [ nested_scores, cv_dicts, label_set, conf, label_set, best_score_, best_estimator_, cv_results_, best_params_, (cv_predictions_pred, cv_predictions_trues, isolates), (Y_test_pred, Y_test) ] = FileUtility.load_obj(results_file + '_SVM.pickle') self.generate_SVM_important_features(best_estimator_, feature_names, results_file)
def get_cv_res(filename): [label_set, conf, best_score_, best_estimator_, cv_results_, best_params_, pred] = FileUtility.load_obj(filename) res = dict() print (conf) #print (cv_results_.keys()) idx = np.argmax(cv_results_['mean_test_f1_macro']) res['f1_macro'] = np.round(cv_results_['mean_test_f1_macro'][idx], 2) res['f1_macro*'] = str(np.round(cv_results_['mean_test_f1_macro'][idx], 2)) + \ ' $\pm$ ' + str(np.round(cv_results_['std_test_f1_macro'][idx], 2)) res['f1_micro'] = str(np.round(cv_results_['mean_test_f1_micro'][idx], 2)) + \ ' $\pm$ ' + str(np.round(cv_results_['std_test_f1_micro'][idx], 2)) res['precision_micro'] = str(np.round(cv_results_['mean_test_precision_micro'][idx], 2)) + \ ' $\pm$ ' + \ str(np.round(cv_results_['std_test_precision_micro'][idx], 2)) res['precision_macro'] = str(np.round(cv_results_['mean_test_precision_macro'][idx], 2)) + \ ' $\pm$ ' + \ str(np.round(cv_results_['std_test_precision_macro'][idx], 2)) res['recall_micro'] = str(np.round(cv_results_['mean_test_recall_micro'][idx], 2)) + \ ' $\pm$ ' + str(np.round(cv_results_['std_test_recall_micro'][idx], 2)) res['recall_macro'] = str(np.round(cv_results_['mean_test_recall_macro'][idx], 2)) + \ ' $\pm$ ' + str(np.round(cv_results_['std_test_recall_macro'][idx], 2)) #res['accuracy']=str(np.round(cv_results_['mean_test_accuracy'][idx],2))+ ' $\pm$ ' + str(np.round(cv_results_['std_test_accuracy'][idx],2)) res['file'] = file res['auc_macro'] = str(conf['auc_macro']) res['score'] = str(best_score_) return res
def tune_and_eval_predefined(self, results_file, isolates, folds, params=None, feature_names=None, njobs=50): ''' :param results_file: :param isolates: :param folds: :param params: :param feature_names: :param njobs: :return: ''' if params is None: params = [{ "n_estimators": [100, 200, 500, 1000], "criterion": ["entropy"], # "gini", 'max_features': ['sqrt', 'auto'], # 'auto', 'min_samples_split': [2, 5, 10], # 2,5,10 'min_samples_leaf': [1, 2], 'class_weight': ['balanced', None] }] self.CV = PredefinedFoldCrossVal(self.X, self.Y, isolates, folds) self.CV.tune_and_evaluate(self.model, parameters=params, score='f1_macro', file_name=results_file + '_RF', n_jobs=njobs) if feature_names is not None: try: [ label_set, conf, best_score_, best_estimator_, cv_results_, best_params_, (y_predicted, Y, label_set) ] = FileUtility.load_obj(results_file + '_RF.pickle') except: [ label_set, best_score_, best_estimator_, cv_results_, best_params_, (Y, label_set) ] = FileUtility.load_obj(results_file + '_RF.pickle') self.generate_RF_important_features(best_estimator_, feature_names, results_file, 1000)
def __init__(self): self.seq2freqstructs = FileUtility.load_obj( '../data_config/seg2sec.pickle') # color dictionary for secondary structures color_dict = { 'e': 'yellow', 'g': 'blue', 'h': 'blue', 'n': 'red', 's': 'red', 't': 'red' }
def load_alpha_distribution(self): swiss_size_change=FileUtility.load_obj('data_config/swiss_1000_samples.pickle') all_samples=[] for i in tqdm.tqdm(range(0,1000)): sample=[] for vocab in np.arange(10000,1000000,10000): sample.append(swiss_size_change[vocab][i]) all_samples.append(-np.diff(sample)) sample_mat=np.mean(normalize_mat(all_samples),axis=0) sample_mat_std=np.std(normalize_mat(all_samples),axis=0) self.alpha_param = st.alpha.fit(sample_mat)
def create_excel_file(input_path, output_path): files_cv = FileUtility.recursive_glob(input_path, '*.pickle') if len(files_cv) >0: files_cv.sort() table_test = {'classifier': [], 'feature': [], 'CV': [], 'Precision': [], 'Recall': [], 'F1': [],'macroF1': [], 'accuracy': []} table_cv = {'classifier': [], 'feature': [], 'CV': [], 'Precision': [], 'Recall': [], 'F1': [], 'macroF1': [],'accuracy': []} import warnings warnings.filterwarnings('ignore') df1=[] df2=[] for file in files_cv: [label_set, conf, label_set, best_score_, best_estimator_, cv_results_, best_params_, (cv_predictions_pred, cv_predictions_trues, isolates), (Y_test_pred, Y_test)] = FileUtility.load_obj(file) rep = file.split('/')[-1].split('_CV_')[0] CV_scheme = file.split('_CV_')[1].split('_')[0] classifier = file.split('_CV_')[1].split('_')[1].split('.')[0] table_test['feature'].append(rep) table_test['classifier'].append(classifier) table_test['CV'].append(CV_scheme) table_test['Precision'].append(np.round(precision_score(Y_test, Y_test_pred), 2)) table_test['Recall'].append(np.round(recall_score(Y_test, Y_test_pred), 2)) table_test['F1'].append(np.round(f1_score(Y_test, Y_test_pred), 2)) table_test['macroF1'].append(np.round(f1_score(Y_test, Y_test_pred,average='macro'), 2)) table_test['accuracy'].append(np.round(accuracy_score(Y_test, Y_test_pred), 2)) table_cv['feature'].append(rep) table_cv['classifier'].append(classifier) table_cv['CV'].append(CV_scheme) table_cv['Precision'].append(np.round(precision_score(cv_predictions_trues, cv_predictions_pred), 2)) table_cv['Recall'].append(np.round(recall_score(cv_predictions_trues, cv_predictions_pred), 2)) table_cv['F1'].append(np.round(f1_score(cv_predictions_trues, cv_predictions_pred), 2)) table_cv['macroF1'].append(np.round(f1_score(cv_predictions_trues, cv_predictions_pred,average='macro'), 2)) table_cv['accuracy'].append(np.round(accuracy_score(cv_predictions_trues, cv_predictions_pred), 2)) df1 = pd.DataFrame(data=table_test, columns=['feature', 'CV', 'classifier', 'accuracy', 'Precision', 'Recall', 'F1','macroF1']) df2 = pd.DataFrame(data=table_cv, columns=['feature', 'CV', 'classifier', 'accuracy', 'Precision', 'Recall', 'F1','macroF1']) writer = pd.ExcelWriter(output_path) df1.sort_values(['macroF1','feature','classifier'], ascending=[False, True, True], inplace=True) df1.to_excel(writer, 'Test', index=False) df2.sort_values(['macroF1','feature','classifier'], ascending=[False, True, True], inplace=True) df2.to_excel(writer, 'Cross-validation', index=False) writer.save()
def make_activation_function(file_name, X, last_layer=None): pretrained_weights = FileUtility.load_obj(file_name) if last_layer: h_sizes = [ float(x) for x in file_name.split('/')[-1].split('_')[3].split('-') ] + [last_layer] else: h_sizes = [ float(x) for x in file_name.split('/')[-1].split('_')[3].split('-') ] model = Sequential() for layer_idx, h_layer_size in enumerate(h_sizes): if layer_idx == 0: model.add( Dense(int(h_layer_size), input_dim=X.shape[1], weights=pretrained_weights[0], activation='relu')) else: if h_layer_size < 1: model.add( Dropout(h_layer_size, weights=pretrained_weights[layer_idx])) else: if layer_idx == len(h_sizes) - 1 and last_layer: model.add( Dense(int(h_layer_size), weights=pretrained_weights[layer_idx], activation='softmax')) else: model.add( Dense(int(h_layer_size), weights=pretrained_weights[layer_idx], activation='relu')) activations = model.predict(X) np.savetxt( file_name.replace( file_name.split('/')[-1].split('_')[0], 'activationlayer'), activations) return activations
def get_pretrained_model(self, file_name, trainable): pretrained_weights=FileUtility.load_obj(file_name) h_sizes=[float(x) for x in file_name.split('/')[-1].split('_')[3].split('-')] model = Sequential() for layer_idx, h_layer_size in enumerate(h_sizes): if layer_idx==0: model.add(Dense(int(h_layer_size), input_dim=self.X.shape[1], weights=pretrained_weights[0], activation='relu', trainable=trainable)) else: if h_layer_size < 1: model.add(Dropout(h_layer_size, weights=pretrained_weights[layer_idx], trainable=trainable)) else: model.add(Dense(int(h_layer_size), weights=pretrained_weights[layer_idx], activation='relu', trainable=trainable)) if self.model_arch: for layer_idx, h_layer_size in enumerate(self.model_arch): if h_layer_size < 1: model.add(Dropout(h_layer_size)) else: model.add(Dense(h_layer_size, activation='relu')) model.add(Dense(self.C, activation='softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) return model
def tune_and_eval_predefined(self, results_file, isolates, folds_file, test_file, params=None, njobs=50, feature_names=None, optimized_for='f1_macro'): ''' :param results_file: :param isolates: :param folds: :param params: :param njobs: :return: ''' if params == None: params = SVM.params_tuning self.CV = PredefinedFoldCrossVal(self.X, self.Y, isolates, folds_file, test_file) self.CV.tune_and_evaluate(self.model, parameters=params, score=optimized_for, file_name=results_file + '_SVM', n_jobs=njobs) if feature_names is not None: [ nested_scores, cv_dicts, label_set, conf, label_set, best_score_, best_estimator_, cv_results_, best_params_, (cv_predictions_pred, cv_predictions_trues, isolates), (Y_test_pred, Y_test) ] = FileUtility.load_obj(results_file + '_SVM.pickle') self.generate_SVM_important_features(best_estimator_, feature_names, results_file)
def load_history(filename, fileout): ''' Plot the history :param filename: :param fileout: :return: ''' [ latex_line, p_micro, r_micro, f1_micro, p_macro, r_macro, f1_macro, history ] = FileUtility.load_obj(filename) (loss_values, val_loss_values, epochs) = history matplotlib.rcParams['mathtext.fontset'] = 'stix' matplotlib.rcParams['font.family'] = 'STIXGeneral' matplotlib.rcParams['mathtext.fontset'] = 'custom' matplotlib.rcParams['mathtext.rm'] = 'Bitstream Vera Sans' matplotlib.rcParams['mathtext.it'] = 'Bitstream Vera Sans:italic' matplotlib.rcParams['mathtext.bf'] = 'Bitstream Vera Sans:bold' matplotlib.rcParams["axes.edgecolor"] = "black" matplotlib.rcParams["axes.linewidth"] = 0.6 plt.rc('text', usetex=True) plt.plot(epochs, loss_values, 'ro', label='Loss for train set') plt.plot(epochs, val_loss_values, 'b+', label='Loss for test set') plt.xlabel('Epochs') plt.ylabel('Loss') plt.legend(loc=1, prop={'size': 8}, ncol=1, edgecolor='black', facecolor='white', frameon=True) plt.title( 'Loss with respect to the number of epochs for train and test sets' ) plt.savefig(fileout + '.pdf') plt.show()
def tune_and_evaluate(self, estimator, parameters, cv_inner=5, score='f1_macro', n_jobs=-1, file_name='results', NUM_TRIALS=3): ''' :param estimator: :param parameters:p :param score: :param n_jobs: :param file_name: directory/tuning/classifier/features/ :return: ''' print( 'ummaaaaad injaaaa ==============================================') self.nested_scores = [] cv_dicts = [] test_predictions_in_trials = [] best_params_in_trials = [] # Loop for each trial for i in tqdm.tqdm(range(NUM_TRIALS)): # Choose cross-validation techniques for the inner and outer loops, # independently of the dataset. # E.g "GroupKFold", "LeaveOneOut", "LeaveOneGroupOut", etc. inner_cv = StratifiedKFold(n_splits=cv_inner, shuffle=True, random_state=i) # parameter search and scoring self.greed_search = GridSearchCV(estimator=estimator, param_grid=parameters, cv=inner_cv, scoring=self.scoring, refit=score, error_score=0, n_jobs=n_jobs, verbose=0) # Nested CV with parameter optimization nested_score = cross_val_score(self.greed_search, X=self.X, y=self.Y, cv=self.cv, n_jobs=1, scoring=score) self.nested_scores.append(nested_score) # Nested CV with parameter optimization cv_dict_pred = cross_val_predict(self.greed_search, X=self.X, y=self.Y, cv=self.cv, n_jobs=1) cv_dicts.append(cv_dict_pred) # get the cv results cv_predictions_pred = [] cv_predictions_trues = [] # Non_nested parameter search and scoring self.greed_search = GridSearchCV(estimator=estimator, param_grid=parameters, cv=self.cv, scoring=self.scoring, refit=score, error_score=0, n_jobs=n_jobs, verbose=0) self.greed_search.fit(X=self.X, y=self.Y) isolates = [] for train, test in self.cv: self.greed_search.best_estimator_.fit( self.X[train, :], [self.Y[idx] for idx in train]) preds = self.greed_search.best_estimator_.predict(self.X[test, :]) trues = [self.Y[idx] for idx in test] [cv_predictions_pred.append(pred) for pred in preds] [cv_predictions_trues.append(tr) for tr in trues] for i in test: isolates.append(i) label_set = list(set(self.Y)) label_set.sort() isolates = [self.train_isolate_list[iso] for iso in isolates] conf = confusion_matrix(cv_predictions_trues, cv_predictions_pred, labels=label_set) Y_test_pred = self.greed_search.best_estimator_.predict(self.X_test) # save in file FileUtility.save_obj(file_name, [ self.nested_scores, cv_dicts, label_set, conf, label_set, self.greed_search.best_score_, self.greed_search.best_estimator_, self.greed_search.cv_results_, self.greed_search.best_params_, (cv_predictions_pred, cv_predictions_trues, isolates), (Y_test_pred, self.Y_test) ]) [ nested_scores, cv_dicts, label_set, conf, label_set, best_score_, best_estimator_, cv_results_, best_params_, (cv_predictions_pred, cv_predictions_trues, isolates), (Y_test_pred, Y_test) ] = FileUtility.load_obj(filename)
def result_visualization(filename): [latex_line, p_micro, r_micro, f1_micro, p_macro, r_macro, f1_macro, (loss_values, val_loss_values, epochs)]=FileUtility.load_obj(filename) print(latex_line)
def generate_report(full_path, pred_test, domain, setting): ''' :param pred_test: test results :return: ''' # Error location analysis error_edge=0 error_NOTedge=0 correct_edge=0 correct_NOTedge=0 all_pred = [] all_true = [] for i in tqdm.tqdm(range(0,514)): pred=np.array([np.argmax(x, axis=1) for x in pred_test[i][0]]) true=np.array([np.argmax(x, axis=1) for x in pred_test[i][1]]) all_pred = all_pred + pred.tolist() all_true = all_true + true.tolist() diff=np.diff(true) errors = [y for x,y in np.argwhere(pred!=true)] corrects = list(set(list(range(len(pred[0]))))-set(errors)) edges_edge = [y for x,y in np.argwhere(diff!=0)] edges_before = [x-1 for x in edges_edge if x-1>=0] edges_after = [x+1 for x in edges_edge if x+1<len(pred[0])] edges = list(set(edges_edge + edges_before + edges_after)) # contingency matrix error_edge = error_edge+len(list(set(errors).intersection(edges))) error_NOTedge = error_NOTedge+len(list(set(errors)-set(edges))) correct_edge = correct_edge+len(list(set(corrects).intersection(edges))) correct_NOTedge = correct_NOTedge+len(list(set(corrects)-set(edges))) all_pred = list(itertools.chain(*all_pred)) all_true = list(itertools.chain(*all_true)) acc_test = accuracy_score(all_true, all_pred) f1_macro = f1_score(all_true, all_pred, average='macro') f1_micro = f1_score(all_true, all_pred, average='micro') conf_mat = confusion_matrix(all_true, all_pred, labels=list(range(1,9))) conf_mat_column_mapping = {3: 'E (Beta sheet)', 4: 'G (3-10 Helix)', 2: 'B (Beta bridge)', 6: 'H (Alpha helix)', 8: 'T (Turn)', 1: 'L (Loop)', 7: 'S (Bend)', 5: 'I (Pi Helix)'} contingency_metric = [[error_edge, error_NOTedge],[correct_edge, correct_NOTedge]] # Chi2 test chi2_res = scipy.stats.chi2_contingency([[error_edge, error_NOTedge],[correct_edge, correct_NOTedge]], correction=True) chi2_res_pval = chi2_res[1] #log-likelihood ratio (i.e. the “G-test”) gtest_res = scipy.stats.chi2_contingency([[error_edge, error_NOTedge],[correct_edge, correct_NOTedge]], lambda_="log-likelihood", correction=True) gtest_res_pval = gtest_res[1] #https://stackoverflow.com/questions/51864730/python-what-is-the-process-to-create-pdf-reports-with-charts-from-a-db cmap = sns.cubehelix_palette(light=1, as_cmap=True) create_mat_plot(conf_mat,[conf_mat_column_mapping[x] for x in list(range(1,9))], 'Confusion matrix of protein secondary structure prediction', full_path+'confusion'+F"{domain}_{setting}",'Predicted Label', 'True Label' ,filetype='png', annot=False, cmap=cmap ) pdf = MyFPDF() pdf.add_page() pdf.set_xy(0, 0) html = F""" <h2>DeepPrime2Sec Report on Protein Secondary Structure Prediction</h2> <h3>Experiment name: {domain} - {setting} </h3> <hr/> <H3 align="left">The performance on CB513</H3> <h4>Report on the accuracy</h4> <table border="1" align="center" width="70%"> <thead><tr><th width="30%">Test-set Accuray</th><th width="30%">Test-set micro F1</th><th width="30%">Test-set macro F1</th></tr></thead> <tbody> <tr><td>{round(acc_test,3)}</td><td>{round(f1_micro,3)}</td><td>{round(f1_macro,3)}</td></tr> </tbody> </table> <hr/> <h4>Confusion matrix</h4> """ pdf.write_html(html) pdf.image(full_path+'confusion'+F"{domain}_{setting}"+'.png', x = 50, y = None, w = 100, h = 0, type = '', link = '') html=F""" <center> <image src='confusion{domain}_{setting}.png'/> </center> <hr/> <h4>Error analysis</h4> <h5>Contingency table for location analysis of the misclassified amino acids</h5> <table border="1" align="center" width="100%"> <thead><tr><th width="30%">\</th><th width="30%">Located at the PSS transition</th><th width="30%">NOT Located at the PSS transition</th></tr></thead> <tbody> <tr><td><b>Miss-classified</b></td><td>{error_edge}</td><td>{error_NOTedge}</td></tr> <tr><td><b>Truely classified</b></td><td>{correct_edge}</td><td>{correct_NOTedge}</td></tr> </tbody> </table> <br/> <b>P-value for Chi-square test</b> = {chi2_res_pval} <br/> <b>P-value for G-test</b> = {gtest_res_pval} <hr/> <br/> <br/> <br/> <h4>Learning curve</h4> """ pdf.write_html(html) # learning curve history_dict=FileUtility.load_obj(full_path+'history.pickle') plt.clf() loss_values = history_dict['loss'] val_loss_values = history_dict['val_loss'] epochs = range(1, len(loss_values) + 1) matplotlib.rcParams['mathtext.fontset'] = 'stix' matplotlib.rcParams['font.family'] = 'STIXGeneral' matplotlib.rcParams['mathtext.fontset'] = 'custom' matplotlib.rcParams['mathtext.rm'] = 'Bitstream Vera Sans' matplotlib.rcParams['mathtext.it'] = 'Bitstream Vera Sans:italic' matplotlib.rcParams['mathtext.bf'] = 'Bitstream Vera Sans:bold' matplotlib.rcParams["axes.edgecolor"] = "black" matplotlib.rcParams["axes.linewidth"] = 0.6 plt.plot(epochs, loss_values, 'ro', label='Loss for train set') plt.plot(epochs, val_loss_values, 'b', label='Loss for test set') plt.xlabel('Epochs') plt.ylabel('Loss') plt.legend(loc=1, prop={'size': 8},ncol=1, edgecolor='black', facecolor='white', frameon=True) plt.title('Loss with respect to the number of epochs for train and test sets') plt.savefig(full_path + 'learning_curve'+F"{domain}_{setting}"+'.png', dpi=300) pdf.image(full_path + 'learning_curve'+F"{domain}_{setting}"+'.png', x = 50, y = None, w = 100, h = 0, type = '', link = '') pdf.output(full_path+'final_report.pdf', 'F') return acc_test, conf_mat, conf_mat_column_mapping, contingency_metric, chi2_res_pval, gtest_res_pval
def biomarker_extraction(self, labeler, label_mapper, phenoname, p_value_threshold=0.05, pos_label=None, neg_label=None, excel=0): ''' :return: ''' print('\t✔ NPE Marker detection is started..') start = time.time() rep_base_path = self.output_directory_inter + 'npe_representation/' + self.dbname + '_uniquepiece_' + str( self.rep_sampling_depth) filenames = [ x.split('/')[-1] for x in FileUtility.load_list(rep_base_path + '_meta') ] # CHECK EXISTING LABELS if callable(labeler): selected_samples = [ idx for idx, file in enumerate(filenames) if labeler(file) in label_mapper ] else: selected_samples = [ idx for idx, file in enumerate(filenames) if labeler[file] in label_mapper ] if callable(labeler): Y = [ str(label_mapper[labeler(filenames[sample_id])]) for sample_id in selected_samples ] else: Y = [ str(label_mapper[labeler[filenames[sample_id]]]) for sample_id in selected_samples ] FileUtility.save_list(rep_base_path + '_' + phenoname + '_Y.txt', Y) DiTaxaWorkflow.ensure_dir(self.output_directory_inter + 'npe_marker_files/') if self.override == 1 or not DiTaxaWorkflow.exists( self.output_directory_inter + 'npe_marker_files/' + '_'.join([phenoname, 'chi2_relative.fasta'])): with warnings.catch_warnings(): warnings.simplefilter("ignore") G16s = NPEMarkerDetection( rep_base_path + '.npz', rep_base_path + '_' + phenoname + '_Y.txt', rep_base_path + '_features', self.output_directory_inter + 'npe_marker_files/' + phenoname, selected_samples) G16s.extract_markers() end = time.time() spent = end - start print('\t✔ biomarker extraction ' + phenoname + ' ' + str(spent) + ' seconds , using ' + str(self.num_p) + ' cores') self.log_file.append('biomarker extraction ' + phenoname + ' ' + str(spent) + ' seconds , using ' + str(self.num_p) + ' cores') else: print( '\t✔ Biomarker are already extracted. Thus, the statistical test was bypassed' ) self.log_file.append( ' Biomarker are already extracted. Thus, the statistical test was bypassed' ) FileUtility.save_list(self.output_directory + 'logfile.txt', self.log_file) print('\t✔ Taxonomic assignment of the markers..') if callable(labeler): phenotypes = [ labeler(filenames[sample_id]) for sample_id in selected_samples ] else: phenotypes = [ labeler[filenames[sample_id]] for sample_id in selected_samples ] fasta_file = self.output_directory_inter + 'npe_marker_files/' + phenoname + '_chi2_relative.fasta' matrix_path = rep_base_path + '.npz' feature_file_path = rep_base_path + '_features' if len(FileUtility.read_fasta_sequences(fasta_file)) > 2000: remove_redundants = False else: remove_redundants = True FileUtility.ensure_dir(self.output_directory + 'final_outputs/save_states/') if self.override == 1 or not DiTaxaWorkflow.exists( self.output_directory + 'final_outputs/save_states/' + phenoname + '.pickle'): start = time.time() Final_OBJ = NPEMarkerAnlaysis(fasta_file, matrix_path, feature_file_path, phenotypes, label_mapper, selected_samples, p_value_threshold=p_value_threshold, remove_redundants=remove_redundants, num_p=self.num_p, blastn_path=self.blastn_path) end = time.time() spent = end - start DiTaxaWorkflow.ensure_dir(self.output_directory + 'final_outputs/') FileUtility.save_obj( self.output_directory + 'final_outputs/save_states/' + phenoname, Final_OBJ) print('\t✔ Marker analysis and alignment ' + phenoname + ' ' + str(spent) + ' seconds, using ' + str(self.num_p) + 'cores') self.log_file.append('Marker analysis and alignment ' + phenoname + ' ' + str(spent) + ' seconds, using ' + str(self.num_p) + 'cores') else: Final_OBJ = FileUtility.load_obj(self.output_directory + 'final_outputs/save_states/' + phenoname + '.pickle') print('\t✔ The aligned markers already existed and are loaded!') self.log_file.append( 'The aligned markers already existed and are loaded!') FileUtility.save_list(self.output_directory + 'logfile.txt', self.log_file) # generating the tree Final_OBJ.generate_tree(self.output_directory + 'final_outputs/', phenoname) if excel == 1: print('\t✔ Creating marker excel file..') Final_OBJ.generate_excel( self.output_directory + 'final_outputs/' + phenoname + '.xlsx', phenoname) X_addr = self.output_directory_inter + 'npe_representation/' + self.dbname + '_uniquepiece_' + str( self.rep_sampling_depth) + '.npz' feature_addr = self.output_directory_inter + 'npe_representation/' + self.dbname + '_uniquepiece_' + str( self.rep_sampling_depth) + '_features' markers = self.output_directory_inter + 'npe_marker_files/' + phenoname + '_finalmarker_list.txt' Y = self.output_directory_inter + 'npe_representation/' + self.dbname + '_uniquepiece_' + str( self.rep_sampling_depth) + '_' + phenoname + "_Y.txt" print('\t✔ Creating t-sne plot..') DiTaxaWorkflow.plot_res(self.output_directory + 'final_outputs/' + phenoname + '_tsne.pdf', X_addr, feature_addr, markers, Y, labels=['Negative', 'Positive']) if pos_label and neg_label: print('\t✔ Creating marker heatmap..') Final_OBJ.update_matrix_by_markers_N() Final_OBJ.generate_heatmap(self.output_directory + 'final_outputs/' + phenoname + '_heatmap', pos_label=pos_label, neg_label=neg_label) if not excel == 1: print('\t✔ Creating t-sne plot..') DiTaxaWorkflow.plot_res(self.output_directory + 'final_outputs/' + phenoname + '_tsne.pdf', X_addr, feature_addr, markers, Y, labels=[neg_label, pos_label]) DiTaxaWorkflow.temp_cleanup() print( '\t⬛ Marker detection and analysis completed. You can find the results at ' + self.output_directory + ', in partuclar at final_outputs subdirectory.')