def org_classification(): ''' ''' X=FileUtility.load_sparse_csr('../../datasets/processed_data/org/K/6-mer_org_restrictedkmer.npz').toarray() Y=FileUtility.load_list('../../datasets/processed_data/org/K/org_label_restrictedkmer.txt') DNN=DNNMutliclass16S(X,Y,model_arch=[1024,0.2,256,0.1,256,0.1,128,0.1,64]) DNN.cross_validation('../../datasets/results/org/classifier/nn', gpu_dev='2', n_fold=10, epochs=30, batch_size=100, model_strct='mlp')
def __init__(self, fasta_file, matrix_path, feature_file_path, phenotypes, phenotype_mapping, selected_samples, p_value_threshold=0.01, remove_redundants=False, num_p=4, blastn_path=''): if len(blastn_path) > 0: os.environ['PATH'] += ':' + blastn_path self.num_p = num_p self.seq_IDS = FileUtility.read_fasta_sequences_ids(fasta_file) self.remove_redundants = remove_redundants self.ez_taxa_dict = { x.split()[0]: x.split()[1].split(';') for x in FileUtility.load_list('db/ez_idx_taxonomy.txt') } self.mat = FileUtility.load_sparse_csr(matrix_path) self.mat = self.mat.toarray() self.mat = self.mat[selected_samples, :] self.mat = csr_matrix(self.mat) self.features = FileUtility.load_list(feature_file_path) self.align_markers_parallel(p_value_threshold) self.redundant_columns_indentification() self.phenotype_mapping = phenotype_mapping self.phenotypes = phenotypes
def eco_all_classification(): ''' ''' #[1024,0.2,256,0.1,256,0.1,128,0.1,64] X=FileUtility.load_sparse_csr('../../datasets/processed_data/eco_all_classes/6-mer_eco_restrictedmer_all.npz').toarray() Y=FileUtility.load_list('../../datasets/processed_data/eco_all_classes/eco_label_restrictedkmer_all.txt') DNN=DNNMutliclass16S(X,Y,model_arch=[1024,0.2,512,0.2,512,0.1,256]) DNN.cross_validation('../../datasets/results/eco_all/nn', gpu_dev='1', n_fold=10, epochs=20, batch_size=10, model_strct='mlp')
def eco_all_classification_transfer_learning(): ''' ''' #[1024,0.2,256,0.1,256,0.1,128,0.1,64] X=FileUtility.load_sparse_csr('../../datasets/processed_data/eco_all_classes/6-mer_eco_restrictedmer_all.npz').toarray() Y=FileUtility.load_list('../../datasets/processed_data/eco_all_classes/eco_label_restrictedkmer_all.txt') DNN=DNNMutliclass16S(X,Y,model_arch=[512,0.1,256, 0.1,128]) DNN.cross_validation('../../datasets/results/eco_all/nn', gpu_dev='6', pretrained_model=True,trainable=False, n_fold=5, epochs=10, batch_size=10, model_strct='../../datasets/results/eco_10000/classifiers/nn_layers_mlp_1024-0.2-512-0.2-512_0.88.pickle')
def crohns_disease(): ''' ''' #[1024,0.2,256,0.1,256,0.1,128,0.1,64] X=FileUtility.load_sparse_csr('../../datasets/processed_data/crohn/sample-size/6-mers_rate_complete1359_seq_5000.npz').toarray() Y=FileUtility.load_list('../../datasets/processed_data/crohn/data_config/labels_disease_complete1359.txt') DNN=DNNMutliclass16S(X,Y,model_arch=[512,0.2,256,0.2,128,0.1,64,16]) DNN.cross_validation('../../datasets/results/crohn/classifier/nn', gpu_dev='2', n_fold=3, epochs=25, batch_size=10, model_strct='mlp')
def test(): X = FileUtility.load_sparse_csr( '../body-sites/npe_rate_5000.npz').toarray() Y = FileUtility.load_list( '../body-sites/npe_representations_labels/labels_phen.txt') DNN = DNNMutliclass16S(X, Y, model_arch=[512, 0.2, 256, 0.2, 128, 0.1, 64]) DNN.cross_validation('../body-sites/nn', gpu_dev='2', n_fold=3, epochs=300, batch_size=10, model_strct='mlp')
def DNN_classifier(out_dir, X_file, Y_file, arch, gpu_id, epochs, batch_size): # k-mer data X = FileUtility.load_sparse_csr(X_file).toarray() # labels Y = [int(y) for y in FileUtility.load_list(Y_file)] DeepNN = DNN(X, Y, model_arch=arch) DeepNN.cross_validation(out_dir, gpu_dev=gpu_id, n_fold=10, epochs=epochs, batch_size=batch_size, model_strct='mlp')
def __init__(self, X_file, Y_file, features_file, path, selected_samples): ''' :param X: :param Y: :param features: :param path: ''' self.X = FileUtility.load_sparse_csr(X_file) self.X = self.X.toarray() self.X = self.X[selected_samples, :] self.X = csr_matrix(self.X) self.Y = [int(x) for x in FileUtility.load_list(Y_file)] self.features = FileUtility.load_list(features_file) self.path = path
def DNN_classifier(X_file, Y_file, arch, out_dir, dataset_name, gpu_id, epochs, batch_size): # k-mer data X = FileUtility.load_sparse_csr(X_file).toarray() # labels Y = FileUtility.load_list(Y_file) DNN = DNNMutliclass16S(X, Y, model_arch=arch) DNN.cross_validation(out_dir + 'nn_classification_results_' + dataset_name, gpu_dev=gpu_id, n_fold=10, epochs=epochs, batch_size=batch_size, model_strct='mlp')
def load_data(self, prefix_list=None): ''' Load list of features :param dir: :param prefix_list: :return: ''' for save_pref in prefix_list: print('@@@' + '_'.join([self.representation_path + save_pref, 'feature', 'vect.npz'])) self.X[save_pref] = FileUtility.load_sparse_csr( '_'.join([self.representation_path + save_pref, 'feature', 'vect.npz'])) self.feature_names[save_pref] = FileUtility.load_list( '_'.join([self.representation_path + save_pref, 'feature', 'list.txt'])) self.strains[save_pref] = FileUtility.load_list( '_'.join([self.representation_path + save_pref, 'strains', 'list.txt']))
def load_data(self, dir, prefix_list): ''' Load list of features :param dir: :param prefix_list: :return: ''' for save_pref in prefix_list: print('@@@' + '_'.join([dir + save_pref, 'feature', 'vect.npz'])) self.X[save_pref] = FileUtility.load_sparse_csr('_'.join( [dir + save_pref, 'feature', 'vect.npz'])) self.feature_names[save_pref] = FileUtility.load_list('_'.join( [dir + save_pref, 'feature', 'list.txt'])) self.isolates[save_pref] = FileUtility.load_list('_'.join( [dir + save_pref, 'isolates', 'list.txt']))
def classical_classifier(out_dir, X_file, Y_file, model, cores): # X = FileUtility.load_sparse_csr(X_file) # labels Y = [int(y) for y in FileUtility.load_list(Y_file)] if model == 'RF': #### Random Forest classifier MRF = RFClassifier(X, Y) # results containing the best parameter, confusion matrix, best estimator, results on fold will be stored in this address MRF.tune_and_eval(out_dir, njobs=cores) elif model == 'SVM': #### Support Vector Machine classifier MSVM = SVM(X, Y) # results containing the best parameter, confusion matrix, best estimator, results on fold will be stored in this address MSVM.tune_and_eval(out_dir, njobs=cores) elif model == 'LR': #### Logistic regression classifier MLR = LogRegression(X, Y) # results containing the best parameter, confusion matrix, best estimator, results on fold will be stored in this address MLR.tune_and_eval(out_dir, njobs=cores)
def classical_classifier(X_file, Y_file, model, out_dir, dataset_name, cores): # X = FileUtility.load_sparse_csr(X_file) # labels Y = FileUtility.load_list(Y_file) if model == 'RF': #### Random Forest classifier MRF = RFClassifier(X, Y) # results containing the best parameter, confusion metrix, best estimator, results on fold will be stored in this address MRF.tune_and_eval(out_dir + '/classification_results_' + dataset_name, n_jobs=cores) else: #### Support Vector Machine classifier MSVM = SVM(X, Y) # results containing the best parameter, confusion metrix, best estimator, results on fold will be stored in this address MSVM.tune_and_eval(out_dir + '/classification_results_' + dataset_name, n_jobs=cores)
def plot_res(file_address, X_addr, features_addr, selected_addr, label_addr, labels=['Negative', 'Positive']): global color_schemes color_schemes = [ ['green', 'blue', 'red', 'gold', 'cyan'], [ '#ff0505', '#f2a041', '#cdff05', '#04d9cb', '#45a8ff', '#8503a6', '#590202', '#734d02', '#4ab304', '#025359', '#0454cc', '#ff45da', '#993829', '#ffda45', '#1c661c', '#05cdff', '#1c2f66', '#731f57', '#b24a04', '#778003', '#0e3322', '#024566', '#0404d9', '#e5057d', '#66391c', '#31330e', '#3ee697', '#2d7da6', '#20024d', '#33011c' ] + list(({ 'aliceblue': '#F0F8FF', 'antiquewhite': '#FAEBD7', 'aqua': '#00FFFF', 'aquamarine': '#7FFFD4', 'azure': '#F0FFFF', 'beige': '#F5F5DC', 'bisque': '#FFE4C4', 'black': '#000000', 'blanchedalmond': '#FFEBCD', 'blue': '#0000FF', 'blueviolet': '#8A2BE2', 'brown': '#A52A2A', 'burlywood': '#DEB887', 'cadetblue': '#5F9EA0', 'chartreuse': '#7FFF00', 'chocolate': '#D2691E', 'coral': '#FF7F50', 'cornflowerblue': '#6495ED', 'cornsilk': '#FFF8DC', 'crimson': '#DC143C', 'cyan': '#00FFFF', 'darkblue': '#00008B', 'darkcyan': '#008B8B', 'darkgoldenrod': '#B8860B', 'darkgray': '#A9A9A9', 'darkgreen': '#006400', 'darkkhaki': '#BDB76B', 'darkmagenta': '#8B008B', 'darkolivegreen': '#556B2F', 'darkorange': '#FF8C00', 'darkorchid': '#9932CC', 'darkred': '#8B0000', 'darksalmon': '#E9967A', 'darkseagreen': '#8FBC8F', 'darkslateblue': '#483D8B', 'darkslategray': '#2F4F4F', 'darkturquoise': '#00CED1', 'darkviolet': '#9400D3', 'deeppink': '#FF1493', 'deepskyblue': '#00BFFF', 'dimgray': '#696969', 'dodgerblue': '#1E90FF', 'firebrick': '#B22222', 'floralwhite': '#FFFAF0', 'forestgreen': '#228B22', 'fuchsia': '#FF00FF', 'gainsboro': '#DCDCDC', 'ghostwhite': '#F8F8FF', 'gold': '#FFD700', 'goldenrod': '#DAA520', 'gray': '#808080', 'green': '#008000', 'greenyellow': '#ADFF2F', 'honeydew': '#F0FFF0', 'hotpink': '#FF69B4', 'indianred': '#CD5C5C', 'indigo': '#4B0082', 'ivory': '#FFFFF0', 'khaki': '#F0E68C', 'lavender': '#E6E6FA', 'lavenderblush': '#FFF0F5', 'lawngreen': '#7CFC00', 'lemonchiffon': '#FFFACD', 'lightblue': '#ADD8E6', 'lightcoral': '#F08080', 'lightcyan': '#E0FFFF', 'lightgoldenrodyellow': '#FAFAD2', 'lightgreen': '#90EE90', 'lightgray': '#D3D3D3', 'lightpink': '#FFB6C1', 'lightsalmon': '#FFA07A', 'lightseagreen': '#20B2AA', 'lightskyblue': '#87CEFA', 'lightslategray': '#778899', 'lightsteelblue': '#B0C4DE', 'lightyellow': '#FFFFE0', 'lime': '#00FF00', 'limegreen': '#32CD32', 'linen': '#FAF0E6', 'magenta': '#FF00FF', 'maroon': '#800000', 'mediumaquamarine': '#66CDAA', 'mediumblue': '#0000CD', 'mediumorchid': '#BA55D3', 'mediumpurple': '#9370DB', 'mediumseagreen': '#3CB371', 'mediumslateblue': '#7B68EE', 'mediumspringgreen': '#00FA9A', 'mediumturquoise': '#48D1CC', 'mediumvioletred': '#C71585', 'midnightblue': '#191970', 'mintcream': '#F5FFFA', 'mistyrose': '#FFE4E1', 'moccasin': '#FFE4B5', 'navajowhite': '#FFDEAD', 'navy': '#000080', 'oldlace': '#FDF5E6', 'olive': '#808000', 'olivedrab': '#6B8E23', 'orange': '#FFA500', 'orangered': '#FF4500', 'orchid': '#DA70D6', 'palegoldenrod': '#EEE8AA', 'palegreen': '#98FB98', 'paleturquoise': '#AFEEEE', 'palevioletred': '#DB7093', 'papayawhip': '#FFEFD5', 'peachpuff': '#FFDAB9', 'peru': '#CD853F', 'pink': '#FFC0CB', 'plum': '#DDA0DD', 'powderblue': '#B0E0E6', 'purple': '#800080', 'red': '#FF0000', 'rosybrown': '#BC8F8F', 'royalblue': '#4169E1', 'saddlebrown': '#8B4513', 'salmon': '#FA8072', 'sandybrown': '#FAA460', 'seagreen': '#2E8B57', 'seashell': '#FFF5EE', 'sienna': '#A0522D', 'silver': '#C0C0C0', 'skyblue': '#87CEEB', 'slateblue': '#6A5ACD', 'slategray': '#708090', 'snow': '#FFFAFA', 'springgreen': '#00FF7F', 'steelblue': '#4682B4', 'tan': '#D2B48C', 'teal': '#008080', 'thistle': '#D8BFD8', 'tomato': '#FF6347', 'turquoise': '#40E0D0', 'violet': '#EE82EE', 'wheat': '#F5DEB3', 'white': '#FFFFFF', 'whitesmoke': '#F5F5F5', 'yellow': '#FFFF00', 'yellowgreen': '#9ACD32' }).keys()), [ '#ff0505', '#f2a041', '#cdff05', '#04d9cb', '#45a8ff', '#8503a6', '#590202', '#734d02', '#4ab304', '#025359', '#0454cc', '#ff45da', '#993829', '#ffda45', '#1c661c', '#05cdff', '#1c2f66', '#731f57', '#b24a04', '#778003', '#0e3322', '#024566', '#0404d9', '#e5057d', '#66391c', '#31330e', '#3ee697', '#2d7da6', '#20024d', '#33011c' ] ] X = FileUtility.load_sparse_csr(X_addr) features = FileUtility.load_list(features_addr) features_selected = FileUtility.load_list(selected_addr) idx = [features.index(x) for x in features_selected if x in features] X_selected = X[:, idx] Y = FileUtility.load_list(label_addr) X_tsne = DiTaxaWorkflow.get_tsne(X) X_red_tsne = DiTaxaWorkflow.get_tsne(X_selected) f = plt.figure(figsize=(16, 8)) ax1 = f.add_subplot(121) ax2 = f.add_subplot(122) DiTaxaWorkflow.plot_scatter(ax1, X_tsne, Y, 't-SNE 1', 't-SNE 0', '(i) t-SNE over NPE representations', legend_hide=False, legend_loc=9, legend_size=10, label_dict={ '0': labels[0], '1': labels[1] }, color_schemes_idx=0) DiTaxaWorkflow.plot_scatter(ax2, X_red_tsne, Y, 't-SNE 1', 't-SNE 0', '(ii) t-SNE over selected markers', legend_hide=False, legend_loc=9, legend_size=10, label_dict={ '0': labels[0], '1': labels[1] }, color_schemes_idx=0) plt.savefig(file_address) plt.close()