def run_main(args): ################################################# START SECTION OF LOADING PARAMETERS ################################################# # Read parameters epochs = args.epochs dim_au_out = args.bottleneck #8, 16, 32, 64, 128, 256,512 freeze = args.freeze_pretrain source_model_path = args.source_model_path target_model_path = args.target_model_path log_path = args.logging_file encoder_hdims = args.source_h_dims.split(",") encoder_hdims = list(map(int, encoder_hdims)) pretrain = args.pretrain prediction = args.predition reduce_model = args.dimreduce predict_hdims = args.p_h_dims.split(",") predict_hdims = list(map(int, predict_hdims)) load_model = True # Misc now=time.strftime("%Y-%m-%d-%H-%M-%S") # Initialize logging and std out out_path = log_path+now+".err" log_path = log_path+now+".log" out=open(out_path,"w") sys.stderr=out #Logging infomaion logging.basicConfig(level=logging.INFO, filename=log_path, filemode='a', format= '%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s' ) logging.getLogger('matplotlib.font_manager').disabled = True logging.info(args) # Save arguments args_df = ut.save_arguments(args,now) ################################################# END SECTION OF LOADING PARAMETERS ################################################# # Select the device of gpu device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Assuming that we are on a CUDA machine, this should print a CUDA device: logging.info(device) torch.cuda.set_device(device) ################################################# END SECTION OF LOADING BULK DATA ################################################# ################################################# START SECTION OF MODEL CUNSTRUCTION ################################################# # Construct target encoder if reduce_model == "AE": encoder = AEBase(input_dim=data.shape[1],latent_dim=dim_au_out,h_dims=encoder_hdims) loss_function_e = nn.MSELoss() elif reduce_model == "VAE": encoder = VAEBase(input_dim=data.shape[1],latent_dim=dim_au_out,h_dims=encoder_hdims) # elif reduce_model == "CVAE": # # Number of condition is equal to the number of clusters # encoder = CVAEBase(input_dim=data.shape[1],n_conditions=len(set(data_c)),latent_dim=dim_au_out,h_dims=encoder_hdims) if torch.cuda.is_available(): encoder.cuda() logging.info("Target encoder structure is: ") logging.info(encoder) encoder.to(device) optimizer_e = optim.Adam(encoder.parameters(), lr=1e-2) loss_function_e = nn.MSELoss() exp_lr_scheduler_e = lr_scheduler.ReduceLROnPlateau(optimizer_e) # Load source model before transfer if prediction == "regression": dim_model_out = 1 else: dim_model_out = 2 # Load AE model if reduce_model == "AE": source_model = PretrainedPredictor(input_dim=Xsource_train.shape[1],latent_dim=dim_au_out,h_dims=encoder_hdims, hidden_dims_predictor=predict_hdims,output_dim=dim_model_out, pretrained_weights=None,freezed=freeze) source_model.load_state_dict(torch.load(source_model_path)) source_encoder = source_model # Load VAE model elif reduce_model in ["VAE","CVAE"]: source_model = PretrainedVAEPredictor(input_dim=Xsource_train.shape[1],latent_dim=dim_au_out,h_dims=encoder_hdims, hidden_dims_predictor=predict_hdims,output_dim=dim_model_out, pretrained_weights=None,freezed=freeze,z_reparam=bool(args.VAErepram)) source_model.load_state_dict(torch.load(source_model_path)) source_encoder = source_model logging.info("Load pretrained source model from: "+source_model_path) source_encoder.to(device) ################################################# END SECTION OF MODEL CUNSTRUCTION ################################################# ################################################# START SECTION OF SC MODEL PRETRAININIG ################################################# # Pretrain target encoder # Pretain using autoencoder is pretrain is not False if(str(pretrain)!='0'): # Pretrained target encoder if there are not stored files in the harddisk train_flag = True pretrain = str(pretrain) try: encoder.load_state_dict(torch.load(pretrain)) logging.info("Load pretrained target encoder from "+pretrain) train_flag = False except: logging.warning("Loading failed, procceed to re-train model") logging.info("Pretrained finished") ################################################# END SECTION OF SC MODEL PRETRAININIG ################################################# ################################################# START SECTION OF TRANSFER LEARNING TRAINING ################################################# # Using ADDA transfer learning if args.transfer =='ADDA': # Set discriminator model discriminator = Predictor(input_dim=dim_au_out,output_dim=2) discriminator.to(device) loss_d = nn.CrossEntropyLoss() optimizer_d = optim.Adam(encoder.parameters(), lr=1e-2) exp_lr_scheduler_d = lr_scheduler.ReduceLROnPlateau(optimizer_d) # Adversairal trainning discriminator,encoder, report_, report2_ = t.train_ADDA_model(source_encoder,encoder,discriminator, dataloaders_source,dataloaders_pretrain, loss_d,loss_d, # Should here be all optimizer d? optimizer_d,optimizer_d, exp_lr_scheduler_d,exp_lr_scheduler_d, epochs,device, target_model_path) logging.info("Transfer ADDA finished") # DaNN model elif args.transfer == 'DaNN': # Set predictor loss loss_d = nn.CrossEntropyLoss() optimizer_d = optim.Adam(encoder.parameters(), lr=1e-2) exp_lr_scheduler_d = lr_scheduler.ReduceLROnPlateau(optimizer_d) # Set DaNN model DaNN_model = DaNN(source_model=source_encoder,target_model=encoder) DaNN_model.to(device) def loss(x,y,GAMMA=args.GAMMA_mmd): result = mmd.mmd_loss(x,y,GAMMA) return result loss_disrtibution = loss # Tran DaNN model DaNN_model, report_ = t.train_DaNN_model(DaNN_model, dataloaders_source,dataloaders_pretrain, # Should here be all optimizer d? optimizer_d, loss_d, epochs,exp_lr_scheduler_d, dist_loss=loss_disrtibution, load=load_model, weight = args.mmd_weight, save_path=target_model_path+"_DaNN.pkl") encoder = DaNN_model.target_model source_model = DaNN_model.source_model logging.info("Transfer DaNN finished")
def run_main(args): # Define parameters epochs = args.epochs dim_au_out = args.bottleneck #8, 16, 32, 64, 128, 256,512 dim_dnn_in = dim_au_out dim_dnn_out = 1 select_drug = args.drug na = args.missing_value data_path = args.data_path label_path = args.label_path test_size = args.test_size valid_size = args.valid_size g_disperson = args.var_genes_disp model_path = args.model_store_path pretrain_path = args.pretrain_path log_path = args.logging_file batch_size = args.batch_size encoder_hdims = args.ft_h_dims.split(",") preditor_hdims = args.p_h_dims.split(",") reduce_model = args.dimreduce prediction = args.predition encoder_hdims = list(map(int, encoder_hdims)) preditor_hdims = list(map(int, preditor_hdims)) # Read data data_r = pd.read_csv(data_path, index_col=0) label_r = pd.read_csv(label_path, index_col=0) label_r = label_r.fillna(na) now = time.strftime("%Y-%m-%d-%H-%M-%S") log_path = log_path + now + ".txt" log = open(log_path, "w") sys.stdout = log print(args) # data = data_r # Filter out na values selected_idx = label_r.loc[:, select_drug] != na if (g_disperson != None): hvg, adata = ut.highly_variable_genes(data_r, min_disp=g_disperson) # Rename columns if duplication exist data_r.columns = adata.var_names # Extract hvgs data = data_r.loc[selected_idx, hvg] else: data = data_r.loc[selected_idx, :] # Extract labels label = label_r.loc[selected_idx, select_drug] # Scaling data mmscaler = preprocessing.MinMaxScaler() lbscaler = preprocessing.MinMaxScaler() data = mmscaler.fit_transform(data) label = label.values.reshape(-1, 1) if prediction == "regression": label = lbscaler.fit_transform(label) dim_model_out = 1 else: le = LabelEncoder() label = le.fit_transform(label) dim_model_out = 2 #label = label.values.reshape(-1,1) print(np.std(data)) print(np.mean(data)) # Split traning valid test set X_train_all, X_test, Y_train_all, Y_test = train_test_split( data, label, test_size=test_size, random_state=42) X_train, X_valid, Y_train, Y_valid = train_test_split(X_train_all, Y_train_all, test_size=valid_size, random_state=42) # up-sampling X_train, Y_train = SMOTE().fit_sample(X_train, Y_train) print(data.shape) print(label.shape) print(X_train.shape, Y_train.shape) print(X_test.shape, Y_test.shape) print(X_train.max()) print(X_train.min()) # Select the Training device device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Assuming that we are on a CUDA machine, this should print a CUDA device: print(device) torch.cuda.set_device(device) # Construct datasets and data loaders X_trainTensor = torch.FloatTensor(X_train).to(device) X_validTensor = torch.FloatTensor(X_valid).to(device) X_testTensor = torch.FloatTensor(X_test).to(device) X_allTensor = torch.FloatTensor(data).to(device) if prediction == "regression": Y_trainTensor = torch.FloatTensor(Y_train).to(device) Y_validTensor = torch.FloatTensor(Y_valid).to(device) else: Y_trainTensor = torch.LongTensor(Y_train).to(device) Y_validTensor = torch.LongTensor(Y_valid).to(device) train_dataset = TensorDataset(X_trainTensor, X_trainTensor) valid_dataset = TensorDataset(X_validTensor, X_validTensor) test_dataset = TensorDataset(X_testTensor, X_testTensor) all_dataset = TensorDataset(X_allTensor, X_allTensor) X_trainDataLoader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True) X_validDataLoader = DataLoader(dataset=valid_dataset, batch_size=batch_size, shuffle=True) X_allDataLoader = DataLoader(dataset=all_dataset, batch_size=batch_size, shuffle=True) # construct TensorDataset trainreducedDataset = TensorDataset(X_trainTensor, Y_trainTensor) validreducedDataset = TensorDataset(X_validTensor, Y_validTensor) trainDataLoader_p = DataLoader(dataset=trainreducedDataset, batch_size=batch_size, shuffle=True) validDataLoader_p = DataLoader(dataset=validreducedDataset, batch_size=batch_size, shuffle=True) dataloaders_train = {'train': trainDataLoader_p, 'val': validDataLoader_p} if (bool(args.pretrain) == True): dataloaders_pretrain = { 'train': X_trainDataLoader, 'val': X_validDataLoader } if reduce_model == "AE": encoder = AEBase(input_dim=data.shape[1], latent_dim=dim_au_out, h_dims=encoder_hdims) elif reduce_model == "VAE": encoder = VAEBase(input_dim=data.shape[1], latent_dim=dim_au_out, h_dims=encoder_hdims) #model = VAE(dim_au_in=data_r.shape[1],dim_au_out=128) if torch.cuda.is_available(): encoder.cuda() print(encoder) encoder.to(device) optimizer_e = optim.Adam(encoder.parameters(), lr=1e-2) loss_function_e = nn.MSELoss() exp_lr_scheduler_e = lr_scheduler.ReduceLROnPlateau(optimizer_e) if reduce_model == "AE": encoder, loss_report_en = ut.train_extractor_model( net=encoder, data_loaders=dataloaders_pretrain, optimizer=optimizer_e, loss_function=loss_function_e, n_epochs=epochs, scheduler=exp_lr_scheduler_e, save_path=pretrain_path) elif reduce_model == "VAE": encoder, loss_report_en = ut.train_VAE_model( net=encoder, data_loaders=dataloaders_pretrain, optimizer=optimizer_e, n_epochs=epochs, scheduler=exp_lr_scheduler_e, save_path=pretrain_path) print("Pretrained finished") # Train model of predictor if reduce_model == "AE": model = PretrainedPredictor(input_dim=X_train.shape[1], latent_dim=dim_au_out, h_dims=encoder_hdims, hidden_dims_predictor=preditor_hdims, output_dim=dim_model_out, pretrained_weights=pretrain_path, freezed=bool(args.freeze_pretrain)) elif reduce_model == "VAE": model = PretrainedVAEPredictor(input_dim=X_train.shape[1], latent_dim=dim_au_out, h_dims=encoder_hdims, hidden_dims_predictor=preditor_hdims, output_dim=dim_model_out, pretrained_weights=pretrain_path, freezed=bool(args.freeze_pretrain)) print(model) if torch.cuda.is_available(): model.cuda() model.to(device) # Define optimizer optimizer = optim.Adam(model.parameters(), lr=1e-2) if prediction == "regression": loss_function = nn.MSELoss() else: loss_function = nn.CrossEntropyLoss() exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau(optimizer) preditor_path = model_path + reduce_model + select_drug + '.pkl' load_model = os.path.exists(preditor_path) model, report = ut.train_predictor_model(model, dataloaders_train, optimizer, loss_function, epochs, exp_lr_scheduler, load=load_model, save_path=preditor_path) dl_result = model(X_testTensor).detach().cpu().numpy() #torch.save(model.feature_extractor.state_dict(), preditor_path+"encoder.pkl") print('Performances: R/Pearson/Mse/') if prediction == "regression": print(r2_score(dl_result, Y_test)) print(pearsonr(dl_result.flatten(), Y_test.flatten())) print(mean_squared_error(dl_result, Y_test)) else: lb_results = np.argmax(dl_result, axis=1) pb_results = np.max(dl_result, axis=1) print(classification_report(Y_test, lb_results)) print(average_precision_score(Y_test, pb_results)) print(roc_auc_score(Y_test, pb_results))
def run_main(args): epochs = args.epochs dim_au_out = args.bottleneck #8, 16, 32, 64, 128, 256,512 na = args.missing_value data_path = args.target_data test_size = args.test_size valid_size = args.valid_size g_disperson = args.var_genes_disp min_n_genes = args.min_n_genes max_n_genes = args.max_n_genes source_model_path = args.source_model_path target_model_path = args.target_model_path log_path = args.logging_file batch_size = args.batch_size encoder_hdims = args.source_h_dims.split(",") encoder_hdims = list(map(int, encoder_hdims)) source_data_path = args.source_data pretrain = args.pretrain model = args.model # Misc now = time.strftime("%Y-%m-%d-%H-%M-%S") log_path = log_path + now + ".txt" export_name = data_path.replace("/", "") # If target file not exist, if (os.path.exists(target_model_path) == False): target_model_path = target_model_path + "/pretrain_" + export_name + "_" + now + ".pkl" log = open(log_path, "w") sys.stdout = log # Load data and preprocessing adata = pp.read_sc_file(data_path) sc.pp.filter_cells(adata, min_genes=200) sc.pp.filter_genes(adata, min_cells=3) adata = pp.cal_ncount_ngenes(adata) sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'], jitter=0.4, multi_panel=True, save=export_name) sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt') sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts') #Preprocess data by filtering adata = pp.receipe_my(adata, l_n_genes=min_n_genes, r_n_genes=max_n_genes, filter_mincells=args.min_c, filter_mingenes=args.min_g, normalize=True, log=True) # Select highly variable genes sc.pp.highly_variable_genes(adata, min_disp=g_disperson, max_disp=np.inf, max_mean=6) sc.pl.highly_variable_genes(adata, save=export_name) adata.raw = adata adata = adata[:, adata.var.highly_variable] #Prepare to normailize and split target data data = adata.X mmscaler = preprocessing.MinMaxScaler() data = mmscaler.fit_transform(data) # Split data to train and valid set Xtarget_train, Xtarget_valid = train_test_split(data, test_size=valid_size, random_state=42) print(Xtarget_train.shape, Xtarget_valid.shape) # Select the device of gpu device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Assuming that we are on a CUDA machine, this should print a CUDA device: print(device) torch.cuda.set_device(device) # Construct datasets and data loaders Xtarget_trainTensor = torch.FloatTensor(Xtarget_train).to(device) Xtarget_validTensor = torch.FloatTensor(Xtarget_valid).to(device) X_allTensor = torch.FloatTensor(data).to(device) train_dataset = TensorDataset(Xtarget_trainTensor, Xtarget_trainTensor) valid_dataset = TensorDataset(Xtarget_validTensor, Xtarget_validTensor) all_dataset = TensorDataset(X_allTensor, X_allTensor) Xtarget_trainDataLoader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True) Xtarget_validDataLoader = DataLoader(dataset=valid_dataset, batch_size=batch_size, shuffle=True) dataloaders_pretrain = { 'train': Xtarget_trainDataLoader, 'val': Xtarget_validDataLoader } # Construct target encoder if model == "AE": encoder = AEBase(input_dim=data.shape[1], latent_dim=dim_au_out, h_dims=encoder_hdims) loss_function_e = nn.MSELoss() elif model == "VAE": encoder = VAEBase(input_dim=data.shape[1], latent_dim=dim_au_out, h_dims=encoder_hdims) #loss_function_e = encoder.loss_function() if torch.cuda.is_available(): encoder.cuda() encoder.to(device) optimizer_e = optim.Adam(encoder.parameters(), lr=1e-2) exp_lr_scheduler_e = lr_scheduler.ReduceLROnPlateau(optimizer_e) # Pretrain target encoder if (model == "AE"): pretrain = str(pretrain) encoder, loss_report_en = ut.train_extractor_model( net=encoder, data_loaders=dataloaders_pretrain, optimizer=optimizer_e, loss_function=loss_function_e, n_epochs=epochs, scheduler=exp_lr_scheduler_e, save_path=pretrain) print("Pretrained finished") # Extract feature embeddings = encoder.encode(X_allTensor).detach().cpu().numpy() elif (model == "VAE"): pretrain = str(pretrain) encoder, loss_report_en = ut.train_VAE_model( net=encoder, data_loaders=dataloaders_pretrain, optimizer=optimizer_e, n_epochs=epochs, scheduler=exp_lr_scheduler_e, save_path=pretrain) print("Pretrained finished") # Extract feature results = encoder.encode_(X_allTensor) embeddings = results[0].detach().cpu().numpy() # PCA sc.tl.pca(adata, svd_solver='arpack') # Add embeddings to the adata package adata.obsm["X_AE"] = embeddings # Generate neighbor graph sc.pp.neighbors(adata, n_neighbors=10, use_rep="X_AE") #sc.tl.umap(adata) # Use t-sne sc.tl.tsne(adata, use_rep="X_AE") # Leiden on the data sc.tl.leiden(adata) # Plot tsne sc.pl.tsne(adata, save=export_name, color=["leiden"]) # Differenrial expression genes sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon') sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False, save=export_name) # Save adata adata.write("saved/results" + export_name + now + ".h5ad")
def run_main(args): # Define parameters epochs = args.epochs dim_au_out = args.bottleneck #8, 16, 32, 64, 128, 256,512 dim_dnn_in = dim_au_out dim_dnn_out=1 na = args.missing_value data_path = args.data_path test_size = args.test_size valid_size = args.valid_size g_disperson = args.var_genes_disp min_n_genes = args.min_n_genes max_n_genes = args.max_n_genes model_path = args.model_store_path pretrain_path = args.pretrain_path log_path = args.logging_file batch_size = args.batch_size encoder_hdims = args.ft_h_dims.split(",") encoder_hdims = list(map(int, encoder_hdims)) print(args) # Misc now=time.strftime("%Y-%m-%d-%H-%M-%S") log_path = log_path+now+".txt" export_name = data_path.replace("/","") pretrain_path = "saved/models/ae_"+export_name+now+".pkl" log_path = log_path+now+".txt" log=open(log_path,"w") sys.stdout=log # Read data adata = sc.read_10x_mtx( 'data/GSE108394/GSM2897334/', # the directory with the `.mtx` file var_names='gene_symbols', # use gene symbols for the variable names (variables-axis index) cache=True) # write a cache file for faster subsequent reading # data = data_r adata = pp.receipe_my(adata,l_n_genes=min_n_genes,r_n_genes=max_n_genes,filter_mincells=args.min_c, filter_mingenes=args.min_g,normalize=True,log=True) # Save qc metrics sc.pl.violin(adata, ['n_counts',"percent_mito",'percent_rps', 'percent_rpl'], jitter=0.4, multi_panel=True,save=export_name) # HVG sc.pp.highly_variable_genes(adata,min_disp=g_disperson,max_disp=np.inf) sc.pl.highly_variable_genes(adata,save=export_name) # Extract data adata.raw = adata adata = adata[:, adata.var.highly_variable] data=adata.X # Scaling and splitting data mmscaler = preprocessing.MinMaxScaler() data = mmscaler.fit_transform(data.todense()) X_train, X_valid = train_test_split(data, test_size=valid_size, random_state=42) print(X_train.shape, X_valid.shape) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Assuming that we are on a CUDA machine, this should print a CUDA device: print(device) torch.cuda.set_device(device) # Construct datasets and data loaders X_trainTensor = torch.FloatTensor(X_train).to(device) X_validTensor = torch.FloatTensor(X_valid).to(device) X_allTensor = torch.FloatTensor(data).to(device) train_dataset = TensorDataset(X_trainTensor, X_trainTensor) valid_dataset = TensorDataset(X_validTensor, X_validTensor) all_dataset = TensorDataset(X_allTensor, X_allTensor) X_trainDataLoader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True) X_validDataLoader = DataLoader(dataset=valid_dataset, batch_size=batch_size, shuffle=True) dataloaders_pretrain = {'train':X_trainDataLoader,'val':X_validDataLoader} # Traing models encoder = AEBase(input_dim=data.shape[1],latent_dim=dim_au_out,h_dims=encoder_hdims) #model = VAE(dim_au_in=data_r.shape[1],dim_au_out=128) if torch.cuda.is_available(): encoder.cuda() print(encoder) encoder.to(device) optimizer_e = optim.Adam(encoder.parameters(), lr=1e-2) loss_function_e = nn.MSELoss() exp_lr_scheduler_e = lr_scheduler.ReduceLROnPlateau(optimizer_e) encoder,loss_report_en = ut.train_extractor_model(net=encoder,data_loaders=dataloaders_pretrain, optimizer=optimizer_e,loss_function=loss_function_e, n_epochs=epochs,scheduler=exp_lr_scheduler_e,save_path=pretrain_path) print("Pretrained finished") # Extract embeddings embeddings = encoder.encode(X_allTensor).detach().cpu().numpy() # Process embeddings sc.tl.pca(adata, svd_solver='arpack') adata.obsm["X_AE"] = embeddings # Visualize embeddings sc.tl.tsne(adata,use_rep="X_AE") # Clustering sc.pp.neighbors(adata, n_neighbors=10,use_rep="X_AE") sc.tl.leiden(adata) # Plot tsne sc.pl.tsne(adata,save=export_name,color=["leiden"]) # Print `` sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon') sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False,save=export_name)
def run_main(args): ################################################# START SECTION OF LOADING PARAMETERS ################################################# # Read parameters epochs = args.epochs dim_au_out = args.bottleneck #8, 16, 32, 64, 128, 256,512 na = args.missing_value data_path = DATA_MAP[args.target_data] test_size = args.test_size select_drug = args.drug freeze = args.freeze_pretrain valid_size = args.valid_size g_disperson = args.var_genes_disp min_n_genes = args.min_n_genes max_n_genes = args.max_n_genes source_model_path = args.source_model_path target_model_path = args.target_model_path log_path = args.logging_file batch_size = args.batch_size encoder_hdims = args.source_h_dims.split(",") encoder_hdims = list(map(int, encoder_hdims)) source_data_path = args.source_data pretrain = args.pretrain prediction = args.predition data_name = args.target_data label_path = args.label_path reduce_model = args.dimreduce predict_hdims = args.p_h_dims.split(",") predict_hdims = list(map(int, predict_hdims)) leiden_res = args.cluster_res load_model = bool(args.load_target_model) # Misc now = time.strftime("%Y-%m-%d-%H-%M-%S") # Initialize logging and std out out_path = log_path + now + ".err" log_path = log_path + now + ".log" out = open(out_path, "w") sys.stderr = out #Logging infomaion logging.basicConfig( level=logging.INFO, filename=log_path, filemode='a', format= '%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s' ) logging.getLogger('matplotlib.font_manager').disabled = True logging.info(args) # Save arguments args_df = ut.save_arguments(args, now) ################################################# END SECTION OF LOADING PARAMETERS ################################################# ################################################# START SECTION OF SINGLE CELL DATA REPROCESSING ################################################# # Load data and preprocessing adata = pp.read_sc_file(data_path) if data_name == 'GSE117872': adata = ut.specific_process(adata, dataname=data_name, select_origin=args.batch_id) elif data_name == 'GSE122843': adata = ut.specific_process(adata, dataname=data_name) elif data_name == 'GSE110894': adata = ut.specific_process(adata, dataname=data_name) elif data_name == 'GSE112274': adata = ut.specific_process(adata, dataname=data_name) elif data_name == 'GSE116237': adata = ut.specific_process(adata, dataname=data_name) elif data_name == 'GSE108383': adata = ut.specific_process(adata, dataname=data_name) elif data_name == 'GSE140440': adata = ut.specific_process(adata, dataname=data_name) elif data_name == 'GSE129730': adata = ut.specific_process(adata, dataname=data_name) elif data_name == 'GSE149383': adata = ut.specific_process(adata, dataname=data_name) else: adata = adata sc.pp.filter_cells(adata, min_genes=200) sc.pp.filter_genes(adata, min_cells=3) adata = pp.cal_ncount_ngenes(adata) # Show statisctic after QX sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt-'], jitter=0.4, multi_panel=True, save=data_name, show=False) sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt-', show=False) sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts', show=False) if args.remove_genes == 0: r_genes = [] else: r_genes = REMOVE_GENES #Preprocess data by filtering if data_name not in ['GSE112274', 'GSE140440']: adata = pp.receipe_my(adata, l_n_genes=min_n_genes, r_n_genes=max_n_genes, filter_mincells=args.min_c, filter_mingenes=args.min_g, normalize=True, log=True, remove_genes=r_genes) else: adata = pp.receipe_my(adata, l_n_genes=min_n_genes, r_n_genes=max_n_genes, filter_mincells=args.min_c, percent_mito=100, filter_mingenes=args.min_g, normalize=True, log=True, remove_genes=r_genes) # Select highly variable genes sc.pp.highly_variable_genes(adata, min_disp=g_disperson, max_disp=np.inf, max_mean=6) sc.pl.highly_variable_genes(adata, save=data_name, show=False) adata.raw = adata adata = adata[:, adata.var.highly_variable] # Preprocess data if spcific process is required data = adata.X # PCA # Generate neighbor graph sc.tl.pca(adata, svd_solver='arpack') sc.pp.neighbors(adata, n_neighbors=10) # Generate cluster labels sc.tl.leiden(adata, resolution=leiden_res) sc.tl.umap(adata) sc.pl.umap(adata, color=['leiden'], save=data_name + 'umap' + now, show=False) adata.obs['leiden_origin'] = adata.obs['leiden'] adata.obsm['X_umap_origin'] = adata.obsm['X_umap'] data_c = adata.obs['leiden'].astype("long").to_list() ################################################# END SECTION OF SINGLE CELL DATA REPROCESSING ################################################# ################################################# START SECTION OF LOADING SC DATA TO THE TENSORS ################################################# #Prepare to normailize and split target data mmscaler = preprocessing.MinMaxScaler() try: data = mmscaler.fit_transform(data) except: logging.warning("Only one class, no ROC") # Process sparse data data = data.todense() data = mmscaler.fit_transform(data) # Split data to train and valid set # Along with the leiden conditions for CVAE propose Xtarget_train, Xtarget_valid, Ctarget_train, Ctarget_valid = train_test_split( data, data_c, test_size=valid_size, random_state=42) # Select the device of gpu device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Assuming that we are on a CUDA machine, this should print a CUDA device: logging.info(device) try: torch.cuda.set_device(device) except: logging.warning("No GPU detected, will apply cpu to process") # Construct datasets and data loaders Xtarget_trainTensor = torch.FloatTensor(Xtarget_train).to(device) Xtarget_validTensor = torch.FloatTensor(Xtarget_valid).to(device) # Use leiden label if CVAE is applied Ctarget_trainTensor = torch.LongTensor(Ctarget_train).to(device) Ctarget_validTensor = torch.LongTensor(Ctarget_valid).to(device) X_allTensor = torch.FloatTensor(data).to(device) C_allTensor = torch.LongTensor(data_c).to(device) train_dataset = TensorDataset(Xtarget_trainTensor, Ctarget_trainTensor) valid_dataset = TensorDataset(Xtarget_validTensor, Ctarget_validTensor) Xtarget_trainDataLoader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True) Xtarget_validDataLoader = DataLoader(dataset=valid_dataset, batch_size=batch_size, shuffle=True) dataloaders_pretrain = { 'train': Xtarget_trainDataLoader, 'val': Xtarget_validDataLoader } ################################################# START SECTION OF LOADING SC DATA TO THE TENSORS ################################################# ################################################# START SECTION OF LOADING BULK DATA ################################################# # Read source data data_r = pd.read_csv(source_data_path, index_col=0) label_r = pd.read_csv(label_path, index_col=0) label_r = label_r.fillna(na) # Extract labels selected_idx = label_r.loc[:, select_drug] != na label = label_r.loc[selected_idx, select_drug] label = label.values.reshape(-1, 1) if prediction == "regression": lbscaler = preprocessing.MinMaxScaler() label = lbscaler.fit_transform(label) dim_model_out = 1 else: le = preprocessing.LabelEncoder() label = le.fit_transform(label) dim_model_out = 2 # Process source data mmscaler = preprocessing.MinMaxScaler() source_data = mmscaler.fit_transform(data_r) # Split source data Xsource_train_all, Xsource_test, Ysource_train_all, Ysource_test = train_test_split( source_data, label, test_size=test_size, random_state=42) Xsource_train, Xsource_valid, Ysource_train, Ysource_valid = train_test_split( Xsource_train_all, Ysource_train_all, test_size=valid_size, random_state=42) # Transform source data # Construct datasets and data loaders Xsource_trainTensor = torch.FloatTensor(Xsource_train).to(device) Xsource_validTensor = torch.FloatTensor(Xsource_valid).to(device) if prediction == "regression": Ysource_trainTensor = torch.FloatTensor(Ysource_train).to(device) Ysource_validTensor = torch.FloatTensor(Ysource_valid).to(device) else: Ysource_trainTensor = torch.LongTensor(Ysource_train).to(device) Ysource_validTensor = torch.LongTensor(Ysource_valid).to(device) sourcetrain_dataset = TensorDataset(Xsource_trainTensor, Ysource_trainTensor) sourcevalid_dataset = TensorDataset(Xsource_validTensor, Ysource_validTensor) Xsource_trainDataLoader = DataLoader(dataset=sourcetrain_dataset, batch_size=batch_size, shuffle=True) Xsource_validDataLoader = DataLoader(dataset=sourcevalid_dataset, batch_size=batch_size, shuffle=True) dataloaders_source = { 'train': Xsource_trainDataLoader, 'val': Xsource_validDataLoader } ################################################# END SECTION OF LOADING BULK DATA ################################################# ################################################# START SECTION OF MODEL CUNSTRUCTION ################################################# # Construct target encoder if reduce_model == "AE": encoder = AEBase(input_dim=data.shape[1], latent_dim=dim_au_out, h_dims=encoder_hdims) loss_function_e = nn.MSELoss() elif reduce_model == "VAE": encoder = VAEBase(input_dim=data.shape[1], latent_dim=dim_au_out, h_dims=encoder_hdims) elif reduce_model == "CVAE": # Number of condition is equal to the number of clusters encoder = CVAEBase(input_dim=data.shape[1], n_conditions=len(set(data_c)), latent_dim=dim_au_out, h_dims=encoder_hdims) if torch.cuda.is_available(): encoder.cuda() logging.info("Target encoder structure is: ") logging.info(encoder) encoder.to(device) optimizer_e = optim.Adam(encoder.parameters(), lr=1e-2) loss_function_e = nn.MSELoss() exp_lr_scheduler_e = lr_scheduler.ReduceLROnPlateau(optimizer_e) # Load source model before transfer if prediction == "regression": dim_model_out = 1 else: dim_model_out = 2 # Load AE model if reduce_model == "AE": source_model = PretrainedPredictor(input_dim=Xsource_train.shape[1], latent_dim=dim_au_out, h_dims=encoder_hdims, hidden_dims_predictor=predict_hdims, output_dim=dim_model_out, pretrained_weights=None, freezed=freeze) source_model.load_state_dict(torch.load(source_model_path)) source_encoder = source_model # Load VAE model elif reduce_model in ["VAE", "CVAE"]: source_model = PretrainedVAEPredictor( input_dim=Xsource_train.shape[1], latent_dim=dim_au_out, h_dims=encoder_hdims, hidden_dims_predictor=predict_hdims, output_dim=dim_model_out, pretrained_weights=None, freezed=freeze, z_reparam=bool(args.VAErepram)) source_model.load_state_dict(torch.load(source_model_path)) source_encoder = source_model logging.info("Load pretrained source model from: " + source_model_path) source_encoder.to(device) ################################################# END SECTION OF MODEL CUNSTRUCTION ################################################# ################################################# START SECTION OF SC MODEL PRETRAININIG ################################################# # Pretrain target encoder # Pretain using autoencoder is pretrain is not False if (str(pretrain) != '0'): # Pretrained target encoder if there are not stored files in the harddisk train_flag = True pretrain = str(pretrain) if (os.path.exists(pretrain) == True): try: encoder.load_state_dict(torch.load(pretrain)) logging.info("Load pretrained target encoder from " + pretrain) train_flag = False except: logging.warning("Loading failed, procceed to re-train model") if train_flag == True: if reduce_model == "AE": encoder, loss_report_en = t.train_AE_model( net=encoder, data_loaders=dataloaders_pretrain, optimizer=optimizer_e, loss_function=loss_function_e, n_epochs=epochs, scheduler=exp_lr_scheduler_e, save_path=pretrain) elif reduce_model == "VAE": encoder, loss_report_en = t.train_VAE_model( net=encoder, data_loaders=dataloaders_pretrain, optimizer=optimizer_e, n_epochs=epochs, scheduler=exp_lr_scheduler_e, save_path=pretrain) elif reduce_model == "CVAE": encoder, loss_report_en = t.train_CVAE_model( net=encoder, data_loaders=dataloaders_pretrain, optimizer=optimizer_e, n_epochs=epochs, scheduler=exp_lr_scheduler_e, save_path=pretrain) logging.info("Pretrained finished") # Before Transfer learning, we test the performance of using no transfer performance: # Use vae result to predict if (args.dimreduce != "CVAE"): embeddings_pretrain = encoder.encode(X_allTensor) else: embeddings_pretrain = encoder.encode(X_allTensor, C_allTensor) pretrain_prob_prediction = source_model.predict( embeddings_pretrain).detach().cpu().numpy() adata.obs["sens_preds_pret"] = pretrain_prob_prediction[:, 1] adata.obs["sens_label_pret"] = pretrain_prob_prediction.argmax(axis=1) # # Use umap result to predict ## This section is removed because the dim problem and the performance problem # sc.tl.pca(adata, n_comps=max(50,2*dim_au_out),svd_solver='arpack') # sc.tl.umap(adata, n_components=dim_au_out) # embeddings_umap = torch.FloatTensor(adata.obsm["X_umap"]).to(device) # umap_prob_prediction = source_model.predict(embeddings_umap).detach().cpu().numpy() # adata.obs["sens_preds_umap"] = umap_prob_prediction[:,1] # adata.obs["sens_label_umap"] = umap_prob_prediction.argmax(axis=1) # # Use tsne result to predict # #sc.tl.tsne(adata, n_pcs=dim_au_out) # X_pca = adata.obsm["X_pca"] # # Replace tsne by pac beacause TSNE is very slow # X_tsne = adata.obsm["X_umap"] # #X_tsne = TSNE(n_components=dim_au_out,method='exact').fit_transform(X_pca) # embeddings_tsne = torch.FloatTensor(X_tsne).to(device) # tsne_prob_prediction = source_model.predict(embeddings_tsne).detach().cpu().numpy() # adata.obs["sens_preds_tsne"] = tsne_prob_prediction[:,1] # adata.obs["sens_label_tsne"] = tsne_prob_prediction.argmax(axis=1) # adata.obsm["X_tsne_pret"] = X_tsne # Add embeddings to the adata object embeddings_pretrain = embeddings_pretrain.detach().cpu().numpy() adata.obsm["X_pre"] = embeddings_pretrain ################################################# END SECTION OF SC MODEL PRETRAININIG ################################################# ################################################# START SECTION OF TRANSFER LEARNING TRAINING ################################################# # Using ADDA transfer learning if args.transfer == 'ADDA': # Set discriminator model discriminator = Predictor(input_dim=dim_au_out, output_dim=2) discriminator.to(device) loss_d = nn.CrossEntropyLoss() optimizer_d = optim.Adam(encoder.parameters(), lr=1e-2) exp_lr_scheduler_d = lr_scheduler.ReduceLROnPlateau(optimizer_d) # Adversairal trainning discriminator, encoder, report_, report2_ = t.train_ADDA_model( source_encoder, encoder, discriminator, dataloaders_source, dataloaders_pretrain, loss_d, loss_d, # Should here be all optimizer d? optimizer_d, optimizer_d, exp_lr_scheduler_d, exp_lr_scheduler_d, epochs, device, target_model_path) logging.info("Transfer ADDA finished") # DaNN model elif args.transfer == 'DaNN': # Set predictor loss loss_d = nn.CrossEntropyLoss() optimizer_d = optim.Adam(encoder.parameters(), lr=1e-2) exp_lr_scheduler_d = lr_scheduler.ReduceLROnPlateau(optimizer_d) # Set DaNN model DaNN_model = DaNN(source_model=source_encoder, target_model=encoder) DaNN_model.to(device) def loss(x, y, GAMMA=args.GAMMA_mmd): result = mmd.mmd_loss(x, y, GAMMA) return result loss_disrtibution = loss # Tran DaNN model DaNN_model, report_ = t.train_DaNN_model( DaNN_model, dataloaders_source, dataloaders_pretrain, # Should here be all optimizer d? optimizer_d, loss_d, epochs, exp_lr_scheduler_d, dist_loss=loss_disrtibution, load=load_model, weight=args.mmd_weight, save_path=target_model_path + "_DaNN.pkl") encoder = DaNN_model.target_model source_model = DaNN_model.source_model logging.info("Transfer DaNN finished") if (load_model == False): ut.plot_loss(report_[0], path="figures/train_loss_" + now + ".pdf") ut.plot_loss(report_[1], path="figures/mmd_loss_" + now + ".pdf", set_ylim=False) if (args.dimreduce != 'CVAE'): # Attribute test using integrated gradient # Generate a target model including encoder and predictor target_model = TargetModel(source_model, encoder) # Allow require gradients and process label X_allTensor.requires_grad_() # Run integrated gradient check # Return adata and feature integrated gradient ytarget_allPred = target_model(X_allTensor).detach().cpu().numpy() ytarget_allPred = ytarget_allPred.argmax(axis=1) adata, attrp1, senNeu_c0_genes, senNeu_c1_genes = ut.integrated_gradient_differential( net=target_model, input=X_allTensor, clip="positive", target=ytarget_allPred, adata=adata, ig_fc=1, save_name=reduce_model + args.predictor + prediction + select_drug + "sensNeuron" + now) adata, attrn1, resNeu_c0_genes, resNeu_c1_genes = ut.integrated_gradient_differential( net=target_model, input=X_allTensor, clip="negative", target=ytarget_allPred, adata=adata, ig_fc=1, save_name=reduce_model + args.predictor + prediction + select_drug + "restNeuron" + now) sc.pl.heatmap(attrp1, senNeu_c0_genes, groupby='sensitive', cmap='RdBu_r', save=data_name + args.transfer + args.dimreduce + "_seNc0_" + now, show=False) sc.pl.heatmap(attrp1, senNeu_c1_genes, groupby='sensitive', cmap='RdBu_r', save=data_name + args.transfer + args.dimreduce + "_seNc1_" + now, show=False) sc.pl.heatmap(attrn1, resNeu_c0_genes, groupby='sensitive', cmap='RdBu_r', save=data_name + args.transfer + args.dimreduce + "_reNc0_" + now, show=False) sc.pl.heatmap(attrn1, resNeu_c1_genes, groupby='sensitive', cmap='RdBu_r', save=data_name + args.transfer + args.dimreduce + "_reNc1_" + now, show=False) # CHI2 Test on predictive features SFD = SelectFdr(chi2) SFD.fit(adata.raw.X, ytarget_allPred) adata.raw.var['chi2_pval'] = SFD.pvalues_ adata.raw.var['chi2_score'] = SFD.scores_ df_chi2_genes = adata.raw.var[ (SFD.pvalues_ < 0.05) & (adata.raw.var.highly_variable == True) & (adata.raw.var.n_cells > args.min_c)] df_chi2_genes.sort_values(by="chi2_pval", ascending=True, inplace=True) df_chi2_genes.to_csv("saved/results/chi2_pval_genes" + args.predictor + prediction + select_drug + now + '.csv') else: print() ################################################# END SECTION OF TRANSER LEARNING TRAINING ################################################# ################################################# START SECTION OF PREPROCESSING FEATURES ################################################# # Extract feature embeddings # Extract prediction probabilities if (args.dimreduce != "CVAE"): embedding_tensors = encoder.encode(X_allTensor) else: embedding_tensors = encoder.encode(X_allTensor, C_allTensor) prediction_tensors = source_model.predictor(embedding_tensors) embeddings = embedding_tensors.detach().cpu().numpy() predictions = prediction_tensors.detach().cpu().numpy() # Transform predict8ion probabilities to 0-1 labels if (prediction == "regression"): adata.obs["sens_preds"] = predictions else: adata.obs["sens_preds"] = predictions[:, 1] adata.obs["sens_label"] = predictions.argmax(axis=1) adata.obs["sens_label"] = adata.obs["sens_label"].astype('category') adata.obs["rest_preds"] = predictions[:, 0] adata.write("saved/adata/before_ann" + data_name + now + ".h5ad") ################################################# END SECTION OF PREPROCESSING FEATURES ################################################# ################################################# START SECTION OF ANALYSIS AND POST PROCESSING ################################################# # Pipeline of scanpy # Add embeddings to the adata package adata.obsm["X_Trans"] = embeddings #sc.tl.umap(adata) sc.pp.neighbors(adata, n_neighbors=10, use_rep="X_Trans") # Use t-sne on transfer learning features sc.tl.tsne(adata, use_rep="X_Trans") # Leiden on the data # sc.tl.leiden(adata) # Plot tsne sc.pl.tsne(adata, save=data_name + now, color=["leiden"], show=False) # Differenrial expression genes sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon') sc.pl.rank_genes_groups(adata, n_genes=args.n_DE_genes, sharey=False, save=data_name + now, show=False) # Differenrial expression genes across 0-1 classes sc.tl.rank_genes_groups(adata, 'sens_label', method='wilcoxon') adata = ut.de_score(adata, clustername='sens_label') # save DE genes between 0-1 class for label in [0, 1]: try: df_degs = get_de_dataframe(adata, label) df_degs.to_csv("saved/results/DEGs_class_" + str(label) + args.predictor + prediction + select_drug + now + '.csv') except: logging.warning("Only one class, no two calsses critical genes") # Generate reports of scores report_df = args_df # Data specific benchmarking sens_pb_pret = adata.obs['sens_preds_pret'] lb_pret = adata.obs['sens_label_pret'] # sens_pb_umap = adata.obs['sens_preds_umap'] # lb_umap = adata.obs['sens_label_umap'] # sens_pb_tsne = adata.obs['sens_preds_tsne'] # lb_tsne = adata.obs['sens_label_tsne'] if ('sensitive' in adata.obs.keys()): report_df = report_df.T Y_test = adata.obs['sensitive'] sens_pb_results = adata.obs['sens_preds'] lb_results = adata.obs['sens_label'] le_sc = LabelEncoder() le_sc.fit(['Resistant', 'Sensitive']) label_descrbie = le_sc.inverse_transform(Y_test) adata.obs['sens_truth'] = label_descrbie color_list = ["sens_truth", "sens_label", 'sens_preds'] color_score_list = [ "Sensitive_score", "Resistant_score", "1_score", "0_score" ] sens_score = pearsonr(adata.obs["sens_preds"], adata.obs["Sensitive_score"])[0] resistant_score = pearsonr(adata.obs["rest_preds"], adata.obs["Resistant_score"])[0] report_df['prob_sens_pearson'] = sens_score report_df['prob_rest_pearson'] = resistant_score try: cluster_score_sens = pearsonr(adata.obs["1_score"], adata.obs["Sensitive_score"])[0] report_df['sens_pearson'] = cluster_score_sens except: logging.warning( "Prediction score 1 not exist, fill adata with 0 values") adata.obs["1_score"] = np.zeros(len(adata)) try: cluster_score_resist = pearsonr(adata.obs["0_score"], adata.obs["Resistant_score"])[0] report_df['rest_pearson'] = cluster_score_resist except: logging.warning( "Prediction score 0 not exist, fill adata with 0 values") adata.obs["0_score"] = np.zeros(len(adata)) #if (data_name in ['GSE110894','GSE117872']): ap_score = average_precision_score(Y_test, sens_pb_results) ap_pret = average_precision_score(Y_test, sens_pb_pret) # ap_umap = average_precision_score(Y_test, sens_pb_umap) # ap_tsne = average_precision_score(Y_test, sens_pb_tsne) report_dict = classification_report(Y_test, lb_results, output_dict=True) f1score = report_dict['weighted avg']['f1-score'] report_df['f1_score'] = f1score classification_report_df = pd.DataFrame(report_dict).T classification_report_df.to_csv("saved/results/clf_report_" + reduce_model + args.predictor + prediction + select_drug + now + '.csv') # report_dict_umap = classification_report(Y_test, lb_umap, output_dict=True) # classification_report_umap_df = pd.DataFrame(report_dict_umap).T # classification_report_umap_df.to_csv("saved/results/clf_umap_report_" + reduce_model + args.predictor+ prediction + select_drug+now + '.csv') report_dict_pret = classification_report(Y_test, lb_pret, output_dict=True) classification_report_pret_df = pd.DataFrame(report_dict_pret).T classification_report_pret_df.to_csv("saved/results/clf_pret_report_" + reduce_model + args.predictor + prediction + select_drug + now + '.csv') # report_dict_tsne = classification_report(Y_test, lb_tsne, output_dict=True) # classification_report_tsne_df = pd.DataFrame(report_dict_tsne).T # classification_report_tsne_df.to_csv("saved/results/clf_tsne_report_" + reduce_model + args.predictor+ prediction + select_drug+now + '.csv') try: auroc_score = roc_auc_score(Y_test, sens_pb_results) auroc_pret = average_precision_score(Y_test, sens_pb_pret) # auroc_umap = average_precision_score(Y_test, sens_pb_umap) # auroc_tsne = average_precision_score(Y_test, sens_pb_tsne) except: logging.warning("Only one class, no ROC") auroc_pret = auroc_umap = auroc_tsne = auroc_score = 0 report_df['auroc_score'] = auroc_score report_df['ap_score'] = ap_score report_df['auroc_pret'] = auroc_pret report_df['ap_pret'] = ap_pret # report_df['auroc_umap'] = auroc_umap # report_df['ap_umap'] = ap_umap # report_df['auroc_tsne'] = auroc_tsne # report_df['ap_tsne'] = ap_tsne ap_title = "ap: " + str(Decimal(ap_score).quantize(Decimal('0.0000'))) auroc_title = "roc: " + str( Decimal(auroc_score).quantize(Decimal('0.0000'))) title_list = ["Ground truth", "Prediction", "Probability"] else: color_list = ["leiden", "sens_label", 'sens_preds'] title_list = ['Cluster', "Prediction", "Probability"] color_score_list = color_list # Simple analysis do neighbors in adata using PCA embeddings #sc.pp.neighbors(adata) # Run UMAP dimension reduction sc.pp.neighbors(adata) sc.tl.umap(adata) # Run leiden clustering # sc.tl.leiden(adata,resolution=leiden_res) # Plot uamp # sc.pl.umap(adata,color=[color_list[0],'sens_label_umap','sens_preds_umap'],save=data_name+args.transfer+args.dimreduce+now,show=False,title=title_list) # Plot transfer learning on umap sc.pl.umap(adata, color=color_list + color_score_list, save=data_name + args.transfer + args.dimreduce + "umap_all" + now, show=False) sc.settings.set_figure_params(dpi=100, frameon=False, figsize=(4, 3), facecolor='white') sc.pl.umap(adata, color=['sensitivity', 'leiden', 'sens_label', 'sens_preds'], title=[ 'Cell sensitivity', 'Cell clusters', 'Transfer learning prediction', 'Prediction probability' ], save=data_name + args.transfer + args.dimreduce + "umap_pred" + now, show=False, ncols=4) sc.pl.umap(adata, color=color_score_list, title=[ 'Sensitive gene score', 'Resistant gene score', 'Sensitive gene score (prediction)', 'Resistant gene score (prediction)' ], save=data_name + args.transfer + args.dimreduce + "umap_scores" + now, show=False, ncols=2) # sc.pl.umap(adata,color=['Sample name'], # save=data_name+args.transfer+args.dimreduce+"umap_sm"+now,show=False,ncols=4) try: sc.pl.umap(adata, color=adata.var.sort_values( "integrated_gradient_sens_class0").head().index, save=data_name + args.transfer + args.dimreduce + "_cgenes0_" + now, show=False) sc.pl.umap(adata, color=adata.var.sort_values( "integrated_gradient_sens_class1").head().index, save=data_name + args.transfer + args.dimreduce + "_cgenes1_" + now, show=False) # c0_genes = df_11_genes.loc[df_11_genes.pval<0.05].head().index # c1_genes = df_00_genes.loc[df_00_genes.pval<0.05].head().index # sc.pl.umap(adata,color=c0_genes,neighbors_key="Trans",save=data_name+args.transfer+args.dimreduce+"_cgenes0_TL"+now,show=False) # sc.pl.umap(adata,color=c1_genes,neighbors_key="Trans",save=data_name+args.transfer+args.dimreduce+"_cgenes1_TL"+now,show=False) except: logging.warning("IG results not avaliable") # Run embeddings using transfered embeddings sc.pp.neighbors(adata, use_rep='X_Trans', key_added="Trans") sc.tl.umap(adata, neighbors_key="Trans") sc.tl.leiden(adata, neighbors_key="Trans", key_added="leiden_trans", resolution=leiden_res) sc.pl.umap(adata, color=color_list, neighbors_key="Trans", save=data_name + args.transfer + args.dimreduce + "_TL" + now, show=False, title=title_list) # Plot cell score on umap sc.pl.umap(adata, color=color_score_list, neighbors_key="Trans", save=data_name + args.transfer + args.dimreduce + "_score_TL" + now, show=False, title=color_score_list) # This tsne is based on transfer learning feature sc.pl.tsne(adata, color=color_list, neighbors_key="Trans", save=data_name + args.transfer + args.dimreduce + "_TL" + now, show=False, title=title_list) # Use tsne origianl version to visualize sc.tl.tsne(adata) # This tsne is based on transfer learning feature # sc.pl.tsne(adata,color=[color_list[0],'sens_label_tsne','sens_preds_tsne'],save=data_name+args.transfer+args.dimreduce+"_original_tsne"+now,show=False,title=title_list) # Plot tsne of the pretrained (autoencoder) embeddings sc.pp.neighbors(adata, use_rep='X_pre', key_added="Pret") sc.tl.umap(adata, neighbors_key="Pret") sc.tl.leiden(adata, neighbors_key="Pret", key_added="leiden_Pret", resolution=leiden_res) sc.pl.umap(adata, color=[color_list[0], 'sens_label_pret', 'sens_preds_pret'], neighbors_key="Pret", save=data_name + args.transfer + args.dimreduce + "_umap_Pretrain_" + now, show=False) # Ari between two transfer learning embedding and sensitivity label ari_score_trans = adjusted_rand_score(adata.obs['leiden_trans'], adata.obs['sens_label']) ari_score = adjusted_rand_score(adata.obs['leiden'], adata.obs['sens_label']) pret_ari_score = adjusted_rand_score(adata.obs['leiden_origin'], adata.obs['leiden_Pret']) transfer_ari_score = adjusted_rand_score(adata.obs['leiden_origin'], adata.obs['leiden_trans']) sc.pl.umap(adata, color=['leiden_origin', 'leiden_trans', 'leiden_Pret'], save=data_name + args.transfer + args.dimreduce + "_comp_Pretrain_" + now, show=False) #report_df = args_df report_df['ari_score'] = ari_score report_df['ari_trans_score'] = ari_score_trans report_df['ari_pre_umap'] = pret_ari_score report_df['ari_trans_umap'] = transfer_ari_score # Trajectory of adata adata, corelations = trajectory(adata, root_key='sensitive', genes_vis=senNeu_c0_genes[:5], root=1, now=now, plot=True) gene_cor = {} # Trajectory for g in np.array(senNeu_c0_genes): gene = g express_vec = adata[:, gene].X corr = pearsonr( np.array(express_vec).ravel(), np.array(adata.obs["dpt_pseudotime"]))[0] gene_cor[gene] = corr try: for k in corelations.keys(): report_df['cor_dpt_' + k] = corelations[k][0] report_df['cor_pvl_' + k] = corelations[k][1] except: logging.warning( "Some of the coorelation cannot be reterived from the dictional") ################################################# END SECTION OF ANALYSIS AND POST PROCESSING ################################################# ################################################# START SECTION OF ANALYSIS FOR BULK DATA ################################################# # bdata = sc.AnnData(data_r) # bdata.obs = label_r # bulk_degs={} # sc.tl.rank_genes_groups(bdata, select_drug, method='wilcoxon') # bdata = ut.de_score(bdata,select_drug) # for label in set(label_r.loc[:,select_drug]): # try: # df_degs = get_de_dataframe(bdata,label) # bulk_degs[label] = df_degs.iloc[:50,:].names # df_degs.to_csv("saved/results/DEGs_bulk_" +str(label)+ args.predictor+ prediction + select_drug+now + '.csv') # except: # logging.warning("Only one class, no two calsses critical genes") # Xsource_allTensor = torch.FloatTensor(data_r.values).to(device) # Ysource_preTensor = source_model(Xsource_allTensor) # Ysource_prediction = Ysource_preTensor.detach().cpu().numpy() # bdata.obs["sens_preds"] = Ysource_prediction[:,1] # bdata.obs["sens_label"] = Ysource_prediction.argmax(axis=1) # bdata.obs["sens_label"] = bdata.obs["sens_label"].astype('category') # bdata.obs["rest_preds"] = Ysource_prediction[:,0] # sc.tl.score_genes(adata, bulk_degs['sensitive'],score_name="bulk_sens_score" ) # sc.tl.score_genes(adata, bulk_degs['resistant'],score_name="bulk_rest_score" ) # sc.pl.umap(adata,color=['bulk_sens_score','bulk_rest_score'],save=data_name+args.transfer+args.dimreduce+"umap_bg_all"+now,show=False) # try: # bulk_score_sens = pearsonr(adata.obs["1_score"],adata.obs["bulk_sens_score"])[0] # report_df['bulk_sens_pearson'] = bulk_score_sens # cluster_score_resist = pearsonr(adata.obs["0_score"],adata.obs["bulk_rest_score"])[0] # report_df['bulk_rest_pearson'] = cluster_score_resist # except: # logging.warning("Bulk level gene score not exist") # Save adata adata.write("saved/adata/" + data_name + now + ".h5ad") # Save report report_df = report_df.T report_df.to_csv("saved/results/report" + reduce_model + args.predictor + prediction + select_drug + now + '.csv')
dataloaders_pretrain = {'train':Xtarget_trainDataLoader,'val':Xtarget_validDataLoader} # In[22]: len(Xtarget_trainDataLoader) # In[23]: encoder = AEBase(input_dim=data.shape[1],latent_dim=dim_au_out,h_dims=encoder_hdims) #model = VAE(dim_au_in=data_r.shape[1],dim_au_out=128) if torch.cuda.is_available(): encoder.cuda() print(encoder) encoder.to(device) optimizer_e = optim.Adam(encoder.parameters(), lr=1e-2) loss_function_e = nn.MSELoss() exp_lr_scheduler_e = lr_scheduler.ReduceLROnPlateau(optimizer_e) # In[24]: # Read source data data_r=pd.read_csv(source_data_path,index_col=0)
def run_main(args): ################################################# START SECTION OF LOADING PARAMETERS ################################################# # Read parameters epochs = args.epochs dim_au_out = args.bottleneck #8, 16, 32, 64, 128, 256,512 na = args.missing_value data_path = DATA_MAP[args.target_data] test_size = args.test_size select_drug = args.drug freeze = args.freeze_pretrain valid_size = args.valid_size g_disperson = args.var_genes_disp min_n_genes = args.min_n_genes max_n_genes = args.max_n_genes source_model_path = args.source_model_path target_model_path = args.target_model_path log_path = args.logging_file batch_size = args.batch_size encoder_hdims = args.source_h_dims.split(",") encoder_hdims = list(map(int, encoder_hdims)) source_data_path = args.source_data pretrain = args.pretrain prediction = args.predition data_name = args.target_data label_path = args.label_path reduce_model = args.dimreduce predict_hdims = args.p_h_dims.split(",") predict_hdims = list(map(int, predict_hdims)) leiden_res = args.cluster_res load_model = bool(args.load_target_model) # Misc now = time.strftime("%Y-%m-%d-%H-%M-%S") # Initialize logging and std out out_path = log_path + now + ".err" log_path = log_path + now + ".log" out = open(out_path, "w") sys.stderr = out #Logging infomaion logging.basicConfig( level=logging.INFO, filename=log_path, filemode='a', format= '%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s' ) logging.getLogger('matplotlib.font_manager').disabled = True logging.info(args) # Save arguments args_df = ut.save_arguments(args, now) AUROC = list() AP = list() Fone = list() ################################################# END SECTION OF LOADING PARAMETERS ################################################# cycletime = 50 while cycletime < 101: adata = pp.read_sc_file(data_path) adata = resample(adata, n_samples=350, random_state=cycletime, replace=False) #replace=True: 可以从a 中反复选取同一个元素。 #replace=False: a 中同一个元素只能被选取一次。 ################################################# START SECTION OF SINGLE CELL DATA REPROCESSING ################################################# # Load data and preprocessing sc.pp.filter_cells(adata, min_genes=200) sc.pp.filter_genes(adata, min_cells=3) adata = pp.cal_ncount_ngenes(adata) # Show statisctic after QX sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'], jitter=0.4, multi_panel=True, save=data_name, show=False) sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt', show=False) sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts', show=False) if args.remove_genes == 0: r_genes = [] else: r_genes = REMOVE_GENES #Preprocess data by filtering adata = pp.receipe_my(adata, l_n_genes=min_n_genes, r_n_genes=max_n_genes, filter_mincells=args.min_c, filter_mingenes=args.min_g, normalize=True, log=True, remove_genes=r_genes) # Select highly variable genes sc.pp.highly_variable_genes(adata, min_disp=g_disperson, max_disp=np.inf, max_mean=6) sc.pl.highly_variable_genes(adata, save=data_name, show=False) adata.raw = adata adata = adata[:, adata.var.highly_variable] # Preprocess data if spcific process is required if data_name == 'GSE117872': adata = ut.specific_process(adata, dataname=data_name, select_origin=args.batch_id) data = adata.X elif data_name == 'GSE122843': adata = ut.specific_process(adata, dataname=data_name) data = adata.X elif data_name == 'GSE110894': adata = ut.specific_process(adata, dataname=data_name) data = adata.X elif data_name == 'GSE112274': adata = ut.specific_process(adata, dataname=data_name) data = adata.X elif data_name == 'GSE116237': adata = ut.specific_process(adata, dataname=data_name) data = adata.X elif data_name == 'GSE108383': adata = ut.specific_process(adata, dataname=data_name) data = adata.X else: data = adata.X # PCA # Generate neighbor graph sc.tl.pca(adata, svd_solver='arpack') sc.pp.neighbors(adata, n_neighbors=10) # Generate cluster labels sc.tl.leiden(adata, resolution=leiden_res) sc.tl.umap(adata) # sc.pl.umap(adata,color=['leiden'],save=data_name+'umap'+now,show=False) adata.obs['leiden_origin'] = adata.obs['leiden'] adata.obsm['X_umap_origin'] = adata.obsm['X_umap'] data_c = adata.obs['leiden'].astype("long").to_list() ################################################# END SECTION OF SINGLE CELL DATA REPROCESSING ################################################# ################################################# START SECTION OF LOADING SC DATA TO THE TENSORS ################################################# #Prepare to normailize and split target data mmscaler = preprocessing.MinMaxScaler() try: data = mmscaler.fit_transform(data) except: logging.warning("Only one class, no ROC") # Process sparse data data = data.todense() data = mmscaler.fit_transform(data) # Split data to train and valid set # Along with the leiden conditions for CVAE propose Xtarget_train, Xtarget_valid, Ctarget_train, Ctarget_valid = train_test_split( data, data_c, test_size=valid_size, random_state=42) # Select the device of gpu device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Assuming that we are on a CUDA machine, this should print a CUDA device: logging.info(device) torch.cuda.set_device(device) # Construct datasets and data loaders Xtarget_trainTensor = torch.FloatTensor(Xtarget_train).to(device) Xtarget_validTensor = torch.FloatTensor(Xtarget_valid).to(device) # Use leiden label if CVAE is applied Ctarget_trainTensor = torch.LongTensor(Ctarget_train).to(device) Ctarget_validTensor = torch.LongTensor(Ctarget_valid).to(device) X_allTensor = torch.FloatTensor(data).to(device) C_allTensor = torch.LongTensor(data_c).to(device) train_dataset = TensorDataset(Xtarget_trainTensor, Ctarget_trainTensor) valid_dataset = TensorDataset(Xtarget_validTensor, Ctarget_validTensor) Xtarget_trainDataLoader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True) Xtarget_validDataLoader = DataLoader(dataset=valid_dataset, batch_size=batch_size, shuffle=True) dataloaders_pretrain = { 'train': Xtarget_trainDataLoader, 'val': Xtarget_validDataLoader } ################################################# START SECTION OF LOADING SC DATA TO THE TENSORS ################################################# ################################################# START SECTION OF LOADING BULK DATA ################################################# # Read source data data_r = pd.read_csv(source_data_path, index_col=0) label_r = pd.read_csv(label_path, index_col=0) label_r = label_r.fillna(na) # Extract labels selected_idx = label_r.loc[:, select_drug] != na label = label_r.loc[selected_idx, select_drug] label = label.values.reshape(-1, 1) if prediction == "regression": lbscaler = preprocessing.MinMaxScaler() label = lbscaler.fit_transform(label) dim_model_out = 1 else: le = preprocessing.LabelEncoder() label = le.fit_transform(label) dim_model_out = 2 # Process source data mmscaler = preprocessing.MinMaxScaler() source_data = mmscaler.fit_transform(data_r) # Split source data Xsource_train_all, Xsource_test, Ysource_train_all, Ysource_test = train_test_split( source_data, label, test_size=test_size, random_state=42) Xsource_train, Xsource_valid, Ysource_train, Ysource_valid = train_test_split( Xsource_train_all, Ysource_train_all, test_size=valid_size, random_state=42) # Transform source data # Construct datasets and data loaders Xsource_trainTensor = torch.FloatTensor(Xsource_train).to(device) Xsource_validTensor = torch.FloatTensor(Xsource_valid).to(device) if prediction == "regression": Ysource_trainTensor = torch.FloatTensor(Ysource_train).to(device) Ysource_validTensor = torch.FloatTensor(Ysource_valid).to(device) else: Ysource_trainTensor = torch.LongTensor(Ysource_train).to(device) Ysource_validTensor = torch.LongTensor(Ysource_valid).to(device) sourcetrain_dataset = TensorDataset(Xsource_trainTensor, Ysource_trainTensor) sourcevalid_dataset = TensorDataset(Xsource_validTensor, Ysource_validTensor) Xsource_trainDataLoader = DataLoader(dataset=sourcetrain_dataset, batch_size=batch_size, shuffle=True) Xsource_validDataLoader = DataLoader(dataset=sourcevalid_dataset, batch_size=batch_size, shuffle=True) dataloaders_source = { 'train': Xsource_trainDataLoader, 'val': Xsource_validDataLoader } ################################################# END SECTION OF LOADING BULK DATA ################################################# ################################################# START SECTION OF MODEL CUNSTRUCTION ################################################# # Construct target encoder if reduce_model == "AE": encoder = AEBase(input_dim=data.shape[1], latent_dim=dim_au_out, h_dims=encoder_hdims) loss_function_e = nn.MSELoss() elif reduce_model == "VAE": encoder = VAEBase(input_dim=data.shape[1], latent_dim=dim_au_out, h_dims=encoder_hdims) elif reduce_model == "CVAE": # Number of condition is equal to the number of clusters encoder = CVAEBase(input_dim=data.shape[1], n_conditions=len(set(data_c)), latent_dim=dim_au_out, h_dims=encoder_hdims) if torch.cuda.is_available(): encoder.cuda() logging.info("Target encoder structure is: ") logging.info(encoder) encoder.to(device) optimizer_e = optim.Adam(encoder.parameters(), lr=1e-2) loss_function_e = nn.MSELoss() exp_lr_scheduler_e = lr_scheduler.ReduceLROnPlateau(optimizer_e) # Load source model before transfer if prediction == "regression": dim_model_out = 1 else: dim_model_out = 2 # Load AE model if reduce_model == "AE": source_model = PretrainedPredictor( input_dim=Xsource_train.shape[1], latent_dim=dim_au_out, h_dims=encoder_hdims, hidden_dims_predictor=predict_hdims, output_dim=dim_model_out, pretrained_weights=None, freezed=freeze) source_model.load_state_dict(torch.load(source_model_path)) source_encoder = source_model # Load VAE model elif reduce_model in ["VAE", "CVAE"]: source_model = PretrainedVAEPredictor( input_dim=Xsource_train.shape[1], latent_dim=dim_au_out, h_dims=encoder_hdims, hidden_dims_predictor=predict_hdims, output_dim=dim_model_out, pretrained_weights=None, freezed=freeze, z_reparam=bool(args.VAErepram)) source_model.load_state_dict(torch.load(source_model_path)) source_encoder = source_model logging.info("Load pretrained source model from: " + source_model_path) source_encoder.to(device) ################################################# END SECTION OF MODEL CUNSTRUCTION ################################################# ################################################# START SECTION OF SC MODEL PRETRAININIG ################################################# # Pretrain target encoder # Pretain using autoencoder is pretrain is not False if (str(pretrain) != '0'): # Pretrained target encoder if there are not stored files in the harddisk train_flag = True pretrain = str(pretrain) if (os.path.exists(pretrain) == True): try: encoder.load_state_dict(torch.load(pretrain)) logging.info("Load pretrained target encoder from " + pretrain) train_flag = False except: logging.warning( "Loading failed, procceed to re-train model") if train_flag == True: if reduce_model == "AE": encoder, loss_report_en = t.train_AE_model( net=encoder, data_loaders=dataloaders_pretrain, optimizer=optimizer_e, loss_function=loss_function_e, n_epochs=epochs, scheduler=exp_lr_scheduler_e, save_path=pretrain) elif reduce_model == "VAE": encoder, loss_report_en = t.train_VAE_model( net=encoder, data_loaders=dataloaders_pretrain, optimizer=optimizer_e, n_epochs=epochs, scheduler=exp_lr_scheduler_e, save_path=pretrain) elif reduce_model == "CVAE": encoder, loss_report_en = t.train_CVAE_model( net=encoder, data_loaders=dataloaders_pretrain, optimizer=optimizer_e, n_epochs=epochs, scheduler=exp_lr_scheduler_e, save_path=pretrain) logging.info("Pretrained finished") # Before Transfer learning, we test the performance of using no transfer performance: # Use vae result to predict if (args.dimreduce != "CVAE"): embeddings_pretrain = encoder.encode(X_allTensor) else: embeddings_pretrain = encoder.encode(X_allTensor, C_allTensor) pretrain_prob_prediction = source_model.predict( embeddings_pretrain).detach().cpu().numpy() adata.obs["sens_preds_pret"] = pretrain_prob_prediction[:, 1] adata.obs["sens_label_pret"] = pretrain_prob_prediction.argmax( axis=1) # Add embeddings to the adata object embeddings_pretrain = embeddings_pretrain.detach().cpu().numpy() adata.obsm["X_pre"] = embeddings_pretrain ################################################# END SECTION OF SC MODEL PRETRAININIG ################################################# ################################################# START SECTION OF TRANSFER LEARNING TRAINING ################################################# # Using ADDA transfer learning if args.transfer == 'ADDA': # Set discriminator model discriminator = Predictor(input_dim=dim_au_out, output_dim=2) discriminator.to(device) loss_d = nn.CrossEntropyLoss() optimizer_d = optim.Adam(encoder.parameters(), lr=1e-2) exp_lr_scheduler_d = lr_scheduler.ReduceLROnPlateau(optimizer_d) # Adversairal trainning discriminator, encoder, report_, report2_ = t.train_ADDA_model( source_encoder, encoder, discriminator, dataloaders_source, dataloaders_pretrain, loss_d, loss_d, # Should here be all optimizer d? optimizer_d, optimizer_d, exp_lr_scheduler_d, exp_lr_scheduler_d, epochs, device, target_model_path) logging.info("Transfer ADDA finished") # DaNN model elif args.transfer == 'DaNN': # Set predictor loss loss_d = nn.CrossEntropyLoss() optimizer_d = optim.Adam(encoder.parameters(), lr=1e-2) exp_lr_scheduler_d = lr_scheduler.ReduceLROnPlateau(optimizer_d) # Set DaNN model DaNN_model = DaNN(source_model=source_encoder, target_model=encoder) DaNN_model.to(device) def loss(x, y, GAMMA=args.GAMMA_mmd): result = mmd.mmd_loss(x, y, GAMMA) return result loss_disrtibution = loss # Tran DaNN model DaNN_model, report_ = t.train_DaNN_model( DaNN_model, dataloaders_source, dataloaders_pretrain, # Should here be all optimizer d? optimizer_d, loss_d, epochs, exp_lr_scheduler_d, dist_loss=loss_disrtibution, load=load_model, weight=args.mmd_weight, save_path=target_model_path + "_DaNN.pkl") encoder = DaNN_model.target_model source_model = DaNN_model.source_model logging.info("Transfer DaNN finished") if (load_model == False): ut.plot_loss(report_[0], path="figures/train_loss_" + now + ".pdf") ut.plot_loss(report_[1], path="figures/mmd_loss_" + now + ".pdf") if (args.dimreduce != 'CVAE'): # Attribute test using integrated gradient # Generate a target model including encoder and predictor target_model = TargetModel(source_model, encoder) # Allow require gradients and process label Xtarget_validTensor.requires_grad_() # Run integrated gradient check # Return adata and feature integrated gradient ytarget_validPred = target_model( Xtarget_validTensor).detach().cpu().numpy() ytarget_validPred = ytarget_validPred.argmax(axis=1) adata, attr = ut.integrated_gradient_check( net=target_model, input=Xtarget_validTensor, target=ytarget_validPred, adata=adata, n_genes=args.n_DL_genes, save_name=reduce_model + args.predictor + prediction + select_drug + now) adata, attr0 = ut.integrated_gradient_check( net=target_model, input=Xtarget_validTensor, target=ytarget_validPred, target_class=0, adata=adata, n_genes=args.n_DL_genes, save_name=reduce_model + args.predictor + prediction + select_drug + now) else: print() ################################################# END SECTION OF TRANSER LEARNING TRAINING ################################################# ################################################# START SECTION OF PREPROCESSING FEATURES ################################################# # Extract feature embeddings # Extract prediction probabilities if (args.dimreduce != "CVAE"): embedding_tensors = encoder.encode(X_allTensor) else: embedding_tensors = encoder.encode(X_allTensor, C_allTensor) prediction_tensors = source_model.predictor(embedding_tensors) embeddings = embedding_tensors.detach().cpu().numpy() predictions = prediction_tensors.detach().cpu().numpy() # Transform predict8ion probabilities to 0-1 labels if (prediction == "regression"): adata.obs["sens_preds"] = predictions else: adata.obs["sens_preds"] = predictions[:, 1] adata.obs["sens_label"] = predictions.argmax(axis=1) adata.obs["sens_label"] = adata.obs["sens_label"].astype( 'category') adata.obs["rest_preds"] = predictions[:, 0] ################################################# END SECTION OF PREPROCESSING FEATURES ################################################# ################################################# START SECTION OF ANALYSIS AND POST PROCESSING ################################################# # Pipeline of scanpy # Add embeddings to the adata package adata.obsm["X_Trans"] = embeddings #sc.tl.umap(adata) sc.pp.neighbors(adata, n_neighbors=10, use_rep="X_Trans") # Use t-sne on transfer learning features sc.tl.tsne(adata, use_rep="X_Trans") # Leiden on the data # sc.tl.leiden(adata) # Plot tsne # sc.pl.tsne(adata,save=data_name+now,color=["leiden"],show=False) # Differenrial expression genes sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon') # sc.pl.rank_genes_groups(adata, n_genes=args.n_DE_genes, sharey=False,save=data_name+now,show=False) # Differenrial expression genes across 0-1 classes sc.tl.rank_genes_groups(adata, 'sens_label', method='wilcoxon') adata = ut.de_score(adata, clustername='sens_label') # save DE genes between 0-1 class # for label in [0,1]: # try: # df_degs = get_de_dataframe(adata,label) # #df_degs.to_csv("saved/results/DEGs_class_" +str(label)+ args.predictor+ prediction + select_drug+now + '.csv') # except: # logging.warning("Only one class, no two calsses critical genes") # Generate reports of scores report_df = args_df # Data specific benchmarking # sens_pb_pret = adata.obs['sens_preds_pret'] # lb_pret = adata.obs['sens_label_pret'] # sens_pb_umap = adata.obs['sens_preds_umap'] # lb_umap = adata.obs['sens_label_umap'] # sens_pb_tsne = adata.obs['sens_preds_tsne'] # lb_tsne = adata.obs['sens_label_tsne'] if (data_name == 'GSE117872'): label = adata.obs['cluster'] label_no_ho = label if len(label[label != "Sensitive"]) > 0: # label[label != "Sensitive"] = 'Resistant' label_no_ho[label_no_ho != "Resistant"] = 'Sensitive' adata.obs['sens_truth'] = label_no_ho le_sc = LabelEncoder() le_sc.fit(['Resistant', 'Sensitive']) sens_pb_results = adata.obs['sens_preds'] Y_test = le_sc.transform(label_no_ho) lb_results = adata.obs['sens_label'] color_list = ["sens_truth", "sens_label", 'sens_preds'] color_score_list = [ "Sensitive_score", "Resistant_score", "1_score", "0_score" ] sens_score = pearsonr(adata.obs["sens_preds"], adata.obs["Sensitive_score"])[0] resistant_score = pearsonr(adata.obs["sens_preds"], adata.obs["Resistant_score"])[0] cluster_score_sens = pearsonr(adata.obs["1_score"], adata.obs["Sensitive_score"])[0] cluster_score_resist = pearsonr(adata.obs["0_score"], adata.obs["Resistant_score"])[0] report_df['sens_pearson'] = sens_score report_df['resist_pearson'] = resistant_score report_df['1_pearson'] = cluster_score_sens report_df['0_pearson'] = cluster_score_resist elif (data_name == 'GSE110894'): report_df = report_df.T Y_test = adata.obs['sensitive'] sens_pb_results = adata.obs['sens_preds'] lb_results = adata.obs['sens_label'] le_sc = LabelEncoder() le_sc.fit(['Resistant', 'Sensitive']) label_descrbie = le_sc.inverse_transform(Y_test) adata.obs['sens_truth'] = label_descrbie color_list = ["sens_truth", "sens_label", 'sens_preds'] color_score_list = [ "sensitive_score", "resistant_score", "1_score", "0_score" ] sens_score = pearsonr(adata.obs["sens_preds"], adata.obs["sensitive_score"])[0] resistant_score = pearsonr(adata.obs["sens_preds"], adata.obs["resistant_score"])[0] report_df['sens_pearson'] = sens_score report_df['resist_pearson'] = resistant_score cluster_score_sens = pearsonr(adata.obs["1_score"], adata.obs["sensitive_score"])[0] cluster_score_resist = pearsonr(adata.obs["0_score"], adata.obs["resistant_score"])[0] report_df['1_pearson'] = cluster_score_sens report_df['0_pearson'] = cluster_score_resist elif (data_name == 'GSE108383'): report_df = report_df.T Y_test = adata.obs['sensitive'] sens_pb_results = adata.obs['sens_preds'] lb_results = adata.obs['sens_label'] le_sc = LabelEncoder() le_sc.fit(['Resistant', 'Sensitive']) label_descrbie = le_sc.inverse_transform(Y_test) adata.obs['sens_truth'] = label_descrbie color_list = ["sens_truth", "sens_label", 'sens_preds'] color_score_list = [ "sensitive_score", "resistant_score", "1_score", "0_score" ] sens_score = pearsonr(adata.obs["sens_preds"], adata.obs["sensitive_score"])[0] resistant_score = pearsonr(adata.obs["sens_preds"], adata.obs["resistant_score"])[0] report_df['sens_pearson'] = sens_score report_df['resist_pearson'] = resistant_score cluster_score_sens = pearsonr(adata.obs["1_score"], adata.obs["sensitive_score"])[0] cluster_score_resist = pearsonr(adata.obs["0_score"], adata.obs["resistant_score"])[0] report_df['1_pearson'] = cluster_score_sens report_df['0_pearson'] = cluster_score_resist if (data_name in ['GSE110894', 'GSE117872', 'GSE108383']): ap_score = average_precision_score(Y_test, sens_pb_results) report_dict = classification_report(Y_test, lb_results, output_dict=True) classification_report_df = pd.DataFrame(report_dict).T #classification_report_df.to_csv("saved/results/clf_report_" + reduce_model + args.predictor+ prediction + select_drug+now + '.csv') sum_classification_report_df = pd.DataFrame() sum_classification_report_df = sum_classification_report_df.append( classification_report_df, ignore_index=False) try: auroc_score = roc_auc_score(Y_test, sens_pb_results) except: logging.warning("Only one class, no ROC") auroc_pret = auroc_umap = auroc_tsne = auroc_score = 0 report_df['auroc_score'] = auroc_score report_df['ap_score'] = ap_score ap_title = "ap: " + str( Decimal(ap_score).quantize(Decimal('0.0000'))) auroc_title = "roc: " + str( Decimal(auroc_score).quantize(Decimal('0.0000'))) title_list = ["Ground truth", "Prediction", "Probability"] else: color_list = ["leiden", "sens_label", 'sens_preds'] title_list = ['Cluster', "Prediction", "Probability"] color_score_list = color_list # Simple analysis do neighbors in adata using PCA embeddings #sc.pp.neighbors(adata) # Run UMAP dimension reduction sc.pp.neighbors(adata) sc.tl.umap(adata) # Run leiden clustering # sc.tl.leiden(adata,resolution=leiden_res) # # Plot uamp # sc.pl.umap(adata,color=[color_list[0],'sens_label_umap','sens_preds_umap'],save=data_name+args.transfer+args.dimreduce+now,show=False,title=title_list) # Run embeddings using transfered embeddings sc.pp.neighbors(adata, use_rep='X_Trans', key_added="Trans") sc.tl.umap(adata, neighbors_key="Trans") sc.tl.leiden(adata, neighbors_key="Trans", key_added="leiden_trans", resolution=leiden_res) # sc.pl.umap(adata,color=color_list,neighbors_key="Trans",save=data_name+args.transfer+args.dimreduce+"_TL"+now,show=False,title=title_list) # Plot tsne # sc.pl.umap(adata,color=color_score_list,neighbors_key="Trans",save=data_name+args.transfer+args.dimreduce+"_score_TL"+now,show=False,title=color_score_list) # This tsne is based on transfer learning feature # sc.pl.tsne(adata,color=color_list,neighbors_key="Trans",save=data_name+args.transfer+args.dimreduce+"_TL"+now,show=False,title=title_list) # Use tsne origianl version to visualize sc.tl.tsne(adata) # This tsne is based on transfer learning feature # sc.pl.tsne(adata,color=[color_list[0],'sens_label_tsne','sens_preds_tsne'],save=data_name+args.transfer+args.dimreduce+"_original_tsne"+now,show=False,title=title_list) # Plot tsne of the pretrained (autoencoder) embeddings sc.pp.neighbors(adata, use_rep='X_pre', key_added="Pret") sc.tl.umap(adata, neighbors_key="Pret") sc.tl.leiden(adata, neighbors_key="Pret", key_added="leiden_Pret", resolution=leiden_res) # sc.pl.umap(adata,color=[color_list[0],'sens_label_pret','sens_preds_pret'],neighbors_key="Pret",save=data_name+args.transfer+args.dimreduce+"_umap_Pretrain_"+now,show=False) # Ari between two transfer learning embedding and sensitivity label ari_score_trans = adjusted_rand_score(adata.obs['leiden_trans'], adata.obs['sens_label']) ari_score = adjusted_rand_score(adata.obs['leiden'], adata.obs['sens_label']) pret_ari_score = adjusted_rand_score(adata.obs['leiden_origin'], adata.obs['leiden_Pret']) transfer_ari_score = adjusted_rand_score(adata.obs['leiden_origin'], adata.obs['leiden_trans']) # sc.pl.umap(adata,color=['leiden_origin','leiden_trans','leiden_Pret'],save=data_name+args.transfer+args.dimreduce+"_comp_Pretrain_"+now,show=False) #report_df = args_df report_df['ari_score'] = ari_score report_df['ari_trans_score'] = ari_score_trans report_df['ari_pre_umap'] = pret_ari_score report_df['ari_trans_umap'] = transfer_ari_score cluster_ids = set(adata.obs['leiden']) # Two class: sens and resistant between clustering label # Trajectory of adata #adata = trajectory(adata,now=now) # Draw PDF # sc.pl.draw_graph(adata, color=['leiden', 'dpt_pseudotime'],save=data_name+args.dimreduce+"leiden+trajectory") # sc.pl.draw_graph(adata, color=['sens_preds', 'dpt_pseudotime_leiden_trans','leiden_trans'],save=data_name+args.dimreduce+"sens_preds+trajectory") # Save adata #adata.write("saved/adata/"+data_name+now+".h5ad") # Save report report_df = report_df.T AP.append(ap_score) AUROC.append(auroc_score) Fone.append(report_dict['weighted avg']['f1-score']) cycletime = cycletime + 1 AUROC2 = pd.DataFrame(AUROC) AP2 = pd.DataFrame(AP) Fonefinal = pd.DataFrame(Fone) AUROC2.to_csv("saved/results/AUROC2report" + reduce_model + args.predictor + prediction + select_drug + now + '.csv') ################################################# END SECTION OF ANALYSIS AND POST PROCESSING ################################################# Fonefinal.to_csv("saved/results/clf_report_Fonefinal" + reduce_model + args.predictor + prediction + select_drug + now + '.csv') AP2.to_csv("saved/results/clf_umap_report_AP2" + reduce_model + args.predictor + prediction + select_drug + now + '.csv')