def train_test(label, data, Net, device, optimizer, lossFunc, opt): trainData, trainLabel, testData, testLabel = myData.separateData(label, data, sep=5) # -------------------------------train------------------------------------- dataSet = myData.MyDataset(trainData, trainLabel) dataLoader = DataLoader(dataset=dataSet, batch_size=opt.batch_size, shuffle=True, num_workers=opt.n_cpu, pin_memory=torch.cuda.is_available()) # train start for epoch in range(opt.n_epochs): for step, (x, y) in enumerate(dataLoader): b_x = Variable(x.to(device)) # batch data b_y = Variable(y.to(device)) # batch label output = Net(b_x) loss = lossFunc(output, b_y) optimizer.zero_grad() loss.backward() optimizer.step() # train end # ----------------------------------------------------------test----------------------------------------- # test start dataSet = myData.MyDataset(testData, testLabel) dataLoader = DataLoader(dataset=dataSet, batch_size=opt.batch_size, shuffle=True, num_workers=opt.n_cpu, pin_memory=torch.cuda.is_available()) rmse = 0.0 # root mean square error acc = 0.0 # Accuracy SE = 0.0 # Sensitivity (Recall) PC = 0.0 # Precision F1 = 0.0 # F1 Score JS = 0.0 # Jaccard Similarity ytrue_ypred = list() length = 0 for (x, y) in dataLoader: b_x = Variable(x.to(device)) # batch x (data) b_y = Variable(y.to(device)) # batch y (label) outputs = torch.sigmoid(Net(b_x)) predicted = torch.max(outputs.data, 1)[1].cpu() b_y = b_y.cpu() ytrue_ypred.append([b_y.numpy(), predicted.numpy()]) rmse += myEvaluation.get_RMSE(b_y, predicted) acc += myEvaluation.get_accuracy(b_y, predicted) SE += myEvaluation.get_sensitivity(b_y, predicted) PC += myEvaluation.get_precision(b_y, predicted) F1 += myEvaluation.get_F1(b_y, predicted) JS += myEvaluation.get_JS(b_y, predicted) length += 1 # test end res = [rmse, acc, SE, PC, F1, JS] res = [round(r / length, 2) for r in res] # res = ','.join(str(i) for i in res) return (res, ytrue_ypred)
def fit_VAE(data, labels, net, device, opt): zn, xn, yn = data.size() dataSet = myData.MyDataset(data, data) dataLoader = DataLoader(dataset=dataSet, batch_size=opt.batch_size, shuffle=False, num_workers=opt.n_cpu, pin_memory=torch.cuda.is_available()) # ----------------------------------------------------------fit----------------------------------------- # fit start ytrue_ypred = list() for (x, y) in dataLoader: b_x = Variable(x.view(-1, xn * yn).float().to(device)) # batch data b_y = Variable(y.to(device)) # batch y (label) _, _, predicted = net(b_x) predicted = torch.max(predicted.data, 1)[1].cpu() b_y = b_y.cpu() ytrue_ypred.append([b_y.numpy(), predicted.numpy()]) return (ytrue_ypred)
def train_test_VAE(data, net, device, optimizer, lossFunc, opt): zn, xn, yn = data.size() dataSet = myData.MyDataset(data, data) dataLoader = DataLoader(dataset=dataSet, batch_size=opt.batch_size, shuffle=False, num_workers=opt.n_cpu, pin_memory=torch.cuda.is_available()) # train start for epoch in range(opt.n_epochs): for step, (x, _) in enumerate(dataLoader): b_x = Variable(x.view(-1, xn * yn).float().to(device)) # batch data _, decoded, _ = net(b_x) loss = lossFunc(decoded, b_x) optimizer.zero_grad() loss.backward() optimizer.step() if step % 10 == 9: print('Epoch: ', epoch, '| train loss: %.4f' % loss.data.cpu().numpy()) # train end # ----------------------------------------------------------test----------------------------------------- # test start predLabels = list() features = list() for (x, y) in dataLoader: b_x = Variable(x.view(-1, xn * yn).float().to(device)) # batch x (data) feature, _, predicted = net(b_x) features.append([feature.cpu().detach().numpy()]) predicted = torch.max(predicted.data, 1)[1].cpu().numpy() predLabels.append(predicted) # test end features = np.hstack(features) zn, xn, yn = features.shape features = np.reshape(features, (xn, yn)) predLabels = np.concatenate(predLabels) return (features, predLabels)
def train_test_AE(data, net, device, optimizer, lossFunc, opt): zn, xn, yn = data.size() dataSet = myData.MyDataset(data, data) dataLoader = DataLoader(dataset=dataSet, batch_size=opt.batch_size, shuffle=False, num_workers=opt.n_cpu, pin_memory=torch.cuda.is_available()) # train start for epoch in range(opt.n_epochs): for step, (x, _) in enumerate(dataLoader): b_x = Variable(x.view(-1, xn * yn).float().to(device)) # batch data encoded, decoded, _ = net(b_x) loss = lossFunc(decoded, b_x) optimizer.zero_grad() loss.backward() optimizer.step() ''' if step % 10 == 9: print('Epoch: ', epoch, '| train loss: %.4f' % loss.data.numpy()) ''' # train end # ----------------------------------------------------------test----------------------------------------- # test start predLabels = list() for (x, y) in dataLoader: b_x = Variable(x.view(-1, xn * yn).float().to(device)) # batch x (data) _, _, label = net(b_x) predicted = torch.max(label.data, 1)[1].cpu() predLabels.append([predicted.numpy()]) # test end predLabels = np.concatenate(predLabels, axis=1) # res = ','.join(str(i) for i in res) return (predLabels)
def train_test_VAE(data, labels, net, device, optimizer, lossFunc, opt, pathName, fileName, saveModel=0): torch.manual_seed(16) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min') zn, xn, yn = data.size() trainData, trainLabel, testData, testLabel = myData.separateData(labels, data, sep=5) dataSet = myData.MyDataset(trainData, trainLabel) dataLoader = DataLoader(dataset=dataSet, batch_size=opt.batch_size, shuffle=True, num_workers=opt.n_cpu, pin_memory=torch.cuda.is_available()) # train start for epoch in range(opt.n_epochs): c = 0 loss_total = 0 for step, (x, y) in enumerate(dataLoader): b_x = Variable(x.view(-1, xn * yn).float().to(device)) # batch data _, decoded, _ = net(b_x) loss = lossFunc(decoded, b_x) loss_total = loss_total + loss.data.cpu().numpy() c = c + 1 optimizer.zero_grad() loss.backward() optimizer.step() loss_total = loss_total / c if epoch % 100 == 99: print('Epoch: ', epoch, '| train loss: %.4f' % loss_total) scheduler.step(loss_total) # train end # save net if saveModel == 1: if not os.path.exists(pathName): os.makedirs(pathName) fileName = pathName + fileName torch.save(net, fileName) return (1)
def train_test_GAN(data, device, lossFunc, opt, net_G, g_optimizer, net_D, d_optimizer, d_steps=16, g_steps=8): zn, xn, yn = data.size() dataSet = myData.MyDataset(data, data) dataLoader = DataLoader(dataset=dataSet, batch_size=opt.batch_size, shuffle=False, num_workers=opt.n_cpu, pin_memory=torch.cuda.is_available()) for epoch in range(opt.n_epochs): for d_index in range(d_steps): for i, (x, _) in enumerate(dataLoader): # 1. Train D on real+fake net_D.zero_grad() # 1A: Train D on real d_real_data = Variable(x.view(-1, xn * yn).float().to(device)) d_real_decision, _ = net_D(d_real_data) d_real_loss = lossFunc(d_real_decision, Variable( torch.ones_like(d_real_decision).to( device))) # ones = true d_real_loss.backward( ) # compute/store gradients, but don't change params # 1B: Train D on fake d_gen_input = Variable(torch.randn(100).to(device)) d_fake_data = net_G(d_gen_input).detach( ) # detach to avoid training G on these labels d_fake_decision, _ = net_D(d_fake_data) d_fake_loss = lossFunc( d_fake_decision, Variable(torch.zeros_like(d_fake_decision)).to( device)) # zeros = fake d_fake_loss.backward() d_optimizer.step( ) # Only optimizes D's parameters; changes based on stored gradients from backward() for g_index in range(g_steps): # 2. Train G on D's response (but DO NOT train D on these labels) net_G.zero_grad() gen_input = Variable(torch.randn(100).to(device)) g_fake_data = net_G(gen_input) dg_fake_decision, _ = net_D(g_fake_data) g_loss = lossFunc( dg_fake_decision, Variable(torch.ones_like(dg_fake_decision).to( device))) # we want to fool, so pretend it's all genuine g_loss.backward() g_optimizer.step() # Only optimizes G's parameters # train end # ----------------------------------------------------------test----------------------------------------- # test start predLabels = list() for (x, _) in dataLoader: x = Variable(x.view(-1, xn * yn).float().to(device)) # batch x (data) _, label = net_D(x) predicted = torch.max(label.data, 1)[1].cpu().numpy() predLabels.append([predicted]) # test end predLabels = np.concatenate(predLabels, axis=1) return (predLabels)
def train_test_FCN(data, labels, net, device, optimizer, lossFunc, opt): zn, xn, yn = data.size() dataSet = myData.MyDataset(data, labels) dataLoader = DataLoader(dataset=dataSet, batch_size=opt.batch_size, shuffle=True, num_workers=opt.n_cpu, pin_memory=torch.cuda.is_available()) # train start for epoch in range(opt.n_epochs): for step, (x, y) in enumerate(dataLoader): #b_x = Variable(x.view(-1, xn * yn).float().to(device)) # batch data b_x = Variable(x.to(device)) b_y = Variable(y.to(device)) # batch label output = net(b_x) loss = lossFunc(output, b_y) optimizer.zero_grad() loss.backward() optimizer.step() if step % 10 == 9: print('Epoch: ', epoch, '| train loss: %.4f' % loss.data.cpu().numpy()) # train end # ----------------------------------------------------------test----------------------------------------- # test start dataSet = myData.MyDataset(data, labels) dataLoader = DataLoader(dataset=dataSet, batch_size=opt.batch_size, shuffle=False, num_workers=opt.n_cpu, pin_memory=torch.cuda.is_available()) rmse = 0.0 # root mean square error acc = 0.0 # Accuracy SE = 0.0 # Sensitivity (Recall) PC = 0.0 # Precision F1 = 0.0 # F1 Score JS = 0.0 # Jaccard Similarity ytrue_ypred = list() length = 0 for (x, y) in dataLoader: b_x = Variable(x.to(device)) # batch data b_y = Variable(y.to(device)) # batch y (label) outputs = torch.sigmoid(net(b_x)) predicted = torch.max(outputs.data, 1)[1].cpu() b_y = b_y.cpu() ytrue_ypred.append([b_y.numpy(), predicted.numpy()]) acc += myEvaluation.get_accuracy(b_y, predicted) ''' rmse += myEvaluation.get_RMSE(b_y, predicted) SE += myEvaluation.get_sensitivity(b_y, predicted) PC += myEvaluation.get_precision(b_y, predicted) F1 += myEvaluation.get_F1(b_y, predicted) JS += myEvaluation.get_JS(b_y, predicted) ''' length += 1 # test end #res = [rmse, acc, SE, PC, F1, JS] #res = [round(r / length, 2) for r in res] # res = ','.join(str(i) for i in res) return (acc / length, ytrue_ypred) predLabels = list() features = list() for (x, y) in dataLoader: b_x = Variable(x.view(-1, xn * yn).float().to(device)) # batch x (data) feature, _, predicted = net(b_x) features.append([feature.cpu().detach().numpy()]) predicted = torch.max(predicted.data, 1)[1].cpu().numpy() predLabels.append(predicted) # test end features = np.hstack(features) zn, xn, yn = features.shape features = np.reshape(features, (xn, yn)) predLabels = np.concatenate(predLabels) return (features, predLabels)
def main(): #select cpu or gpu device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") torch.manual_seed(16) #pre process data #annotation an = pd.read_csv( "/N/project/zhangclab/pengtao/myProjectsDataRes/20200619Classification/data/annotation.csv", index_col=0) tmp = an["loh_percent"].copy() for i in range(len(tmp)): if tmp[i] >= 0.05: tmp[i] = 1 else: if tmp[i] < 0.05: tmp[i] = 0 an["loh_percent"] = tmp tmp2 = an["mutations_per_mb"].copy() for i in range(len(tmp2)): if tmp2[i] >= 28: tmp2[i] = 1 else: if tmp2[i] < 28: tmp2[i] = 0 an["mutations_per_mb"] = tmp2 #data x = pd.read_csv( "/N/project/zhangclab/pengtao/myProjectsDataRes/20200619Classification/data/salmonE74cDNA_counts_baseline.csv", index_col=0) x = x.T x = (x + 1).apply(np.log2) #test = np.median(x, axis=0) x_std = np.std(x, axis=0) top_gene = runPams.n_top_gene top_gene_idx = x_std.argsort()[::-1][0:top_gene] data = x.iloc[:, top_gene_idx] data = data.values.copy() top_gene_names = list(x.columns[top_gene_idx]) top_gene_names = np.insert(top_gene_names, 0, "bias") #data = np.random.rand(10, 200) xn, yn = data.shape # umap + kmeans pams = str(runPams.k) + "_" + str(runPams.n_top_gene) pathName = "/N/project/zhangclab/pengtao/myProjectsDataRes/20200619Classification/results/" + pams + "/imgs_UMAP/" # umap reducer = umap.UMAP() z = reducer.fit_transform(data) # kmeans kmeans = KMeans(n_clusters=4, random_state=0).fit(z) imgName = "kmeans.png" myData.myDraw(z, kmeans.labels_, pathName, imgName) # Hierarchical Clustering clst = cluster.AgglomerativeClustering(n_clusters=4) imgName = "Hierarchical_Clustering.png" myData.myDraw(z, clst.fit_predict(z), pathName, imgName) for i in range(1, len(an.columns)): a = an.columns[i] imgName = str(a) + ".png" myData.myDraw(z, an[a], pathName, imgName) # TSNE pathName = "/N/project/zhangclab/pengtao/myProjectsDataRes/20200619Classification/results/" + pams + "/imgs_TSNE/" # T-sNE z = TSNE(n_components=2).fit_transform(data) # kmeans kmeans = KMeans(n_clusters=4, random_state=0).fit(z) imgName = "kmeans.png" myData.myDraw(z, kmeans.labels_, pathName, imgName) # Hierarchical Clustering clst = cluster.AgglomerativeClustering(n_clusters=4) imgName = "Hierarchical_Clustering.png" myData.myDraw(z, clst.fit_predict(z), pathName, imgName) for i in range(1, len(an.columns)): a = an.columns[i] imgName = str(a) + ".png" myData.myDraw(z, an[a], pathName, imgName) # vae+dnp #data = np.random.rand(10, 2000) #xn, yn = data.shape data = np.reshape(data, (xn, 1, yn)) data = np.insert(data, 0, 1, axis=2) #data = data[:,:,:5000] zn, xn, yn = data.shape # set s set_s = np.zeros(xn * yn) set_s[0] = 1 # set c set_c = np.ones(xn * yn) set_c[0] = 0 # np 2 tensor data = torch.tensor(data) # dataLoader dataSet = myData.MyDataset(data, data) dataLoader = DataLoader(dataset=dataSet, batch_size=runPams.batch_size, shuffle=False, num_workers=runPams.n_cpu, pin_memory=torch.cuda.is_available()) net, optimizer, lossFunc = getVAEPams(xn, yn, device, runPams.lr) #np->tensor or gpu set_s = torch.tensor(set_s).float().to(device) set_c = torch.tensor(set_c).float().to(device) # train while torch.sum(set_s == 1).item() < (runPams.k + 1): print(torch.sum(set_s == 1).item()) for _ in range(runPams.epoch): for step, (x, _) in enumerate(dataLoader): b_x = Variable(x.view(-1, xn * yn).float().to(device)) b_y = Variable(x.view(-1, xn * yn).float().to(device)) # initialize the weight of set c to be zero and of set s to be normal net.fc1.weight.data = net.fc1.weight.data * (set_s) # network _, decoded, _ = net(b_x) loss = lossFunc(decoded, b_y) # mean square error optimizer.zero_grad() # clear gradients for this training step loss.backward() # backpropagation, compute gradients optimizer.step() # apply gradients print(net.fc1.weight.grad) #get new J newJ = getNewJ(net.fc1.weight.grad.clone(), set_c, device).item() print(newJ) # initialize the weight of node J by xavier tmpWeight = torch.rand(1, net.fc1.out_features) tmpWeight = nn.init.xavier_normal_(tmpWeight) net.fc1.weight.data[:, newJ] = tmpWeight # update set s and aet C set_s[newJ] = torch.tensor(1) set_c[newJ] = torch.tensor(0) # test #sys.exit() predLabelsByVAE = list() features = list() for (x, _) in dataLoader: b_x = Variable(x.view(-1, xn * yn).float().to(device)) # batch x (data) feature, _, predicted = net(b_x) features.append([feature.cpu().detach().numpy()]) predicted = torch.max(predicted.data, 1)[1].cpu().numpy() predLabelsByVAE.append(predicted) # test end features = np.hstack(features) zn, xn, yn = features.shape features = np.reshape(features, (xn, yn)) features = np.array(features) z = features pams = str(runPams.k) + "_" + str(runPams.n_top_gene) pathName = "/N/project/zhangclab/pengtao/myProjectsDataRes/20200619Classification/results/" + pams + "/imgs_VAE+DNP/" # kmeans kmeans = KMeans(n_clusters=4, random_state=0).fit(z) imgName = "kmeans.png" myData.myDraw(z, kmeans.labels_, pathName, imgName) # Hierarchical Clustering clst = cluster.AgglomerativeClustering(n_clusters=4) imgName = "Hierarchical_Clustering.png" myData.myDraw(z, clst.fit_predict(z), pathName, imgName) for i in range(1, len(an.columns)): a = an.columns[i] imgName = str(a) + ".png" myData.myDraw(z, an[a], pathName, imgName) # save gene names pathName = "/N/project/zhangclab/pengtao/myProjectsDataRes/20200619Classification/results/" + pams + "/genes_selected.csv" genes = pd.DataFrame(set_s.cpu().detach().numpy()) genes = genes.T genes.columns = top_gene_names genes.to_csv(pathName) ''' kmeans_estimator = KMeans(n_clusters=4, random_state=0).fit(features) labelByVAEKmeans = kmeans_estimator.labels_ # get figures mark = ['or', 'ob', 'og', 'ok', '^r', '+r', 'sr', 'dr', '<r', 'pr'] # 这里'or'代表中的'o'代表画圈,'r'代表颜色为红色,后面的依次类推 for i in range(len(labelByVAEKmeans)): plt.plot([features[i, 0]], [features[i, 1]], mark[label_pred[i]], markersize=5) #save data pathName = "/N/project/zhangclab/pengtao/myProjectsDataRes/20200702AE_DNP/results/csv_img_res/" fileName = pathName + str(runPams.k) + ".png" plt.savefig(fileName) fileName = pathName + str(runPams.k) + ".csv" setS = pd.DataFrame(set_s.cpu().detach().numpy()) setS = setS.T setS.to_csv(fileName) #plt.show() ''' return ()