Пример #1
0
def train_test(label, data, Net, device, optimizer, lossFunc, opt):
    trainData, trainLabel, testData, testLabel = myData.separateData(label,
                                                                     data,
                                                                     sep=5)
    # -------------------------------train-------------------------------------
    dataSet = myData.MyDataset(trainData, trainLabel)
    dataLoader = DataLoader(dataset=dataSet,
                            batch_size=opt.batch_size,
                            shuffle=True,
                            num_workers=opt.n_cpu,
                            pin_memory=torch.cuda.is_available())
    # train start
    for epoch in range(opt.n_epochs):
        for step, (x, y) in enumerate(dataLoader):
            b_x = Variable(x.to(device))  # batch data
            b_y = Variable(y.to(device))  # batch label
            output = Net(b_x)
            loss = lossFunc(output, b_y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
    # train end

    # ----------------------------------------------------------test-----------------------------------------
    # test start
    dataSet = myData.MyDataset(testData, testLabel)
    dataLoader = DataLoader(dataset=dataSet,
                            batch_size=opt.batch_size,
                            shuffle=True,
                            num_workers=opt.n_cpu,
                            pin_memory=torch.cuda.is_available())
    rmse = 0.0  # root mean square error
    acc = 0.0  # Accuracy
    SE = 0.0  # Sensitivity (Recall)
    PC = 0.0  # Precision
    F1 = 0.0  # F1 Score
    JS = 0.0  # Jaccard Similarity
    ytrue_ypred = list()
    length = 0
    for (x, y) in dataLoader:
        b_x = Variable(x.to(device))  # batch x (data)
        b_y = Variable(y.to(device))  # batch y (label)
        outputs = torch.sigmoid(Net(b_x))
        predicted = torch.max(outputs.data, 1)[1].cpu()
        b_y = b_y.cpu()
        ytrue_ypred.append([b_y.numpy(), predicted.numpy()])
        rmse += myEvaluation.get_RMSE(b_y, predicted)
        acc += myEvaluation.get_accuracy(b_y, predicted)
        SE += myEvaluation.get_sensitivity(b_y, predicted)
        PC += myEvaluation.get_precision(b_y, predicted)
        F1 += myEvaluation.get_F1(b_y, predicted)
        JS += myEvaluation.get_JS(b_y, predicted)
        length += 1
    # test end

    res = [rmse, acc, SE, PC, F1, JS]
    res = [round(r / length, 2) for r in res]
    # res = ','.join(str(i) for i in res)
    return (res, ytrue_ypred)
def fit_VAE(data, labels, net, device, opt):
    zn, xn, yn = data.size()
    dataSet = myData.MyDataset(data, data)
    dataLoader = DataLoader(dataset=dataSet,
                            batch_size=opt.batch_size,
                            shuffle=False,
                            num_workers=opt.n_cpu,
                            pin_memory=torch.cuda.is_available())
    # ----------------------------------------------------------fit-----------------------------------------
    # fit start
    ytrue_ypred = list()
    for (x, y) in dataLoader:
        b_x = Variable(x.view(-1, xn * yn).float().to(device))  # batch data
        b_y = Variable(y.to(device))  # batch y (label)
        _, _, predicted = net(b_x)
        predicted = torch.max(predicted.data, 1)[1].cpu()
        b_y = b_y.cpu()
        ytrue_ypred.append([b_y.numpy(), predicted.numpy()])
    return (ytrue_ypred)
Пример #3
0
def train_test_VAE(data, net, device, optimizer, lossFunc, opt):
    zn, xn, yn = data.size()
    dataSet = myData.MyDataset(data, data)
    dataLoader = DataLoader(dataset=dataSet,
                            batch_size=opt.batch_size,
                            shuffle=False,
                            num_workers=opt.n_cpu,
                            pin_memory=torch.cuda.is_available())
    # train start
    for epoch in range(opt.n_epochs):
        for step, (x, _) in enumerate(dataLoader):
            b_x = Variable(x.view(-1,
                                  xn * yn).float().to(device))  # batch data
            _, decoded, _ = net(b_x)
            loss = lossFunc(decoded, b_x)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            if step % 10 == 9:
                print('Epoch: ', epoch,
                      '| train loss: %.4f' % loss.data.cpu().numpy())
    # train end

    # ----------------------------------------------------------test-----------------------------------------
    # test start

    predLabels = list()
    features = list()
    for (x, y) in dataLoader:
        b_x = Variable(x.view(-1,
                              xn * yn).float().to(device))  # batch x (data)
        feature, _, predicted = net(b_x)
        features.append([feature.cpu().detach().numpy()])
        predicted = torch.max(predicted.data, 1)[1].cpu().numpy()
        predLabels.append(predicted)
    # test end

    features = np.hstack(features)
    zn, xn, yn = features.shape
    features = np.reshape(features, (xn, yn))
    predLabels = np.concatenate(predLabels)
    return (features, predLabels)
Пример #4
0
def train_test_AE(data, net, device, optimizer, lossFunc, opt):
    zn, xn, yn = data.size()
    dataSet = myData.MyDataset(data, data)
    dataLoader = DataLoader(dataset=dataSet,
                            batch_size=opt.batch_size,
                            shuffle=False,
                            num_workers=opt.n_cpu,
                            pin_memory=torch.cuda.is_available())
    # train start
    for epoch in range(opt.n_epochs):
        for step, (x, _) in enumerate(dataLoader):
            b_x = Variable(x.view(-1,
                                  xn * yn).float().to(device))  # batch data
            encoded, decoded, _ = net(b_x)
            loss = lossFunc(decoded, b_x)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            '''
            if step % 10 == 9:
                print('Epoch: ', epoch, '| train loss: %.4f' % loss.data.numpy())
            '''
    # train end

    # ----------------------------------------------------------test-----------------------------------------
    # test start

    predLabels = list()
    for (x, y) in dataLoader:
        b_x = Variable(x.view(-1,
                              xn * yn).float().to(device))  # batch x (data)
        _, _, label = net(b_x)
        predicted = torch.max(label.data, 1)[1].cpu()
        predLabels.append([predicted.numpy()])

    # test end
    predLabels = np.concatenate(predLabels, axis=1)
    # res = ','.join(str(i) for i in res)
    return (predLabels)
def train_test_VAE(data, labels, net, device, optimizer, lossFunc, opt, pathName, fileName, saveModel=0):
    torch.manual_seed(16)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min')
    zn, xn, yn = data.size()
    trainData, trainLabel, testData, testLabel = myData.separateData(labels, data, sep=5)
    dataSet = myData.MyDataset(trainData, trainLabel)
    dataLoader = DataLoader(dataset=dataSet,
                            batch_size=opt.batch_size,
                            shuffle=True,
                            num_workers=opt.n_cpu,
                            pin_memory=torch.cuda.is_available())
    # train start
    for epoch in range(opt.n_epochs):
        c = 0
        loss_total = 0
        for step, (x, y) in enumerate(dataLoader):
            b_x = Variable(x.view(-1, xn * yn).float().to(device))  # batch data
            _, decoded, _ = net(b_x)
            loss = lossFunc(decoded, b_x)
            loss_total = loss_total + loss.data.cpu().numpy()
            c = c + 1
            optimizer.zero_grad()
            loss.backward()
            optimizer.step() 
        loss_total = loss_total / c
        if epoch % 100 == 99:
            print('Epoch: ', epoch, '| train loss: %.4f' % loss_total)
        scheduler.step(loss_total)
    # train end

    # save net
    if saveModel == 1:
        if not os.path.exists(pathName):
            os.makedirs(pathName)
        fileName = pathName + fileName
        torch.save(net, fileName)
    return (1)
Пример #6
0
def train_test_GAN(data,
                   device,
                   lossFunc,
                   opt,
                   net_G,
                   g_optimizer,
                   net_D,
                   d_optimizer,
                   d_steps=16,
                   g_steps=8):
    zn, xn, yn = data.size()
    dataSet = myData.MyDataset(data, data)
    dataLoader = DataLoader(dataset=dataSet,
                            batch_size=opt.batch_size,
                            shuffle=False,
                            num_workers=opt.n_cpu,
                            pin_memory=torch.cuda.is_available())

    for epoch in range(opt.n_epochs):
        for d_index in range(d_steps):
            for i, (x, _) in enumerate(dataLoader):
                # 1. Train D on real+fake
                net_D.zero_grad()

                #  1A: Train D on real
                d_real_data = Variable(x.view(-1, xn * yn).float().to(device))
                d_real_decision, _ = net_D(d_real_data)
                d_real_loss = lossFunc(d_real_decision,
                                       Variable(
                                           torch.ones_like(d_real_decision).to(
                                               device)))  # ones = true
                d_real_loss.backward(
                )  # compute/store gradients, but don't change params

                #  1B: Train D on fake
                d_gen_input = Variable(torch.randn(100).to(device))
                d_fake_data = net_G(d_gen_input).detach(
                )  # detach to avoid training G on these labels
                d_fake_decision, _ = net_D(d_fake_data)
                d_fake_loss = lossFunc(
                    d_fake_decision,
                    Variable(torch.zeros_like(d_fake_decision)).to(
                        device))  # zeros = fake
                d_fake_loss.backward()
                d_optimizer.step(
                )  # Only optimizes D's parameters; changes based on stored gradients from backward()

        for g_index in range(g_steps):
            # 2. Train G on D's response (but DO NOT train D on these labels)
            net_G.zero_grad()

            gen_input = Variable(torch.randn(100).to(device))
            g_fake_data = net_G(gen_input)
            dg_fake_decision, _ = net_D(g_fake_data)
            g_loss = lossFunc(
                dg_fake_decision,
                Variable(torch.ones_like(dg_fake_decision).to(
                    device)))  # we want to fool, so pretend it's all genuine
            g_loss.backward()
            g_optimizer.step()  # Only optimizes G's parameters

    # train end

    # ----------------------------------------------------------test-----------------------------------------
    # test start

    predLabels = list()
    for (x, _) in dataLoader:
        x = Variable(x.view(-1, xn * yn).float().to(device))  # batch x (data)
        _, label = net_D(x)
        predicted = torch.max(label.data, 1)[1].cpu().numpy()
        predLabels.append([predicted])
        # test end
    predLabels = np.concatenate(predLabels, axis=1)
    return (predLabels)
Пример #7
0
def train_test_FCN(data, labels, net, device, optimizer, lossFunc, opt):
    zn, xn, yn = data.size()
    dataSet = myData.MyDataset(data, labels)
    dataLoader = DataLoader(dataset=dataSet,
                            batch_size=opt.batch_size,
                            shuffle=True,
                            num_workers=opt.n_cpu,
                            pin_memory=torch.cuda.is_available())
    # train start
    for epoch in range(opt.n_epochs):
        for step, (x, y) in enumerate(dataLoader):
            #b_x = Variable(x.view(-1, xn * yn).float().to(device))  # batch data
            b_x = Variable(x.to(device))
            b_y = Variable(y.to(device))  # batch label
            output = net(b_x)
            loss = lossFunc(output, b_y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            if step % 10 == 9:
                print('Epoch: ', epoch,
                      '| train loss: %.4f' % loss.data.cpu().numpy())
    # train end

    # ----------------------------------------------------------test-----------------------------------------
    # test start
    dataSet = myData.MyDataset(data, labels)
    dataLoader = DataLoader(dataset=dataSet,
                            batch_size=opt.batch_size,
                            shuffle=False,
                            num_workers=opt.n_cpu,
                            pin_memory=torch.cuda.is_available())

    rmse = 0.0  # root mean square error
    acc = 0.0  # Accuracy
    SE = 0.0  # Sensitivity (Recall)
    PC = 0.0  # Precision
    F1 = 0.0  # F1 Score
    JS = 0.0  # Jaccard Similarity
    ytrue_ypred = list()
    length = 0
    for (x, y) in dataLoader:
        b_x = Variable(x.to(device))  # batch data
        b_y = Variable(y.to(device))  # batch y (label)
        outputs = torch.sigmoid(net(b_x))
        predicted = torch.max(outputs.data, 1)[1].cpu()

        b_y = b_y.cpu()
        ytrue_ypred.append([b_y.numpy(), predicted.numpy()])
        acc += myEvaluation.get_accuracy(b_y, predicted)
        '''
        rmse += myEvaluation.get_RMSE(b_y, predicted)
        
        SE += myEvaluation.get_sensitivity(b_y, predicted)
        PC += myEvaluation.get_precision(b_y, predicted)
        F1 += myEvaluation.get_F1(b_y, predicted)
        JS += myEvaluation.get_JS(b_y, predicted)
        '''
        length += 1
    # test end

    #res = [rmse, acc, SE, PC, F1, JS]
    #res = [round(r / length, 2) for r in res]
    # res = ','.join(str(i) for i in res)
    return (acc / length, ytrue_ypred)

    predLabels = list()
    features = list()
    for (x, y) in dataLoader:
        b_x = Variable(x.view(-1,
                              xn * yn).float().to(device))  # batch x (data)
        feature, _, predicted = net(b_x)
        features.append([feature.cpu().detach().numpy()])
        predicted = torch.max(predicted.data, 1)[1].cpu().numpy()
        predLabels.append(predicted)
    # test end

    features = np.hstack(features)
    zn, xn, yn = features.shape
    features = np.reshape(features, (xn, yn))
    predLabels = np.concatenate(predLabels)
    return (features, predLabels)
Пример #8
0
def main():
    #select cpu or gpu
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    torch.manual_seed(16)

    #pre process data

    #annotation
    an = pd.read_csv(
        "/N/project/zhangclab/pengtao/myProjectsDataRes/20200619Classification/data/annotation.csv",
        index_col=0)
    tmp = an["loh_percent"].copy()
    for i in range(len(tmp)):
        if tmp[i] >= 0.05:
            tmp[i] = 1
        else:
            if tmp[i] < 0.05:
                tmp[i] = 0
    an["loh_percent"] = tmp

    tmp2 = an["mutations_per_mb"].copy()
    for i in range(len(tmp2)):
        if tmp2[i] >= 28:
            tmp2[i] = 1
        else:
            if tmp2[i] < 28:
                tmp2[i] = 0
    an["mutations_per_mb"] = tmp2

    #data
    x = pd.read_csv(
        "/N/project/zhangclab/pengtao/myProjectsDataRes/20200619Classification/data/salmonE74cDNA_counts_baseline.csv",
        index_col=0)
    x = x.T
    x = (x + 1).apply(np.log2)
    #test = np.median(x, axis=0)
    x_std = np.std(x, axis=0)
    top_gene = runPams.n_top_gene
    top_gene_idx = x_std.argsort()[::-1][0:top_gene]
    data = x.iloc[:, top_gene_idx]
    data = data.values.copy()
    top_gene_names = list(x.columns[top_gene_idx])
    top_gene_names = np.insert(top_gene_names, 0, "bias")

    #data = np.random.rand(10, 200)
    xn, yn = data.shape

    # umap + kmeans
    pams = str(runPams.k) + "_" + str(runPams.n_top_gene)
    pathName = "/N/project/zhangclab/pengtao/myProjectsDataRes/20200619Classification/results/" + pams + "/imgs_UMAP/"
    # umap
    reducer = umap.UMAP()
    z = reducer.fit_transform(data)
    # kmeans
    kmeans = KMeans(n_clusters=4, random_state=0).fit(z)

    imgName = "kmeans.png"
    myData.myDraw(z, kmeans.labels_, pathName, imgName)

    # Hierarchical Clustering
    clst = cluster.AgglomerativeClustering(n_clusters=4)

    imgName = "Hierarchical_Clustering.png"
    myData.myDraw(z, clst.fit_predict(z), pathName, imgName)

    for i in range(1, len(an.columns)):
        a = an.columns[i]
        imgName = str(a) + ".png"
        myData.myDraw(z, an[a], pathName, imgName)

    # TSNE
    pathName = "/N/project/zhangclab/pengtao/myProjectsDataRes/20200619Classification/results/" + pams + "/imgs_TSNE/"

    # T-sNE
    z = TSNE(n_components=2).fit_transform(data)
    # kmeans
    kmeans = KMeans(n_clusters=4, random_state=0).fit(z)

    imgName = "kmeans.png"
    myData.myDraw(z, kmeans.labels_, pathName, imgName)

    # Hierarchical Clustering
    clst = cluster.AgglomerativeClustering(n_clusters=4)

    imgName = "Hierarchical_Clustering.png"
    myData.myDraw(z, clst.fit_predict(z), pathName, imgName)

    for i in range(1, len(an.columns)):
        a = an.columns[i]
        imgName = str(a) + ".png"
        myData.myDraw(z, an[a], pathName, imgName)

    # vae+dnp
    #data = np.random.rand(10, 2000)
    #xn, yn = data.shape
    data = np.reshape(data, (xn, 1, yn))
    data = np.insert(data, 0, 1, axis=2)
    #data = data[:,:,:5000]
    zn, xn, yn = data.shape
    # set s
    set_s = np.zeros(xn * yn)
    set_s[0] = 1

    # set c
    set_c = np.ones(xn * yn)
    set_c[0] = 0

    # np 2 tensor
    data = torch.tensor(data)
    # dataLoader
    dataSet = myData.MyDataset(data, data)
    dataLoader = DataLoader(dataset=dataSet,
                            batch_size=runPams.batch_size,
                            shuffle=False,
                            num_workers=runPams.n_cpu,
                            pin_memory=torch.cuda.is_available())

    net, optimizer, lossFunc = getVAEPams(xn, yn, device, runPams.lr)

    #np->tensor or gpu
    set_s = torch.tensor(set_s).float().to(device)
    set_c = torch.tensor(set_c).float().to(device)

    # train
    while torch.sum(set_s == 1).item() < (runPams.k + 1):
        print(torch.sum(set_s == 1).item())
        for _ in range(runPams.epoch):
            for step, (x, _) in enumerate(dataLoader):
                b_x = Variable(x.view(-1, xn * yn).float().to(device))
                b_y = Variable(x.view(-1, xn * yn).float().to(device))

                # initialize the weight of set c to be zero and of set s to be normal
                net.fc1.weight.data = net.fc1.weight.data * (set_s)

                # network
                _, decoded, _ = net(b_x)
                loss = lossFunc(decoded, b_y)  # mean square error
                optimizer.zero_grad()  # clear gradients for this training step
                loss.backward()  # backpropagation, compute gradients
                optimizer.step()  # apply gradients
                print(net.fc1.weight.grad)

        #get new J
        newJ = getNewJ(net.fc1.weight.grad.clone(), set_c, device).item()
        print(newJ)

        # initialize the weight of node J by xavier
        tmpWeight = torch.rand(1, net.fc1.out_features)
        tmpWeight = nn.init.xavier_normal_(tmpWeight)
        net.fc1.weight.data[:, newJ] = tmpWeight

        # update set s and aet C
        set_s[newJ] = torch.tensor(1)
        set_c[newJ] = torch.tensor(0)

    # test
    #sys.exit()
    predLabelsByVAE = list()
    features = list()
    for (x, _) in dataLoader:
        b_x = Variable(x.view(-1,
                              xn * yn).float().to(device))  # batch x (data)
        feature, _, predicted = net(b_x)
        features.append([feature.cpu().detach().numpy()])
        predicted = torch.max(predicted.data, 1)[1].cpu().numpy()
        predLabelsByVAE.append(predicted)
    # test end

    features = np.hstack(features)
    zn, xn, yn = features.shape
    features = np.reshape(features, (xn, yn))
    features = np.array(features)
    z = features
    pams = str(runPams.k) + "_" + str(runPams.n_top_gene)
    pathName = "/N/project/zhangclab/pengtao/myProjectsDataRes/20200619Classification/results/" + pams + "/imgs_VAE+DNP/"

    # kmeans
    kmeans = KMeans(n_clusters=4, random_state=0).fit(z)

    imgName = "kmeans.png"
    myData.myDraw(z, kmeans.labels_, pathName, imgName)

    # Hierarchical Clustering
    clst = cluster.AgglomerativeClustering(n_clusters=4)

    imgName = "Hierarchical_Clustering.png"
    myData.myDraw(z, clst.fit_predict(z), pathName, imgName)

    for i in range(1, len(an.columns)):
        a = an.columns[i]
        imgName = str(a) + ".png"
        myData.myDraw(z, an[a], pathName, imgName)

    # save gene names
    pathName = "/N/project/zhangclab/pengtao/myProjectsDataRes/20200619Classification/results/" + pams + "/genes_selected.csv"
    genes = pd.DataFrame(set_s.cpu().detach().numpy())
    genes = genes.T
    genes.columns = top_gene_names
    genes.to_csv(pathName)
    '''
    kmeans_estimator = KMeans(n_clusters=4, random_state=0).fit(features)
    labelByVAEKmeans = kmeans_estimator.labels_ 
    # get figures
    mark = ['or', 'ob', 'og', 'ok', '^r', '+r', 'sr', 'dr', '<r', 'pr']
    # 这里'or'代表中的'o'代表画圈,'r'代表颜色为红色,后面的依次类推
    for i in range(len(labelByVAEKmeans)):
        plt.plot([features[i, 0]], [features[i, 1]], mark[label_pred[i]], markersize=5)
    #save data
    pathName = "/N/project/zhangclab/pengtao/myProjectsDataRes/20200702AE_DNP/results/csv_img_res/"
    fileName = pathName + str(runPams.k) + ".png"
    plt.savefig(fileName)

    fileName = pathName + str(runPams.k) + ".csv"
    setS = pd.DataFrame(set_s.cpu().detach().numpy())
    setS = setS.T
    setS.to_csv(fileName)
    #plt.show()
    '''
    return ()