Пример #1
0
def correlations(df: pd.DataFrame,
                 include_categorical: bool = False,
                 plot_type: str = "dendogram",
                 figsize=[10, 5],
                 categorical_cols: List[str] = []):
    corr = None
    cols: List = []
    if include_categorical:
        corr = sr(df).correlation
        cols = df.columns
    else:

        if not len(categorical_cols):
            categorical_cols = df.select_dtypes(include=[np.object]).columns

        cols = [c for c in df.columns if c not in categorical_cols]

        corr = df[cols].corr()
        cols = corr.columns

    if plot_type == "dendogram":
        plot_dendogram(corr, cols, figsize=figsize)
    elif plot_type == "matrix":
        plot_matrix(corr, cols, figsize=figsize)
    else:
        raise (f"Variable plot_type not valid. Provided: {plot_type}")

    return corr
Пример #2
0
def show_corre(y_true, y_test):
    plt.figure()
    ax1 = plt.subplot(1, 1, 1)
    plt.title("after: " + format(sr(y_true, y_test.reshape(-1, 1))[0], '.4f'))
    plt.scatter(y_true, y_test.reshape(-1, 1))
    plt.grid()
    x0, x1 = ax1.get_xlim()
    y0, y1 = ax1.get_ylim()
    ax1.set_aspect((x1 - x0) / (y1 - y0))
    plt.ylabel("predicted QoE")
    plt.xlabel("real QoE")
    plt.show()
Пример #3
0
def show_results(X_test, X_test_before_scaling, y_test, regressor_name,
                 feature_labels, answer):

    if cfg.QUALITY_MODEL + "_" + cfg.POOLING_TYPE in feature_labels:
        position_vqa = feature_labels.index(cfg.QUALITY_MODEL + "_" +
                                            cfg.POOLING_TYPE)

    plt.figure()
    ax1 = plt.subplot(1, 1, 1)
    plt.title(
        "before: " +
        format(sr(y_test, X_test[:, position_vqa].reshape(-1, 1))[0], '.4f'))
    plt.scatter(y_test, X_test_before_scaling[:, position_vqa].reshape(-1, 1))
    plt.grid()
    x0, x1 = ax1.get_xlim()
    y0, y1 = ax1.get_ylim()
    ax1.set_aspect((x1 - x0) / (y1 - y0))
    plt.ylabel("predicted QoE")
    plt.xlabel("MOS")
    plt.show()

    plt.figure()
    ax1 = plt.subplot(1, 1, 1)
    plt.title("after: " + format(sr(y_test, answer.reshape(-1, 1))[0], '.4f'))
    plt.scatter(y_test, answer.reshape(-1, 1))
    plt.grid()
    x0, x1 = ax1.get_xlim()
    y0, y1 = ax1.get_ylim()
    ax1.set_aspect((x1 - x0) / (y1 - y0))
    plt.ylabel("predicted QoE")
    plt.xlabel("MOS")
    plt.show()

    print("SROCC before (" + str(cfg.QUALITY_MODEL) + "): " +
          str(sr(y_test, X_test[:, position_vqa].reshape(-1, 1))[0]))
    print("SROCC using DeepQoE (" + str(cfg.QUALITY_MODEL) + " + " +
          regressor_name + "): " + str(sr(y_test, answer.reshape(-1, 1))[0]))
Пример #4
0
def correlations(
        df: pd.DataFrame,
        include_categorical: bool = False,
        plot_type: str = "dendogram",
        plt_kwargs={},
        categorical_cols: List[str] = []):
    """
    Computes the correlations for the columns provided and plots the relevant
    image as requested by the parameters.

    :Example:

    cat_df = xai.balance(
        df, 
        "gender", "loan",
        upsample=0.8,
        downsample=0.8)

    :param df: Pandas Dataframe containing data (inputs and target )
    :type df: pandas.DataFrame
    :param *cross_cols: One or more positional arguments (passed as *args) that 
    are used to split the data into the cross product of their values 
    :type cross_cols: List[str]
    :param upsample: [Default: 0.5] Target upsample for columns lower 
        than percentage.
    :type upsample: float
    :param downsample: [Default: 1] Target downsample for columns higher 
        than percentage.
    :type downsample: float
    :param bins: [Default: 6] Number of bins to be used for numerical cols
    :type bins: int
    :param categorical_cols: [Default: []] Columns within dataframe that are
        categorical. Columns that are not np.objects and are not part explicitly
        provided here will be treated as numeric, and bins will be used.
    :type categorical_cols: List[str]
    :param threshold: [Default: 0.5] Threshold to display in the chart.
    :type threshold: float
    :returns: Returns a dataframe containing the correlation values for the features
    :rtype: pandas.DataFrame

    """

    corr = None
    cols: List = []
    if include_categorical:
        corr = sr(df).correlation 
        cols = df.columns
    else:

        if not len(categorical_cols):
            categorical_cols = df.select_dtypes(include=[np.object, np.bool]).columns

        cols = [c for c in df.columns if c not in categorical_cols]

        corr = df[cols].corr()
        cols = corr.columns

    if plot_type == "dendogram":
        _plot_correlation_dendogram(corr, cols, plt_kwargs=plt_kwargs)
    elif plot_type == "matrix":
        _plot_correlation_matrix(corr, cols, plt_kwargs=plt_kwargs)
    else:
        raise ValueError(f"Variable plot_type not valid. Provided: {plot_type}")

    return corr
Пример #5
0
def calc_spearman(pred, true):
    try:
        r, p_value = sr(np.asarray(pred), np.asarray(true))
    except ValueError:
        r = -1.0
    return r
Пример #6
0
def train_shuffle(min_mse=200, max_corr=0):
    round_max_spea = 0
    round_min_mse = 200
    # random.shuffle(f)
    # train = samples[:100]
    # test = samples[100:]
    trainset = videoDataset(root=args.root,
                            label="./data/train_dataset.txt",
                            suffix=".npy",
                            transform=transform,
                            data=None,
                            pcs=args.pcs)
    trainLoader = torch.utils.data.DataLoader(trainset,
                                              batch_size=128,
                                              shuffle=True,
                                              num_workers=0)
    testset = videoDataset(root=args.root,
                           label="./data/test_dataset.txt",
                           suffix='.npy',
                           transform=transform,
                           data=None,
                           pcs=args.pcs)
    testLoader = torch.utils.data.DataLoader(testset,
                                             batch_size=64,
                                             shuffle=False,
                                             num_workers=0)

    # build the model
    scoring = Scoring(feature_size=4096)
    if torch.cuda.is_available():
        scoring.cuda()  # turn the model into gpu
    # scoring.load_state_dict(torch.load("./models/merge/pcs.pt"))
    total_params = sum(p.numel() for p in scoring.parameters()
                       if p.requires_grad)
    loss_log.write("Total Params: " + str(total_params) + '\n')
    optimizer = optim.Adam(
        params=scoring.parameters(),
        lr=0.0005)  # use SGD optimizer to optimize the loss function
    scheduler = lr_scheduler.StepLR(optimizer, step_size=70, gamma=0.7)
    for epoch in range(500):  # total 40 epoches
        # scheduler.step()
        print("Epoch:  " + str(epoch) + "Total Params: %d" % total_params)
        total_regr_loss = 0
        total_sample = 0
        for i, (features, scores) in enumerate(trainLoader):  # get mini-batch
            # print("%d batches have done" % i)
            if torch.cuda.is_available():
                features = Variable(features).cuda()
                scores = Variable(scores).cuda()
            # regression, logits = scoring(features)
            logits, penal = scoring(features)
            if penal is None:
                regr_loss = scoring.loss(logits, scores)
            else:
                regr_loss = scoring.loss(logits, scores) + penal
            # new three lines are back propagation
            optimizer.zero_grad()
            regr_loss.backward()
            # nn.utils.clip_grad_norm(scoring.parameters(), 1.5)
            optimizer.step()
            total_regr_loss += regr_loss.data.item() * scores.shape[0]
            total_sample += scores.shape[0]

        loss_log.write(str(total_regr_loss / total_sample) + '\n')

        print("Classification Loss: " + str(total_regr_loss / total_sample) +
              '\n')
        # the rest is used to evaluate the model with the test dataset
        torch.save(scoring.state_dict(), './models/epoch{}.pt'.format(epoch))
        scoring.eval()
        val_pred = []
        val_sample = 0
        val_loss = 0
        val_truth = []
        for j, (features, scores) in enumerate(testLoader):
            val_truth.append(scores.numpy())
            if torch.cuda.is_available():
                features = Variable(features).cuda()
                scores = Variable(scores).cuda()
            regression, _ = scoring(features)
            val_pred.append(regression.data.cpu().numpy())
            regr_loss = scoring.loss(regression, scores)
            val_loss += (regr_loss.data.item()) * scores.shape[0]
            val_sample += scores.shape[0]
        val_truth = np.concatenate(val_truth)
        val_pred = np.concatenate(val_pred)
        val_sr, _ = sr(val_truth, val_pred)
        if val_loss / val_sample < min_mse:
            torch.save(scoring.state_dict(), './models/merge/tes_40attn.pt')
        min_mse = min(min_mse, val_loss / val_sample)
        max_corr = max(max_corr, val_sr)
        round_min_mse = min(round_min_mse, val_loss / val_sample)
        round_max_spea = max(val_sr, round_max_spea)
        print(
            "Val Loss: %.2f Correlation: %.2f Min Val Loss: %.2f Max Correlation: %.2f"
            % (val_loss / val_sample, val_sr, min_mse, max_corr))
        scoring.train()
    w.write('MSE: %.2f spearman: %.2f' % (round_min_mse, round_max_spea))
    return min_mse, max_corr
    default="movies_accuracy.csv")
parser.add_argument("--out_file",
                    help="name of file to write correlation to, if desired",
                    default="out.txt")
args = parser.parse_args()

# scores: stores modularity scores - outer key = k, inner key = level
scores = {}

# reading in modularity data
with open(args.modularity_file, newline="") as csvfile:
    reader = csv.reader(csvfile)
    rows = [row for row in reader]
    for i in range(len(rows[0])):
        scores[rows[0][i]] = [float(score[i]) for score in rows[1:]]

metric = []

# reading in performance data, assuming 1-column list w/ no header
with open(args.metrics_file, "r") as csvfile:
    reader = csv.reader(csvfile)
    rows = [row for row in reader]
    for i in range(len(rows)):
        metric.append(float(rows[i][0]))

# writes results to a file, separates columns by commas
with open(args.out_file, "w") as o:
    o.write("category,correlation\n")  # column headers
    for cat in scores.keys():
        o.write(cat + "," + str(sr(scores[cat], metric).correlation) + "\n")
Пример #8
0
def train_shuffle(min_mse=200, max_corr=0):
    trainset = videoDataset(root="figure_skating/c3d_feat",
                            label="data/train_dataset.txt",
                            suffix=".npy",
                            transform=transform,
                            data=None)
    trainLoader = torch.utils.data.DataLoader(trainset,
                                              batch_size=128,
                                              shuffle=True,
                                              num_workers=0)
    testset = videoDataset(root="figure_skating/c3d_feat",
                           label="data/test_dataset.txt",
                           suffix='.npy',
                           transform=transform,
                           data=None)
    testLoader = torch.utils.data.DataLoader(testset,
                                             batch_size=64,
                                             shuffle=False,
                                             num_workers=0)

    # build the model
    scoring = Scoring(feature_size=4096)
    if torch.cuda.is_available():
        scoring.cuda()  # turn the model into gpu
    total_params = sum(p.numel() for p in scoring.parameters()
                       if p.requires_grad)
    optimizer = optim.Adam(params=scoring.parameters(), lr=0.0005)  # Adam
    for epoch in range(epoch_num):
        print("Epoch:  " + str(epoch) + "; Total Params: %d" % total_params)
        total_regr_loss = 0
        total_sample = 0
        for i, (features, scores) in enumerate(trainLoader):  # get mini-batch
            print("%d batches have done" % i)
            if torch.cuda.is_available():
                features = Variable(features).cuda()
                scores = Variable(scores).cuda()
            logits, penal = scoring(features)  # features.shape=(128,300,4096)
            if penal is None:
                regr_loss = scoring.loss(logits, scores)
            else:
                regr_loss = scoring.loss(logits, scores) + penal

            # back propagation
            optimizer.zero_grad(
            )  # PyTorch accumulates the gradients, so clean it every step of backward.
            regr_loss.backward()
            optimizer.step()
            total_regr_loss += regr_loss.data.item() * scores.shape[0]
            total_sample += scores.shape[0]

        loss = total_regr_loss / total_sample
        train_loss.append(loss)
        print("Regression Loss: " + str(loss) + '\n')

        ### the rest is used to evaluate the model with the test dataset ###
        scoring.eval(
        )  # turn the model into evaluation mode(for batch-normalization / dropout layer)
        val_pred = []
        val_sample = 0
        val_loss = 0
        val_truth = []
        for j, (features, scores) in enumerate(testLoader):
            val_truth.append(scores.numpy())
            if torch.cuda.is_available():
                features = Variable(features).cuda()
                scores = Variable(scores).cuda()
            regression, _ = scoring(features)
            val_pred.append(regression.data.cpu().numpy())
            regr_loss = scoring.loss(regression, scores)
            val_loss += (regr_loss.data.item()) * scores.shape[0]
            val_sample += scores.shape[0]
        val_truth = np.concatenate(val_truth)
        val_pred = np.concatenate(val_pred)
        val_sr, _ = sr(val_truth, val_pred)
        if val_loss / val_sample < min_mse:
            torch.save(scoring.state_dict(), 'S_LSTM+M_LSTM+PCS.pt')
        min_mse = min(min_mse, val_loss / val_sample)
        max_corr = max(max_corr, val_sr)
        loss = val_loss / val_sample
        test_loss.append(loss)
        print(
            "Val Loss: {:.2f} Correlation: {:.2f} Min Val Loss: {:.2f} Max Correlation: {:.2f}"
            .format(loss, val_sr, min_mse, max_corr))

        scoring.train()  # turn back to train mode
Пример #9
0
modularities against wordsim task results. Results printed to console.
'''

# imports
import csv
from scipy.stats import spearmanr as sr
import argparse

# processing command-line arguments
parser = argparse.ArgumentParser()
parser.add_argument("--modularity_file",
                    help="name of file with modularity scores, 1-column .csv",
                    default="data/3_2.csv")
parser.add_argument(
    "--downstream_file",
    help="name of file with downstream performance metrics, 1-column .csv",
    default="data/loss.csv")
args = parser.parse_args()

mods = []
with open(args.modularity_file, newline="") as csvfile:
    reader = csv.reader(csvfile)
    mods = [float(row[0]) for row in reader]

metrics = []
with open(args.downstream_file, newline="") as csvfile:
    reader = csv.reader(csvfile)
    metrics = [float(row[0]) for row in reader]

print(sr(mods, metrics))
Пример #10
0
    regressor.fit(X_train, np.ravel(y_train))

if hasattr(regressor, 'best_estimator_'):
    answer = regressor.best_estimator_.predict(X_test)
else:
    answer = regressor.predict(X_test)

# locate column of quality model for SROCC without regression
if quality_model + "_" + pooling_type in feature_labels:
    position_vqa = feature_labels.index(quality_model + "_" + pooling_type)

# display results
plt.figure()
ax1 = plt.subplot(1, 1, 1)
plt.title("before: " +
          format(sr(y_test, X_test[:, position_vqa].reshape(-1, 1))[0], '.4f'))
plt.scatter(y_test, X_test_before_scaling[:, position_vqa].reshape(-1, 1))
plt.grid()
x0, x1 = ax1.get_xlim()
y0, y1 = ax1.get_ylim()
ax1.set_aspect((x1 - x0) / (y1 - y0))
plt.ylabel("predicted QoE")
plt.xlabel("MOS")
plt.show()

plt.figure()
ax1 = plt.subplot(1, 1, 1)
plt.title("after: " + format(sr(y_test, answer.reshape(-1, 1))[0], '.4f'))
plt.scatter(y_test, answer.reshape(-1, 1))
plt.grid()
x0, x1 = ax1.get_xlim()
Пример #11
0
def train_shuffle():
    #random.shuffle(f)
    #train = samples[:100]
    #test = samples[100:]
    trainset = videoDataset(
        root="/home/xuchengming/MM18/figure-skating/c3d_feat",
        label="./data/train_dataset.txt",
        suffix=".npy",
        transform=transform,
        data=None)
    trainLoader = torch.utils.data.DataLoader(trainset,
                                              batch_size=128,
                                              shuffle=True,
                                              num_workers=0)
    testset = videoDataset(
        root="/home/xuchengming/MM18/figure-skating/c3d_feat",
        label="./data/test_dataset.txt",
        suffix='.npy',
        transform=transform,
        data=None)
    testLoader = torch.utils.data.DataLoader(testset,
                                             batch_size=64,
                                             shuffle=False,
                                             num_workers=0)

    #build the model
    scoring = Scoring(feature_size=4096)
    if torch.cuda.is_available():
        scoring.cuda()  #turn the model into gpu
    optimizer = optim.Adam(
        params=scoring.parameters(),
        lr=0.0005)  # use SGD optimizer to optimize the loss function
    scheduler = lr_scheduler.StepLR(optimizer, step_size=70, gamma=0.7)
    min_mse = 200
    max_corr = 0
    for epoch in range(500):  # total 40 epoches
        #scheduler.step()
        print("Epoch: " + str(epoch))
        total_regr_loss = 0
        total_sample = 0
        for i, (features, scores) in enumerate(trainLoader):  # get mini-batch
            #print("%d batches have done" % i)
            if torch.cuda.is_available():
                features = Variable(features).cuda()
                scores = Variable(scores).cuda()
            #regression, logits = scoring(features)
            logits, penal = scoring(features)
            regr_loss = scoring.loss(logits, scores) + penal * 1.0
            # new three lines are back propagation
            optimizer.zero_grad()
            regr_loss.backward()
            #nn.utils.clip_grad_norm(scoring.parameters(), 1.5)
            optimizer.step()
            total_regr_loss += regr_loss.data.item() * scores.shape[0]
            total_sample += scores.shape[0]

        print("Classification Loss: " + str(total_regr_loss / total_sample))
        # the rest is used to evaluate the model with the test dataset
        torch.save(scoring.state_dict(), './models/epoch{}.pt'.format(epoch))
        scoring.eval()
        val_pred = []
        val_sample = 0
        val_loss = 0
        val_truth = []
        for j, (features, scores) in enumerate(testLoader):
            val_truth.append(scores.numpy())
            if torch.cuda.is_available():
                features = Variable(features).cuda()
                scores = Variable(scores).cuda()
            regression, _ = scoring(features)
            val_pred.append(regression.data.cpu().numpy())
            regr_loss = scoring.loss(regression, scores)
            val_loss += (regr_loss.data.item()) * scores.shape[0]
            val_sample += scores.shape[0]
        val_truth = np.concatenate(val_truth)
        val_pred = np.concatenate(val_pred)
        val_sr, _ = sr(val_truth, val_pred)
        min_mse = min(min_mse, val_loss / val_sample)
        max_corr = max(max_corr, val_sr)
        print(
            "Val Loss: %.2f Correlation: %.2f Min Val Loss: %.2f Max Correlation: %.2f"
            % (val_loss / val_sample, val_sr, min_mse, max_corr))
        scoring.train()
    w.write(str(max_corr) + '\n')