Пример #1
0
def eval_retrieval(scaler, model):
    # This loads the data from disk every time
    # To speed this up, we could load this once and then "only" make copies
    # The reason we can't have a single copy is that the model predictions can differ
    data = read_pickle(args.data)
    data = sub_data(data, train=False, in_place=True)
    prediction = model.predict(scaler.transform(data["docs"]))

    data = prune_docs(
        data,
        None, [i for i, _x in enumerate(data["docs"]) if prediction[i] == 1],
        verbose=False)

    acc = acc_ip(data["queries"], data["docs"], data["relevancy"], n=10)
    return acc
Пример #2
0
import sys
sys.path.append("src")
from misc.load_utils import read_pickle, center_data, norm_data
from misc.retrieval_utils import rprec_l2, rprec_ip
import numpy as np
import torch
import argparse
from sklearn.decomposition import PCA
from sklearn.model_selection import ParameterGrid

parser = argparse.ArgumentParser()
parser.add_argument('--data')
parser.add_argument('--post-cn', action="store_true")
parser.add_argument('--seed', type=int, default=0)
args = parser.parse_args()
data = read_pickle(args.data)


def summary_performance(name, dataReduced, dataReconstructed):
    if args.post_cn:
        dataReduced = center_data(dataReduced)
        dataReduced = norm_data(dataReduced)

    val_ip = rprec_ip(dataReduced["queries"],
                      dataReduced["docs"],
                      data["relevancy"],
                      fast=True)
    val_l2 = rprec_l2(dataReduced["queries"],
                      dataReduced["docs"],
                      data["relevancy"],
                      fast=True)
Пример #3
0
def random_projection_performance(components, model_name, runs=3):
    if model_name == "gauss":
        Model = GaussianRandomProjection
    elif model_name == "sparse":
        Model = SparseRandomProjection
    elif model_name == "crop":
        Model = CropRandomProjection
    else:
        raise Exception("Unknown model")

    random.seed(args.seed)
    vals_ip = []
    vals_l2 = []
    for i in range(runs):
        data = read_pickle(args.data)
        # take only dev queries
        data = sub_data(data, train=False, in_place=True)
        # make sure the vectors are np arrays
        data["queries"] = np.array(data["queries"])
        data["docs"] = np.array(data["docs"])

        model = Model(n_components=components,
                      random_state=random.randint(0, 2**8 - 1))
        model.fit(data["docs"])

        dataReduced = {
            "queries": safe_transform(model, data["queries"]),
            "docs": safe_transform(model, data["docs"])
        }
        del data["queries"]
        del data["docs"]

        if args.post_cn:
            dataReduced = center_data(dataReduced)
            dataReduced = norm_data(dataReduced)

        # copy to make it C-continuous
        # (skipped)
        val_l2 = rprec_a_l2(
            dataReduced["queries"],
            dataReduced["docs"],
            data["relevancy"],
            data["relevancy_articles"],
            data["docs_articles"],
            report=False,
            fast=True,
        )
        vals_l2.append(val_l2)

        # skip IP computation because the vectors are normalized
        if not args.post_cn:
            val_ip = rprec_a_ip(
                dataReduced["queries"],
                dataReduced["docs"],
                data["relevancy"],
                data["relevancy_articles"],
                data["docs_articles"],
                report=False,
                fast=True,
            )
            vals_ip.append(val_ip)
        else:
            vals_ip.append(val_l2)

    logdata.append({
        "dim": components,
        "vals_ip": vals_ip,
        "vals_l2": vals_l2,
        "model": model_name
    })

    # continuously override the file
    with open(args.logfile, "w") as f:
        f.write(str(logdata))
Пример #4
0
import argparse
import sklearn.metrics
from sklearn.decomposition import PCA

parser = argparse.ArgumentParser(description='PCA with scaled eigenvalues')
parser.add_argument('--data')
parser.add_argument('--data-small', default=None)
parser.add_argument('--logfile', default="computed/tmp.log")
parser.add_argument('--post-cn', action="store_true")
parser.add_argument('--center', action="store_true")
parser.add_argument('--norm', action="store_true")
parser.add_argument('--skip-loss', action="store_true")
parser.add_argument('--dims', default="custom")
parser.add_argument('--seed', type=int, default=0)
args = parser.parse_args()
data = read_pickle(args.data)

if args.data_small is None:
    if args.center:
        data = center_data(data)
    if args.norm:
        data = norm_data(data)
    print(
        "Because args.data_small is not provided, I'm copying the whole structure"
    )
    data_small = copy.deepcopy(data)

    data = sub_data(data, train=False, in_place=True)
    data_small = sub_data(data_small, train=True, in_place=True)
else:
    data_small = read_pickle(args.data_small)
Пример #5
0
import copy
import random
import sys
sys.path.append("src")
from misc.load_utils import read_pickle, sub_data
from misc.retrieval_utils import rprec_a_l2, rprec_a_ip
import argparse

parser = argparse.ArgumentParser(
    description='Uncompressed irrelevant performance summary')
parser.add_argument('--data')
parser.add_argument('--data-big')
parser.add_argument('--logfile', default="computed/tmp.log")
parser.add_argument('--seed', type=int, default=0)
args = parser.parse_args()
data = read_pickle(args.data)
data_big = read_pickle(args.data_big)


def summary_performance(dataReduced):
    val_l2 = rprec_a_l2(
        dataReduced["queries"],
        dataReduced["docs"],
        data["relevancy"],
        data["relevancy_articles"],
        data["docs_articles"],
        fast=True,
    )
    val_ip = val_l2
    return val_ip, val_l2
This patch is necessary so that r-precision may use correct information
regarding the articles.
"""

import sys
sys.path.append("src")
from misc.load_utils import save_pickle, read_pickle
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--data-1', default="/data/hp/full.pkl")
parser.add_argument('--data-2', default="/data/hp/full.embd")
parser.add_argument('--data-out', default="/data/hp/full_fixed.embd")

args = parser.parse_args()
data1 = read_pickle(args.data_1)
data2 = read_pickle(args.data_2)

if "relevancy" not in data2:
    raise Exception("Second data does not have relevancy entry")
if "relevancy_articles" not in data2:
    raise Exception("Second data does not have relevancy_articles entry")
if "docs_articles" not in data2:
    raise Exception("Second data does not have docs_articles entry")
if len(data1["queries"]) != len(data2["queries"]):
    raise Exception("Data lengths (queries) are not matching")
if len(data1["docs"]) != len(data2["docs"]):
    raise Exception("Data lengths (docs) are not matching")

print("query type1:", type(data1["queries"][0]))
print("docs type1:", type(data1["docs"][0]))
Пример #7
0
#!/usr/bin/env python3

import sys
sys.path.append("../../")
from misc.load_utils import read_pickle
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import numpy as np

plt.figure(figsize=(10, 7))
data = read_pickle("/data/kilt/hotpot-dpr-c-5000.embd")
tsne = TSNE(2, random_state=0)
tsne = tsne.fit_transform(np.array(data["queries"] + data["docs"][:5000]))
plt.scatter(tsne[:5000, 0], tsne[:5000, 1], alpha=0.3, s=5)
plt.scatter(tsne[5000:, 0], tsne[5000:, 1], alpha=0.3, s=5, color="red")
plt.title(f"t-SNE visualization")

plt.tight_layout()
plt.show()
Пример #8
0
import sys
sys.path.append("src")
from misc.load_utils import read_pickle, save_json, small_data, sub_data
from misc.retrieval_utils import retrieved_ip, acc_ip
from filtering_utils import filter_step, prune_docs
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.dummy import DummyClassifier
import argparse, json

parser = argparse.ArgumentParser()
parser.add_argument('--data', default="/data/hp/dpr-c-pruned.embd_cn")
parser.add_argument('--logfile', default="computed/autofilter_classifier.json")
parser.add_argument('--traindata', default="computed/autofilter_traindata.pkl")
args = parser.parse_args()
data_train = read_pickle(args.traindata)

logdata = []
positives_total = []
negatives_total = []


def eval_retrieval(scaler, model):
    # This loads the data from disk every time
    # To speed this up, we could load this once and then "only" make copies
    # The reason we can't have a single copy is that the model predictions can differ
    data = read_pickle(args.data)
    data = sub_data(data, train=False, in_place=True)
    prediction = model.predict(scaler.transform(data["docs"]))

    data = prune_docs(
Пример #9
0
import numpy as np
import argparse
from sklearn.decomposition import PCA
import sklearn.metrics

parser = argparse.ArgumentParser(description='Main PCA performance experiment')
parser.add_argument('--data')
parser.add_argument('--data-train', default=None)
parser.add_argument('--logfile', default="computed/tmp.log")
parser.add_argument('--center', action="store_true")
parser.add_argument('--norm', action="store_true")
parser.add_argument('--post-cn', action="store_true")
parser.add_argument('--dims', default="custom")
parser.add_argument('--seed', type=int, default=0)
args = parser.parse_args()
data = read_pickle(args.data)

if args.data_train is None:
    print(
        "Because args.data_train is not provided, I'm copying the whole structure"
    )
    data_train = copy.deepcopy(data)
else:
    data_train = read_pickle(args.data_train)

data = sub_data(data, train=False, in_place=True)
data_train = sub_data(data_train, train=True, in_place=True)
data_orig = copy.deepcopy(data)

if args.center:
    # only keep the dev scaler
Пример #10
0
import sys
sys.path.append("src")
from misc.load_utils import read_pickle, save_json, save_pickle, small_data, sub_data
from misc.retrieval_utils import retrieved_ip, acc_ip
from filtering_utils import filter_step
import argparse, json, pickle
import time

parser = argparse.ArgumentParser()
parser.add_argument('--data', default="/data/hp/dpr-c-pruned.embd_cn")
parser.add_argument('--logfile', default="computed/autofilter.json")
parser.add_argument('--traindata', default="computed/autofilter_traindata.pkl")
args = parser.parse_args()

data = read_pickle(args.data)
data = sub_data(data, train=False, in_place=True)
data_dev = read_pickle(args.data)
data_dev = sub_data(data_dev, train=False, in_place=True)

print(len(data["queries"]), "train queries")
print(len(data_dev["queries"]), "dev queries")
print(len(data["docs"]), "docs")
print(max([max(x) for x in data["relevancy"]]), "max train relevancy")
print(max([max(x) for x in data_dev["relevancy"]]), "max dev relevancy")
print("", flush=True)

del data["relevancy_articles"]
del data["docs_articles"]
del data_dev["docs_articles"]
del data_dev["relevancy_articles"]