def eval_retrieval(scaler, model): # This loads the data from disk every time # To speed this up, we could load this once and then "only" make copies # The reason we can't have a single copy is that the model predictions can differ data = read_pickle(args.data) data = sub_data(data, train=False, in_place=True) prediction = model.predict(scaler.transform(data["docs"])) data = prune_docs( data, None, [i for i, _x in enumerate(data["docs"]) if prediction[i] == 1], verbose=False) acc = acc_ip(data["queries"], data["docs"], data["relevancy"], n=10) return acc
import sys sys.path.append("src") from misc.load_utils import read_pickle, center_data, norm_data from misc.retrieval_utils import rprec_l2, rprec_ip import numpy as np import torch import argparse from sklearn.decomposition import PCA from sklearn.model_selection import ParameterGrid parser = argparse.ArgumentParser() parser.add_argument('--data') parser.add_argument('--post-cn', action="store_true") parser.add_argument('--seed', type=int, default=0) args = parser.parse_args() data = read_pickle(args.data) def summary_performance(name, dataReduced, dataReconstructed): if args.post_cn: dataReduced = center_data(dataReduced) dataReduced = norm_data(dataReduced) val_ip = rprec_ip(dataReduced["queries"], dataReduced["docs"], data["relevancy"], fast=True) val_l2 = rprec_l2(dataReduced["queries"], dataReduced["docs"], data["relevancy"], fast=True)
def random_projection_performance(components, model_name, runs=3): if model_name == "gauss": Model = GaussianRandomProjection elif model_name == "sparse": Model = SparseRandomProjection elif model_name == "crop": Model = CropRandomProjection else: raise Exception("Unknown model") random.seed(args.seed) vals_ip = [] vals_l2 = [] for i in range(runs): data = read_pickle(args.data) # take only dev queries data = sub_data(data, train=False, in_place=True) # make sure the vectors are np arrays data["queries"] = np.array(data["queries"]) data["docs"] = np.array(data["docs"]) model = Model(n_components=components, random_state=random.randint(0, 2**8 - 1)) model.fit(data["docs"]) dataReduced = { "queries": safe_transform(model, data["queries"]), "docs": safe_transform(model, data["docs"]) } del data["queries"] del data["docs"] if args.post_cn: dataReduced = center_data(dataReduced) dataReduced = norm_data(dataReduced) # copy to make it C-continuous # (skipped) val_l2 = rprec_a_l2( dataReduced["queries"], dataReduced["docs"], data["relevancy"], data["relevancy_articles"], data["docs_articles"], report=False, fast=True, ) vals_l2.append(val_l2) # skip IP computation because the vectors are normalized if not args.post_cn: val_ip = rprec_a_ip( dataReduced["queries"], dataReduced["docs"], data["relevancy"], data["relevancy_articles"], data["docs_articles"], report=False, fast=True, ) vals_ip.append(val_ip) else: vals_ip.append(val_l2) logdata.append({ "dim": components, "vals_ip": vals_ip, "vals_l2": vals_l2, "model": model_name }) # continuously override the file with open(args.logfile, "w") as f: f.write(str(logdata))
import argparse import sklearn.metrics from sklearn.decomposition import PCA parser = argparse.ArgumentParser(description='PCA with scaled eigenvalues') parser.add_argument('--data') parser.add_argument('--data-small', default=None) parser.add_argument('--logfile', default="computed/tmp.log") parser.add_argument('--post-cn', action="store_true") parser.add_argument('--center', action="store_true") parser.add_argument('--norm', action="store_true") parser.add_argument('--skip-loss', action="store_true") parser.add_argument('--dims', default="custom") parser.add_argument('--seed', type=int, default=0) args = parser.parse_args() data = read_pickle(args.data) if args.data_small is None: if args.center: data = center_data(data) if args.norm: data = norm_data(data) print( "Because args.data_small is not provided, I'm copying the whole structure" ) data_small = copy.deepcopy(data) data = sub_data(data, train=False, in_place=True) data_small = sub_data(data_small, train=True, in_place=True) else: data_small = read_pickle(args.data_small)
import copy import random import sys sys.path.append("src") from misc.load_utils import read_pickle, sub_data from misc.retrieval_utils import rprec_a_l2, rprec_a_ip import argparse parser = argparse.ArgumentParser( description='Uncompressed irrelevant performance summary') parser.add_argument('--data') parser.add_argument('--data-big') parser.add_argument('--logfile', default="computed/tmp.log") parser.add_argument('--seed', type=int, default=0) args = parser.parse_args() data = read_pickle(args.data) data_big = read_pickle(args.data_big) def summary_performance(dataReduced): val_l2 = rprec_a_l2( dataReduced["queries"], dataReduced["docs"], data["relevancy"], data["relevancy_articles"], data["docs_articles"], fast=True, ) val_ip = val_l2 return val_ip, val_l2
This patch is necessary so that r-precision may use correct information regarding the articles. """ import sys sys.path.append("src") from misc.load_utils import save_pickle, read_pickle import argparse parser = argparse.ArgumentParser() parser.add_argument('--data-1', default="/data/hp/full.pkl") parser.add_argument('--data-2', default="/data/hp/full.embd") parser.add_argument('--data-out', default="/data/hp/full_fixed.embd") args = parser.parse_args() data1 = read_pickle(args.data_1) data2 = read_pickle(args.data_2) if "relevancy" not in data2: raise Exception("Second data does not have relevancy entry") if "relevancy_articles" not in data2: raise Exception("Second data does not have relevancy_articles entry") if "docs_articles" not in data2: raise Exception("Second data does not have docs_articles entry") if len(data1["queries"]) != len(data2["queries"]): raise Exception("Data lengths (queries) are not matching") if len(data1["docs"]) != len(data2["docs"]): raise Exception("Data lengths (docs) are not matching") print("query type1:", type(data1["queries"][0])) print("docs type1:", type(data1["docs"][0]))
#!/usr/bin/env python3 import sys sys.path.append("../../") from misc.load_utils import read_pickle import matplotlib.pyplot as plt from sklearn.manifold import TSNE import numpy as np plt.figure(figsize=(10, 7)) data = read_pickle("/data/kilt/hotpot-dpr-c-5000.embd") tsne = TSNE(2, random_state=0) tsne = tsne.fit_transform(np.array(data["queries"] + data["docs"][:5000])) plt.scatter(tsne[:5000, 0], tsne[:5000, 1], alpha=0.3, s=5) plt.scatter(tsne[5000:, 0], tsne[5000:, 1], alpha=0.3, s=5, color="red") plt.title(f"t-SNE visualization") plt.tight_layout() plt.show()
import sys sys.path.append("src") from misc.load_utils import read_pickle, save_json, small_data, sub_data from misc.retrieval_utils import retrieved_ip, acc_ip from filtering_utils import filter_step, prune_docs from sklearn.linear_model import LogisticRegression from sklearn.preprocessing import StandardScaler from sklearn.dummy import DummyClassifier import argparse, json parser = argparse.ArgumentParser() parser.add_argument('--data', default="/data/hp/dpr-c-pruned.embd_cn") parser.add_argument('--logfile', default="computed/autofilter_classifier.json") parser.add_argument('--traindata', default="computed/autofilter_traindata.pkl") args = parser.parse_args() data_train = read_pickle(args.traindata) logdata = [] positives_total = [] negatives_total = [] def eval_retrieval(scaler, model): # This loads the data from disk every time # To speed this up, we could load this once and then "only" make copies # The reason we can't have a single copy is that the model predictions can differ data = read_pickle(args.data) data = sub_data(data, train=False, in_place=True) prediction = model.predict(scaler.transform(data["docs"])) data = prune_docs(
import numpy as np import argparse from sklearn.decomposition import PCA import sklearn.metrics parser = argparse.ArgumentParser(description='Main PCA performance experiment') parser.add_argument('--data') parser.add_argument('--data-train', default=None) parser.add_argument('--logfile', default="computed/tmp.log") parser.add_argument('--center', action="store_true") parser.add_argument('--norm', action="store_true") parser.add_argument('--post-cn', action="store_true") parser.add_argument('--dims', default="custom") parser.add_argument('--seed', type=int, default=0) args = parser.parse_args() data = read_pickle(args.data) if args.data_train is None: print( "Because args.data_train is not provided, I'm copying the whole structure" ) data_train = copy.deepcopy(data) else: data_train = read_pickle(args.data_train) data = sub_data(data, train=False, in_place=True) data_train = sub_data(data_train, train=True, in_place=True) data_orig = copy.deepcopy(data) if args.center: # only keep the dev scaler
import sys sys.path.append("src") from misc.load_utils import read_pickle, save_json, save_pickle, small_data, sub_data from misc.retrieval_utils import retrieved_ip, acc_ip from filtering_utils import filter_step import argparse, json, pickle import time parser = argparse.ArgumentParser() parser.add_argument('--data', default="/data/hp/dpr-c-pruned.embd_cn") parser.add_argument('--logfile', default="computed/autofilter.json") parser.add_argument('--traindata', default="computed/autofilter_traindata.pkl") args = parser.parse_args() data = read_pickle(args.data) data = sub_data(data, train=False, in_place=True) data_dev = read_pickle(args.data) data_dev = sub_data(data_dev, train=False, in_place=True) print(len(data["queries"]), "train queries") print(len(data_dev["queries"]), "dev queries") print(len(data["docs"]), "docs") print(max([max(x) for x in data["relevancy"]]), "max train relevancy") print(max([max(x) for x in data_dev["relevancy"]]), "max dev relevancy") print("", flush=True) del data["relevancy_articles"] del data["docs_articles"] del data_dev["docs_articles"] del data_dev["relevancy_articles"]