Exemplo n.º 1
0
def summary_performance(dataReduced, dataReconstructed):
    if args.post_cn:
        dataReduced = center_data(dataReduced)
        dataReduced = norm_data(dataReduced)

    val_l2 = rprec_a_l2(dataReduced["queries"],
                        dataReduced["docs"],
                        data["relevancy"],
                        data["relevancy_articles"],
                        data["docs_articles"],
                        fast=True,
                        report=False)
    if args.post_cn:
        val_ip = val_l2
    else:
        val_ip = rprec_a_ip(dataReduced["queries"],
                            dataReduced["docs"],
                            data["relevancy"],
                            data["relevancy_articles"],
                            data["docs_articles"],
                            fast=True,
                            report=False)

    if not args.skip_loss:
        loss_q = sklearn.metrics.mean_squared_error(
            data["queries"], dataReconstructed["queries"])
        # loss of only the first 10k documents because it has to get copied
        loss_d = sklearn.metrics.mean_squared_error(
            data["docs"][:10000], dataReconstructed["docs"][:10000])
        return val_ip, val_l2, loss_q.item(), loss_d.item()
    else:
        return val_ip, val_l2, None, None
Exemplo n.º 2
0
def summary_performance(dataReduced):
    if args.post_cn:
        dataReduced = center_data(dataReduced)
        dataReduced = norm_data(dataReduced)

    val_l2 = rprec_a_l2(
        dataReduced["queries"],
        dataReduced["docs"],
        data["relevancy"],
        data["relevancy_articles"],
        data["docs_articles"],
        fast=True,
    )
    if args.post_cn:
        val_ip = val_l2
    else:
        val_ip = rprec_a_ip(
            dataReduced["queries"],
            dataReduced["docs"],
            data["relevancy"],
            data["relevancy_articles"],
            data["docs_articles"],
            fast=True,
        )
    return val_ip, val_l2
Exemplo n.º 3
0
def summary_performance(prefix, data_reduced, data, post_cn):
    if post_cn:
        data_reduced = center_data(data_reduced)
        data_reduced = norm_data(data_reduced)

    val_l2 = rprec_a_l2(
        data_reduced["queries"],
        data_reduced["docs"],
        data["relevancy"],
        data["relevancy_articles"],
        data["docs_articles"],
        fast=True, report=False
    )
    if post_cn:
        val_ip = val_l2
    else:
        val_ip = rprec_a_ip(
            data_reduced["queries"],
            data_reduced["docs"],
            data["relevancy"],
            data["relevancy_articles"],
            data["docs_articles"],
            fast=True, report=False
        )
    print(f'{prefix} rprec_ip: {val_ip:.3f}, rprec_l2: {val_l2:.3f}')
    return val_ip, val_l2
def summary_performance(name, dataReduced, dataReconstructed):
    if args.post_cn:
        dataReduced = center_data(dataReduced)
        dataReduced = norm_data(dataReduced)

    val_ip = rprec_a_ip(dataReduced["queries"],
                        dataReduced["docs"],
                        data["relevancy"],
                        data["relevancy_articles"],
                        data["docs_articles"],
                        fast=True)
    val_l2 = rprec_a_l2(dataReduced["queries"],
                        dataReduced["docs"],
                        data["relevancy"],
                        data["relevancy_articles"],
                        data["docs_articles"],
                        fast=True)
    name = name.replace("float", "f")
    print(f"{name:<21} {val_ip:>5.3f} {val_l2:>5.3f}")
    return val_ip, val_l2
Exemplo n.º 5
0
def random_projection_performance(dim):
    model = DropRandomProjection()

    dataReduced = {
        "queries": model.transform(data["queries"], dim, IMPR_L2),
        "docs": model.transform(data["docs"], dim, IMPR_L2)
    }
    if args.post_cn:
        dataReduced = center_data(dataReduced)
        dataReduced = norm_data(dataReduced)

    # copy to make it C-continuous
    val_l2 = rprec_a_l2(
        dataReduced["queries"].copy(),
        dataReduced["docs"].copy(),
        data["relevancy"],
        data["relevancy_articles"],
        data["docs_articles"],
        fast=True,
    )
    if not args.post_cn:
        val_ip = rprec_a_ip(
            dataReduced["queries"].copy(),
            dataReduced["docs"].copy(),
            data["relevancy"],
            data["relevancy_articles"],
            data["docs_articles"],
            fast=True,
        )
    else:
        val_ip = val_l2

    data_log.append({"del_dim": dim, "val_ip": val_ip, "val_l2": val_l2})

    # continuously override the file
    with open(args.logfile, "w") as f:
        f.write(str(data_log))

    print(f"Delete {dim} dims: {val_l2:<8.5f}")
Exemplo n.º 6
0
def summary_performance(dataReduced, dataReconstructed):
    # reconstructed data is not in the original form when scaling
    # note the reverse order
    if args.norm:
        dataReconstructed = norm_model.inverse_transform(dataReconstructed)
    if args.center:
        dataReconstructed = center_model.inverse_transform(dataReconstructed)

    if args.post_cn:
        dataReduced = CenterScaler().transform(dataReduced)
        dataReduced = NormScaler().transform(dataReduced)

    val_l2 = rprec_a_l2(
        dataReduced["queries"],
        dataReduced["docs"],
        data["relevancy"],
        data["relevancy_articles"],
        data["docs_articles"],
        fast=True,
    )
    if args.post_cn:
        val_ip = val_l2
    else:
        val_ip = rprec_a_ip(
            dataReduced["queries"],
            dataReduced["docs"],
            data["relevancy"],
            data["relevancy_articles"],
            data["docs_articles"],
            fast=True,
        )
    loss_q = sklearn.metrics.mean_squared_error(data_orig["queries"],
                                                dataReconstructed["queries"])
    # loss of only the first 10k documents because it has to get copied
    loss_d = sklearn.metrics.mean_squared_error(
        data_orig["docs"][:10000], dataReconstructed["docs"][:10000])
    return val_ip, val_l2, loss_q, loss_d
Exemplo n.º 7
0
print("Fitting model")
# dataNew = model.fit_transform(similarities)
dataNew = model.fit_transform(dataNew)

dataNew = {
    "docs": dataNew[:len(data["docs"])].copy(),
    "queries": dataNew[len(data["docs"]):].copy(),
}

print(len(dataNew["docs"]))
print(len(dataNew["queries"]))

val_ip_pca = rprec_a_ip(dataNew["queries"],
                        dataNew["docs"],
                        data["relevancy"],
                        data["relevancy_articles"],
                        data["docs_articles"],
                        fast=False)
val_l2_pca = rprec_a_l2(dataNew["queries"],
                        dataNew["docs"],
                        data["relevancy"],
                        data["relevancy_articles"],
                        data["docs_articles"],
                        fast=False)
print(f"ip: {val_ip_pca:.4f}, l2: {val_l2_pca:.4f} (MDS)")

val_ip_pca = rprec_a_ip(data["queries"],
                        data["docs"],
                        data["relevancy"],
                        data["relevancy_articles"],
                        data["docs_articles"],
Exemplo n.º 8
0
def random_projection_performance(components, model_name, runs=3):
    if model_name == "gauss":
        Model = GaussianRandomProjection
    elif model_name == "sparse":
        Model = SparseRandomProjection
    elif model_name == "crop":
        Model = CropRandomProjection
    else:
        raise Exception("Unknown model")

    random.seed(args.seed)
    vals_ip = []
    vals_l2 = []
    for i in range(runs):
        data = read_pickle(args.data)
        # take only dev queries
        data = sub_data(data, train=False, in_place=True)
        # make sure the vectors are np arrays
        data["queries"] = np.array(data["queries"])
        data["docs"] = np.array(data["docs"])

        model = Model(n_components=components,
                      random_state=random.randint(0, 2**8 - 1))
        model.fit(data["docs"])

        dataReduced = {
            "queries": safe_transform(model, data["queries"]),
            "docs": safe_transform(model, data["docs"])
        }
        del data["queries"]
        del data["docs"]

        if args.post_cn:
            dataReduced = center_data(dataReduced)
            dataReduced = norm_data(dataReduced)

        # copy to make it C-continuous
        # (skipped)
        val_l2 = rprec_a_l2(
            dataReduced["queries"],
            dataReduced["docs"],
            data["relevancy"],
            data["relevancy_articles"],
            data["docs_articles"],
            report=False,
            fast=True,
        )
        vals_l2.append(val_l2)

        # skip IP computation because the vectors are normalized
        if not args.post_cn:
            val_ip = rprec_a_ip(
                dataReduced["queries"],
                dataReduced["docs"],
                data["relevancy"],
                data["relevancy_articles"],
                data["docs_articles"],
                report=False,
                fast=True,
            )
            vals_ip.append(val_ip)
        else:
            vals_ip.append(val_l2)

    logdata.append({
        "dim": components,
        "vals_ip": vals_ip,
        "vals_l2": vals_l2,
        "model": model_name
    })

    # continuously override the file
    with open(args.logfile, "w") as f:
        f.write(str(logdata))