Exemplo n.º 1
0
def test_transform_qid_posdocid_negdocid_with_negdoc(tmpdir, trec_index, dummy_collection_config):
    collection = Collection(dummy_collection_config)
    pipeline_config = {
        "indexstops": True,
        "maxthreads": 1,
        "stemmer": "anserini",
        "bmax": 0.2,
        "k1max": 0.2,
        "maxqlen": 5,
        "maxdoclen": 10,
        "keepstops": True,
    }
    bm25_run = BM25Grid(trec_index, collection, os.path.join(tmpdir, "searcher"), pipeline_config)
    benchmark = Robust04Benchmark(bm25_run, collection, pipeline_config)
    feature = EmbedText(tmpdir, tmpdir, pipeline_config, index=trec_index, collection=collection, benchmark=benchmark)
    feature.stoi["dummy"] = 1
    feature.itos[1] = "dummy"
    feature.doc_id_to_doc_toks = {
        "LA010189-0001": ["dummy", "dummy", "dummy", "hello", "world", "greetings", "from", "outer", "space"],
        "LA010189-0001": ["dummy", "dummy", "dummy", "hello", "world", "greetings", "from", "outer", "space"],
    }

    transformed = feature.transform_qid_posdocid_negdocid("301", "LA010189-0001", "LA010189-0001")

    # stoi only knows about the word 'dummy'. So the transformation of every other word is set as 0
    assert transformed["qid"] == "301"
    assert transformed["posdocid"] == "LA010189-0001"
    assert transformed["negdocid"] == "LA010189-0001"
    assert numpy.array_equal(transformed["query"], [1, 0, 0, 0, 0])
    assert numpy.array_equal(transformed["posdoc"], [1, 1, 1, 0, 0, 0, 0, 0, 0, 0])
    assert numpy.array_equal(transformed["negdoc"], [1, 1, 1, 0, 0, 0, 0, 0, 0, 0])
    assert numpy.array_equal(transformed["query_idf"], [0, 0, 0, 0, 0])
Exemplo n.º 2
0
def test_build_from_benchmark(monkeypatch, tmpdir, trec_index,
                              dummy_collection_config):
    # Kind of a useless test - not asserting much here. Still useful since it makes sure that the code at least runs
    collection = Collection(dummy_collection_config)
    pipeline_config = {
        "indexstops": True,
        "maxthreads": 1,
        "stemmer": "anserini",
        "bmax": 0.2,
        "k1max": 0.2,
        "maxqlen": 5,
        "maxdoclen": 10,
        "keepstops": True,
        "rundocsonly": False,
        "datamode": "unigram",
        "passagelen": 3,
        "slicelen": 20,
        "tfchannel": True,
    }
    bm25_run = BM25Grid(trec_index, collection,
                        os.path.join(tmpdir, "searcher"), pipeline_config)
    bm25_run.create()
    folds = {
        "s1": {
            "train_qids": ["301"],
            "predict": {
                "dev": ["301"],
                "test": ["301"]
            }
        }
    }
    benchmark = Robust04Benchmark(bm25_run, collection, pipeline_config)
    benchmark.create_and_store_train_and_pred_pairs(folds)

    extractor = DeepTileExtractor(tmpdir,
                                  tmpdir,
                                  pipeline_config,
                                  index=trec_index,
                                  collection=collection,
                                  benchmark=benchmark)

    def fake_magnitude_embedding(*args, **kwargs):
        return Magnitude(None)

    monkeypatch.setattr(extractor, "get_magnitude_embeddings",
                        fake_magnitude_embedding)
    extractor.build_from_benchmark(True)
    assert extractor.stoi == {
        "<pad>": 0,
        "dummy": 1,
        "doc": 2,
        "hello": 3,
        "world": 4,
        "greetings": 5,
        "from": 6,
        "outer": 7,
        "space": 8,
    }

    assert extractor.itos == {v: k for k, v in extractor.stoi.items()}
Exemplo n.º 3
0
def test_transform_qid_posdocid_negdocid_only_posdoc(tmpdir, trec_index,
                                                     dummy_collection_config):
    collection = Collection(dummy_collection_config)
    pipeline_config = {
        "indexstops": True,
        "maxthreads": 1,
        "stemmer": "anserini",
        "bmax": 0.2,
        "k1max": 0.2,
        "maxqlen": 5,
        "maxdoclen": 10,
        "keepstops": True,
        "datamode": "unigram",
    }
    bm25_run = BM25Grid(trec_index, collection,
                        os.path.join(tmpdir, "searcher"), pipeline_config)
    benchmark = Robust04Benchmark(bm25_run, collection, pipeline_config)
    feature = BagOfWords(tmpdir,
                         tmpdir,
                         pipeline_config,
                         index=trec_index,
                         collection=collection,
                         benchmark=benchmark)
    feature.stoi["dummy"] = 1
    feature.stoi["doc"] = 2
    feature.itos[1] = "dummy"
    feature.itos[2] = "doc"
    feature.doc_id_to_doc_toks = {
        "LA010189-0001": [
            "dummy", "dummy", "dummy", "hello", "world", "greetings", "from",
            "outer", "space"
        ],
        "LA010189-0001": [
            "dummy", "dummy", "dummy", "hello", "world", "greetings", "from",
            "outer", "space"
        ],
    }
    transformed = feature.transform_qid_posdocid_negdocid(
        "301", "LA010189-0001")
    # stoi only knows about the word 'dummy'. So the transformation of every other word is set as 0
    assert transformed["qid"] == "301"
    assert transformed["posdocid"] == "LA010189-0001"
    assert transformed["negdocid"] is None

    # Right now we have only 3 words in the vocabular - "<pad>", "dummy" and "doc"
    assert numpy.array_equal(transformed["query"], [0, 1, 1])
    assert numpy.array_equal(transformed["posdoc"], [
        6, 3, 0
    ])  # There  are 6 unknown words in the doc, so all of them is encoded as 0
    assert numpy.array_equal(transformed["query_idf"], [0, 0, 0])

    # Learn another word
    feature.stoi["hello"] = 3
    feature.itos[3] = "hello"
    transformed = feature.transform_qid_posdocid_negdocid(
        "301", "LA010189-0001")
    # The posdoc transformation changes to reflect the new word
    assert numpy.array_equal(transformed["posdoc"], [5, 3, 0, 1])
Exemplo n.º 4
0
def test_cross_validated_ranking(trec_index, dummy_collection_config, tmpdir):
    collection = Collection(dummy_collection_config)
    pipeline_config = {
        "indexstops": True,
        "maxthreads": 1,
        "stemmer": "anserini",
        "bmax": 0.2,
        "k1max": 0.2
    }

    bm25_run = BM25Grid(trec_index, collection,
                        os.path.join(tmpdir, "searcher"), pipeline_config)
    test_ranking = bm25_run.crossvalidated_ranking(["301"], ["301"])

    assert test_ranking["301"]["LA010189-0001"] > 0
    assert test_ranking["301"]["LA010189-0002"] > 0
Exemplo n.º 5
0
def test_bm25grid_create(trec_index, dummy_collection_config, tmpdir):
    collection = Collection(dummy_collection_config)
    pipeline_config = {
        "indexstops": True,
        "maxthreads": 1,
        "stemmer": "anserini",
        "bmax": 0.2,
        "k1max": 0.2
    }

    bm25_run = BM25Grid(trec_index, collection,
                        os.path.join(tmpdir, "searcher"), pipeline_config)
    bm25_run.create()

    # Make sure that the searcher file is generated
    os.path.isfile(os.path.join(tmpdir, "searcher", "done"))
Exemplo n.º 6
0
def test_build_from_benchmark(monkeypatch, tmpdir, trec_index, dummy_collection_config):
    collection = Collection(dummy_collection_config)
    pipeline_config = {
        "indexstops": True,
        "maxthreads": 1,
        "stemmer": "anserini",
        "bmax": 0.2,
        "k1max": 0.2,
        "maxqlen": 5,
        "maxdoclen": 10,
        "keepstops": True,
        "rundocsonly": False,
        "reranker": "KNRM",
    }
    bm25_run = BM25Grid(trec_index, collection, os.path.join(tmpdir, "searcher"), pipeline_config)
    bm25_run.create()
    folds = {"s1": {"train_qids": ["301"], "predict": {"dev": ["301"], "test": ["301"]}}}
    benchmark = Robust04Benchmark(bm25_run, collection, pipeline_config)
    benchmark.create_and_store_train_and_pred_pairs(folds)

    # Prevents a download when the unit tests are searcher
    def fake_magnitude_embedding(*args, **kwargs):
        return Magnitude(None)

    feature = EmbedText(tmpdir, tmpdir, pipeline_config, index=trec_index, collection=collection, benchmark=benchmark)
    monkeypatch.setattr(feature, "get_magnitude_embeddings", fake_magnitude_embedding)

    feature.build_from_benchmark("glove6b", True)
    assert feature.stoi == {
        "<pad>": 0,
        "dummy": 1,
        "doc": 2,
        "hello": 3,
        "world": 4,
        "greetings": 5,
        "from": 6,
        "outer": 7,
        "space": 8,
    }

    assert feature.itos == {v: k for k, v in feature.stoi.items()}
    assert numpy.array_equal(feature.embeddings[0], [0, 0, 0, 0, 0, 0, 0, 0])
    assert feature.embeddings.shape == (9, 8)
Exemplo n.º 7
0
def test_build_from_benchmark_with_trigram(monkeypatch, tmpdir, trec_index,
                                           dummy_collection_config):
    collection = Collection(dummy_collection_config)
    pipeline_config = {
        "indexstops": True,
        "maxthreads": 1,
        "stemmer": "anserini",
        "bmax": 0.2,
        "k1max": 0.2,
        "maxqlen": 5,
        "maxdoclen": 10,
        "keepstops": True,
        "rundocsonly": False,
        "datamode": "trigram",
    }
    bm25_run = BM25Grid(trec_index, collection,
                        os.path.join(tmpdir, "searcher"), pipeline_config)
    bm25_run.create()
    folds = {
        "s1": {
            "train_qids": ["301"],
            "predict": {
                "dev": ["301"],
                "test": ["301"]
            }
        }
    }
    benchmark = Robust04Benchmark(bm25_run, collection, pipeline_config)
    benchmark.create_and_store_train_and_pred_pairs(folds)

    feature = BagOfWords(tmpdir,
                         tmpdir,
                         pipeline_config,
                         index=trec_index,
                         collection=collection,
                         benchmark=benchmark)

    feature.build_from_benchmark(True)
    assert feature.stoi == {
        "<pad>": 0,
        "#du": 1,
        "dum": 2,
        "umm": 3,
        "mmy": 4,
        "my#": 5,
        "#do": 6,
        "doc": 7,
        "oc#": 8,
        "#he": 9,
        "hel": 10,
        "ell": 11,
        "llo": 12,
        "lo#": 13,
        "#wo": 14,
        "wor": 15,
        "orl": 16,
        "rld": 17,
        "ld#": 18,
        "#gr": 19,
        "gre": 20,
        "ree": 21,
        "eet": 22,
        "eti": 23,
        "tin": 24,
        "ing": 25,
        "ngs": 26,
        "gs#": 27,
        "#fr": 28,
        "fro": 29,
        "rom": 30,
        "om#": 31,
        "#ou": 32,
        "out": 33,
        "ute": 34,
        "ter": 35,
        "er#": 36,
        "#sp": 37,
        "spa": 38,
        "pac": 39,
        "ace": 40,
        "ce#": 41,
    }

    assert feature.itos == {v: k for k, v in feature.stoi.items()}
Exemplo n.º 8
0
def test_build_from_benchmark(monkeypatch, tmpdir, trec_index,
                              dummy_collection_config):
    collection = Collection(dummy_collection_config)
    pipeline_config = {
        "indexstops": True,
        "maxthreads": 1,
        "stemmer": "anserini",
        "bmax": 0.2,
        "k1max": 0.2,
        "maxqlen": 5,
        "maxdoclen": 10,
        "keepstops": True,
        "rundocsonly": False,
        "datamode": "unigram",
    }
    bm25_run = BM25Grid(trec_index, collection,
                        os.path.join(tmpdir, "searcher"), pipeline_config)
    bm25_run.create()
    folds = {
        "s1": {
            "train_qids": ["301"],
            "predict": {
                "dev": ["301"],
                "test": ["301"]
            }
        }
    }
    benchmark = Robust04Benchmark(bm25_run, collection, pipeline_config)
    benchmark.create_and_store_train_and_pred_pairs(folds)

    feature = BagOfWords(tmpdir,
                         tmpdir,
                         pipeline_config,
                         index=trec_index,
                         collection=collection,
                         benchmark=benchmark)

    feature.build_from_benchmark(True)
    assert feature.stoi == {
        "<pad>": 0,
        "dummy": 1,
        "doc": 2,
        "hello": 3,
        "world": 4,
        "greetings": 5,
        "from": 6,
        "outer": 7,
        "space": 8,
    }

    assert feature.itos == {v: k for k, v in feature.stoi.items()}
    assert feature.embeddings == {
        "<pad>": 0,
        "dummy": 1,
        "doc": 2,
        "hello": 3,
        "world": 4,
        "greetings": 5,
        "from": 6,
        "outer": 7,
        "space": 8,
    }
Exemplo n.º 9
0
def test_transform_qid_posdocid_negdocid(monkeypatch, tmpdir, trec_index,
                                         dummy_collection_config):
    collection = Collection(dummy_collection_config)
    pipeline_config = {
        "indexstops": True,
        "maxthreads": 1,
        "stemmer": "anserini",
        "bmax": 0.2,
        "k1max": 0.2,
        "maxqlen": 5,
        "maxdoclen": 10,
        "keepstops": True,
        "rundocsonly": False,
    }
    bm25_run = BM25Grid(trec_index, collection,
                        os.path.join(tmpdir, "searcher"), pipeline_config)
    bm25_run.create()
    folds = {
        "s1": {
            "train_qids": ["301"],
            "predict": {
                "dev": ["301"],
                "test": ["301"]
            }
        }
    }
    benchmark = Robust04Benchmark(bm25_run, collection, pipeline_config)
    benchmark.create_and_store_train_and_pred_pairs(folds)

    feature = BertText(tmpdir,
                       tmpdir,
                       pipeline_config,
                       index=trec_index,
                       collection=collection,
                       benchmark=benchmark)
    feature.build_from_benchmark()
    transformed = feature.transform_qid_posdocid_negdocid(
        "301", "LA010189-0001", "LA010189-0001")

    assert np.array_equal(
        transformed["postoks"],
        [
            101, 24369, 9986, 0, 0, 0, 102, 24369, 24369, 24369, 7592, 2088,
            1010, 14806, 2015, 2013, 6058, 102
        ],
    )
    assert np.array_equal(
        transformed["posmask"],
        [1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
    assert np.array_equal(
        transformed["possegs"],
        [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
    assert np.array_equal(transformed["posqmask"], [1, 1, 0, 0, 0])
    assert np.array_equal(transformed["posdmask"],
                          [1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

    assert np.array_equal(
        transformed["negtoks"],
        [
            101, 24369, 9986, 0, 0, 0, 102, 24369, 24369, 24369, 7592, 2088,
            1010, 14806, 2015, 2013, 6058, 102
        ],
    )
    assert np.array_equal(
        transformed["negmask"],
        [1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
    assert np.array_equal(
        transformed["negsegs"],
        [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
    assert np.array_equal(transformed["negqmask"], [1, 1, 0, 0, 0])
    assert np.array_equal(transformed["negdmask"],
                          [1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

    assert transformed["posdocid"] == "LA010189-0001"
    assert transformed["negdocid"] == "LA010189-0001"
    assert transformed["qid"] == "301"