Пример #1
0
def test_tokenize_text(trec_index, tmpdir):
    toks_list = [["to", "be", "or", "not", "to", "be"]]
    feature = BagOfWords(tmpdir,
                         tmpdir, {"datamode": "unigram"},
                         index=trec_index)
    feature.build_stoi(toks_list, True, False)
    assert feature.stoi == {"<pad>": 0, "to": 1, "be": 2, "or": 3, "not": 4}

    assert feature.idf == {}
Пример #2
0
def test_tokenize_text_with_calculate_idf(dummy_collection_config, trec_index,
                                          tmpdir):
    toks_list = [["to", "be", "or", "not", "to", "be"]]
    feature = BagOfWords(tmpdir,
                         tmpdir, {"datamode": "unigram"},
                         index=trec_index)
    feature.build_stoi(toks_list, True, True)
    assert feature.stoi == {"<pad>": 0, "to": 1, "be": 2, "or": 3, "not": 4}

    assert feature.idf == {
        "be": 1.791759469228055,
        "not": 1.791759469228055,
        "or": 1.791759469228055,
        "to": 1.791759469228055
    }
Пример #3
0
def test_bagofwords_create(monkeypatch, tmpdir, dummy_index):
    benchmark = DummyBenchmark({})
    extractor = BagOfWords(
        {
            "name": "bagofwords",
            "datamode": "unigram",
            "maxqlen": 4,
            "maxdoclen": 800,
            "usecache": False
        },
        provide={
            "index": dummy_index,
            "benchmark": benchmark
        },
    )
    extractor.preprocess(["301"], ["LA010189-0001", "LA010189-0002"],
                         benchmark.topics["title"])
    assert extractor.stoi == {
        "<pad>": 0,
        "dummy": 1,
        "doc": 2,
        "hello": 3,
        "world": 4,
        "greetings": 5,
        "from": 6,
        "outer": 7,
        "space": 8,
        "lessdummy": 9,
    }

    assert extractor.itos == {v: k for k, v in extractor.stoi.items()}
    assert extractor.embeddings == {
        "<pad>": 0,
        "dummy": 1,
        "doc": 2,
        "hello": 3,
        "world": 4,
        "greetings": 5,
        "from": 6,
        "outer": 7,
        "space": 8,
        "lessdummy": 9,
    }
Пример #4
0
def test_tokenize_text_trigram(trec_index, tmpdir):
    toks_list = [["to", "be", "or", "not", "to", "be"]]
    feature = BagOfWords(tmpdir,
                         tmpdir, {"datamode": "trigram"},
                         index=trec_index)
    feature.build_stoi(toks_list, True, False)

    # trigrams would be - ['#to', 'to#', '#be', 'be#', '#or', 'or#', "#no', 'not', "ot#']
    assert feature.stoi == {
        "<pad>": 0,
        "#to": 1,
        "to#": 2,
        "#be": 3,
        "be#": 4,
        "#or": 5,
        "or#": 6,
        "#no": 7,
        "not": 8,
        "ot#": 9
    }

    assert feature.idf == {}
Пример #5
0
def test_transform_qid_posdocid_negdocid_with_negdoc(tmpdir, trec_index,
                                                     dummy_collection_config):
    collection = Collection(dummy_collection_config)
    pipeline_config = {
        "indexstops": True,
        "maxthreads": 1,
        "stemmer": "anserini",
        "bmax": 0.2,
        "k1max": 0.2,
        "maxqlen": 5,
        "maxdoclen": 10,
        "keepstops": True,
        "datamode": "unigram",
    }
    bm25_run = BM25Grid(trec_index, collection,
                        os.path.join(tmpdir, "searcher"), pipeline_config)
    benchmark = Robust04Benchmark(bm25_run, collection, pipeline_config)
    feature = BagOfWords(tmpdir,
                         tmpdir,
                         pipeline_config,
                         index=trec_index,
                         collection=collection,
                         benchmark=benchmark)
    feature.stoi["dummy"] = 1
    feature.stoi["doc"] = 2
    feature.itos[1] = "dummy"
    feature.itos[2] = "doc"
    feature.doc_id_to_doc_toks = {
        "LA010189-0001": [
            "dummy", "dummy", "dummy", "hello", "world", "greetings", "from",
            "outer", "space"
        ],
        "LA010189-0001": [
            "dummy", "dummy", "dummy", "hello", "world", "greetings", "from",
            "outer", "space"
        ],
    }
    transformed = feature.transform_qid_posdocid_negdocid(
        "301", "LA010189-0001", "LA010189-0001")
    # stoi only knows about the word 'dummy' and 'doc'. So the transformation of every other word is set as 0

    assert transformed["qid"] == "301"
    assert transformed["posdocid"] == "LA010189-0001"
    assert transformed["negdocid"] == "LA010189-0001"
    assert numpy.array_equal(transformed["query"], [0, 1, 1])
    assert numpy.array_equal(transformed["posdoc"], [6, 3, 0])
    assert numpy.array_equal(transformed["negdoc"], [6, 3, 0])
    assert numpy.array_equal(transformed["query_idf"], [0, 0, 0])
Пример #6
0
def test_bagofwords_caching(dummy_index, monkeypatch):
    def fake_magnitude_embedding(*args, **kwargs):
        return Magnitude(None)

    monkeypatch.setattr(SlowEmbedText, "_load_pretrained_embeddings",
                        fake_magnitude_embedding)

    benchmark = DummyBenchmark()
    extractor_cfg = {
        "name": "bagofwords",
        "datamode": "trigram",
        "maxqlen": 4,
        "maxdoclen": 800,
        "usecache": True
    }
    extractor = BagOfWords(extractor_cfg,
                           provide={
                               "index": dummy_index,
                               "benchmark": benchmark
                           })

    qids = list(benchmark.qrels.keys())  # ["301"]
    qid = qids[0]
    docids = list(benchmark.qrels[qid].keys())

    assert not extractor.is_state_cached(qids, docids)

    extractor.preprocess(qids, docids, benchmark.topics[benchmark.query_type])

    assert extractor.is_state_cached(qids, docids)

    new_extractor = BagOfWords(extractor_cfg,
                               provide={
                                   "index": dummy_index,
                                   "benchmark": benchmark
                               })

    assert new_extractor.is_state_cached(qids, docids)
    new_extractor._build_vocab(qids, docids,
                               benchmark.topics[benchmark.query_type])
Пример #7
0
def test_bagofwords_id2vec_trigram(tmpdir, dummy_index):
    benchmark = DummyBenchmark({})
    tok_cfg = {"name": "anserini", "keepstops": True, "stemmer": "none"}
    tokenizer = AnseriniTokenizer(tok_cfg)
    extractor = BagOfWords(
        {
            "name": "bagofwords",
            "datamode": "trigram",
            "maxqlen": 4,
            "maxdoclen": 800,
            "usecache": False
        },
        provide={
            "index": dummy_index,
            "tokenizer": tokenizer,
            "benchmark": benchmark
        },
    )
    extractor.stoi = {extractor.pad_tok: extractor.pad}
    extractor.itos = {extractor.pad: extractor.pad_tok}
    extractor.idf = defaultdict(lambda: 0)
    # extractor.preprocess(["301"], ["LA010189-0001", "LA010189-0002"], benchmark.topics["title"])

    extractor.qid2toks = {"301": ["dummy", "doc"]}
    extractor.docid2toks = {
        "LA010189-0001": [
            "dummy", "dummy", "dummy", "hello", "world", "greetings", "from",
            "outer", "space"
        ],
        "LA010189-0002": [
            "dummy", "dummy", "dummy", "hello", "world", "greetings", "from",
            "outer", "space"
        ],
    }
    extractor.stoi["#du"] = 1
    extractor.stoi["dum"] = 2
    extractor.stoi["umm"] = 3
    extractor.itos[1] = "#du"
    extractor.itos[2] = "dum"
    extractor.itos[3] = "umm"
    transformed = extractor.id2vec("301", "LA010189-0001")

    # stoi only knows about the word 'dummy'. So the transformation of every other word is set as 0
    assert transformed["qid"] == "301"
    assert transformed["posdocid"] == "LA010189-0001"
    assert transformed.get("negdocid") is None

    # Right now we have only 3 words in the vocabular - "<pad>", "dummy" and "doc"
    assert np.array_equal(transformed["query"], [5, 1, 1, 1])
    assert np.array_equal(transformed["posdoc"], [
        39, 3, 3, 3
    ])  # There  are 6 unknown words in the doc, so all of them is encoded as 0
    assert np.array_equal(transformed["query_idf"], [0, 0, 0, 0])

    # Learn another word
    extractor.stoi["mmy"] = 4
    extractor.stoi["my#"] = 5
    extractor.stoi["#he"] = 6
    extractor.itos[4] = "mmy"
    extractor.itos[5] = "my#"
    extractor.itos[6] = "#he"

    transformed = extractor.id2vec("301", "LA010189-0001")
    # The posdoc transformation changes to reflect the new word
    assert np.array_equal(transformed["posdoc"], [32, 3, 3, 3, 3, 3, 1])
Пример #8
0
def test_bagofwords_id2vec(tmpdir, dummy_index):
    benchmark = DummyBenchmark({})
    tok_cfg = {"name": "anserini", "keepstops": True, "stemmer": "none"}
    tokenizer = AnseriniTokenizer(tok_cfg)
    extractor = BagOfWords(
        {
            "name": "bagofwords",
            "datamode": "unigram",
            "maxqlen": 4,
            "maxdoclen": 800,
            "usecache": False
        },
        provide={
            "index": dummy_index,
            "tokenizer": tokenizer,
            "benchmark": benchmark
        },
    )
    extractor.stoi = {extractor.pad_tok: extractor.pad}
    extractor.itos = {extractor.pad: extractor.pad_tok}
    extractor.idf = defaultdict(lambda: 0)
    # extractor.preprocess(["301"], ["LA010189-0001", "LA010189-0002"], benchmark.topics["title"])

    extractor.qid2toks = {"301": ["dummy", "doc"]}
    extractor.stoi["dummy"] = 1
    extractor.stoi["doc"] = 2
    extractor.itos[1] = "dummy"
    extractor.itos[2] = "doc"
    extractor.docid2toks = {
        "LA010189-0001": [
            "dummy", "dummy", "dummy", "hello", "world", "greetings", "from",
            "outer", "space"
        ],
        "LA010189-0002": [
            "dummy", "dummy", "dummy", "hello", "world", "greetings", "from",
            "outer", "space"
        ],
    }
    transformed = extractor.id2vec("301", "LA010189-0001", "LA010189-0001")
    # stoi only knows about the word 'dummy' and 'doc'. So the transformation of every other word is set as 0

    assert transformed["qid"] == "301"
    assert transformed["posdocid"] == "LA010189-0001"
    assert transformed["negdocid"] == "LA010189-0001"
    assert np.array_equal(transformed["query"], [0, 1, 1])
    assert np.array_equal(transformed["posdoc"], [6, 3, 0])
    assert np.array_equal(transformed["negdoc"], [6, 3, 0])
    assert np.array_equal(transformed["query_idf"], [0, 0, 0])
Пример #9
0
def test_bagofwords_create_trigrams(monkeypatch, tmpdir, dummy_index):
    benchmark = DummyBenchmark({})
    tok_cfg = {"name": "anserini", "keepstops": True, "stemmer": "none"}
    tokenizer = AnseriniTokenizer(tok_cfg)
    extractor = BagOfWords(
        {
            "name": "bagofwords",
            "datamode": "trigram",
            "maxqlen": 4,
            "maxdoclen": 800,
            "usecache": False
        },
        provide={
            "index": dummy_index,
            "tokenizer": tokenizer,
            "benchmark": benchmark
        },
    )
    extractor.preprocess(["301"], ["LA010189-0001", "LA010189-0002"],
                         benchmark.topics["title"])
    assert extractor.stoi == {
        "<pad>": 0,
        "#du": 1,
        "dum": 2,
        "umm": 3,
        "mmy": 4,
        "my#": 5,
        "#do": 6,
        "doc": 7,
        "oc#": 8,
        "#he": 9,
        "hel": 10,
        "ell": 11,
        "llo": 12,
        "lo#": 13,
        "#wo": 14,
        "wor": 15,
        "orl": 16,
        "rld": 17,
        "ld#": 18,
        "#gr": 19,
        "gre": 20,
        "ree": 21,
        "eet": 22,
        "eti": 23,
        "tin": 24,
        "ing": 25,
        "ngs": 26,
        "gs#": 27,
        "#fr": 28,
        "fro": 29,
        "rom": 30,
        "om#": 31,
        "#ou": 32,
        "out": 33,
        "ute": 34,
        "ter": 35,
        "er#": 36,
        "#sp": 37,
        "spa": 38,
        "pac": 39,
        "ace": 40,
        "ce#": 41,
        "#le": 42,
        "les": 43,
        "ess": 44,
        "ssd": 45,
        "sdu": 46,
    }

    assert extractor.itos == {v: k for k, v in extractor.stoi.items()}
Пример #10
0
def test_transform_qid_posdocid_negdocid_only_posdoc(tmpdir, trec_index,
                                                     dummy_collection_config):
    collection = Collection(dummy_collection_config)
    pipeline_config = {
        "indexstops": True,
        "maxthreads": 1,
        "stemmer": "anserini",
        "bmax": 0.2,
        "k1max": 0.2,
        "maxqlen": 5,
        "maxdoclen": 10,
        "keepstops": True,
        "datamode": "unigram",
    }
    bm25_run = BM25Grid(trec_index, collection,
                        os.path.join(tmpdir, "searcher"), pipeline_config)
    benchmark = Robust04Benchmark(bm25_run, collection, pipeline_config)
    feature = BagOfWords(tmpdir,
                         tmpdir,
                         pipeline_config,
                         index=trec_index,
                         collection=collection,
                         benchmark=benchmark)
    feature.stoi["dummy"] = 1
    feature.stoi["doc"] = 2
    feature.itos[1] = "dummy"
    feature.itos[2] = "doc"
    feature.doc_id_to_doc_toks = {
        "LA010189-0001": [
            "dummy", "dummy", "dummy", "hello", "world", "greetings", "from",
            "outer", "space"
        ],
        "LA010189-0001": [
            "dummy", "dummy", "dummy", "hello", "world", "greetings", "from",
            "outer", "space"
        ],
    }
    transformed = feature.transform_qid_posdocid_negdocid(
        "301", "LA010189-0001")
    # stoi only knows about the word 'dummy'. So the transformation of every other word is set as 0
    assert transformed["qid"] == "301"
    assert transformed["posdocid"] == "LA010189-0001"
    assert transformed["negdocid"] is None

    # Right now we have only 3 words in the vocabular - "<pad>", "dummy" and "doc"
    assert numpy.array_equal(transformed["query"], [0, 1, 1])
    assert numpy.array_equal(transformed["posdoc"], [
        6, 3, 0
    ])  # There  are 6 unknown words in the doc, so all of them is encoded as 0
    assert numpy.array_equal(transformed["query_idf"], [0, 0, 0])

    # Learn another word
    feature.stoi["hello"] = 3
    feature.itos[3] = "hello"
    transformed = feature.transform_qid_posdocid_negdocid(
        "301", "LA010189-0001")
    # The posdoc transformation changes to reflect the new word
    assert numpy.array_equal(transformed["posdoc"], [5, 3, 0, 1])
Пример #11
0
def test_build_from_benchmark_with_trigram(monkeypatch, tmpdir, trec_index,
                                           dummy_collection_config):
    collection = Collection(dummy_collection_config)
    pipeline_config = {
        "indexstops": True,
        "maxthreads": 1,
        "stemmer": "anserini",
        "bmax": 0.2,
        "k1max": 0.2,
        "maxqlen": 5,
        "maxdoclen": 10,
        "keepstops": True,
        "rundocsonly": False,
        "datamode": "trigram",
    }
    bm25_run = BM25Grid(trec_index, collection,
                        os.path.join(tmpdir, "searcher"), pipeline_config)
    bm25_run.create()
    folds = {
        "s1": {
            "train_qids": ["301"],
            "predict": {
                "dev": ["301"],
                "test": ["301"]
            }
        }
    }
    benchmark = Robust04Benchmark(bm25_run, collection, pipeline_config)
    benchmark.create_and_store_train_and_pred_pairs(folds)

    feature = BagOfWords(tmpdir,
                         tmpdir,
                         pipeline_config,
                         index=trec_index,
                         collection=collection,
                         benchmark=benchmark)

    feature.build_from_benchmark(True)
    assert feature.stoi == {
        "<pad>": 0,
        "#du": 1,
        "dum": 2,
        "umm": 3,
        "mmy": 4,
        "my#": 5,
        "#do": 6,
        "doc": 7,
        "oc#": 8,
        "#he": 9,
        "hel": 10,
        "ell": 11,
        "llo": 12,
        "lo#": 13,
        "#wo": 14,
        "wor": 15,
        "orl": 16,
        "rld": 17,
        "ld#": 18,
        "#gr": 19,
        "gre": 20,
        "ree": 21,
        "eet": 22,
        "eti": 23,
        "tin": 24,
        "ing": 25,
        "ngs": 26,
        "gs#": 27,
        "#fr": 28,
        "fro": 29,
        "rom": 30,
        "om#": 31,
        "#ou": 32,
        "out": 33,
        "ute": 34,
        "ter": 35,
        "er#": 36,
        "#sp": 37,
        "spa": 38,
        "pac": 39,
        "ace": 40,
        "ce#": 41,
    }

    assert feature.itos == {v: k for k, v in feature.stoi.items()}
Пример #12
0
def test_build_from_benchmark(monkeypatch, tmpdir, trec_index,
                              dummy_collection_config):
    collection = Collection(dummy_collection_config)
    pipeline_config = {
        "indexstops": True,
        "maxthreads": 1,
        "stemmer": "anserini",
        "bmax": 0.2,
        "k1max": 0.2,
        "maxqlen": 5,
        "maxdoclen": 10,
        "keepstops": True,
        "rundocsonly": False,
        "datamode": "unigram",
    }
    bm25_run = BM25Grid(trec_index, collection,
                        os.path.join(tmpdir, "searcher"), pipeline_config)
    bm25_run.create()
    folds = {
        "s1": {
            "train_qids": ["301"],
            "predict": {
                "dev": ["301"],
                "test": ["301"]
            }
        }
    }
    benchmark = Robust04Benchmark(bm25_run, collection, pipeline_config)
    benchmark.create_and_store_train_and_pred_pairs(folds)

    feature = BagOfWords(tmpdir,
                         tmpdir,
                         pipeline_config,
                         index=trec_index,
                         collection=collection,
                         benchmark=benchmark)

    feature.build_from_benchmark(True)
    assert feature.stoi == {
        "<pad>": 0,
        "dummy": 1,
        "doc": 2,
        "hello": 3,
        "world": 4,
        "greetings": 5,
        "from": 6,
        "outer": 7,
        "space": 8,
    }

    assert feature.itos == {v: k for k, v in feature.stoi.items()}
    assert feature.embeddings == {
        "<pad>": 0,
        "dummy": 1,
        "doc": 2,
        "hello": 3,
        "world": 4,
        "greetings": 5,
        "from": 6,
        "outer": 7,
        "space": 8,
    }