def test_word2vec(): model = Word2Vec([doc.split() for doc in documents], iter=1, min_count=1) match_op = Matching() wcd = WordCentroidDistance(model.wv) retrieval = Retrieval(wcd, matching=match_op) retrieval.fit(documents) result = retrieval.query('dog') assert result[0] == 0
def test_retrieval(): # Test retrieval with given ids tfidf = Tfidf() retrieval = Retrieval(tfidf) ids = ['fox_example', 'lazy_example'] retrieval.fit(documents, ids) result = retrieval.query('fox') assert result[0] == 'fox_example' assert result[1] == 'lazy_example'
def test_doc2vec_inference(): tagged_docs = [TaggedDocument(simple_preprocess(doc), [i]) for i, doc in enumerate(documents)] model = Doc2Vec(tagged_docs, epochs=1, min_count=1) d2v = Doc2VecInference(model, DEFAULT_ANALYZER) match_op = Matching() retrieval = Retrieval(d2v, matching=match_op).fit(documents) result = retrieval.query("scientists") assert result[0] == 1
def test_doc2vec_inference(): tagged_docs = [ TaggedDocument(simple_preprocess(doc), [i]) for i, doc in enumerate(documents) ] model = Doc2Vec(tagged_docs, epochs=1, min_count=1) d2v = Doc2VecInference(model, DEFAULT_ANALYZER) match_op = Matching() retrieval = Retrieval(d2v, matching=match_op).fit(documents) result = retrieval.query("scientists") assert result[0] == 1
def test_word2vec(): model = Word2Vec([doc.split() for doc in documents], iter=1, min_count=1) match_op = Matching() with pytest.raises(ValueError): wcd = WordCentroidDistance(model) wcd = WordCentroidDistance(model.wv) retrieval = Retrieval(wcd, matching=match_op) retrieval.fit(documents) result = retrieval.query('dog') assert result[0] == 0
def test_reddit_wcd_idf(): model = KeyedVectors.load_word2vec_format( "model/reddit.en.text.vector") # Replace with directory to your .vector model file wcd = WordCentroidDistance(model.wv) retrieval = Retrieval(wcd) retrieval.fit(documents) while True: query = input("Please enter the query:\n") if query == "exit": break else: result = retrieval.query(query, return_scores=True) print(result)
def test_expansion_inside_retrieval(): # Integration test within full retrieval pipeline model = Word2Vec([doc.split() for doc in DOCUMENTS], iter=1, min_count=1) n_expansions = 2 tfidf = Tfidf() match_op = Matching() expansion_op = EmbeddedQueryExpansion(model.wv, m=n_expansions) retrieval = Retrieval(tfidf, # The retrieval model matching=match_op, query_expansion=expansion_op) ids = ['fox_ex', 'surf_ex'] retrieval.fit(DOCUMENTS, ids) result = retrieval.query('surfing surfers do surf green') assert result[0] == 'surf_ex'
def test_combined(): model = Word2Vec([doc.split() for doc in documents], iter=1, min_count=1) wcd = WordCentroidDistance(model.wv) tfidf = Tfidf() wcd.fit(documents) # # they can operate on different feilds tfidf.fit(['fox', 'scientists']) match_op = Matching().fit(documents) combined = wcd + tfidf**2 retrieval = Retrieval(combined, matching=match_op, labels=[7, 42]) result = retrieval.query('fox') assert result[0] == 7 result = retrieval.query('scientists') assert result[0] == 42
def test_expansion_inside_retrieval(): # Integration test within full retrieval pipeline model = Word2Vec([doc.split() for doc in DOCUMENTS], iter=1, min_count=1) n_expansions = 2 tfidf = Tfidf() match_op = Matching() expansion_op = EmbeddedQueryExpansion(model.wv, m=n_expansions) retrieval = Retrieval( tfidf, # The retrieval model matching=match_op, query_expansion=expansion_op) ids = ['fox_ex', 'surf_ex'] retrieval.fit(DOCUMENTS, ids) result = retrieval.query('surfing surfers do surf green') assert result[0] == 'surf_ex'
def test_combined(): model = Word2Vec([doc.split() for doc in documents], iter=1, min_count=1) wcd = WordCentroidDistance(model.wv) tfidf = Tfidf() wcd.fit(documents) # # they can operate on different feilds tfidf.fit(['fox', 'scientists']) match_op = Matching().fit(documents) combined = wcd + tfidf ** 2 retrieval = Retrieval(combined, matching=match_op, labels=[7,42]) result = retrieval.query('fox') assert result[0] == 7 result = retrieval.query('scientists') assert result[0] == 42