Exemplo n.º 1
0
def test_build_index2(mock_file):
    """ Check if we can build index from string with repeated word. """

    my_index = InvertedIndex()
    my_index.build(mock_file)

    assert my_index.index_data == {'test': [1], 'me': [1], 'first': [1]}
def test_add_new_doc_one_word():
    doc_id = 1
    content = 'foo'
    foo_index = InvertedIndex()
    foo_index.add_new_document(doc_id, content)
    assert doc_id in foo_index.inverted_index[content], (
        "add a new document with 1 word but couldn't find in built index")
Exemplo n.º 3
0
def test_query2(mock_file):
    """ Check if we can query some correct words with repeats. """

    my_index = InvertedIndex()
    my_index.build(mock_file)

    assert my_index.query(['string', 'long']) == {1, 10, 100500}
Exemplo n.º 4
0
 def test_get_index_data(self):
     data_source = DataSource(self.books_file)
     inverted_index = InvertedIndex(self.index_file)
     inverted_index.get_inverted_index(data_source.read_file())
     with open(inverted_index.file_name, 'r') as f:
         keywords_dict = pickle.load(f)
     self.assertEqual(keywords_dict[self.keyword], ['B000UZNREG'])
Exemplo n.º 5
0
class IndexTrainer(object):

	def __init__(self):
		self.index = InvertedIndex()
		self.bow = Bow()
		self.extractor = Extractor('surf')
		print self.index.author
		print self.index.description

	def load_feature(self, path='../models/feature.npy'):
		self.features = np.load(path)
		if len(self.features) > 200000:
			self.features = self.features[:200000]
		print "feature shape: ", self.features.shape
		return self.features

	def run(self, path):
		self.bow.load()
		self.index.reset(self.bow.centers)
		images = imutil.get_list_image(path)
		t = imutil.Timer(1)
		t.tic()
		for i,image in enumerate(images):
			descriptors = self.extractor.extract(image)
			self.index.append(image, descriptors)
			if (i+1)%1000 == 0:
				t.toc('finish 1000 images: ')
				t.tic()
def test_query_2_intersect_words(words=['bow', 'tfidf']):
    inv_idx = InvertedIndex(TEST_INDEX_TABLE)
    doc_ids = inv_idx.query(words)
    right_answer = {
        4,
    }
    assert doc_ids == right_answer
def test_many_queries_not_in_one_article(query):
    index = InvertedIndex()
    article_id = '12'
    with open(ONE_ARTICLE_PATH, 'r') as fd:
        index.build(fd)
    assert article_id not in index.find_articles(
        query), 'find article in query that not in one article'
Exemplo n.º 8
0
def test_build_index(mock_file):
    """ Check if we can build index from simple string. """

    my_index = InvertedIndex()
    my_index.build(mock_file)

    assert my_index.index_data == {'test': [1], 'me': [1], 'first': [1]}
Exemplo n.º 9
0
 def test_single_element(self):
     c = [('http://test.net', 'Simple text')]
     ii = II(c)
     ii.create_index()
     d = dd(list)
     d['simple'] = ['http://test.net']
     d['text'] = ['http://test.net']
     assert d == ii.index
Exemplo n.º 10
0
def test_build_index4(mock_file):
    """ Check if we can build index from file with several lines. """
    my_index = InvertedIndex()
    my_index.build(mock_file)

    # Check the index
    assert len(my_index.index_data) == 12
    assert len(my_index.index_data.values()) == 12
Exemplo n.º 11
0
def test_query_from_loaded2(mock_file):
    """ Check if we can query some non-existed words. """

    my_index = InvertedIndex()
    my_index.load('test.index')

    assert my_index.query(['me', 'test', 'non existed']) == set()
    assert len(mock_file.mock_calls) > 1
Exemplo n.º 12
0
def test_load_index(mock_file):
    """ Check if we can load index. """

    my_index = InvertedIndex()
    my_index.load('my_Test.index')

    assert my_index.index_data == {'me': [1], 'test': [1, 2], 'you': [2]}
    assert len(mock_file.mock_calls) > 1
Exemplo n.º 13
0
def main():
    os.chdir(RELATIVE_PATH_TO_CORPUS)

    docs = os.listdir(os.getcwd())
    index = InvertedIndex(docs)
    index.build()

    QueryHandler(index).loop()
Exemplo n.º 14
0
def test_query(mock_file):
    """ Check if we can query some correct words. """

    my_index = InvertedIndex()
    my_index.build(mock_file)

    assert my_index.query(['test']) == {
        1,
    }
def test_add_new_doc_multi_word():
    doc_id = 23
    word_1 = '  foo   '
    word_2 = ' \t bar\t'
    foo_index = InvertedIndex()
    foo_index.add_new_document(doc_id, word_1 + word_2)
    assert_mes = "add a new document with 2 words and different separators but couldn't find a word in built index"
    assert doc_id in foo_index.inverted_index[word_1.strip()], assert_mes
    assert doc_id in foo_index.inverted_index[word_2.strip()], assert_mes
def test_query_one_doc_in_index():
    index = InvertedIndex()
    index.inverted_index = defaultdict(set, {
        'foo': {1, 2, 3},
        'bar': {1},
        'foobar': {1, 2}
    })
    assert index.query(['foo', 'bar'
                        ]) == {1}, 'didnt find a doc, which present in index'
def test_one_article():
    index = InvertedIndex()
    with open(ONE_ARTICLE_PATH, 'r') as fd:
        index.build(fd)
    with open(ONE_ARTICLE_PATH, 'r') as fd:
        article_id, words = fd.readline().split(maxsplit=1)
    words = words.split()
    assert article_id == index.find_articles(
        words), 'didnt find article in query of all words in article'
Exemplo n.º 18
0
 def __init__(self, config):
     # An object representing the inverted_index: {term: [df, {tweet_id: list of tweet information...}...]..}
     self.inverted_idx = InvertedIndex()
     # Represents the GloVe vector
     self.document_dict = {}
     self.num_of_docs = 0
     self.global_capitals = {}
     self.entities_dict = Counter()
     self.config = config
     self.glove_dict = {}
Exemplo n.º 19
0
def test_query_from_loaded(mock_file):
    """ Check if we can query after we load the index. """

    my_index = InvertedIndex()
    my_index.load('test.index')

    assert my_index.query(['me', 'test']) == {
        1,
    }
    assert len(mock_file.mock_calls) > 1
Exemplo n.º 20
0
 def test_multiple_elements(self):
     c = [('One', 'one Two three'), ('two', 'three'),
          ('three', 'two Three')]
     d = dd(list)
     d['one'] = ['One']
     d['two'] = ['One', 'three']
     d['three'] = ['One', 'two', 'three']
     ii = II(c)
     ii.create_index()
     assert d == ii.index
Exemplo n.º 21
0
def exec():
    #Querry comes from command line 
    querry_ = sys.argv[1:]
    ii = InvertedIndex()
    for file in [f for f in listdir('../data') if isfile(join('../data', f))]:
        document_ = Document('../data/'+file)
        ii.add_document(document_)
    sim_table = sorted(ii.querry(querry_))
    for document in sim_table:
        print(document)
def test_unicode_query_two_docs_in_index():
    index = InvertedIndex()
    index.inverted_index = defaultdict(set, {
        'один': {1, 2, 3},
        'bar': {1},
        'два': {1, 2}
    })
    assert index.query(['один', 'два']) == {
        1, 2
    }, 'didnt find a two docs, which are present in index with unicode'
Exemplo n.º 23
0
def test_inv_index_query(load_inverted_index, wiki_docs):
    words = ['after', 'were']
    doc_ind = InvertedIndex.query(load_inverted_index, words)
    assert {25, 290}.issubset(doc_ind)
    words = ['neizvesnie', 'slova']
    doc_ind = InvertedIndex.query(load_inverted_index, words)
    assert len(doc_ind) == 0
    words = ['after', 'were']
    index_dict = build_inverted_index(wiki_docs)
    inv_index = InvertedIndex(index_dict)
    doc_ind = inv_index.query(words)
    assert {25, 290}.issubset(doc_ind)
Exemplo n.º 24
0
def test_build_index3(mock_file):
    """ Check if we can build index from string with special characters. """

    my_index = InvertedIndex()
    my_index.build(mock_file)

    assert my_index.index_data == {
        'test': [1],
        'test,': [1],
        'me': [1],
        'first': [1]
    }
Exemplo n.º 25
0
def test_check_compression_good():
    json_inverted_index = InvertedIndex.load(
        "inverted_index/inverted_json.index", JsonStoragePolicy())
    compressed_inverted_index = InvertedIndex.load(
        "inverted_index/inverted_json_zip.index", JsonZipStoragePolicy())
    assert json_inverted_index.query(["two", "words"
                                      ]) == compressed_inverted_index.query([
                                          "two", "words"
                                      ]), "compressin give another answer"

    assert json_inverted_index.get_size(
    ) == compressed_inverted_index.get_size(
    ), "compressed file has diff num of records"
def test_parse_queries_three_query():
    expected = [['foo', 'bar'], ['one', 'два', '123'], ['один']]
    arguments = Namespace(query=expected,
                          query_file_utf8=None,
                          query_file_cp1251=None)
    result = InvertedIndex().parse_queries(arguments)
    assert result == expected, 'wrong parsing query file with three queries'
def test_parse_queries_two_query():
    expected = [['foo', 'bar'], ['one', 'два', '123']]
    with open('test_two_queries.cp1251', 'r', encoding='cp1251') as f:
        arguments = Namespace(query=None,
                              query_file_utf8=None,
                              query_file_cp1251=f)
        result = InvertedIndex().parse_queries(arguments)
    assert result == expected, 'wrong parsing query file with two queries'
def test_parse_queries_one_query():
    expected = [['one', 'два', '123']]
    with open('test_one_query.utf8', 'r') as f:
        arguments = Namespace(query=None,
                              query_file_utf8=f,
                              query_file_cp1251=None)
        result = InvertedIndex().parse_queries(arguments)
    assert result == expected, 'wrong parsing query file with one query'
Exemplo n.º 29
0
def search(dictionary_file, postings_file, query_file, output_file):
    try:
        # Remove previous output file
        os.remove(output_file)
    except OSError:
        pass
    inverted_index = InvertedIndex(dictionary_file, postings_file)
    meta_data = get_meta_data()
    tree = ET.parse(query_file)
    root = tree.getroot()
    title_tokens = []
    description_tokens = []

    raw_tokens = []

    for child in root:
        if child.tag == 'title':
            title_tokens = build_tokens(child.text)
            raw_tokens.extend(word_tokenize(child.text))
        elif child.tag == 'description':
            description_tokens = build_tokens(child.text)
            raw_tokens.extend(word_tokenize(child.text))

    raw_tokens = helper.remove_stop_words_without_normalize(
        helper.filter_invalid_characters(raw_tokens))
    additional_tokens = []
    for token in list(set(raw_tokens)):
        additional_tokens.extend(helper.get_similar_words(token))
        pass

    title_tokens = helper.remove_stop_words(
        helper.filter_invalid_characters(title_tokens))
    description_tokens = helper.remove_stop_words(
        helper.filter_invalid_characters(description_tokens))

    # tight results are results which favour high precision. We use this as a proxy for true positive
    tight_results = execute_query(title_tokens, description_tokens, [],
                                  inverted_index, meta_data)
    global top_UPC_classes
    global top_IPC_classes
    global top_family_members
    top_UPC_classes = get_top_classes(tight_results, meta_data['UPC_class'], 6)
    top_IPC_classes = get_top_classes(tight_results, meta_data['IPC_class'], 4)
    top_family_members = get_top_members(tight_results,
                                         meta_data['family_members'], 30)
    # supplementary_results = expand_query(tight_results, meta_data['doc_top_terms'], inverted_index, meta_data)

    additional_tokens = helper.normalize_tokens(list(set(additional_tokens)))

    results = execute_query(title_tokens, description_tokens,
                            additional_tokens, inverted_index, meta_data)

    k = int(TOP_X_PERCENT_RESULTS * len(results))
    # j = int(TOP_X_PERCENT_RESULTS * len(supplementary_results))
    # results = list(set(results[:k] + supplementary_results[:j]))
    write_to_output(output_file, results[:k])
def test_dump_and_load_index(tmp_path, tiny_sample_document):
    dir = tmp_path / "tiny_example_dir"
    dir.mkdir()
    index_file = dir / "tiny_example.index"
    docs = tiny_sample_document
    inv_table = build_inverted_index(docs)
    inv_table.dump(index_file)
    assert inv_table == TINY_SAMPLE_INV_TABLE
    loaded_inv_table = InvertedIndex.load(index_file)
    assert inv_table == loaded_inv_table
def test_can_dump_and_load_inverted_index(tmpdir, small_dataset_index):
    index_fio = tmpdir.join('inverted.index')
    small_dataset_index.dump(index_fio)
    load_inverted_index = InvertedIndex.load(index_fio)
    assert small_dataset_index == load_inverted_index, (
        "load should return the same inverted index"
    )
    assert {} != load_inverted_index, (
        "load should return the same inverted index"
    )
Exemplo n.º 32
0
def init_inverted_index():
    idx = 1
    doc_list = []
    inverted_index = InvertedIndex()

    while True:
        try:
            document = deserialize(str(idx)+".dbf")
            doc_list.append(document)
            idx  += 1;
        except IOError:
            break

    total = len(doc_list)
    inverted_index.n = total

    for document in doc_list:
        lower_doc = str(document).lower()
        tokens = nltk.word_tokenize(lower_doc)
        for pos in range(0,len(tokens)):
            tk = tokens[pos]

            if not tk in inverted_index:
                inverted_index[tk] = list()
            
            term_data = inverted_index[tk]
            if not document.id in map(lambda p: p.doc_id, term_data):
                term_data.append(Posting(document.id))

            for posting in term_data:
                if posting.doc_id == document.id:
                    posting.positions.append(pos)
                    break
        print "{0:.2f}% completed...".format(float(document.id)/total * 100)

    serialize(inverted_index,"inverted_index.idx")
Exemplo n.º 33
0
class IndexTrainer(object):

	def __init__(self, centers):
		self.bow = Bow(centers)
		self.index = InvertedIndex()

	def load_feature(self, path='../models/feature.npy'):
		self.features = np.load(path)
		if len(self.features) > 500000:
			self.features = self.features[:500000]
		print "feature shape: ", self.features.shape
		return self.features

	def train(self):
		self.bow.load('../models/bow.pkl')
		self.index.reset(self.bow.centers)
		self.index.append('img1',self.features[:100])
		self.index.append('img2',self.features[100:200])
		self.index.append('img3',self.features[200:300])
		print self.index
Exemplo n.º 34
0
 def __init__(self, n_terms):
     InvertedIndex.__init__(self, n_terms)
Exemplo n.º 35
0
	def __init__(self, centers):
		self.bow = Bow(centers)
		self.index = InvertedIndex()
Exemplo n.º 36
0
'''
Created on 2014-11-27
@author: haoyu
'''
from inverted_index import InvertedIndex
from searcher import Searcher
from tfidf import TF_IDF
from bm25 import BM25

if __name__ == '__main__':
    fdir = "data"
    rankerUse = "TF-IDF"
    rankerAvailable = {"TF-IDF":TF_IDF, "BM25":BM25}

    invertedFile = InvertedIndex()
    invertedFile.makeTextIndexFromFloder( fdir )

    ranker = rankerAvailable[rankerUse]( invertedFile )
    searcher = Searcher( invertedFile, ranker )
    # print(invertedFile.times)
    # print( invertedFile.contains )
    while True:
        searcher.search()
Exemplo n.º 37
0
	def __init__(self):
		self.index = InvertedIndex()
		self.bow = Bow()
		self.extractor = Extractor('surf')
		print self.index.author
		print self.index.description