def test_context_window_search_sentence_extension_acc(self): testfile = open("text.txt", 'w') testfile.write("There are only fluffy kittens! Only kittens") testfile.close() self.testindexer.index_with_lines("text.txt") testfile2 = open("text2.txt", 'w') testfile2.write("only kittens and puppies.") testfile2.close() self.testindexer.index_with_lines("text2.txt") testsearch = search_engine.SearchEngine('database') windowsdict = testsearch.several_tokens_search_with_sentence_context_acc( "only", 3, -10) expectedwindowresult = { "text.txt": [ search_engine.ContextWindow( "There are only fluffy kittens! Only kittens", [indexer.Position_with_lines(10, 14, 0)], search_engine.WindowPosition(0, 30, 0, "text.txt")) ], "text2.txt": [ search_engine.ContextWindow( "only kittens and puppies.", [indexer.Position_with_lines(0, 4, 0)], search_engine.WindowPosition(0, 25, 0, "text2.txt")) ] } self.assertEqual(windowsdict, expectedwindowresult) windowsdict = testsearch.several_tokens_search_with_sentence_context_acc( "only", 1, 8) expectedwindowresult = {} self.assertEqual(windowsdict, expectedwindowresult)
def test_search_one_token_one_file(self): testfile = open("text.txt", 'w') testfile.write("There are only kittens!") testfile.close() self.testindexer.index_with_lines("text.txt") testsearch = search_engine.SearchEngine('database') expectedresult = dict({ "There": { "text.txt": [indexer.Position_with_lines(0, 5, 0)] }, "are": { "text.txt": [indexer.Position_with_lines(6, 9, 0)] }, "only": { "text.txt": [indexer.Position_with_lines(10, 14, 0)] }, "kittens": { "text.txt": [indexer.Position_with_lines(15, 22, 0)] } }) resulteddictionary = dict(shelve.open('database')) self.assertEqual(resulteddictionary, expectedresult) searchresulteddictionary = testsearch.search_by_token("only") expectedsearchresult = { "text.txt": [indexer.Position_with_lines(10, 14, 0)] } self.assertIsInstance(searchresulteddictionary, dict) self.assertEqual(searchresulteddictionary, expectedsearchresult)
def test_several_tokens_search_acc(self): testfile = open("text.txt", 'w') testfile.write("There are only kittens!") testfile.close() testfile2 = open("text2.txt", 'w') testfile2.write("only kittens and puppies...") testfile2.close() self.testindexer.index_with_lines("text2.txt") self.testindexer.index_with_lines("text.txt") testsearch = search_engine.SearchEngine('database') searchresulteddictionary = testsearch.several_tokens_search_acc( "only kittens", 0, 0) expectedsearchresult = {} self.assertEqual(searchresulteddictionary, expectedsearchresult) searchresulteddictionary = testsearch.several_tokens_search_acc( "only kittens", 1, 0) expectedsearchresult = { "text.txt": [ indexer.Position_with_lines(10, 14, 0), indexer.Position_with_lines(15, 22, 0) ] } self.assertEqual(searchresulteddictionary, expectedsearchresult) searchresulteddictionary = testsearch.several_tokens_search_acc( "only kittens", 2, 1) expectedsearchresult = { "text2.txt": [ indexer.Position_with_lines(0, 4, 0), indexer.Position_with_lines(5, 12, 0) ] } self.assertEqual(searchresulteddictionary, expectedsearchresult)
def test_context_window_search_several_tokens_several_files_3_3(self): testfile = open("text.txt", 'w') testfile.write("There are only fluffy kittens!") testfile.close() testfile2 = open("text2.txt", 'w') testfile2.write("only kittens and puppies...") testfile2.close() self.testindexer.index_with_lines("text2.txt") self.testindexer.index_with_lines("text.txt") testsearch = search_engine.SearchEngine('database') # context '3,3' windowsdict = testsearch.several_tokens_search_with_customizable_context( "only kittens", 3, 3) expectedwindowresult = { "text.txt": [ search_engine.ContextWindow( "There are only fluffy kittens", [ indexer.Position_with_lines(10, 14, 0), indexer.Position_with_lines(22, 29, 0) ], search_engine.WindowPosition(0, 29, 0, "text.txt")) ], "text2.txt": [ search_engine.ContextWindow( "only kittens and puppies", [ indexer.Position_with_lines(0, 4, 0), indexer.Position_with_lines(5, 12, 0) ], search_engine.WindowPosition(0, 24, 0, "text2.txt")) ] } self.assertEqual(expectedwindowresult, windowsdict)
def test_position_generator(self): testfile = open("text.txt", 'w') testfile.write("") testfile.close() testsearch = search_engine.SearchEngine('database') lists1 = [[1, 2, 3, 4, 6], [9, 5, 10, 31]] list1result = list(testsearch.position_generator(lists1)) expectedlist1 = [1, 2, 3, 4, 5, 6, 9, 10, 31] self.assertEqual(list1result, expectedlist1) lists2 = [[-5, 9, 0, 20], [1, 15]] list2result = list(testsearch.position_generator(lists2)) expectedlist2 = [-5, 0, 1, 9, 15, 20] self.assertEqual(list2result, expectedlist2) lists3 = [[ indexer.Position_with_lines(6, 9, 0), indexer.Position_with_lines(2, 4, 1) ], [ indexer.Position_with_lines(0, 2, 1), indexer.Position_with_lines(4, 10, 0) ]] expectedlist3 = [ indexer.Position_with_lines(4, 10, 0), indexer.Position_with_lines(6, 9, 0), indexer.Position_with_lines(0, 2, 1), indexer.Position_with_lines(2, 4, 1) ] list3result = list(testsearch.position_generator(lists3)) self.assertEqual(list3result, expectedlist3)
def test_input_two_same_words(self): testfile = open("text.txt", 'w') testfile.write("sun sun") testfile.close() expectedresult = dict({ "sun": { "text.txt": [ indexer.Position_with_lines(0, 3, 0), indexer.Position_with_lines(4, 7, 0) ] } }) self.testindexer.index_with_lines("text.txt") resulteddictionary = dict(shelve.open('database')) self.assertEqual(resulteddictionary, expectedresult)
def test_search_one_token_several_files(self): testfile = open("text.txt", 'w') testfile.write("There are only kittens!") testfile.close() testfile2 = open("text2.txt", 'w') testfile2.write("only...") testfile2.close() self.testindexer.index_with_lines("text.txt") self.testindexer.index_with_lines("text2.txt") testsearch = search_engine.SearchEngine('database') searchresulteddictionary = testsearch.search_by_token("only") expectedsearchresult = { "text.txt": [indexer.Position_with_lines(10, 14, 0)], "text2.txt": [indexer.Position_with_lines(0, 4, 0)] } self.assertIsInstance(searchresulteddictionary, dict) self.assertEqual(searchresulteddictionary, expectedsearchresult)
def test_context_window_search_sentence_extension(self): testfile = open("text.txt", 'w') testfile.write("There are only fluffy kittens! Only kittens") testfile.close() self.testindexer.index_with_lines("text.txt") testsearch = search_engine.SearchEngine('database') windowsdict = testsearch.several_tokens_search_with_sentence_context( "only") expectedwindowresult = { "text.txt": [ search_engine.ContextWindow( "There are only fluffy kittens! Only kittens", [indexer.Position_with_lines(10, 14, 0)], search_engine.WindowPosition(0, 30, 0, "text.txt")) ] } self.assertEqual(windowsdict, expectedwindowresult) windowsdict = testsearch.several_tokens_search_with_sentence_context( "only fluffy") expectedwindowresult = { "text.txt": [ search_engine.ContextWindow( "There are only fluffy kittens!", [ indexer.Position_with_lines(10, 14, 0), indexer.Position_with_lines(15, 21, 0) ], search_engine.WindowPosition(0, 30, 0, "text.txt")) ] } self.assertEqual(windowsdict, expectedwindowresult) windowsdict = testsearch.several_tokens_search_with_sentence_context( "kittens") expectedwindowresult = { "text.txt": [ search_engine.ContextWindow( "There are only fluffy kittens! Only kittens", [ indexer.Position_with_lines(22, 29, 0), indexer.Position_with_lines(36, 43, 0) ], search_engine.WindowPosition(0, 43, 0, "text.txt")) ] } self.assertEqual(windowsdict, expectedwindowresult)
def test_several_tokens_search_gen(self): testfile = open("text.txt", 'w') testfile.write("There are only fluffy kittens kittens") testfile.close() self.testindexer.index_with_lines("text.txt") testfile2 = open("text2.txt", 'w') testfile2.write("only kittens and puppies...") testfile2.close() self.testindexer.index_with_lines("text2.txt") testsearch = search_engine.SearchEngine('database') searchresult1 = testsearch.several_tokens_search_gen("", 1, 0) self.assertEqual(searchresult1, {}) searchresult2 = testsearch.several_tokens_search_gen("?", 2, 0) self.assertEqual(searchresult2, {}) searchresult3 = testsearch.several_tokens_search_gen("kittens", 2, 0) expectedsearchresult3 = { "text.txt": [ indexer.Position_with_lines(22, 29, 0), indexer.Position_with_lines(30, 37, 0) ], "text2.txt": [indexer.Position_with_lines(5, 12, 0)] } for file in searchresult3: self.assertEqual(list(searchresult3[file]), expectedsearchresult3[file]) searchresult4 = testsearch.several_tokens_search_gen("kittens", 1, 0) expectedsearchresult4 = { "text.txt": [ indexer.Position_with_lines(22, 29, 0), indexer.Position_with_lines(30, 37, 0) ] } for file in searchresult4: self.assertEqual(list(searchresult4[file]), expectedsearchresult4[file]) searchresult5 = testsearch.several_tokens_search_gen("kittens", 1, 3) self.assertEqual(searchresult5, {}) searchresult6 = testsearch.several_tokens_search_gen("kittens", -5, 0) self.assertEqual(searchresult6, {})
def get_context_window_one_position_one_file(cls, tokenposition, doc, line, leftcontext, rightcontext): """ This method can construct a context window of customizable size. @param tokenposition: position of the token @param doc: name of the document to work with @param leftcontext: number of words from the left side of the token to be added to the context window @param rightcontext: number of words from the right side of the token to be added to the context window @return mycontextwindow: object of the type ContextWindow, window for ONE position in ONE document """ tokenizerresult = [] t = Tokenizator() lineno = tokenposition.line i = 0 # left context mylist = [] myleftline = line[:tokenposition.wordend] myreversedleftline = myleftline[::-1] tokenizerresult = list(t.generate_alpha_and_digits(myreversedleftline)) for i, token in enumerate(tokenizerresult): if i==0: leftstart = tokenposition.wordbeg if i == leftcontext: leftstart = tokenposition.wordbeg break mylist.append(token.word) if i>0: mylist.append(token.word) # token.position is the position of the first token's symbol leftstart = token.position + len(token.word) if i == leftcontext or i == len(tokenizerresult)-1: leftstart = tokenposition.wordend - leftstart break mylist.reverse() for i,token in enumerate(mylist): mylist[i] = token[::-1] # right context myrightline = line[tokenposition.wordbeg:] tokenizerresult = list(t.generate_alpha_and_digits(myrightline)) for i, token in enumerate(tokenizerresult): if i==0: rightend = tokenposition.wordend if i == rightcontext: break if i>0: mylist.append(token.word) rightend = token.position + len(token.word) if i == rightcontext or i == len(tokenizerresult)-1: rightend = tokenposition.wordbeg + rightend break mycontextwindow = cls(line, [indexer.Position_with_lines( tokenposition.wordbeg, tokenposition.wordend, tokenposition.line)], WindowPosition(leftstart, rightend, lineno, doc)) return mycontextwindow
def test_several_tokens_one_file(self): testfile = open("text.txt", 'w') testfile.write("There are only kittens!") testfile.close() self.testindexer.index_with_lines("text.txt") testsearch = search_engine.SearchEngine('database') searchresulteddictionary = testsearch.several_tokens_search( "only kittens") expectedsearchresult = { "text.txt": [ indexer.Position_with_lines(10, 14, 0), indexer.Position_with_lines(15, 22, 0) ] } self.assertEqual(searchresulteddictionary, expectedsearchresult) searchresulteddictionary = testsearch.several_tokens_search( "only kittens and") expectedsearchresult = {} self.assertEqual(searchresulteddictionary, expectedsearchresult)
def test_context_window_search_several_tokens_several_files_0_0(self): testfile = open("text.txt", 'w') testfile.write("There are only fluffy kittens!") testfile.close() self.testindexer.index_with_lines("text.txt") testsearch = search_engine.SearchEngine('database') # context '0,0' windowsdict = testsearch.several_tokens_search_with_customizable_context( "only kittens", 0, 0) expectedwindowresult = { 'text.txt': [ search_engine.ContextWindow( "There are only fluffy kittens!", [indexer.Position_with_lines(10, 14, 0)], search_engine.WindowPosition(10, 14, 0, "text.txt")), search_engine.ContextWindow( "There are only fluffy kittens!", [indexer.Position_with_lines(22, 29, 0)], search_engine.WindowPosition(22, 29, 0, "text.txt")) ] } self.assertEqual(expectedwindowresult, windowsdict)
def test_input_sentence(self): testfile = open("text.txt", 'w') testfile.write("This is a sentence \nsentence.") testfile.close() expectedresult = dict({ "This": { "text.txt": [indexer.Position_with_lines(0, 4, 0)] }, "is": { "text.txt": [indexer.Position_with_lines(5, 7, 0)] }, "a": { "text.txt": [indexer.Position_with_lines(8, 9, 0)] }, "sentence": { "text.txt": [ indexer.Position_with_lines(10, 18, 0), indexer.Position_with_lines(0, 8, 1) ] } }) self.testindexer.index_with_lines("text.txt") resulteddictionary = dict(shelve.open('database')) self.assertEqual(resulteddictionary, expectedresult)
def test_context_window_context_window_one_position_one_file(self): testfile = open("text.txt", 'w') testfile.write("There are only kittens!") testfile.close() testfile2 = open("text2.txt", 'w') testfile2.write("only...") testfile2.close() self.testindexer.index_with_lines("text.txt") self.testindexer.index_with_lines("text2.txt") testsearch = search_engine.SearchEngine('database') window1 = search_engine.ContextWindow.get_context_window_one_position_one_file( indexer.Position_with_lines(0, 4, 0), "text2.txt", "only...", 2, 1) window2 = search_engine.ContextWindow.get_context_window_one_position_one_file( indexer.Position_with_lines(10, 14, 0), "text.txt", "There are only kittens!", 2, 1) expectedwindow1 = search_engine.ContextWindow( "only", [indexer.Position_with_lines(0, 4, 0)], search_engine.WindowPosition(0, 4, 0, "text2.txt")) expectedwindow2 = search_engine.ContextWindow( "There are only kittens", [indexer.Position_with_lines(10, 14, 0)], search_engine.WindowPosition(0, 22, 0, "text.txt")) self.assertEqual(expectedwindow1, window1) self.assertEqual(expectedwindow2, window2) windowsdict = testsearch.several_tokens_search_with_customizable_context( "only", 2, 1) expectedwindowresult = { "text.txt": [ search_engine.ContextWindow( "There are only kittens", [indexer.Position_with_lines(10, 14, 0)], search_engine.WindowPosition(0, 22, 0, "text.txt")) ], "text2.txt": [ search_engine.ContextWindow( "only", [indexer.Position_with_lines(0, 4, 0)], search_engine.WindowPosition(0, 4, 0, "text2.txt")) ] } self.assertEqual(expectedwindowresult, windowsdict)
def test_not_equal(self): a = indexer.Position_with_lines(2, 5, 1) b = indexer.Position_with_lines(4, 6, 1) self.assertNotEqual(a, b)
def test_equal(self): a = indexer.Position_with_lines(1, 6, 3) b = indexer.Position_with_lines(1, 6, 3) self.assertEqual(a, b)