def find_window(cls, filename, position, size): """ method creates an instance of class ContextWindow loading from file @param filename: path to the file with the word @param position: position of the searching word in context window @param size: size of the context window """ t = Tokenizer() with open(filename) as f: for i, line in enumerate(f): if i == position.line: break if i != position.line: raise ValueError('Inappropriate number') line = line.strip("\n") positions = [position] right = line[position.start:] left = line[:position.end][::-1] for i, token in enumerate(t.for_index_tokenize(left)): if i == size: break start = position.end - token.position - len(token.text) for i, token in enumerate(t.for_index_tokenize(right)): if i == size: break end = position.start + token.position + len(token.text) return cls(line, positions, start, end)
def search_many_limit_offset(self, query, limit=0, offset=0, limits=[1, 1, 1, 1], offsets=[0, 0, 0, 0]): ''' this function for filtering result search many with limit and offset parameters (task acc0) :param query: multiword query :param limit: limit of documents :param offset: offset of documents :return: ''' if not isinstance(query, str): raise ValueError if not isinstance(limit, int): raise ValueError if not isinstance(offset, int): raise ValueError for lim in limits: if not isinstance(lim, int): raise ValueError for of in offsets: if not isinstance(of, int): raise ValueError if query == '': return {} if offset < 0: offset = 0 if limit < 0: limit = 0 tokenizer = Tokenizer() # using tokenizer for extracting tokens words = list(tokenizer.for_index_tokenize(query)) results = [] # creating a tuple for word in words: results.append(self.database[word.text]) files = sorted(set(results[0])) # converting tuple into set i = 0 filtered = set([]) for file in files: if (i >= int(offset)) and (i < (int(offset) + int(limit))): filtered.add(file) i = i + 1 files = filtered for result in results: files &= set(result) # intersecting sets of documents files = sorted(files) positions = {} # creating a dictionary with positions i = 0 for file in files: for result in results: k = i + offset positions.setdefault(file, []).extend( result[file][offsets[k]:limits[k] + offsets[k]]) i = i + 1 return positions
def indextie(self, filename): """ This method indexties text that is stored in some file. The method opens the file, indexties the text and puts all tokens with their positions in a database. """ if not isinstance(filename, str): raise TypeError('Inappropriate type') text = open(filename) tokenizer = Tokenizer() for word in tokenizer.for_index_tokenize(text.read()): self.database.setdefault(word.text, {}).setdefault(filename, []).append(Position(word.position, (word.position + len(word.text)))) text.close() self.database.sync()
def __init__(self, dbname): """ This method creates an example of class SearchEngine. """ self.database = shelve.open(dbname, writeback=True) self.tokenizer = Tokenizer()
def search_many_limit_offset_gen(self, query, limit=0, offset=0, limits=[1, 1, 1, 1], offsets=[0, 0, 0, 0]): if not isinstance(query, str): raise ValueError if not isinstance(limit, int): raise ValueError if not isinstance(offset, int): raise ValueError for lim in limits: if not isinstance(lim, int): raise ValueError for of in offsets: if not isinstance(of, int): raise ValueError if query == '': return {} if offset < 0: offset = 0 if limit < 0: limit = 0 tokenizer = Tokenizer() searchlist = [] for token in tokenizer.gen_type_tokenize(query): if token.typ == 'a' or token.typ == 'd': searchlist.append(token.text) results = [] for token in searchlist: results.append(set(self.search_one(token))) files = results[0] for f in results: files = files & f final_dict = {} files = sorted(files) i = 0 for f in files: if (i >= offset) and (i < (limit + offset)): lists = [] for token in searchlist: lists.append(self.database[token][f][offsets[i]:limits[i] + offsets[i]]) final_dict[f] = self.generator(lists) i = i + 1 return final_dict
def search_many(self, query): """ This method uses tokenization. The method searches in a database, finds tokens in a tokenized string. Returns a dictionary where the tokens are keys with their positions in all given files. """ if not isinstance(query, str): raise ValueError if query == '': return {} tokenizer = Tokenizer() # using tokenizer for extracting tokens words = list(tokenizer.for_index_tokenize(query)) results = [] # creating a tuple for word in words: results.append(self.database[word.text]) files = set(results[0]) # converting tuple into set for result in results: files &= set(result) # intersecting sets of documents positions = {} # creating a dictionary with positions for file in files: for result in results: positions.setdefault(file, []).extend(result[file]) return positions
def setUp(self): self.Tokenizer = Tokenizer()
class Test(unittest.TestCase): def setUp(self): self.Tokenizer = Tokenizer() # unittest for method tokenize def test_type_output(self): result = self.Tokenizer.tokenize('text') self.assertIsInstance(result, list) def test_type_input_notlist(self): with self.assertRaises(ValueError): self.Tokenizer.tokenize(['eto', 'ne', 'spisok']) def test_type_input_number(self): with self.assertRaises(ValueError): self.Tokenizer.tokenize(5) def test_result_words(self): result = self.Tokenizer.tokenize('we ^&* are testing- *&$^ this thing') self.assertEqual(len(result), 5) self.assertEqual(result[0].text, 'we') self.assertEqual(result[0].position, 0) self.assertEqual(result[4].text, 'thing') self.assertEqual(result[4].position, 30) def test_result_characters_beginning(self): result = self.Tokenizer.tokenize( '$%$we ^&* are testing- *&$^ this thing') self.assertEqual(len(result), 5) self.assertEqual(result[0].text, 'we') self.assertEqual(result[0].position, 3) self.assertEqual(result[4].text, 'thing') self.assertEqual(result[4].position, 33) def test_result_characters_end(self): result = self.Tokenizer.tokenize( 'we ^&* are testing- *&$^ this thing()(') self.assertEqual(len(result), 5) self.assertEqual(result[0].text, 'we') self.assertEqual(result[0].position, 0) self.assertEqual(result[4].text, 'thing') self.assertEqual(result[4].position, 30) def test_result_characters_begin_end(self): result = self.Tokenizer.tokenize( '720@!we ^&* are testing- *&$^ this thing*%@3') self.assertEqual(len(result), 5) self.assertEqual(result[0].text, 'we') self.assertEqual(result[0].position, 5) self.assertEqual(result[4].text, 'thing') self.assertEqual(result[4].position, 35) # unittest for method gen_tokenize def gen_test_type_input_notlist(self): with self.assertRaises(ValueError): self.Tokenizer.gen_tokenize(['eto', 'ne', 'spisok']) def gen_test_type_input_number(self): with self.assertRaises(ValueError): self.Tokenizer.gen_tokenize(5) def gen_test_result_words(self): result = self.Tokenizer.gen_tokenize( 'we ^&* are testing- *&$^ this thing') self.assertEqual(len(result), 5) self.assertEqual(result[0].text, 'we') self.assertEqual(result[0].position, 0) self.assertEqual(result[4].text, 'thing') self.assertEqual(result[4].position, 30) def gen_test_result_characters_beginning(self): result = self.Tokenizer.gen_tokenize( '$%$we ^&* are testing- *&$^ this thing') self.assertEqual(len(result), 5) self.assertEqual(result[0].text, 'we') self.assertEqual(result[0].position, 3) self.assertEqual(result[4].text, 'thing') self.assertEqual(result[4].position, 33) def gen_test_result_characters_end(self): result = self.Tokenizer.gen_tokenize( 'we ^&* are testing- *&$^ this thing()(') self.assertEqual(len(result), 5) self.assertEqual(result[0].text, 'we') self.assertEqual(result[0].position, 0) self.assertEqual(result[4].text, 'thing') self.assertEqual(result[4].position, 30) def gen_test_result_characters_begin_end(self): result = self.Tokenizer.gen_tokenize( '720@!we ^&* are testing- *&$^ this thing*%@3') self.assertEqual(len(result), 5) self.assertEqual(result[0].text, 'we') self.assertEqual(result[0].position, 5) self.assertEqual(result[4].text, 'thing') self.assertEqual(result[4].position, 35) # unittest for method gen_type_tokenize def gen_type_test_list(self): with self.assertRaises(ValueError): result = self.Tokenizer.gen_type_tokenize(['eto', 'ne', 'spisok']) def gen_test_type_input_number(self): with self.assertRaises(ValueError): result = self.Tokenizer.gen_type_tokenize(5) def test_type(self): result = self.Tokenizer.gen_type_tokenize('Test - thats right') sequence = list(result) self.assertEqual(len(sequence), 7) self.assertEqual(sequence[0].text, 'Test') self.assertEqual(sequence[0].position, 0) self.assertEqual(sequence[0].typ, "a") self.assertEqual(sequence[1].text, ' ') self.assertEqual(sequence[1].position, 4) self.assertEqual(sequence[1].typ, "s") self.assertEqual(sequence[2].text, '-') self.assertEqual(sequence[2].position, 5) self.assertEqual(sequence[2].typ, "p") def test_type_notlatin(self): result = self.Tokenizer.gen_type_tokenize('大好きです。 Мне это нравится') sequence = list(result) self.assertEqual(len(sequence), 8) self.assertEqual(sequence[0].text, '大好きです') self.assertEqual(sequence[0].position, 0) self.assertEqual(sequence[0].typ, "a") self.assertEqual(sequence[1].text, '。') self.assertEqual(sequence[1].position, 5) self.assertEqual(sequence[1].typ, "p") self.assertEqual(sequence[2].text, ' ') self.assertEqual(sequence[2].position, 6) self.assertEqual(sequence[2].typ, "s") self.assertEqual(sequence[3].text, 'Мне') self.assertEqual(sequence[3].position, 7) self.assertEqual(sequence[3].typ, "a") def test_type_other(self): result = self.Tokenizer.gen_type_tokenize('... ой6ой + @') sequence = list(result) self.assertEqual(len(sequence), 9) self.assertEqual(sequence[0].text, '...') self.assertEqual(sequence[0].position, 0) self.assertEqual(sequence[0].typ, "p") self.assertEqual(sequence[3].text, '6') self.assertEqual(sequence[3].position, 6) self.assertEqual(sequence[3].typ, "d") self.assertEqual(sequence[6].text, '+') self.assertEqual(sequence[6].position, 10) self.assertEqual(sequence[6].typ, "o")