def __init__(self, dbname): """ This method creates an example of class SearchEngine. """ self.database = shelve.open(dbname, writeback=True) self.tokenizer = Tokenizer()
def find_window(cls, filename, position, size): """ method creates an instance of class ContextWindow loading from file @param filename: path to the file with the word @param position: position of the searching word in context window @param size: size of the context window """ t = Tokenizer() with open(filename) as f: for i, line in enumerate(f): if i == position.line: break if i != position.line: raise ValueError('Inappropriate number') line = line.strip("\n") positions = [position] right = line[position.start:] left = line[:position.end][::-1] for i, token in enumerate(t.for_index_tokenize(left)): if i == size: break start = position.end - token.position - len(token.text) for i, token in enumerate(t.for_index_tokenize(right)): if i == size: break end = position.start + token.position + len(token.text) return cls(line, positions, start, end)
def search_many_limit_offset(self, query, limit=0, offset=0, limits=[1, 1, 1, 1], offsets=[0, 0, 0, 0]): ''' this function for filtering result search many with limit and offset parameters (task acc0) :param query: multiword query :param limit: limit of documents :param offset: offset of documents :return: ''' if not isinstance(query, str): raise ValueError if not isinstance(limit, int): raise ValueError if not isinstance(offset, int): raise ValueError for lim in limits: if not isinstance(lim, int): raise ValueError for of in offsets: if not isinstance(of, int): raise ValueError if query == '': return {} if offset < 0: offset = 0 if limit < 0: limit = 0 tokenizer = Tokenizer() # using tokenizer for extracting tokens words = list(tokenizer.for_index_tokenize(query)) results = [] # creating a tuple for word in words: results.append(self.database[word.text]) files = sorted(set(results[0])) # converting tuple into set i = 0 filtered = set([]) for file in files: if (i >= int(offset)) and (i < (int(offset) + int(limit))): filtered.add(file) i = i + 1 files = filtered for result in results: files &= set(result) # intersecting sets of documents files = sorted(files) positions = {} # creating a dictionary with positions i = 0 for file in files: for result in results: k = i + offset positions.setdefault(file, []).extend( result[file][offsets[k]:limits[k] + offsets[k]]) i = i + 1 return positions
def indextie(self, filename): """ This method indexties text that is stored in some file. The method opens the file, indexties the text and puts all tokens with their positions in a database. """ if not isinstance(filename, str): raise TypeError('Inappropriate type') text = open(filename) tokenizer = Tokenizer() for word in tokenizer.for_index_tokenize(text.read()): self.database.setdefault(word.text, {}).setdefault(filename, []).append(Position(word.position, (word.position + len(word.text)))) text.close() self.database.sync()
def search_many_limit_offset_gen(self, query, limit=0, offset=0, limits=[1, 1, 1, 1], offsets=[0, 0, 0, 0]): if not isinstance(query, str): raise ValueError if not isinstance(limit, int): raise ValueError if not isinstance(offset, int): raise ValueError for lim in limits: if not isinstance(lim, int): raise ValueError for of in offsets: if not isinstance(of, int): raise ValueError if query == '': return {} if offset < 0: offset = 0 if limit < 0: limit = 0 tokenizer = Tokenizer() searchlist = [] for token in tokenizer.gen_type_tokenize(query): if token.typ == 'a' or token.typ == 'd': searchlist.append(token.text) results = [] for token in searchlist: results.append(set(self.search_one(token))) files = results[0] for f in results: files = files & f final_dict = {} files = sorted(files) i = 0 for f in files: if (i >= offset) and (i < (limit + offset)): lists = [] for token in searchlist: lists.append(self.database[token][f][offsets[i]:limits[i] + offsets[i]]) final_dict[f] = self.generator(lists) i = i + 1 return final_dict
def search_many(self, query): """ This method uses tokenization. The method searches in a database, finds tokens in a tokenized string. Returns a dictionary where the tokens are keys with their positions in all given files. """ if not isinstance(query, str): raise ValueError if query == '': return {} tokenizer = Tokenizer() # using tokenizer for extracting tokens words = list(tokenizer.for_index_tokenize(query)) results = [] # creating a tuple for word in words: results.append(self.database[word.text]) files = set(results[0]) # converting tuple into set for result in results: files &= set(result) # intersecting sets of documents positions = {} # creating a dictionary with positions for file in files: for result in results: positions.setdefault(file, []).extend(result[file]) return positions
def setUp(self): self.Tokenizer = Tokenizer()