def search_mult(self, query, limit, offset): """Multiword search. :return: a dictionary with the file names of the files that contain all words of the query as the keys and all Positions in that file of the words of the query as the values. :param db: database containing file(s) query: input query """ self.query = query t = Tokenizer() res = [] # list for dictionaries of search results fs = [] # list for sets of names of files output = {} dic = self.db for i in t.alph_tokenize(query): #print(i) if not dic.get(i.tok) in res: res.append(dic.get(i.tok)) # create list of sets of filenames for each word for f in res: fs.append(set(f.keys())) for r in sorted( list(set.intersection(*fs)) )[offset:offset + limit]: # get files that contain all the words of the query for item in res: output.setdefault(r, []).append(item[r]) # sort positions by line and start index for el in output: output[el] = our_sort(output[el]) return output
def search_mult_stem(self, query, limit, offset): """Multiword search with stemming. :return: a dictionary with the file names of the files that contain all stems/lemmas of the query words as the keys and a generator of all Positions query words stems/lemmas in that file. :param query: input query limit: number of files offset: index of the first file (starting at 1) """ t = Tokenizer() stemmer = Stemmer_agent() res = [] # list for dictionaries of search results fs = [] # list for sets of names of files output = {} dic = self.db for i in t.alph_tokenize(query): #print(i) stems = {} for st in stemmer.stem(i.tok): if st in dic: #print(st) for fn in dic.get(st).keys(): stems.setdefault(fn, []).extend(dic.get(st)[fn]) res.append(stems) #for f in res: # print(f) # create list of sets of filenames for each word for f in res: fs.append(set(f.keys())) for r in sorted( list(set.intersection(*fs)) )[offset:offset + limit]: # get files that contain all the words of the query for item in res: output.setdefault(r, []).append(item[r]) # sort positions by line and start index for el in output: output[el] = our_sort(output[el]) return output
def test_symbol(self): t = Tokenizer() res = list(t.alph_tokenize('b')) gold = [Token('b', 0, 0, "alph")] self.assertEqual(res, gold)