예제 #1
0
    def search_mult(self, query, limit, offset):
        """Multiword search.

        :return: a dictionary with the file names of
        the files that contain all words of the query as the keys
        and all Positions in that file of the words of the query as the values.  

        :param db: database containing file(s)
               query: input query
        """
        self.query = query
        t = Tokenizer()
        res = []  # list for dictionaries of search results
        fs = []  # list for sets of names of files
        output = {}
        dic = self.db
        for i in t.alph_tokenize(query):
            #print(i)
            if not dic.get(i.tok) in res:
                res.append(dic.get(i.tok))
        # create list of sets of filenames for each word
        for f in res:
            fs.append(set(f.keys()))
        for r in sorted(
                list(set.intersection(*fs))
        )[offset:offset +
          limit]:  # get files that contain all the words of the query
            for item in res:
                output.setdefault(r, []).append(item[r])
        # sort positions by line and start index
        for el in output:
            output[el] = our_sort(output[el])
        return output
예제 #2
0
    def search_mult_stem(self, query, limit, offset):
        """Multiword search with stemming.

        :return: a dictionary with the file names of
        the files that contain all stems/lemmas of the query words as the keys
        and a generator of all Positions query words stems/lemmas in that file.  

        :param query: input query
               limit: number of files
               offset: index of the first file (starting at 1)
        """
        t = Tokenizer()
        stemmer = Stemmer_agent()
        res = []  # list for dictionaries of search results
        fs = []  # list for sets of names of files
        output = {}
        dic = self.db
        for i in t.alph_tokenize(query):
            #print(i)
            stems = {}
            for st in stemmer.stem(i.tok):
                if st in dic:
                    #print(st)
                    for fn in dic.get(st).keys():
                        stems.setdefault(fn, []).extend(dic.get(st)[fn])
            res.append(stems)
        #for f in res:
        #   print(f)
        # create list of sets of filenames for each word
        for f in res:
            fs.append(set(f.keys()))
        for r in sorted(
                list(set.intersection(*fs))
        )[offset:offset +
          limit]:  # get files that contain all the words of the query
            for item in res:
                output.setdefault(r, []).append(item[r])
        # sort positions by line and start index
        for el in output:
            output[el] = our_sort(output[el])
        return output
예제 #3
0
 def test_symbol(self):
     t = Tokenizer()
     res = list(t.alph_tokenize('b'))
     gold = [Token('b', 0, 0, "alph")]
     self.assertEqual(res, gold)