Пример #1
0
class Indexer:
    def __init__(self, docs_dir, docs_size):
        self.docLoader = DocLoader(docs_dir, docs_size)
        self.tokenizer = Tokenizer()
        self.stemmer = Stemmer()
        self.dictionary = Dictionary(load=False)
        self._clean()
        self._setup(docs_size)

    def _setup(self, docs_size):
        for doc_id in range(1, docs_size + 1):
            doc = self.docLoader.getDoc(doc_id)
            tokens = self.tokenizer.tokenizeDoc(doc)
            print("tokens: ")
            for token in tokens:
                print(token)
            normalized_words = self.stemmer.normalize_list(tokens)
            print("normalized_words: ")
            for token in normalized_words:
                print(token)
            for token in normalized_words:
                self.dictionary.addToken(token, doc_id)

    @staticmethod
    def _clean():
        if os.path.exists(os.path.dirname("./dist")):
            try:
                shutil.rmtree("./dist")
            except (FileNotFoundError, FileExistsError) as e:
                print("error")
class SearchEngine:
    _dictionary: Dictionary
    _tokenizer: Tokenizer
    _stemmer: Stemmer
    _query_result: QueryResult

    def __init__(self):
        self._dictionary = Dictionary(load=True)
        self._tokenizer = Tokenizer()
        self._stemmer = Stemmer()
        self._query_result = QueryResult()
        print(self._dictionary)

    def _search_for_token(self, token: Token):
        pl = self._dictionary.getPostingList(token.getWord())
        print(pl)
        if pl is not None:
            self._query_result.addToResults(token, pl)

    def listen(self):
        inp = input("Enter Your Query: ")
        # inp = "هفته"
        query_tokens = self._tokenizer.tokenizeDoc(inp)
        normalized_query_tokens = self._stemmer.normalize_list(query_tokens)
        for p in normalized_query_tokens:
            self._search_for_token(p)
        self._query_result.buildCandidates()
        self._query_result.printKBestCandidates()