search_engine_3.py

import pandas as pd

import utils
from configuration import ConfigClass
from indexer import Indexer
from parser_module import Parse
from run_configs import RunConfigClass
from searcher import Searcher


# DO NOT CHANGE THE CLASS NAME
class SearchEngine:
    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation, but you must have a parser and an indexer.
    __slots__ = ['_config', '_indexer', '_parser', '_model', 'searcher', '_run_config', '_config']

    def __init__(self, config=None, run_config=None):
        if not config:
            config = ConfigClass()
        if not run_config:
            run_config = RunConfigClass()
        self._run_config = run_config
        self._config = config
        self._parser = Parse(run_config)
        self._indexer = Indexer(run_config)
        self._model = None
        self.searcher = Searcher(self._parser, self._indexer, run_config, model=self._model)

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def build_index_from_parquet(self, fn):
        """
        Reads parquet file and passes it to the parser, then indexer.
        Input:
            fn - path to parquet file
        Output:
            No output, just modifies the internal _indexer object.
        """
        df = pd.read_parquet(fn, engine="pyarrow")
        # Iterate over every document in the file
        for document in df.values:
            # parse the document
            parsed_list = self._parser.parse_doc(document)
            self._indexer.add_new_doc(parsed_list)

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_index(self, fn):
        """
        Loads a pre-computed index (or indices) so we can answer queries.
        Input:
            fn - file name of pickled index.
        """
        self._indexer.load_index(fn.strip('.pkl'))

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_precomputed_model(self, model_dir=None):
        """
        Loads a pre-computed model (or models) so we can answer queries.
        This is where you would load models like word2vec, LSI, LDA, etc. and
        assign to self._model, which is passed on to the searcher at query time.
        """
        pass

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def search(self, query):
        """
        Executes a query over an existing index and returns the number of
        relevant docs and an ordered list of search results.
        Input:
            query - string.
        Output:
            A tuple containing the number of relevant search results, and
            a list of tweet_ids where the first element is the most relevant
            and the last is the least relevant result.
        """
        return self.searcher.search(query, None, {3})