Пример #1
0
    def get_documents_from_disk(self, doc_ids):
        """
        Does not make use of the index. We use pyserini's disk traversal methods to retrieve documents. This allows
        us to get away with much smaller index sizes on disk, since indexes now does not have to store the document
        """
        start = time.time()
        logger.info("Starting to get documents from disk")
        document_type = self.collection.config["documents"]["type"]
        if document_type == "trec":
            ctype = "TrecCollection"
        elif document_type == "trecweb":
            ctype = "TrecwebCollection"
        else:
            # For clueweb12, document_type in yaml is the same as anserini - ClueWeb12Collection
            ctype = document_type

        rootdir = self.collection.config["documents"]["path"]
        p = subprocess.run(
            ["python", get_crawl_collection_script(), rootdir, ctype],
            stdout=subprocess.PIPE,
            input=",".join(doc_ids),
            check=True,
            encoding="utf-8",
        )
        with open(
                "{0}/disk_crawl_temp_dump.json".format(
                    os.getenv("CAPREOLUS_CACHE", get_default_cache_dir())),
                "rt") as fp:
            fetched_docs = json.load(fp)

        return [fetched_docs.get(doc_id, []) for doc_id in doc_ids]
Пример #2
0
def crawl():
    """
    Iterates through every document in a collection and looks for the doc ids passed as command line arguments.
    Spawns multiple processes to do this for us. Clueweb12 crawl completes in approximately 42 hours with 8 processes
    See `get_documents_from_disk()` in anserini.py to know how this file is being used
    """
    rootdir = sys.argv[1]
    ctype = sys.argv[2]
    doc_ids = set(input().split(","))
    manager = Manager()
    shared_dict = manager.dict()
    multiprocess_start = time.time()
    logger.debug("Start multiprocess")
    args_list = []
    for subdir in os.listdir(rootdir):
        if os.path.isdir(rootdir + "/" + subdir):
            args_list.append({
                "doc_ids": doc_ids,
                "rootdir": rootdir + "/" + subdir,
                "ctype": ctype,
                "shared_dict": shared_dict
            })

    pool = Pool(processes=8)
    pool.map(spawn_child_process_to_read_docs, args_list)

    logger.debug(
        "Getting all documents from disk took: {0}".format(time.time() -
                                                           multiprocess_start))
    # TODO: This will fail if multiple crawls are running at the same time
    with open(
            "{0}/disk_crawl_temp_dump.json".format(
                os.getenv("CAPREOLUS_CACHE", get_default_cache_dir())),
            "w") as fp:
        json.dump(shared_dict.copy(), fp)
Пример #3
0
    def get_available_indices():
        cache_path = os.environ.get("CAPREOLUS_CACHE", get_default_cache_dir())
        index_dirs = search_files_or_folders_in_directory(cache_path, "index")
        index_dirs_with_done = [
            index_dir for index_dir in index_dirs if len(search_files_or_folders_in_directory(index_dir, "done"))
        ]

        return index_dirs_with_done
Пример #4
0
    def get_paths(self, config):
        """
        Returns a dictionary of various paths
        :param config: A sacred config
        :return: A dict. Eg:
        {
            "collection_path": "path",
            "base_path": "path",
            "cache_path": "path",
            "index_path": "path",
            "run_path": "path",
            "model_path": "path"
        }
        """
        expid = config["expid"]
        collection_path = self.module2cls["collection"].basepath
        base_path = os.environ.get("CAPREOLUS_RESULTS",
                                   get_default_results_dir())
        cache_path = os.environ.get("CAPREOLUS_CACHE", get_default_cache_dir())
        index_key = os.path.join(cache_path, config["collection"],
                                 self.module_key("index"))
        index_path = os.path.join(index_key, "index")
        run_path = os.path.join(index_key, "searcher",
                                self.module_key("searcher"))
        model_path = os.path.join(
            base_path,
            expid,
            config["collection"],
            self.module_key("index"),
            self.module_key("searcher"),
            self.module_key("benchmark"),
            self.module_key("pipeline"),
            self.module_key("reranker") + "_" + self.module_key("extractor"),
        )
        trained_weight_path = os.path.join(model_path, config["fold"],
                                           "weights", "dev")

        return {
            "collection_path": collection_path,
            "base_path": base_path,
            "cache_path": cache_path,
            "index_path": index_path,
            "index_key": index_key,
            "run_path": run_path,
            "model_path": model_path,
            "trained_weight_path": trained_weight_path,
        }
Пример #5
0
 def __init__(self, embedding_name):
     """
         If the _is_initialized class property is not set, build the benchmark and model (expensive)
         Else, do nothing.
     """
     self.embedding_name = embedding_name
     self.embedding = Magnitude(
         MagnitudeUtils.download_model(
             self.SUPPORTED_EMBEDDINGS[embedding_name], download_dir=os.environ.get("CAPREOLUS_CACHE", get_default_cache_dir())
         ),
         lazy_loading=-1,
         blocking=True,
     )
     self.stoi = {self.PAD: 0}  # string to integer. Associates an integer value with every token
     self.itos = {0: self.PAD}
Пример #6
0
 def create(self):
     self.tokenizer = BertTokenizer.from_pretrained(
         self.tokmodel,
         cache_dir=os.environ.get("CAPREOLUS_CACHE",
                                  get_default_cache_dir()))
     self.vocab = self.tokenizer.vocab