Пример #1
0
def extract_segments(limit: t.Optional[int] = None, log_level: str = "INFO"):
    """
    Takes all conference talks and scripture chapters in the pages
    collection of the database and extracts all the segments from
    them, including cleaned text, meta-data, and the references of
    each segment.
    """
    n_written = 0
    logger.setLevel(log_level)

    # First delete all segments in the collection.
    logger.info("deleting all documents in the segments collection...")
    db.segments.delete_many({})

    for page_dict in db.pages.find({}):
        if limit is not None and n_written >= limit:
            break

        page = Page(**page_dict)
        if page.doc_type == "scriptures":
            segmentable = Chapter(page)
        elif page.doc_type == "general-conference":
            segmentable = ConferenceTalk(page)
        else:
            raise ValueError(f"unsupported doc_type '{page.doc_type}'")

        logger.debug(segmentable)

        write_segments(segmentable.to_segments())
        n_written += 1
Пример #2
0
    def __init__(self, page: Page) -> None:
        logger.info(f"processing {page._id}...")
        self.url = page._id
        self.soup = BeautifulSoup(page.html, features="lxml")

        attrs = parse_scripture_chapter_url(self.url)
        self.id: str = attrs["id"]
        self.volume: str = attrs["volume"]
        self.book_id: str = attrs["book_id"]
        self.ch: int = attrs["ch"]
        self.book_name: str = book_map[self.book_id]["names"][0]

        self._set_segments()
Пример #3
0
    def __init__(self, page: Page) -> None:
        logger.info(f"processing {page._id}...")
        self.url = page._id
        self.soup = BeautifulSoup(page.html, features="lxml")

        attrs = parse_conference_talk_url(self.url)
        self.id: str = attrs["id"]
        self.year: int = attrs["volume"]
        self.month: int = attrs["work"]
        self.url_name: str = attrs["parent_doc"]

        self.name = self.soup.find(**self.NAME_QUERY).string
        self._set_author()
        self._set_segments()
Пример #4
0
 def __init__(self, _id: str, doc_type: str, html: str = None) -> None:
     """
     `doc_type` is the document type, one of `["general-conference", "scriptures"]`.
     """
     self._id = _id
     self.doc_type = doc_type
     if html is None:
         logger.info(f"pulling '{_id}'...")
         res = requests.get(self._id)
         # Raise exception for 4xx or 5xx HTTP codes.
         res.raise_for_status()
         res.encoding = "utf-8"
         self.html = res.text
     else:
         self.html = html
Пример #5
0
def pull_pages(
    overwrite: bool = False, limit: t.Optional[int] = None, log_level: str = "INFO",
):
    """
    Writes the raw HTML content of all conference talks and scriptures to the database.

    Parameters
    ----------
    overwrite
        If `True`, all pages will be removed, and a fresh write will
        take place. If `False`, pages will only be written to the db
        for documents that are not referenced yet in the db.
    limit
        If supplied, the number of documents written to the database will
        not exceed `limit`. Useful for testing and debugging.
    log_level
        The level to set logging to. One of
        `["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]`.
    """
    logger.setLevel(log_level)

    if overwrite:
        # Delete all HTML pages in the collection.
        logger.info("deleting all documents in the pages collection...")
        db.pages.delete_many({})

    sub_limit = math.ceil(limit / 2) if limit is not None else None
    existing_doc_ids = set(doc["_id"] for doc in db.pages.find({}, {"_id": 1}))

    pull_pages_for_type(
        sub_limit,
        "general-conference",
        get_all_conference_talk_urls(),
        existing_doc_ids,
    )
    pull_pages_for_type(
        sub_limit, "scriptures", get_all_chapter_urls(), existing_doc_ids
    )
Пример #6
0
def compute_embeddings(overwrite: bool = False):
    """
    Computes all the embeddings for all the segments in the database
    """
    segments_collection = db.segments
    embedder = TextEmbedder()

    logger.info("computing an embedding vector for each segment...")
    # load them all into memory at once so our MongoDB connection doesn't
    # drop part way through.
    segments = list(segments_collection.find())
    for segment in tqdm(segments):
        if not overwrite and "embedding" in segment:
            # This segment already has an embedding and we're not replacing it.
            continue
        embedding = embedder.embed_text(segment["text"])
        segments_collection.find_one_and_update(
            # The list is stored in the database as a raw list, not a numpy array.
            {"_id": segment["_id"]},
            {"$set": {
                "embedding": embedding.tolist()
            }},
        )
    logger.info("embedding complete")
Пример #7
0
def get_segments_by_document(*,
                             include_embeddings: bool = False
                             ) -> t.Dict[str, dict]:
    """
    Collects all segments for each conference talk or scripture chapter into a
    single document, returning all documents as items in a dictionary, mapped
    from their document id.
    """
    segments_collection = db.segments
    documents: t.Dict[str, dict] = defaultdict(lambda: {"segments": []})

    logger.info("preprocessing segments into documents...")
    for segment in tqdm(segments_collection.find(),
                        total=segments_collection.count_documents(filter={})):
        document = documents[segment["parent_id"]]
        if include_embeddings:
            # Mongodb stores the embedding as a raw list, not a numpy array.
            try:
                segment["embedding"] = np.fromiter(segment["embedding"],
                                                   np.float)
            except Exception as e:
                logger.error(segment)
                raise e
        else:
            segment.pop("embedding")

        document["segments"].append(segment)

    for doc_id, document in documents.items():
        # Store the document's id in the document itself so the
        # document can be fully self-contained.
        document["_id"] = doc_id
        # Sort the segments of each document to make sure they're in order.
        document["segments"].sort(key=lambda s: s["num"])

    return documents
Пример #8
0
def import_docs(overwrite: bool = True, log_level: str = "INFO"):
    """
    Imports all segments from the Mongo DB into
    ElasticSearch. If `overwrite == True`, the ES
    index will first be wiped out before indexing.
    """
    logger.setLevel(log_level)

    logger.info(
        f"importing '{SEGMENTS}' index from MongoDB to ElasticSearch...")

    if overwrite and es_client.indices.exists(index=SEGMENTS):
        logger.info("deleting all documents in the segments index...")
        es_client.indices.delete(index=SEGMENTS)

    documents = get_segments_by_document()
    for doc in documents.values():
        # Add the index name for Elastic Search.
        doc["_index"] = SEGMENTS

    logger.info("indexing documents...")
    bulk(es_client, tqdm(documents.values()))