Exemplo n.º 1
0
def get_texts(texts):
    articles, articles_all = 0, 0
    positions, positions_all = 0, 0
    try:
        # process the corpus in smaller chunks of docs, because multiprocessing.Pool
        # is dumb and would load the entire input into RAM at once...
        for group in utils.chunkize(texts, chunksize=10 * processes,
                                    maxsize=1):
            for title, text, pageid in pool.imap(process_article, group):
                #if articles % 1000 == 0:
                #    print(articles)
                articles_all += 1
                # article redirects and short stubs are pruned here
                if any(
                        title.startswith(ignore + ':')
                        for ignore in IGNORED_NAMESPACES):
                    continue
                if len(text) < 500:
                    continue
                articles += 1
                yield text
    except KeyboardInterrupt:
        logger.warning(
            "user terminated iteration over Wikipedia corpus after %i documents with %i positions "
            "(total %i articles, %i positions before pruning articles shorter than %i words)",
            articles, positions, articles_all, positions_all, 50)
    else:
        logger.info(
            "finished iterating over Wikipedia corpus of %i documents with %i positions "
            "(total %i articles, %i positions before pruning articles shorter than %i words)",
            articles, positions, articles_all, positions_all, 50)
    finally:
        pool.terminate()
Exemplo n.º 2
0
    def get_texts_with_sections(self):
        """Iterate over the dump, returning titles and text versions of all sections of articles.

        Notes
        -----
        Only articles of sufficient length are returned (short articles & redirects
        etc are ignored).

        Note that this iterates over the **texts**; if you want vectors, just use
        the standard corpus interface instead of this function:

        .. sourcecode:: pycon

            >>> for vec in wiki_corpus:
            >>>     print(vec)

        Yields
        ------
        (str, list of (str, str), list of (str, str))
            Structure contains (title, [(section_heading, section_content), ...],
            (Optionally)[(interlink_article, interlink_text), ...]).

        """
        skipped_namespace, skipped_length, skipped_redirect = 0, 0, 0
        total_articles, total_sections = 0, 0
        page_xmls = extract_page_xmls(self.fileobj)
        pool = multiprocessing.Pool(self.processes)
        # process the corpus in smaller chunks of docs, because multiprocessing.Pool
        # is dumb and would load the entire input into RAM at once...
        for group in utils.chunkize(page_xmls, chunksize=10 * self.processes, maxsize=1):
            for article in pool.imap(partial(segment, include_interlinks=self.include_interlinks),
                                     group):  # chunksize=10): partial(merge_names, b='Sons')
                article_title, sections = article[0], article[1]

                # article redirects are pruned here
                if any(article_title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES):  # filter non-articles
                    skipped_namespace += 1
                    continue
                if not sections or sections[0][1].lstrip().lower().startswith("#redirect"):  # filter redirect
                    skipped_redirect += 1
                    continue
                if sum(len(body.strip()) for (_, body) in sections) < self.min_article_character:
                    # filter stubs (incomplete, very short articles)
                    skipped_length += 1
                    continue
                total_articles += 1
                total_sections += len(sections)

                if self.include_interlinks:
                    interlinks = article[2]
                    yield (article_title, sections, interlinks)
                else:
                    yield (article_title, sections)

        logger.info(
            "finished processing %i articles with %i sections (skipped %i redirects, %i stubs, %i ignored namespaces)",
            total_articles, total_sections, skipped_redirect, skipped_length, skipped_namespace)
        pool.terminate()
        self.length = total_articles  # cache corpus length
Exemplo n.º 3
0
    def get_texts_with_sections(self):
        """Iterate over the dump, returning titles and text versions of all sections of articles.

        Notes
        -----
        Only articles of sufficient length are returned (short articles & redirects
        etc are ignored).

        Note that this iterates over the **texts**; if you want vectors, just use
        the standard corpus interface instead of this function:

        .. sourcecode:: pycon

            >>> for vec in wiki_corpus:
            >>>     print(vec)

        Yields
        ------
        (str, list of (str, str), dict of (str: str))
            Structure contains (title, [(section_heading, section_content), ...], (Optionally){interlinks}).

        """
        skipped_namespace, skipped_length, skipped_redirect = 0, 0, 0
        total_articles, total_sections = 0, 0
        page_xmls = extract_page_xmls(self.fileobj)
        pool = multiprocessing.Pool(self.processes)
        # process the corpus in smaller chunks of docs, because multiprocessing.Pool
        # is dumb and would load the entire input into RAM at once...
        for group in utils.chunkize(page_xmls, chunksize=10 * self.processes, maxsize=1):
            for article in pool.imap(partial(segment, include_interlinks=self.include_interlinks),
                                     group):  # chunksize=10): partial(merge_names, b='Sons')
                article_title, sections = article[0], article[1]

                # article redirects are pruned here
                if any(article_title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES):  # filter non-articles
                    skipped_namespace += 1
                    continue
                if not sections or sections[0][1].lstrip().lower().startswith("#redirect"):  # filter redirect
                    skipped_redirect += 1
                    continue
                if sum(len(body.strip()) for (_, body) in sections) < self.min_article_character:
                    # filter stubs (incomplete, very short articles)
                    skipped_length += 1
                    continue
                total_articles += 1
                total_sections += len(sections)

                if self.include_interlinks:
                    interlinks = article[2]
                    yield (article_title, sections, interlinks)
                else:
                    yield (article_title, sections)

        logger.info(
            "finished processing %i articles with %i sections (skipped %i redirects, %i stubs, %i ignored namespaces)",
            total_articles, total_sections, skipped_redirect, skipped_length, skipped_namespace)
        pool.terminate()
        self.length = total_articles  # cache corpus length
Exemplo n.º 4
0
    def get_texts_with_sections(self):
        """Iterate over the dump, returning titles and text versions of all sections of articles.

        Notes
        -----
        Only articles of sufficient length are returned (short articles & redirects
        etc are ignored).

        Note that this iterates over the **texts**; if you want vectors, just use
        the standard corpus interface instead of this function::

        >>> for vec in wiki_corpus:
        >>>     print(vec)

        Yields
        ------
        (str, list of (str, str))
            Structure contains (title, [(section_heading, section_content), ...]).

        """
        articles = 0
        page_xmls = extract_page_xmls(self.fileobj)
        pool = multiprocessing.Pool(self.processes)
        # process the corpus in smaller chunks of docs, because multiprocessing.Pool
        # is dumb and would load the entire input into RAM at once...
        for group in utils.chunkize(page_xmls, chunksize=10 * self.processes, maxsize=1):
            for article_title, sections in pool.imap(segment, group):  # chunksize=10):
                # article redirects are pruned here
                if any(article_title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES):  # filter non-articles
                    continue
                if not sections or sections[0][1].lstrip().lower().startswith("#redirect"):  # filter redirect
                    continue
                if sum(len(body.strip()) for (_, body) in sections) < self.min_article_character:
                    # filter very short articles (trash)
                    continue

                articles += 1
                yield (article_title, sections)
        pool.terminate()
        self.length = articles  # cache corpus length