示例#1
0
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        self.tokens = aggregators.len(self.datasources.tokens)
        "`int` : The number of tokens in the revision"
        self.numbers = aggregators.len(self.datasources.numbers)
        "`int` : The number of number tokens in the revision"
        self.whitespaces = aggregators.len(self.datasources.whitespaces)
        "`int` : The number of whitespace tokens in the revision"
        self.markups = aggregators.len(self.datasources.markups)
        "`int` : The number of markup tokens in the revision"
        self.cjks = aggregators.len(self.datasources.cjks)
        "`int` : The number of Chinese/Japanese/Korean tokens in the revision"
        self.entities = aggregators.len(self.datasources.entities)
        "`int` : The number of HTML entity tokens in the revision"
        self.urls = aggregators.len(self.datasources.urls)
        "`int` : The number of URL tokens in the revision"
        self.words = aggregators.len(self.datasources.words)
        "`int` : The number of word tokens in the revision"
        self.uppercase_words = \
            aggregators.len(self.datasources.uppercase_words)
        "`int` : The number of UPPERCASE word tokens in the revision"
        self.punctuations = aggregators.len(self.datasources.punctuations)
        "`int` : The number of punctuation tokens in the revision"
        self.breaks = aggregators.len(self.datasources.breaks)
        "`int` : The number of break tokens in the revision"
        self.longest_token = aggregators.max(
            mappers.map(len, self.datasources.tokens), returns=int)
        "`int` : The longest single token in the revision"
        self.longest_word = aggregators.max(
            mappers.map(len, self.datasources.words), returns=int)
        "`int` : The longest single word-token in the revision"
示例#2
0
                         cite_templates,
                         name="enwiki.revision.non_cite_templates")

# Links
category_links = wikitext.revision.wikilink_titles_matching(
    r"Category\:", name="enwiki.revision.category_links")
image_links = wikitext.revision.wikilink_titles_matching(
    r"File|Image\:", name="enwiki.revision.image_links")

# References
revision = Revision(
    "enwiki.revision.revision",
    wikitext.revision.datasources,
)
paragraphs = mappers.map(str,
                         revision.paragraphs_sentences_and_whitespace,
                         name="enwiki.revision.paragraphs")
paragraphs_without_refs = filters.regex_matching(
    r"^(?!\s*$)((?!<ref>)(.|\n))*$",
    paragraphs,
    name="enwiki.revision.paragraphs_without_refs")
paragraphs_without_refs_total_length = aggregators.sum(
    mappers.map(len, paragraphs_without_refs),
    name="enwiki.revision.paragraphs_without_refs_total_length")

local_wiki = [
    image_links,
    image_links / max(wikitext.revision.content_chars, 1),
    category_links,
    category_links / max(wikitext.revision.content_chars, 1),
    cite_templates,
示例#3
0
    def __init__(self, name, revision_datasources):
        super().__init__(name, revision_datasources)

        self.chars = aggregators.len(self.datasources.text,
                                     name=self._name + ".chars")
        "`int` : The number of characters in the text"
        self.numeric_chars = aggregators.sum(
            mappers.map(len, self.datasources.numbers),
            name=self._name + ".numeric_chars",
            returns=int)
        "`int` : The number of numeric characters in the text"
        self.whitespace_chars = aggregators.sum(
            mappers.map(len, self.datasources.whitespaces),
            name=self._name + ".whitespace_chars",
            returns=int)
        "`int` : The number of whitespace characters in the text"
        self.markup_chars = aggregators.sum(mappers.map(
            len, self.datasources.markups),
                                            name=self._name + ".markup_chars",
                                            returns=int)
        "`int` : The number of wikitext markup characters in the text"
        self.cjk_chars = aggregators.sum(mappers.map(len,
                                                     self.datasources.cjks),
                                         name=self._name + ".cjk_chars",
                                         returns=int)
        "`int` : The number of Chinese/Japanese/Korean characters in the text"
        self.entity_chars = aggregators.sum(mappers.map(
            len, self.datasources.entities),
                                            name=self._name + ".entity_chars",
                                            returns=int)
        "`int` : The number of HTML entity characters in the text"
        self.url_chars = aggregators.sum(mappers.map(len,
                                                     self.datasources.urls),
                                         name=self._name + ".url_chars",
                                         returns=int)
        "`int` : The number of URL characters in the text"
        self.word_chars = aggregators.sum(mappers.map(len,
                                                      self.datasources.words),
                                          name=self._name + ".word_chars",
                                          returns=int)
        "`int` : The number of word characters in the text"
        self.uppercase_word_chars = aggregators.sum(
            mappers.map(len, self.datasources.uppercase_words),
            name=self._name + ".uppercase_word_chars",
            returns=int)
        "`int` : The number of UPPERCASE WORD characters in the text"
        self.punctuation_chars = aggregators.sum(
            mappers.map(len, self.datasources.punctuations),
            name=self._name + ".punctuation_chars",
            returns=int)
        "`int` : The number of punctuation characters in the text"
        self.break_chars = aggregators.sum(mappers.map(
            len, self.datasources.breaks),
                                           name=self._name + ".break_chars",
                                           returns=int)
        "`int` : The number of break characters in the text"

        self.longest_repeated_char = \
            Feature(self._name + ".longest_repeated_char",
                    _process_longest_repeated_char,
                    returns=int, depends_on=[self.datasources.text])
        "`int` : The most repeated character"
示例#4
0
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        self.chars_added = aggregators.sum(mappers.map(
            len, self.datasources.segments_added),
                                           name=self._name + ".chars_added",
                                           returns=int)
        "`int` : The number of characters added"

        self.chars_removed = aggregators.sum(
            mappers.map(len, self.datasources.segments_removed),
            name=self._name + ".chars_removed",
            returns=int)
        "`int` : The number of characters removed"

        self.numeric_chars_added = aggregators.sum(
            mappers.map(len, self.datasources.numbers_added),
            name=self._name + ".numeric_chars_added",
            returns=int)
        "`int` : The number of numeric characters added"

        self.numeric_chars_removed = aggregators.sum(
            mappers.map(len, self.datasources.numbers_removed),
            name=self._name + ".numeric_chars_removed",
            returns=int)
        "`int` : The number of numeric characters removed"

        self.whitespace_chars_added = aggregators.sum(
            mappers.map(len, self.datasources.whitespaces_added),
            name=self._name + ".whitespace_chars_added",
            returns=int)
        "`int` : The number of whitespace characters added"

        self.whitespace_chars_removed = aggregators.sum(
            mappers.map(len, self.datasources.whitespaces_removed),
            name=self._name + ".whitespace_chars_removed",
            returns=int)
        "`int` : The number of whitespace characters removed"

        self.markup_chars_added = aggregators.sum(
            mappers.map(len, self.datasources.markups_added),
            name=self._name + ".markup_chars_added",
            returns=int)
        "`int` : The number of markup characters added"

        self.markup_chars_removed = aggregators.sum(
            mappers.map(len, self.datasources.markups_removed),
            name=self._name + ".markup_chars_removed",
            returns=int)
        "`int` : The number of markup characters removed"

        self.cjk_chars_added = aggregators.sum(
            mappers.map(len, self.datasources.cjks_added),
            name=self._name + ".cjk_chars_added",
            returns=int)
        "`int` : The number of cjk characters added"

        self.cjk_chars_removed = aggregators.sum(
            mappers.map(len, self.datasources.cjks_removed),
            name=self._name + ".cjk_chars_removed",
            returns=int)
        "`int` : The number of cjk characters removed"

        self.entity_chars_added = aggregators.sum(
            mappers.map(len, self.datasources.entities_added),
            name=self._name + ".entity_chars_added",
            returns=int)
        "`int` : The number of entity characters added"

        self.entity_chars_removed = aggregators.sum(
            mappers.map(len, self.datasources.entities_removed),
            name=self._name + ".entity_chars_removed",
            returns=int)
        "`int` : The number of entity characters removed"

        self.url_chars_added = aggregators.sum(
            mappers.map(len, self.datasources.urls_added),
            name=self._name + ".url_chars_added",
            returns=int)
        "`int` : The number of url characters added"

        self.url_chars_removed = aggregators.sum(
            mappers.map(len, self.datasources.urls_removed),
            name=self._name + ".url_chars_removed",
            returns=int)
        "`int` : The number of url characters removed"

        self.word_chars_added = aggregators.sum(
            mappers.map(len, self.datasources.words_added),
            name=self._name + ".word_chars_added",
            returns=int)
        "`int` : The number of word characters added"

        self.word_chars_removed = aggregators.sum(
            mappers.map(len, self.datasources.words_removed),
            name=self._name + ".word_chars_removed",
            returns=int)
        "`int` : The number of word characters removed"

        self.uppercase_word_chars_added = aggregators.sum(
            mappers.map(len, self.datasources.uppercase_words_added),
            name=self._name + ".uppercase_word_chars_added",
            returns=int)
        "`int` : The number of UPPERCASE word characters added"

        self.uppercase_word_chars_removed = aggregators.sum(
            mappers.map(len, self.datasources.uppercase_words_removed),
            name=self._name + ".uppercase_word_chars_removed",
            returns=int)
        "`int` : The number of UPPERCASE word characters removed"

        self.punctuation_chars_added = aggregators.sum(
            mappers.map(len, self.datasources.punctuations_added),
            name=self._name + ".punctuation_chars_added",
            returns=int)
        "`int` : The number of punctuation characters added"

        self.punctuation_chars_removed = aggregators.sum(
            mappers.map(len, self.datasources.punctuations_removed),
            name=self._name + ".punctuation_chars_removed",
            returns=int)
        "`int` : The number of punctuation characters removed"

        self.break_chars_added = aggregators.sum(
            mappers.map(len, self.datasources.breaks_added),
            name=self._name + ".break_chars_added",
            returns=int)
        "`int` : The number of break characters added"

        self.break_chars_removed = aggregators.sum(
            mappers.map(len, self.datasources.breaks_removed),
            name=self._name + ".break_chars_removed",
            returns=int)
        "`int` : The number of break characters removed"

        self.longest_repeated_char_added = \
            Feature(self._name + ".longest_repeated_char_added",
                    _process_longest_repeated_char_added,
                    returns=int, depends_on=[self.datasources.segments_added])
        "`int` : The most repeated character added"
示例#5
0
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        self.segments_added = aggregators.len(
            self.datasources.segments_added,
            name=self._name + ".segments_added"
        )
        "`int` : The number of segments added "

        self.segments_removed = aggregators.len(
            self.datasources.segments_removed,
            name=self._name + ".segments_removed"
        )
        "`int` : The number of segments removed "

        self.tokens_added = aggregators.len(
            self.datasources.tokens_added,
            name=self._name + ".tokens_added"
        )
        "`int` : The number of tokens added "

        self.tokens_removed = aggregators.len(
            self.datasources.tokens_removed,
            name=self._name + ".tokens_removed"
        )
        "`int` : The number of tokens removed "

        self.numbers_added = aggregators.len(
            self.datasources.numbers_added,
            name=self._name + ".numbers_added"
        )
        "`int` : The number of number tokens added "

        self.numbers_removed = aggregators.len(
            self.datasources.numbers_removed,
            name=self._name + ".numbers_removed"
        )
        "`int` : The number of number tokens removed "

        self.markups_added = aggregators.len(
            self.datasources.markups_added,
            name=self._name + ".markups_added"
        )
        "`int` : The number of markup tokens added "

        self.markups_removed = aggregators.len(
            self.datasources.markups_removed,
            name=self._name + ".markups_removed"
        )
        "`int` : The number of markup tokens removed "

        self.whitespaces_added = aggregators.len(
            self.datasources.whitespaces_added,
            name=self._name + ".whitespaces_added"
        )
        "`int` : The number of whitespace tokens added "

        self.whitespaces_removed = aggregators.len(
            self.datasources.whitespaces_removed,
            name=self._name + ".whitespaces_removed"
        )
        "`int` : The number of whitespace tokens removed "

        self.cjks_added = aggregators.len(
            self.datasources.cjks_added,
            name=self._name + ".cjks_added"
        )
        "`int` : The number of cjk tokens added "

        self.cjks_removed = aggregators.len(
            self.datasources.cjks_removed,
            name=self._name + ".cjks_removed"
        )
        "`int` : The number of cjk tokens removed "

        self.entities_added = aggregators.len(
            self.datasources.entities_added,
            name=self._name + ".entities_added"
        )
        "`int` : The number of entity tokens added "

        self.entities_removed = aggregators.len(
            self.datasources.entities_removed,
            name=self._name + ".entities_removed"
        )
        "`int` : The number of entity tokens removed "

        self.urls_added = aggregators.len(
            self.datasources.urls_added,
            name=self._name + ".urls_added"
        )
        "`int` : The number of url tokens added "

        self.urls_removed = aggregators.len(
            self.datasources.urls_removed,
            name=self._name + ".urls_removed"
        )
        "`int` : The number of url tokens removed "

        self.words_added = aggregators.len(
            self.datasources.words_added,
            name=self._name + ".words_added"
        )
        "`int` : The number of word tokens added "

        self.words_removed = aggregators.len(
            self.datasources.words_removed,
            name=self._name + ".words_removed"
        )
        "`int` : The number of word tokens removed "

        self.uppercase_words_added = aggregators.len(
            self.datasources.uppercase_words_added,
            name=self._name + ".uppercase_words_added"
        )
        "`int` : The number of word tokens added "

        self.uppercase_words_removed = aggregators.len(
            self.datasources.uppercase_words_removed,
            name=self._name + ".uppercase_words_removed"
        )
        "`int` : The number of word tokens removed "

        self.punctuations_added = aggregators.len(
            self.datasources.punctuations_added,
            name=self._name + ".punctuations_added"
        )
        "`int` : The number of punctuation tokens added "

        self.punctuations_removed = aggregators.len(
            self.datasources.punctuations_removed,
            name=self._name + ".punctuations_removed"
        )
        "`int` : The number of punctuation tokens removed "

        self.breaks_added = aggregators.len(
            self.datasources.breaks_added,
            name=self._name + ".breaks_added"
        )
        "`int` : The number of break tokens added "

        self.breaks_removed = aggregators.len(
            self.datasources.breaks_removed,
            name=self._name + ".breaks_removed"
        )
        "`int` : The number of break tokens removed"

        self.longest_token_added = aggregators.max(
            mappers.map(len, self.datasources.tokens_added),
            name=self._name + '.longest_token_added'
        )
        "`int` : The length of the longest token added"

        self.longest_uppercase_word_added = aggregators.max(
            mappers.map(len, self.datasources.uppercase_words_added)
        )
        """
示例#6
0
import pickle

from revscoring.datasources.datasource import Datasource
from revscoring.datasources.meta import mappers
from revscoring.dependencies import solve

tokens = Datasource("tokens")
my_ints = Datasource("my_ints")


def extract_first_char(token):
    return token[:1]


first_char = mappers.map(extract_first_char, tokens, name="first_char")

lower_case_tokens = mappers.lower_case(tokens, name="lower_case_tokens")

derepeat_tokens = mappers.derepeat(tokens, name="derepeat_tokens")

de1337_tokens = mappers.de1337(tokens, name="de1337_tokens")

abs_ints = mappers.abs(my_ints)


def test_item_mapper():
    cache = {tokens: ["alpha", "bravo", "charlie", "delta"]}
    assert (solve(first_char, cache=cache) ==
            ["a", "b", "c", "d"])

    assert pickle.loads(pickle.dumps(first_char)) == first_char
示例#7
0
                              returns=int)

image_tags_str = wikitext.revision.datasources.tags_str_matching(
    r"<(gallery|imagemap)", name="ukwiki.revision.image_tags_str")

images_in_tags = Feature("ukwiki.revision.images_in_tags",
                         get_images,
                         depends_on=[image_tags_str],
                         returns=int)

all_images = image_links + image_templates +\
    images_in_templates + images_in_tags

# References
paragraphs = mappers.map(
    str, wikitext.revision.datasources.paragraphs_sentences_and_whitespace,
    name="ukwiki.revision.paragraphs"
)
paragraphs_without_refs = filters.regex_matching(
    r"^(?!\s*$)((?!<ref>)(.|\n))*$",
    paragraphs,
    name="ukwiki.revision.paragraphs_without_refs"
)
paragraphs_without_refs_total_length = aggregators.sum(
    mappers.map(len, paragraphs_without_refs),
    name="ukwiki.revision.paragraphs_without_refs_total_length"
)

local_wiki = [
    all_images,
    all_images / max(wikitext.revision.content_chars, 1),
    category_links,
示例#8
0
    r"sen[ _]referencias|cómpre[ _]páxina|" +
    r"verificar[ _]credibilidade", name="glwiki.revision.cn_templates")

# Links
category_links = wikitext.revision.wikilink_titles_matching(
    r"(Categoría|Category)\:", name="glwiki.revision.category_links")
image_links = wikitext.revision.wikilink_titles_matching(
    r"(File|Image|Ficheiro)\:", name="glwiki.revision.image_links")

# References
revision = Revision(
    "glwiki.revision.revision",
    wikitext.revision.datasources,
)
paragraphs = mappers.map(
    str, revision.paragraphs_sentences_and_whitespace,
    name="glwiki.revision.paragraphs"
)
paragraphs_without_refs = filters.regex_matching(
    r"^(?!\s*$)((?!<ref>)(.|\n))*$",
    paragraphs,
    name="glwiki.revision.paragraphs_without_refs"
)
paragraphs_without_refs_total_length = aggregators.sum(
    mappers.map(len, paragraphs_without_refs),
    name="glwiki.revision.paragraphs_without_refs_total_length"
)

local_wiki = [
    image_links,
    image_links / max(wikitext.revision.content_chars, 1),
    category_links,
示例#9
0
    def __init__(self, name, revision_datasources):
        super().__init__(name, revision_datasources)

        self.wikicode = Datasource(self._name + ".wikicode",
                                   _process_wikicode,
                                   depends_on=[revision_datasources.text])
        """
        A :class:`mwparserfromhell.wikicode.Wikicode` abstract syntax
        tree representing the structure of the page.
        """

        self.node_class_map = Datasource(self._name + ".node_class_map",
                                         _process_node_class_map,
                                         depends_on=[self.wikicode])
        """
        A map of mwparserfromhell.wikicode.<class> to lists of nodes of
        that type.
        """

        self.content = execute_method("strip_code",
                                      self.wikicode,
                                      name=self._name + ".content")
        """
        The viewable content (no markup or templates) of the revision.
        """

        self.headings = get_key(mwparserfromhell.nodes.Heading,
                                self.node_class_map,
                                default=[],
                                name=self._name + ".headings")
        """
        A list of :class:`mwparserfromhell.nodes.heading.Heading`'s
        """

        self.heading_titles = mappers.map(_extract_heading_title,
                                          self.headings,
                                          name=self._name + ".heading_titles")
        """
        A list of heading titles
        """

        self.external_links = get_key(mwparserfromhell.nodes.ExternalLink,
                                      self.node_class_map,
                                      default=[],
                                      name=self._name + ".external_links")
        """
        A list of :class:`mwparserfromhell.nodes.heading.ExternalLink`'s
        """

        self.external_link_urls = mappers.map(_extract_external_link_url,
                                              self.external_links,
                                              name=self._name +
                                              ".external_link_url")
        """
        A list of external link urls
        """

        self.wikilinks = get_key(mwparserfromhell.nodes.Wikilink,
                                 self.node_class_map,
                                 default=[],
                                 name=self._name + ".wikilinks")
        """
        A list of :class:`mwparserfromhell.nodes.heading.Wikilink`'s
        """

        self.wikilink_titles = mappers.map(_extract_wikilink_title,
                                           self.wikilinks,
                                           name=self._name +
                                           ".wikilink_titles")
        """
        Returns a list of string titles of internal links (aka "targets")
        """

        self.tags = get_key(mwparserfromhell.nodes.Tag,
                            self.node_class_map,
                            default=[],
                            name=self._name + ".tags")
        """
        A list of :class:`mwparserfromhell.nodes.heading.Tag`'s
        """

        self.tag_names = mappers.map(_extract_tag_name,
                                     self.tags,
                                     name=self._name + ".tag_names")
        """
        Returns a list of html tag names present in the content of the revision
        """

        self.tags_str = mappers.map(str,
                                    self.tags,
                                    name=self._name + ".tags_str")
        """
        Returns a list of tags present in the content of the revision as strings
        """

        self.templates = get_key(mwparserfromhell.nodes.Template,
                                 self.node_class_map,
                                 default=[],
                                 name=self._name + ".templates")
        """
        A list of :class:`mwparserfromhell.nodes.heading.Templates`'s
        """

        self.template_names = mappers.map(_extract_template_name,
                                          self.templates,
                                          name=self._name + ".template_names")
        """
        Returns a list of template names present in the content of the revision
        """

        self.templates_str = mappers.map(str,
                                         self.templates,
                                         name=self._name + ".templates_str")
        """
        Returns a list of templates present in the content of the revision as strings
        """
        self.sections = Datasource(self._name + ".section",
                                   _extract_sections,
                                   depends_on=[self.wikicode])
        """
示例#10
0
    def __init__(self, name, revision_datasources):
        super().__init__(name, revision_datasources)

        self.wikicode = Datasource(
            self._name + ".wikicode",
            _process_wikicode, depends_on=[revision_datasources.text]
        )
        """
        A :class:`mwparserfromhell.wikicode.Wikicode` abstract syntax
        tree representing the structure of the page.
        """

        self.node_class_map = Datasource(
            self._name + ".node_class_map",
            _process_node_class_map, depends_on=[self.wikicode]
        )
        """
        A map of mwparserfromhell.wikicode.<class> to lists of nodes of
        that type.
        """

        self.content = execute_method(
            "strip_code", self.wikicode,
            name=self._name + ".content"
        )
        """
        The viewable content (no markup or templates) of the revision.
        """

        self.headings = get_key(
            mwparserfromhell.nodes.Heading, self.node_class_map,
            default=[],
            name=self._name + ".headings"
        )
        """
        A list of :class:`mwparserfromhell.nodes.heading.Heading`'s
        """

        self.heading_titles = mappers.map(
            _extract_heading_title, self.headings,
            name=self._name + ".heading_titles"
        )
        """
        A list of heading titles
        """

        self.external_links = get_key(
            mwparserfromhell.nodes.ExternalLink, self.node_class_map,
            default=[],
            name=self._name + ".external_links"
        )
        """
        A list of :class:`mwparserfromhell.nodes.heading.ExternalLink`'s
        """

        self.external_link_urls = mappers.map(
            _extract_external_link_url, self.external_links,
            name=self._name + ".external_link_url"
        )
        """
        A list of external link urls
        """

        self.wikilinks = get_key(
            mwparserfromhell.nodes.Wikilink, self.node_class_map,
            default=[],
            name=self._name + ".wikilinks"
        )
        """
        A list of :class:`mwparserfromhell.nodes.heading.Wikilink`'s
        """

        self.wikilink_titles = mappers.map(
            _extract_wikilink_title, self.wikilinks,
            name=self._name + ".wikilink_titles"
        )
        """
        Returns a list of string titles of internal links (aka "targets")
        """

        self.tags = get_key(
            mwparserfromhell.nodes.Tag, self.node_class_map,
            default=[],
            name=self._name + ".tags"
        )
        """
        A list of :class:`mwparserfromhell.nodes.heading.Tag`'s
        """

        self.tag_names = mappers.map(
            _extract_tag_name, self.tags,
            name=self._name + ".tag_names"
        )
        """
        Returns a list of html tag names present in the content of the revision
        """

        self.templates = get_key(
            mwparserfromhell.nodes.Template, self.node_class_map,
            default=[],
            name=self._name + ".templates"
        )
        """
        A list of :class:`mwparserfromhell.nodes.heading.Templates`'s
        """

        self.template_names = mappers.map(
            _extract_template_name, self.templates,
            name=self._name + ".template_names"
        )
        """
示例#11
0
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        self.segments_added = aggregators.len(self.datasources.segments_added,
                                              name=self._name +
                                              ".segments_added")
        "`int` : The number of segments added "

        self.segments_removed = aggregators.len(
            self.datasources.segments_removed,
            name=self._name + ".segments_removed")
        "`int` : The number of segments removed "

        self.tokens_added = aggregators.len(self.datasources.tokens_added,
                                            name=self._name + ".tokens_added")
        "`int` : The number of tokens added "

        self.tokens_removed = aggregators.len(self.datasources.tokens_removed,
                                              name=self._name +
                                              ".tokens_removed")
        "`int` : The number of tokens removed "

        self.numbers_added = aggregators.len(self.datasources.numbers_added,
                                             name=self._name +
                                             ".numbers_added")
        "`int` : The number of number tokens added "

        self.numbers_removed = aggregators.len(
            self.datasources.numbers_removed,
            name=self._name + ".numbers_removed")
        "`int` : The number of number tokens removed "

        self.markups_added = aggregators.len(self.datasources.markups_added,
                                             name=self._name +
                                             ".markups_added")
        "`int` : The number of markup tokens added "

        self.markups_removed = aggregators.len(
            self.datasources.markups_removed,
            name=self._name + ".markups_removed")
        "`int` : The number of markup tokens removed "

        self.whitespaces_added = aggregators.len(
            self.datasources.whitespaces_added,
            name=self._name + ".whitespaces_added")
        "`int` : The number of whitespace tokens added "

        self.whitespaces_removed = aggregators.len(
            self.datasources.whitespaces_removed,
            name=self._name + ".whitespaces_removed")
        "`int` : The number of whitespace tokens removed "

        self.cjks_added = aggregators.len(self.datasources.cjks_added,
                                          name=self._name + ".cjks_added")
        "`int` : The number of cjk tokens added "

        self.cjks_removed = aggregators.len(self.datasources.cjks_removed,
                                            name=self._name + ".cjks_removed")
        "`int` : The number of cjk tokens removed "

        self.entities_added = aggregators.len(self.datasources.entities_added,
                                              name=self._name +
                                              ".entities_added")
        "`int` : The number of entity tokens added "

        self.entities_removed = aggregators.len(
            self.datasources.entities_removed,
            name=self._name + ".entities_removed")
        "`int` : The number of entity tokens removed "

        self.urls_added = aggregators.len(self.datasources.urls_added,
                                          name=self._name + ".urls_added")
        "`int` : The number of url tokens added "

        self.urls_removed = aggregators.len(self.datasources.urls_removed,
                                            name=self._name + ".urls_removed")
        "`int` : The number of url tokens removed "

        self.words_added = aggregators.len(self.datasources.words_added,
                                           name=self._name + ".words_added")
        "`int` : The number of word tokens added "

        self.words_removed = aggregators.len(self.datasources.words_removed,
                                             name=self._name +
                                             ".words_removed")
        "`int` : The number of word tokens removed "

        self.uppercase_words_added = aggregators.len(
            self.datasources.uppercase_words_added,
            name=self._name + ".uppercase_words_added")
        "`int` : The number of word tokens added "

        self.uppercase_words_removed = aggregators.len(
            self.datasources.uppercase_words_removed,
            name=self._name + ".uppercase_words_removed")
        "`int` : The number of word tokens removed "

        self.punctuations_added = aggregators.len(
            self.datasources.punctuations_added,
            name=self._name + ".punctuations_added")
        "`int` : The number of punctuation tokens added "

        self.punctuations_removed = aggregators.len(
            self.datasources.punctuations_removed,
            name=self._name + ".punctuations_removed")
        "`int` : The number of punctuation tokens removed "

        self.breaks_added = aggregators.len(self.datasources.breaks_added,
                                            name=self._name + ".breaks_added")
        "`int` : The number of break tokens added "

        self.breaks_removed = aggregators.len(self.datasources.breaks_removed,
                                              name=self._name +
                                              ".breaks_removed")
        "`int` : The number of break tokens removed"

        self.longest_token_added = aggregators.max(
            mappers.map(len, self.datasources.tokens_added),
            name=self._name + '.longest_token_added')
        "`int` : The length of the longest token added"

        self.longest_uppercase_word_added = aggregators.max(
            mappers.map(len, self.datasources.uppercase_words_added))
        """
示例#12
0
    "Check to see if we have at least 10 words and no refs"
    words = 0
    refs = 0
    for t in segment.tokens():
        words += t.type == "word"
        refs += t.type in ("ref_open", "ref_close", "ref_singleton")
    return words > 10 and refs == 0


paragraphs_without_refs = filters.filter(
    filter_paragraphs_without_ref_tags,
    wikitext.revision.datasources.paragraphs_sentences_and_whitespace,
    name="ptwiki.revision.paragraphs_without_refs")

paragraphs_without_refs_total_length = aggregators.sum(
    mappers.map(len, mappers.map(str, paragraphs_without_refs)),
    name="ptwiki.revision.paragraphs_without_refs_total_length")

# Wikipedia:Manual of style/Words to watch
words_to_watch_count = portuguese.words_to_watch.revision.matches

local_wiki = [
    all_images, all_images / max(wikitext.revision.content_chars, 1),
    category_links, category_links / max(wikitext.revision.content_chars, 1),
    all_ref_tags, all_ref_tags / max(wikitext.revision.content_chars, 1),
    all_cite_templates,
    all_cite_templates / max(wikitext.revision.content_chars, 1),
    proportion_of_templated_references, non_templated_references,
    non_templated_references / max(wikitext.revision.content_chars, 1),
    non_cite_templates, non_cite_templates /
    max(wikitext.revision.content_chars, 1), infobox_templates,