示例#1
0
 def headings_by_level(self, level, name=None):
     """
     Constructs a :class:`revscoring.Datasource` that generates a `list` of
     all headers of a level.
     """
     if name is None:
         name = "{0}({1})".format(self._name + ".headings_by_level",
                                  level)
     return filters.filter(HeadingOfLevel(level).filter, self.headings,
                           name=name)
示例#2
0
 def headings_by_level(self, level, name=None):
     """
     Constructs a :class:`revscoring.Datasource` that generates a `list` of
     all headers of a level.
     """
     if name is None:
         name = "{0}({1})".format(self._name + ".headings_by_level", level)
     return filters.filter(HeadingOfLevel(level).filter,
                           self.headings,
                           name=name)
示例#3
0
 def tokens_removed_in_types(self, types, name=None):
     """
     Constructs a :class:`revscoring.Datasource` that represents tokens
     removed that are within a set of types.
     """
     types = set(types)
     if name is None:
         name = "{0}({1})".format(self._name + ".tokens_removed_in_types",
                                  types)
     return filters.filter(TokenIsInTypes(types).filter,
                           self.tokens_removed, name=name)
示例#4
0
 def tokens_removed_in_types(self, types, name=None):
     """
     Constructs a :class:`revscoring.Datasource` that represents tokens
     removed that are within a set of types.
     """
     types = set(types)
     if name is None:
         name = "{0}({1})".format(self._name + ".tokens_removed_in_types",
                                  types)
     return filters.filter(TokenIsInTypes(types).filter,
                           self.tokens_removed,
                           name=name)
示例#5
0
    def tokens_in_types(self, types, name=None):
        """
        Constructs a :class:`revscoring.Datasource` that returns all content
        tokens that are within a set of types.
        """
        token_is_in_types = TokenIsInTypes(types)

        if name is None:
            name = "{0}({1})" \
                   .format(self._name + ".tokens_in_types", types)

        return filters.filter(token_is_in_types.filter, self.tokens, name=name)
示例#6
0
    def tokens_in_types(self, types, name=None):
        """
        Constructs a :class:`revscoring.Datasource` that returns all content
        tokens that are within a set of types.
        """
        token_is_in_types = TokenIsInTypes(types)

        if name is None:
            name = "{0}({1})" \
                   .format(self._name + ".tokens_in_types", types)

        return filters.filter(token_is_in_types.filter,
                              self.tokens, name=name)
示例#7
0
    def __init__(self, name, revision_datasources):
        super().__init__(name, revision_datasources)

        self.tokens = tokenized(revision_datasources.text)
        """
        A list of all tokens
        """

        self.paragraphs_sentences_and_whitespace = Datasource(
            self._name + ".paragraphs_sentences_and_whitespace",
            paragraphs_sentences_and_whitespace.segment,
            depends_on=[self.tokens]
        )
        """
        A list of paragraphs, sentences, and whitespaces as segments.  See
        :class:`deltas.segmenters.Segment` and
        :class:`deltas.segmenters.MatchableSegment`.
        """

        self.token_frequency = frequencies.table(
            self.tokens,
            name=self._name + ".token_frequency"
        )
        """
        A frequency table of all tokens.
        """

        self.numbers = self.tokens_in_types(
            {'number'}, name=self._name + ".numbers"
        )
        """
        A list of numeric tokens
        """

        self.number_frequency = frequencies.table(
            self.numbers, name=self._name + ".number_frequency"
        )
        """
        A frequency table of number tokens.
        """

        self.whitespaces = self.tokens_in_types(
            {'whitespace'}, name=self._name + ".whitespaces"
        )
        """
        A list of whitespace tokens
        """

        self.whitespace_frequency = frequencies.table(
            self.whitespaces, name=self._name + ".whitespace_frequency"
        )
        """
        A frequency table of whichspace tokens.
        """

        self.markups = self.tokens_in_types(
            {'dbrack_open', 'dbrack_close', 'brack_open', 'brack_close',
             'tab_open', 'tab_close', 'dcurly_open', 'dcurly_close',
             'curly_open', 'curly_close', 'bold', 'italics', 'equals'},
            name=self._name + ".markups"
        )
        """
        A list of markup tokens
        """

        self.markup_frequency = frequencies.table(
            self.markups, name=self._name + ".markup_frequency"
        )
        """
        A frequency table of markup tokens.
        """

        self.cjks = self.tokens_in_types(
            {'cjk'}, name=self._name + ".cjks"
        )
        """
        A list of Chinese/Japanese/Korean tokens
        """

        self.cjk_frequency = frequencies.table(
            self.cjks, name=self._name + ".cjk_frequency"
        )
        """
        A frequency table of cjk tokens.
        """

        self.entities = self.tokens_in_types(
            {'entity'}, name=self._name + ".entities"
        )
        """
        A list of HTML entity tokens
        """

        self.entity_frequency = frequencies.table(
            self.entities, name=self._name + ".entity_frequency"
        )
        """
        A frequency table of entity tokens.
        """

        self.urls = self.tokens_in_types(
            {'url'}, name=self._name + ".urls"
        )
        """
        A list of URL tokens
        """

        self.url_frequency = frequencies.table(
            self.urls, name=self._name + ".url_frequency"
        )
        """
        A frequency table of url tokens.
        """

        self.words = self.tokens_in_types(
            {'word'}, name=self._name + ".words"
        )
        """
        A list of word tokens
        """

        self.word_frequency = frequencies.table(
            mappers.lower_case(self.words),
            name=self._name + ".word_frequency"
        )
        """
        A frequency table of lower-cased word tokens.
        """

        self.uppercase_words = filters.filter(
            is_uppercase_word, self.words,
            name=self._name + ".uppercase_words"
        )
        """
        A list of uppercase word tokens that are at least two
        characters long.
        """

        self.uppercase_word_frequency = frequencies.table(
            self.uppercase_words,
            name=self._name + ".uppercase_word_frequency"
        )
        """
        A frequency table of uppercase word tokens that are at least two
        characters long.
        """

        self.punctuations = self.tokens_in_types(
            {'period', 'qmark', 'epoint', 'comma', 'colon', 'scolon',
             'japan_punct'},
            name=self._name + ".punctuations"
        )
        """
        A list of punctuation tokens
        """

        self.punctuation_frequency = frequencies.table(
            self.punctuations, name=self._name + ".punctuation_frequency"
        )
        """
        A frequency table of punctuation tokens.
        """

        self.breaks = self.tokens_in_types(
            {'break'}, name=self._name + ".breaks"
        )
        """
        A list of break tokens
        """

        self.break_frequency = frequencies.table(
            self.breaks, name=self._name + ".break_frequency"
        )
        """
示例#8
0
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        self.operations = Datasource(
            self._name + ".operations", _process_operations,
            depends_on=[
                self.revision.parent.paragraphs_sentences_and_whitespace,
                self.revision.paragraphs_sentences_and_whitespace,
                self.revision.parent.tokens,
                self.revision.tokens]
        )
        """
        Returns a tuple that describes the difference between the parent
        revision text and the current revision's text.

        The tuple contains three fields:

        * operations: `list` of :class:`deltas.Operation`
        * A tokens: `list` of `str`
        * B tokens: `list` of `str`
        """

        self.segments_added = Datasource(
            self._name + ".segments_added", _process_segments_added,
            depends_on=[self.operations]
        )
        """
        Returns a list of all contiguous segments of tokens added in this
        revision.
        """

        self.segments_removed = Datasource(
            self._name + ".segments_removed", _process_segments_removed,
            depends_on=[self.operations]
        )
        """
        Returns a list of all contiguous segments of tokens removed in this
        revision.
        """

        self.tokens_added = Datasource(
            self._name + ".tokens_added", _process_tokens_added,
            depends_on=[self.operations]
        )
        """
        Constructs a :class:`revscoring.Datasource` that returns a list of all
        tokens added in this revision.
        """

        self.tokens_removed = Datasource(
            self._name + ".tokens_removed", _process_tokens_removed,
            depends_on=[self.operations]
        )
        """
        Constructs a :class:`revscoring.Datasource` that returns a list of all
        tokens removed in this revision.
        """

        self.numbers_added = self.tokens_added_in_types(
            {'number'}, name=self._name + ".numbers_added"
        )
        """
        A list of numeric tokens added in the edit
        """

        self.numbers_removed = self.tokens_removed_in_types(
            {'number'}, name=self._name + ".numbers_removed"
        )
        """
        A list of numeric tokens removed in the edit
        """

        self.whitespaces_added = self.tokens_added_in_types(
            {'whitespace'}, name=self._name + ".whitespaces_added"
        )
        """
        A list of whitespace tokens added in the edit
        """

        self.whitespaces_removed = self.tokens_removed_in_types(
            {'whitespace'}, name=self._name + ".whitespaces_removed"
        )
        """
        A list of whitespace tokens removed in the edit
        """

        self.markups_added = self.tokens_added_in_types(
            {'dbrack_open', 'dbrack_close', 'brack_open', 'brack_close',
             'tab_open', 'tab_close', 'dcurly_open', 'dcurly_close',
             'curly_open', 'curly_close', 'bold', 'italics', 'equals'},
            name=self._name + ".markups_added"
        )
        """
        A list of markup tokens added in the edit
        """

        self.markups_removed = self.tokens_removed_in_types(
            {'dbrack_open', 'dbrack_close', 'brack_open', 'brack_close',
             'tab_open', 'tab_close', 'dcurly_open', 'dcurly_close',
             'curly_open', 'curly_close', 'bold', 'italics', 'equals'},
            name=self._name + ".markups_removed"
        )
        """
        A list of markup tokens removed in the edit
        """

        self.cjks_added = self.tokens_added_in_types(
            {'cjk'}, name=self._name + ".cjks_added"
        )
        """
        A list of Chinese/Japanese/Korean tokens added in the edit
        """

        self.cjks_removed = self.tokens_removed_in_types(
            {'cjk'}, name=self._name + ".cjks_removed"
        )
        """
        A list of Chinese/Japanese/Korean tokens removed in the edit
        """

        self.entities_added = self.tokens_added_in_types(
            {'entity'}, name=self._name + ".entities_added"
        )
        """
        A list of HTML entity tokens added in the edit
        """

        self.entities_removed = self.tokens_removed_in_types(
            {'entity'}, name=self._name + ".entities_removed"
        )
        """
        A list of HTML entity tokens removed in the edit
        """

        self.urls_added = self.tokens_added_in_types(
            {'url'}, name=self._name + ".urls_added"
        )
        """
        A list of URL tokens rempved in the edit
        """

        self.urls_removed = self.tokens_removed_in_types(
            {'url'}, name=self._name + ".urls_removed"
        )
        """
        A list of URL tokens added in the edit
        """

        self.words_added = self.tokens_added_in_types(
            {'word'}, name=self._name + ".words_added"
        )
        """
        A list of word tokens added in the edit
        """

        self.words_removed = self.tokens_removed_in_types(
            {'word'}, name=self._name + ".words_removed"
        )
        """
        A list of word tokens removed in the edit
        """

        self.uppercase_words_added = filters.filter(
            is_uppercase_word, self.words_added,
            name=self._name + ".uppercase_words_added"
        )
        """
        A list of fully UPPERCASE word tokens added in the edit
        """

        self.uppercase_words_removed = filters.filter(
            is_uppercase_word, self.words_removed,
            name=self._name + ".uppercase_words_removed"
        )
        """
        A list of fully UPPERCASE word tokens removed in the edit
        """

        self.punctuations_added = self.tokens_added_in_types(
            {'period', 'qmark', 'epoint', 'comma', 'colon', 'scolon',
             'japan_punct'},
            name=self._name + ".punctuations_added"
        )
        """
        A list of punctuation tokens added in the edit
        """

        self.punctuations_removed = self.tokens_removed_in_types(
            {'period', 'qmark', 'epoint', 'comma', 'colon', 'scolon',
             'japan_punct'},
            name=self._name + ".punctuations_removed"
        )
        """
        A list of punctuation tokens removed in the edit
        """

        self.breaks_added = self.tokens_added_in_types(
            {'break'},
            name=self._name + ".breaks_added"
        )
        """
        A list of break tokens added in the edit
        """

        self.breaks_removed = self.tokens_removed_in_types(
            {'break'},
            name=self._name + ".breaks_removed"
        )
        """
示例#9
0
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        self.operations = Datasource(
            self._name + ".operations",
            _process_operations,
            depends_on=[
                self.revision.parent.paragraphs_sentences_and_whitespace,
                self.revision.paragraphs_sentences_and_whitespace,
                self.revision.parent.tokens, self.revision.tokens
            ])
        """
        Returns a tuple that describes the difference between the parent
        revision text and the current revision's text.

        The tuple contains three fields:

        * operations: `list` of :class:`deltas.Operation`
        * A tokens: `list` of `str`
        * B tokens: `list` of `str`
        """

        self.segments_added = Datasource(self._name + ".segments_added",
                                         _process_segments_added,
                                         depends_on=[self.operations])
        """
        Returns a list of all contiguous segments of tokens added in this
        revision.
        """

        self.segments_removed = Datasource(self._name + ".segments_removed",
                                           _process_segments_removed,
                                           depends_on=[self.operations])
        """
        Returns a list of all contiguous segments of tokens removed in this
        revision.
        """

        self.tokens_added = Datasource(self._name + ".tokens_added",
                                       _process_tokens_added,
                                       depends_on=[self.operations])
        """
        Constructs a :class:`revscoring.Datasource` that returns a list of all
        tokens added in this revision.
        """

        self.tokens_removed = Datasource(self._name + ".tokens_removed",
                                         _process_tokens_removed,
                                         depends_on=[self.operations])
        """
        Constructs a :class:`revscoring.Datasource` that returns a list of all
        tokens removed in this revision.
        """

        self.numbers_added = self.tokens_added_in_types({'number'},
                                                        name=self._name +
                                                        ".numbers_added")
        """
        A list of numeric tokens added in the edit
        """

        self.numbers_removed = self.tokens_removed_in_types({'number'},
                                                            name=self._name +
                                                            ".numbers_removed")
        """
        A list of numeric tokens removed in the edit
        """

        self.whitespaces_added = self.tokens_added_in_types(
            {'whitespace'}, name=self._name + ".whitespaces_added")
        """
        A list of whitespace tokens added in the edit
        """

        self.whitespaces_removed = self.tokens_removed_in_types(
            {'whitespace'}, name=self._name + ".whitespaces_removed")
        """
        A list of whitespace tokens removed in the edit
        """

        self.markups_added = self.tokens_added_in_types(
            {
                'dbrack_open', 'dbrack_close', 'brack_open', 'brack_close',
                'tab_open', 'tab_close', 'dcurly_open', 'dcurly_close',
                'curly_open', 'curly_close', 'bold', 'italics', 'equals'
            },
            name=self._name + ".markups_added")
        """
        A list of markup tokens added in the edit
        """

        self.markups_removed = self.tokens_removed_in_types(
            {
                'dbrack_open', 'dbrack_close', 'brack_open', 'brack_close',
                'tab_open', 'tab_close', 'dcurly_open', 'dcurly_close',
                'curly_open', 'curly_close', 'bold', 'italics', 'equals'
            },
            name=self._name + ".markups_removed")
        """
        A list of markup tokens removed in the edit
        """

        self.cjks_added = self.tokens_added_in_types({'cjk'},
                                                     name=self._name +
                                                     ".cjks_added")
        """
        A list of Chinese/Japanese/Korean tokens added in the edit
        """

        self.cjks_removed = self.tokens_removed_in_types({'cjk'},
                                                         name=self._name +
                                                         ".cjks_removed")
        """
        A list of Chinese/Japanese/Korean tokens removed in the edit
        """

        self.entities_added = self.tokens_added_in_types({'entity'},
                                                         name=self._name +
                                                         ".entities_added")
        """
        A list of HTML entity tokens added in the edit
        """

        self.entities_removed = self.tokens_removed_in_types(
            {'entity'}, name=self._name + ".entities_removed")
        """
        A list of HTML entity tokens removed in the edit
        """

        self.urls_added = self.tokens_added_in_types({'url'},
                                                     name=self._name +
                                                     ".urls_added")
        """
        A list of URL tokens rempved in the edit
        """

        self.urls_removed = self.tokens_removed_in_types({'url'},
                                                         name=self._name +
                                                         ".urls_removed")
        """
        A list of URL tokens added in the edit
        """

        self.words_added = self.tokens_added_in_types({'word'},
                                                      name=self._name +
                                                      ".words_added")
        """
        A list of word tokens added in the edit
        """

        self.words_removed = self.tokens_removed_in_types({'word'},
                                                          name=self._name +
                                                          ".words_removed")
        """
        A list of word tokens removed in the edit
        """

        self.uppercase_words_added = filters.filter(is_uppercase_word,
                                                    self.words_added,
                                                    name=self._name +
                                                    ".uppercase_words_added")
        """
        A list of fully UPPERCASE word tokens added in the edit
        """

        self.uppercase_words_removed = filters.filter(
            is_uppercase_word,
            self.words_removed,
            name=self._name + ".uppercase_words_removed")
        """
        A list of fully UPPERCASE word tokens removed in the edit
        """

        self.punctuations_added = self.tokens_added_in_types(
            {
                'period', 'qmark', 'epoint', 'comma', 'colon', 'scolon',
                'japan_punct'
            },
            name=self._name + ".punctuations_added")
        """
        A list of punctuation tokens added in the edit
        """

        self.punctuations_removed = self.tokens_removed_in_types(
            {
                'period', 'qmark', 'epoint', 'comma', 'colon', 'scolon',
                'japan_punct'
            },
            name=self._name + ".punctuations_removed")
        """
        A list of punctuation tokens removed in the edit
        """

        self.breaks_added = self.tokens_added_in_types({'break'},
                                                       name=self._name +
                                                       ".breaks_added")
        """
        A list of break tokens added in the edit
        """

        self.breaks_removed = self.tokens_removed_in_types({'break'},
                                                           name=self._name +
                                                           ".breaks_removed")
        """
示例#10
0
    images_in_tags + infobox_images


# References
def filter_paragraphs_without_ref_tags(segment):
    "Check to see if we have at least 10 words and no refs"
    words = 0
    refs = 0
    for t in segment.tokens():
        words += t.type == "word"
        refs += t.type in ("ref_open", "ref_close", "ref_singleton")
    return words > 10 and refs == 0


paragraphs_without_refs = filters.filter(
    filter_paragraphs_without_ref_tags,
    wikitext.revision.datasources.paragraphs_sentences_and_whitespace,
    name="ptwiki.revision.paragraphs_without_refs")

paragraphs_without_refs_total_length = aggregators.sum(
    mappers.map(len, mappers.map(str, paragraphs_without_refs)),
    name="ptwiki.revision.paragraphs_without_refs_total_length")

# Wikipedia:Manual of style/Words to watch
words_to_watch_count = portuguese.words_to_watch.revision.matches

local_wiki = [
    all_images, all_images / max(wikitext.revision.content_chars, 1),
    category_links, category_links / max(wikitext.revision.content_chars, 1),
    all_ref_tags, all_ref_tags / max(wikitext.revision.content_chars, 1),
    all_cite_templates,
    all_cite_templates / max(wikitext.revision.content_chars, 1),
示例#11
0
    def __init__(self, name, revision_datasources):
        super().__init__(name, revision_datasources)

        self.tokens = tokenized(revision_datasources.text)
        """
        A list of all tokens
        """

        self.paragraphs_sentences_and_whitespace = Datasource(
            self._name + ".paragraphs_sentences_and_whitespace",
            paragraphs_sentences_and_whitespace.segment,
            depends_on=[self.tokens])
        """
        A list of paragraphs, sentences, and whitespaces as segments.  See
        :class:`deltas.segmenters.Segment` and
        :class:`deltas.segmenters.MatchableSegment`.
        """

        self.token_frequency = frequencies.table(self.tokens,
                                                 name=self._name +
                                                 ".token_frequency")
        """
        A frequency table of all tokens.
        """

        self.numbers = self.tokens_in_types({'number'},
                                            name=self._name + ".numbers")
        """
        A list of numeric tokens
        """

        self.number_frequency = frequencies.table(self.numbers,
                                                  name=self._name +
                                                  ".number_frequency")
        """
        A frequency table of number tokens.
        """

        self.whitespaces = self.tokens_in_types({'whitespace'},
                                                name=self._name +
                                                ".whitespaces")
        """
        A list of whitespace tokens
        """

        self.whitespace_frequency = frequencies.table(self.whitespaces,
                                                      name=self._name +
                                                      ".whitespace_frequency")
        """
        A frequency table of whichspace tokens.
        """

        self.markups = self.tokens_in_types(
            {
                'dbrack_open', 'dbrack_close', 'brack_open', 'brack_close',
                'tab_open', 'tab_close', 'dcurly_open', 'dcurly_close',
                'curly_open', 'curly_close', 'bold', 'italics', 'equals'
            },
            name=self._name + ".markups")
        """
        A list of markup tokens
        """

        self.markup_frequency = frequencies.table(self.markups,
                                                  name=self._name +
                                                  ".markup_frequency")
        """
        A frequency table of markup tokens.
        """

        self.cjks = self.tokens_in_types({'cjk'}, name=self._name + ".cjks")
        """
        A list of Chinese/Japanese/Korean tokens
        """

        self.cjk_frequency = frequencies.table(self.cjks,
                                               name=self._name +
                                               ".cjk_frequency")
        """
        A frequency table of cjk tokens.
        """

        self.entities = self.tokens_in_types({'entity'},
                                             name=self._name + ".entities")
        """
        A list of HTML entity tokens
        """

        self.entity_frequency = frequencies.table(self.entities,
                                                  name=self._name +
                                                  ".entity_frequency")
        """
        A frequency table of entity tokens.
        """

        self.urls = self.tokens_in_types({'url'}, name=self._name + ".urls")
        """
        A list of URL tokens
        """

        self.url_frequency = frequencies.table(self.urls,
                                               name=self._name +
                                               ".url_frequency")
        """
        A frequency table of url tokens.
        """

        self.words = self.tokens_in_types({'word'}, name=self._name + ".words")
        """
        A list of word tokens
        """

        self.word_frequency = frequencies.table(mappers.lower_case(self.words),
                                                name=self._name +
                                                ".word_frequency")
        """
        A frequency table of lower-cased word tokens.
        """

        self.uppercase_words = filters.filter(is_uppercase_word,
                                              self.words,
                                              name=self._name +
                                              ".uppercase_words")
        """
        A list of uppercase word tokens that are at least two
        characters long.
        """

        self.uppercase_word_frequency = frequencies.table(
            self.uppercase_words,
            name=self._name + ".uppercase_word_frequency")
        """
        A frequency table of uppercase word tokens that are at least two
        characters long.
        """

        self.punctuations = self.tokens_in_types(
            {
                'period', 'qmark', 'epoint', 'comma', 'colon', 'scolon',
                'japan_punct'
            },
            name=self._name + ".punctuations")
        """
        A list of punctuation tokens
        """

        self.punctuation_frequency = frequencies.table(
            self.punctuations, name=self._name + ".punctuation_frequency")
        """
        A frequency table of punctuation tokens.
        """

        self.breaks = self.tokens_in_types({'break'},
                                           name=self._name + ".breaks")
        """
        A list of break tokens
        """

        self.break_frequency = frequencies.table(self.breaks,
                                                 name=self._name +
                                                 ".break_frequency")
        """