def __init__(self, en_kvs_path):
     self.tokenizer = mappers.lower_case(wikitext.revision.datasources.words)
     self.vectorizer = self.load_vectorizer(en_kvs_path)
     self.cols_to_extract = [
         'aft_id',
         'aft_page',
         'aft_page_revision',
         'aft_user',
         'aft_user_text',
         'aft_comment',
         'aft_noaction',
         'aft_inappropriate',
         'aft_helpful',
         'aft_unhelpful'
     ]
    def load_vectorizer(self, enwiki_kvs_path):
        enwiki_kvs = vectorizers.word2vec.load_gensim_kv(
            path=enwiki_kvs_path,
            mmap="r"
        )

        vectorize_words = functools.partial(vectorizers.word2vec.vectorize_words, enwiki_kvs)

        revision_text_vectors = vectorizers.word2vec(
            mappers.lower_case(wikitext.revision.datasources.words),
            vectorize_words,
            name="revision.text.en_vectors")

        w2v = aggregators.mean(
            revision_text_vectors,
            vector=True,
            name="revision.text.en_vectors_mean"
        )

        return w2v
示例#3
0
from revscoring.datasources.meta import vectorizers, mappers
from revscoring.features import wikitext
from revscoring.features.meta import aggregators

cswiki_kvs = vectorizers.word2vec.load_gensim_kv(
    filename="cswiki-20191201-learned_vectors.50_cell.100k.kv", mmap='r')


def vectorize_words(words):
    return vectorizers.word2vec.vectorize_words(cswiki_kvs, words)


revision_text_vectors = vectorizers.word2vec(mappers.lower_case(
    wikitext.revision.datasources.words),
                                             vectorize_words,
                                             name="revision.text.cs_vectors")

w2v = aggregators.mean(revision_text_vectors,
                       vector=True,
                       name="revision.text.cs_vectors_mean")

drafttopic = [w2v]
articletopic = drafttopic
示例#4
0
    def __init__(self, name, revision_datasources):
        super().__init__(name, revision_datasources)

        self.tokens = tokenized(revision_datasources.text)
        """
        A list of all tokens
        """

        self.paragraphs_sentences_and_whitespace = Datasource(
            self._name + ".paragraphs_sentences_and_whitespace",
            paragraphs_sentences_and_whitespace.segment,
            depends_on=[self.tokens]
        )
        """
        A list of paragraphs, sentences, and whitespaces as segments.  See
        :class:`deltas.segmenters.Segment` and
        :class:`deltas.segmenters.MatchableSegment`.
        """

        self.token_frequency = frequencies.table(
            self.tokens,
            name=self._name + ".token_frequency"
        )
        """
        A frequency table of all tokens.
        """

        self.numbers = self.tokens_in_types(
            {'number'}, name=self._name + ".numbers"
        )
        """
        A list of numeric tokens
        """

        self.number_frequency = frequencies.table(
            self.numbers, name=self._name + ".number_frequency"
        )
        """
        A frequency table of number tokens.
        """

        self.whitespaces = self.tokens_in_types(
            {'whitespace'}, name=self._name + ".whitespaces"
        )
        """
        A list of whitespace tokens
        """

        self.whitespace_frequency = frequencies.table(
            self.whitespaces, name=self._name + ".whitespace_frequency"
        )
        """
        A frequency table of whichspace tokens.
        """

        self.markups = self.tokens_in_types(
            {'dbrack_open', 'dbrack_close', 'brack_open', 'brack_close',
             'tab_open', 'tab_close', 'dcurly_open', 'dcurly_close',
             'curly_open', 'curly_close', 'bold', 'italics', 'equals'},
            name=self._name + ".markups"
        )
        """
        A list of markup tokens
        """

        self.markup_frequency = frequencies.table(
            self.markups, name=self._name + ".markup_frequency"
        )
        """
        A frequency table of markup tokens.
        """

        self.cjks = self.tokens_in_types(
            {'cjk'}, name=self._name + ".cjks"
        )
        """
        A list of Chinese/Japanese/Korean tokens
        """

        self.cjk_frequency = frequencies.table(
            self.cjks, name=self._name + ".cjk_frequency"
        )
        """
        A frequency table of cjk tokens.
        """

        self.entities = self.tokens_in_types(
            {'entity'}, name=self._name + ".entities"
        )
        """
        A list of HTML entity tokens
        """

        self.entity_frequency = frequencies.table(
            self.entities, name=self._name + ".entity_frequency"
        )
        """
        A frequency table of entity tokens.
        """

        self.urls = self.tokens_in_types(
            {'url'}, name=self._name + ".urls"
        )
        """
        A list of URL tokens
        """

        self.url_frequency = frequencies.table(
            self.urls, name=self._name + ".url_frequency"
        )
        """
        A frequency table of url tokens.
        """

        self.words = self.tokens_in_types(
            {'word'}, name=self._name + ".words"
        )
        """
        A list of word tokens
        """

        self.word_frequency = frequencies.table(
            mappers.lower_case(self.words),
            name=self._name + ".word_frequency"
        )
        """
        A frequency table of lower-cased word tokens.
        """

        self.uppercase_words = filters.filter(
            is_uppercase_word, self.words,
            name=self._name + ".uppercase_words"
        )
        """
        A list of uppercase word tokens that are at least two
        characters long.
        """

        self.uppercase_word_frequency = frequencies.table(
            self.uppercase_words,
            name=self._name + ".uppercase_word_frequency"
        )
        """
        A frequency table of uppercase word tokens that are at least two
        characters long.
        """

        self.punctuations = self.tokens_in_types(
            {'period', 'qmark', 'epoint', 'comma', 'colon', 'scolon',
             'japan_punct'},
            name=self._name + ".punctuations"
        )
        """
        A list of punctuation tokens
        """

        self.punctuation_frequency = frequencies.table(
            self.punctuations, name=self._name + ".punctuation_frequency"
        )
        """
        A frequency table of punctuation tokens.
        """

        self.breaks = self.tokens_in_types(
            {'break'}, name=self._name + ".breaks"
        )
        """
        A list of break tokens
        """

        self.break_frequency = frequencies.table(
            self.breaks, name=self._name + ".break_frequency"
        )
        """
示例#5
0
from revscoring.datasources.datasource import Datasource
from revscoring.datasources.meta import mappers
from revscoring.dependencies import solve

tokens = Datasource("tokens")
my_ints = Datasource("my_ints")


def extract_first_char(token):
    return token[:1]


first_char = mappers.map(extract_first_char, tokens, name="first_char")

lower_case_tokens = mappers.lower_case(tokens, name="lower_case_tokens")

derepeat_tokens = mappers.derepeat(tokens, name="derepeat_tokens")

de1337_tokens = mappers.de1337(tokens, name="de1337_tokens")

abs_ints = mappers.abs(my_ints)


def test_item_mapper():
    cache = {tokens: ["alpha", "bravo", "charlie", "delta"]}
    assert (solve(first_char, cache=cache) ==
            ["a", "b", "c", "d"])

    assert pickle.loads(pickle.dumps(first_char)) == first_char
示例#6
0
from revscoring.datasources.meta import vectorizers, mappers
from revscoring.features import wikitext
from revscoring.features.meta import aggregators


kowiki_kvs = vectorizers.word2vec.load_gensim_kv(
    filename="kowiki-20191201-learned_vectors.50_cell.100k.kv", mmap='r')


def vectorize_words(words):
    return vectorizers.word2vec.vectorize_words(kowiki_kvs, words)


revision_text_vectors = vectorizers.word2vec(
    mappers.lower_case(wikitext.revision.datasources.words),
    vectorize_words,
    name="revision.text.ko_vectors")

w2v = aggregators.mean(
    revision_text_vectors,
    vector=True,
    name="revision.text.ko_vectors_mean"
)

drafttopic = [w2v]
articletopic = drafttopic
示例#7
0
    def __init__(self, name, revision_datasources):
        super().__init__(name, revision_datasources)

        self.tokens = tokenized(revision_datasources.text)
        """
        A list of all tokens
        """

        self.paragraphs_sentences_and_whitespace = Datasource(
            self._name + ".paragraphs_sentences_and_whitespace",
            paragraphs_sentences_and_whitespace.segment,
            depends_on=[self.tokens])
        """
        A list of paragraphs, sentences, and whitespaces as segments.  See
        :class:`deltas.segmenters.Segment` and
        :class:`deltas.segmenters.MatchableSegment`.
        """

        self.token_frequency = frequencies.table(self.tokens,
                                                 name=self._name +
                                                 ".token_frequency")
        """
        A frequency table of all tokens.
        """

        self.numbers = self.tokens_in_types({'number'},
                                            name=self._name + ".numbers")
        """
        A list of numeric tokens
        """

        self.number_frequency = frequencies.table(self.numbers,
                                                  name=self._name +
                                                  ".number_frequency")
        """
        A frequency table of number tokens.
        """

        self.whitespaces = self.tokens_in_types({'whitespace'},
                                                name=self._name +
                                                ".whitespaces")
        """
        A list of whitespace tokens
        """

        self.whitespace_frequency = frequencies.table(self.whitespaces,
                                                      name=self._name +
                                                      ".whitespace_frequency")
        """
        A frequency table of whichspace tokens.
        """

        self.markups = self.tokens_in_types(
            {
                'dbrack_open', 'dbrack_close', 'brack_open', 'brack_close',
                'tab_open', 'tab_close', 'dcurly_open', 'dcurly_close',
                'curly_open', 'curly_close', 'bold', 'italics', 'equals'
            },
            name=self._name + ".markups")
        """
        A list of markup tokens
        """

        self.markup_frequency = frequencies.table(self.markups,
                                                  name=self._name +
                                                  ".markup_frequency")
        """
        A frequency table of markup tokens.
        """

        self.cjks = self.tokens_in_types({'cjk'}, name=self._name + ".cjks")
        """
        A list of Chinese/Japanese/Korean tokens
        """

        self.cjk_frequency = frequencies.table(self.cjks,
                                               name=self._name +
                                               ".cjk_frequency")
        """
        A frequency table of cjk tokens.
        """

        self.entities = self.tokens_in_types({'entity'},
                                             name=self._name + ".entities")
        """
        A list of HTML entity tokens
        """

        self.entity_frequency = frequencies.table(self.entities,
                                                  name=self._name +
                                                  ".entity_frequency")
        """
        A frequency table of entity tokens.
        """

        self.urls = self.tokens_in_types({'url'}, name=self._name + ".urls")
        """
        A list of URL tokens
        """

        self.url_frequency = frequencies.table(self.urls,
                                               name=self._name +
                                               ".url_frequency")
        """
        A frequency table of url tokens.
        """

        self.words = self.tokens_in_types({'word'}, name=self._name + ".words")
        """
        A list of word tokens
        """

        self.word_frequency = frequencies.table(mappers.lower_case(self.words),
                                                name=self._name +
                                                ".word_frequency")
        """
        A frequency table of lower-cased word tokens.
        """

        self.uppercase_words = filters.filter(is_uppercase_word,
                                              self.words,
                                              name=self._name +
                                              ".uppercase_words")
        """
        A list of uppercase word tokens that are at least two
        characters long.
        """

        self.uppercase_word_frequency = frequencies.table(
            self.uppercase_words,
            name=self._name + ".uppercase_word_frequency")
        """
        A frequency table of uppercase word tokens that are at least two
        characters long.
        """

        self.punctuations = self.tokens_in_types(
            {
                'period', 'qmark', 'epoint', 'comma', 'colon', 'scolon',
                'japan_punct'
            },
            name=self._name + ".punctuations")
        """
        A list of punctuation tokens
        """

        self.punctuation_frequency = frequencies.table(
            self.punctuations, name=self._name + ".punctuation_frequency")
        """
        A frequency table of punctuation tokens.
        """

        self.breaks = self.tokens_in_types({'break'},
                                           name=self._name + ".breaks")
        """
        A list of break tokens
        """

        self.break_frequency = frequencies.table(self.breaks,
                                                 name=self._name +
                                                 ".break_frequency")
        """