def __init__(self, en_kvs_path): self.tokenizer = mappers.lower_case(wikitext.revision.datasources.words) self.vectorizer = self.load_vectorizer(en_kvs_path) self.cols_to_extract = [ 'aft_id', 'aft_page', 'aft_page_revision', 'aft_user', 'aft_user_text', 'aft_comment', 'aft_noaction', 'aft_inappropriate', 'aft_helpful', 'aft_unhelpful' ]
def load_vectorizer(self, enwiki_kvs_path): enwiki_kvs = vectorizers.word2vec.load_gensim_kv( path=enwiki_kvs_path, mmap="r" ) vectorize_words = functools.partial(vectorizers.word2vec.vectorize_words, enwiki_kvs) revision_text_vectors = vectorizers.word2vec( mappers.lower_case(wikitext.revision.datasources.words), vectorize_words, name="revision.text.en_vectors") w2v = aggregators.mean( revision_text_vectors, vector=True, name="revision.text.en_vectors_mean" ) return w2v
from revscoring.datasources.meta import vectorizers, mappers from revscoring.features import wikitext from revscoring.features.meta import aggregators cswiki_kvs = vectorizers.word2vec.load_gensim_kv( filename="cswiki-20191201-learned_vectors.50_cell.100k.kv", mmap='r') def vectorize_words(words): return vectorizers.word2vec.vectorize_words(cswiki_kvs, words) revision_text_vectors = vectorizers.word2vec(mappers.lower_case( wikitext.revision.datasources.words), vectorize_words, name="revision.text.cs_vectors") w2v = aggregators.mean(revision_text_vectors, vector=True, name="revision.text.cs_vectors_mean") drafttopic = [w2v] articletopic = drafttopic
def __init__(self, name, revision_datasources): super().__init__(name, revision_datasources) self.tokens = tokenized(revision_datasources.text) """ A list of all tokens """ self.paragraphs_sentences_and_whitespace = Datasource( self._name + ".paragraphs_sentences_and_whitespace", paragraphs_sentences_and_whitespace.segment, depends_on=[self.tokens] ) """ A list of paragraphs, sentences, and whitespaces as segments. See :class:`deltas.segmenters.Segment` and :class:`deltas.segmenters.MatchableSegment`. """ self.token_frequency = frequencies.table( self.tokens, name=self._name + ".token_frequency" ) """ A frequency table of all tokens. """ self.numbers = self.tokens_in_types( {'number'}, name=self._name + ".numbers" ) """ A list of numeric tokens """ self.number_frequency = frequencies.table( self.numbers, name=self._name + ".number_frequency" ) """ A frequency table of number tokens. """ self.whitespaces = self.tokens_in_types( {'whitespace'}, name=self._name + ".whitespaces" ) """ A list of whitespace tokens """ self.whitespace_frequency = frequencies.table( self.whitespaces, name=self._name + ".whitespace_frequency" ) """ A frequency table of whichspace tokens. """ self.markups = self.tokens_in_types( {'dbrack_open', 'dbrack_close', 'brack_open', 'brack_close', 'tab_open', 'tab_close', 'dcurly_open', 'dcurly_close', 'curly_open', 'curly_close', 'bold', 'italics', 'equals'}, name=self._name + ".markups" ) """ A list of markup tokens """ self.markup_frequency = frequencies.table( self.markups, name=self._name + ".markup_frequency" ) """ A frequency table of markup tokens. """ self.cjks = self.tokens_in_types( {'cjk'}, name=self._name + ".cjks" ) """ A list of Chinese/Japanese/Korean tokens """ self.cjk_frequency = frequencies.table( self.cjks, name=self._name + ".cjk_frequency" ) """ A frequency table of cjk tokens. """ self.entities = self.tokens_in_types( {'entity'}, name=self._name + ".entities" ) """ A list of HTML entity tokens """ self.entity_frequency = frequencies.table( self.entities, name=self._name + ".entity_frequency" ) """ A frequency table of entity tokens. """ self.urls = self.tokens_in_types( {'url'}, name=self._name + ".urls" ) """ A list of URL tokens """ self.url_frequency = frequencies.table( self.urls, name=self._name + ".url_frequency" ) """ A frequency table of url tokens. """ self.words = self.tokens_in_types( {'word'}, name=self._name + ".words" ) """ A list of word tokens """ self.word_frequency = frequencies.table( mappers.lower_case(self.words), name=self._name + ".word_frequency" ) """ A frequency table of lower-cased word tokens. """ self.uppercase_words = filters.filter( is_uppercase_word, self.words, name=self._name + ".uppercase_words" ) """ A list of uppercase word tokens that are at least two characters long. """ self.uppercase_word_frequency = frequencies.table( self.uppercase_words, name=self._name + ".uppercase_word_frequency" ) """ A frequency table of uppercase word tokens that are at least two characters long. """ self.punctuations = self.tokens_in_types( {'period', 'qmark', 'epoint', 'comma', 'colon', 'scolon', 'japan_punct'}, name=self._name + ".punctuations" ) """ A list of punctuation tokens """ self.punctuation_frequency = frequencies.table( self.punctuations, name=self._name + ".punctuation_frequency" ) """ A frequency table of punctuation tokens. """ self.breaks = self.tokens_in_types( {'break'}, name=self._name + ".breaks" ) """ A list of break tokens """ self.break_frequency = frequencies.table( self.breaks, name=self._name + ".break_frequency" ) """
from revscoring.datasources.datasource import Datasource from revscoring.datasources.meta import mappers from revscoring.dependencies import solve tokens = Datasource("tokens") my_ints = Datasource("my_ints") def extract_first_char(token): return token[:1] first_char = mappers.map(extract_first_char, tokens, name="first_char") lower_case_tokens = mappers.lower_case(tokens, name="lower_case_tokens") derepeat_tokens = mappers.derepeat(tokens, name="derepeat_tokens") de1337_tokens = mappers.de1337(tokens, name="de1337_tokens") abs_ints = mappers.abs(my_ints) def test_item_mapper(): cache = {tokens: ["alpha", "bravo", "charlie", "delta"]} assert (solve(first_char, cache=cache) == ["a", "b", "c", "d"]) assert pickle.loads(pickle.dumps(first_char)) == first_char
from revscoring.datasources.meta import vectorizers, mappers from revscoring.features import wikitext from revscoring.features.meta import aggregators kowiki_kvs = vectorizers.word2vec.load_gensim_kv( filename="kowiki-20191201-learned_vectors.50_cell.100k.kv", mmap='r') def vectorize_words(words): return vectorizers.word2vec.vectorize_words(kowiki_kvs, words) revision_text_vectors = vectorizers.word2vec( mappers.lower_case(wikitext.revision.datasources.words), vectorize_words, name="revision.text.ko_vectors") w2v = aggregators.mean( revision_text_vectors, vector=True, name="revision.text.ko_vectors_mean" ) drafttopic = [w2v] articletopic = drafttopic
def __init__(self, name, revision_datasources): super().__init__(name, revision_datasources) self.tokens = tokenized(revision_datasources.text) """ A list of all tokens """ self.paragraphs_sentences_and_whitespace = Datasource( self._name + ".paragraphs_sentences_and_whitespace", paragraphs_sentences_and_whitespace.segment, depends_on=[self.tokens]) """ A list of paragraphs, sentences, and whitespaces as segments. See :class:`deltas.segmenters.Segment` and :class:`deltas.segmenters.MatchableSegment`. """ self.token_frequency = frequencies.table(self.tokens, name=self._name + ".token_frequency") """ A frequency table of all tokens. """ self.numbers = self.tokens_in_types({'number'}, name=self._name + ".numbers") """ A list of numeric tokens """ self.number_frequency = frequencies.table(self.numbers, name=self._name + ".number_frequency") """ A frequency table of number tokens. """ self.whitespaces = self.tokens_in_types({'whitespace'}, name=self._name + ".whitespaces") """ A list of whitespace tokens """ self.whitespace_frequency = frequencies.table(self.whitespaces, name=self._name + ".whitespace_frequency") """ A frequency table of whichspace tokens. """ self.markups = self.tokens_in_types( { 'dbrack_open', 'dbrack_close', 'brack_open', 'brack_close', 'tab_open', 'tab_close', 'dcurly_open', 'dcurly_close', 'curly_open', 'curly_close', 'bold', 'italics', 'equals' }, name=self._name + ".markups") """ A list of markup tokens """ self.markup_frequency = frequencies.table(self.markups, name=self._name + ".markup_frequency") """ A frequency table of markup tokens. """ self.cjks = self.tokens_in_types({'cjk'}, name=self._name + ".cjks") """ A list of Chinese/Japanese/Korean tokens """ self.cjk_frequency = frequencies.table(self.cjks, name=self._name + ".cjk_frequency") """ A frequency table of cjk tokens. """ self.entities = self.tokens_in_types({'entity'}, name=self._name + ".entities") """ A list of HTML entity tokens """ self.entity_frequency = frequencies.table(self.entities, name=self._name + ".entity_frequency") """ A frequency table of entity tokens. """ self.urls = self.tokens_in_types({'url'}, name=self._name + ".urls") """ A list of URL tokens """ self.url_frequency = frequencies.table(self.urls, name=self._name + ".url_frequency") """ A frequency table of url tokens. """ self.words = self.tokens_in_types({'word'}, name=self._name + ".words") """ A list of word tokens """ self.word_frequency = frequencies.table(mappers.lower_case(self.words), name=self._name + ".word_frequency") """ A frequency table of lower-cased word tokens. """ self.uppercase_words = filters.filter(is_uppercase_word, self.words, name=self._name + ".uppercase_words") """ A list of uppercase word tokens that are at least two characters long. """ self.uppercase_word_frequency = frequencies.table( self.uppercase_words, name=self._name + ".uppercase_word_frequency") """ A frequency table of uppercase word tokens that are at least two characters long. """ self.punctuations = self.tokens_in_types( { 'period', 'qmark', 'epoint', 'comma', 'colon', 'scolon', 'japan_punct' }, name=self._name + ".punctuations") """ A list of punctuation tokens """ self.punctuation_frequency = frequencies.table( self.punctuations, name=self._name + ".punctuation_frequency") """ A frequency table of punctuation tokens. """ self.breaks = self.tokens_in_types({'break'}, name=self._name + ".breaks") """ A list of break tokens """ self.break_frequency = frequencies.table(self.breaks, name=self._name + ".break_frequency") """