def tokenized(text_datasource, name=None, tok_strategy="Latin"): """ Constructs a :class:`revision.Datasource` that generates a list of tokens """ if name is None: name = "{0}({1!r}, {2!r})".format("tokenized", text_datasource, tok_strategy) if tok_strategy == "Latin": return Datasource(name, _process_tokens, depends_on=[text_datasource]) elif tok_strategy == "CJK": return Datasource(name, _process_tokens_cjk, depends_on=[text_datasource]) else: raise NotImplementedError
def __init__(self, name, revision_datasources): super().__init__(name, revision_datasources) self.sentences = Datasource( self._name + ".sentences", psw2sentences, depends_on=[self.paragraphs_sentences_and_whitespace] ) """
def tokenized(text_datasource, name=None): """ Constructs a :class:`revision.Datasource` that generates a list of tokens """ if name is None: name = "{0}({1})".format("tokenized", text_datasource) return Datasource(name, _process_tokens, depends_on=[text_datasource])
def __init__(self, prefix, revision_datasources): self.bytes = Datasource(prefix + ".bytes", _process_bytes, depends_on=[revision_datasources.text]) if hasattr(revision_datasources, "parent"): self.parent = Revision(prefix + ".parent", revision_datasources.parent)
def test_scoring_context(): from revscoring.datasources import Datasource from revscoring.dependencies import Dependent from revscoring.features import Feature fake_data = Datasource("fake_data", lambda: "fake") len_func = Dependent("len_func") literal_fake = Dependent("literal_fake") characters = Feature("characters", lambda word, len: len(word), returns=int, depends_on=[fake_data, len_func]) is_fake = Feature("is_fake", lambda word, fake: word == fake, returns=bool, depends_on=[fake_data, literal_fake]) FakeExtractor = namedtuple("Extractor", ['extract', 'solve', 'language']) def fake_extract(rev_ids, dependents, caches=None): caches = caches or defaultdict(dict) for rev_id in rev_ids: cache = caches[rev_id] if rev_id % 5 != 0: values = dependencies.solve(dependents, context={len_func: lambda: len}, cache=cache) yield None, list(values) else: yield RuntimeError("extract"), None def fake_solve(dependents, cache=None): cache = cache or {} cache.update({len_func: len, literal_fake: "fake"}) return dependencies.solve(dependents, cache=cache) extractor = FakeExtractor(fake_extract, fake_solve, None) FakeScorerModel = namedtuple("FakeScorerModel", ['score', 'version', 'language', 'features']) scorer_model = FakeScorerModel(lambda fvs: {"prediction": "generated"}, "1", None, [characters, is_fake]) scoring_context = ScoringContext("fakewiki", {"fake": scorer_model}, extractor) rev_ids = [1, 2, 3, 4, 5] root_ds_caches = scoring_context.extract_roots("fake", rev_ids) eq_(len(root_ds_caches), 5) eq_(root_ds_caches[1][1][fake_data], "fake") assert root_ds_caches[5][0] is not None score, feature_vals = scoring_context.score("fake", { characters: 10, is_fake: False }) eq_(score['prediction'], "generated")
def test_max_vectors(): my_list = Datasource("my_list") my_max = aggregators.max(my_list, vector=True) cache = {my_list: [[1, 2, 3], [4, 5, 6]]} assert all(a == b for a, b in zip(solve(my_max, cache=cache), [4, 5, 6])) cache = {my_list: [[]]} assert solve(my_max, cache=cache) == [0] cache = {my_list: [None]} assert solve(my_max, cache=cache) == [0] assert pickle.loads(pickle.dumps(my_max)) == my_max
def test_trim(): d1 = Datasource("derp1") f1 = Feature("foobar1", returns=int) f2 = Feature("foobar2", returns=int, depends_on=[d1]) c = Constant(value=5) fv = FeatureVector("foobar3", returns=int, depends_on=[c]) assert list(trim(f1)) == [f1] assert list(trim([f1, f2, fv])) == [f1, f2, fv] assert (list(trim(log(max(f1 - f2, 1)))) == [f1, f2])
def test_len(): my_list = Datasource("my_list") my_len = aggregators.len(my_list) cache = {my_list: [1, 2, 3, 4]} assert solve(my_len, cache=cache) == 4 cache = {my_list: []} assert solve(my_len, cache=cache) == 0 cache = {my_list: None} assert solve(my_len, cache=cache) == 0 assert pickle.loads(pickle.dumps(my_len)) == my_len
def test_sum_vectors(): my_list = Datasource("my_list") my_sum = aggregators.sum(my_list, vector=True) cache = {my_list: [[1, 2, 3], [4, 5, 6]]} assert all(a == b for a, b in zip(solve(my_sum, cache=cache), [5, 7, 9])) cache = {my_list: [[]]} assert solve(my_sum, cache=cache) == [0] cache = {my_list: [None]} assert solve(my_sum, cache=cache) == [0] assert str(my_sum) == "feature_vector.sum(<datasource.my_list>)" assert pickle.loads(pickle.dumps(my_sum)) == my_sum
def test_sum(): my_list = Datasource("my_list") my_sum = aggregators.sum(my_list) cache = {my_list: [1, 2, 3, 4]} assert solve(my_sum, cache=cache) == 10 cache = {my_list: []} assert solve(my_sum, cache=cache) == 0 cache = {my_list: None} assert solve(my_sum, cache=cache) == 0 assert str(my_sum) == "feature.sum(<datasource.my_list>)" assert pickle.loads(pickle.dumps(my_sum)) == my_sum
def test_key(): my_dict = Datasource("my_dict") foo = key('foo', my_dict) assert solve(foo, cache={my_dict: {'foo': "bar"}}) == 'bar' assert repr(foo) == "<datasource.my_dict['foo']>" bar = key('bar', my_dict, apply=or_none(int)) assert solve(bar, cache={my_dict: {'bar': None}}) is None assert solve(bar, cache={my_dict: {'bar': "1"}}) == 1 foobar = key(['foo', 'bar'], my_dict) assert solve(foobar, cache={my_dict: {'bar': 1}}) is None assert solve(foobar, cache={my_dict: {'foo': {'bar': 1}}}) == 1 assert repr(foobar) == "<datasource.my_dict[['foo', 'bar']]>" assert pickle.loads(pickle.dumps(foo)) == foo assert pickle.loads(pickle.dumps(bar)) == bar assert pickle.loads(pickle.dumps(foobar)) == foobar
def test_offline_extractor(): last_two_in_id = Datasource("last_two_in_id", get_last_two, depends_on=[revision_oriented.revision.id]) extractor = OfflineExtractor() assert extractor.extract(345678, last_two_in_id) == 78 assert (list(extractor.extract([345678, 4634800], last_two_in_id)) == [(None, 78), (None, 0)]) extraction_profile = {} list( extractor.extract([345678, 4634800], last_two_in_id, profile=extraction_profile)) assert len(extraction_profile) == 1 assert len(extraction_profile[last_two_in_id]) == 2
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.sentences_added_removed = Datasource( self._name + ".sentences_added_removed", set_diff, depends_on=[self.revision.sentences, self.revision.parent.sentences] ) self.sentences_added = indexable.index( 0, self.sentences_added_removed, name=self._name + ".sentences_added" ) """ A set of sentences that were added in this edit """ self.sentences_removed = indexable.index( 1, self.sentences_added_removed, name=self._name + ".sentences_removed" ) """
wikidata_kvs = vectorizers.word2vec.load_gensim_kv( filename="wikidata-20200501-learned_vectors.50_cell.10k.kv", mmap="r") def process_claims_to_words(claims): words = [] for pid, value in claims: words.append(pid) if QID_RE.match(value) is not None: words.append(value) return words def vectorize_words(words): return vectorizers.word2vec.vectorize_words(wikidata_kvs, words) claim_words = Datasource("wikidata.revision.claim_words", process_claims_to_words, depends_on=[wikibase.revision.datasources.claims]) revision_claim_words_vectors = vectorizers.word2vec( claim_words, vectorize_words, name="revision.text.wikidata_vectors") w2v = aggregators.mean(revision_claim_words_vectors, vector=True, name="revision.text.wikidata_vectors_mean") articletopic = [w2v]
def __init__(self, name, revision_datasources): super().__init__(name, revision_datasources) self.tokens = tokenized(revision_datasources.text) """ A list of all tokens """ self.paragraphs_sentences_and_whitespace = Datasource( self._name + ".paragraphs_sentences_and_whitespace", paragraphs_sentences_and_whitespace.segment, depends_on=[self.tokens]) """ A list of paragraphs, sentences, and whitespaces as segments. See :class:`deltas.segmenters.Segment` and :class:`deltas.segmenters.MatchableSegment`. """ self.token_frequency = frequencies.table(self.tokens, name=self._name + ".token_frequency") """ A frequency table of all tokens. """ self.numbers = self.tokens_in_types({'number'}, name=self._name + ".numbers") """ A list of numeric tokens """ self.number_frequency = frequencies.table(self.numbers, name=self._name + ".number_frequency") """ A frequency table of number tokens. """ self.whitespaces = self.tokens_in_types({'whitespace'}, name=self._name + ".whitespaces") """ A list of whitespace tokens """ self.whitespace_frequency = frequencies.table(self.whitespaces, name=self._name + ".whitespace_frequency") """ A frequency table of whichspace tokens. """ self.markups = self.tokens_in_types( { 'dbrack_open', 'dbrack_close', 'brack_open', 'brack_close', 'tab_open', 'tab_close', 'dcurly_open', 'dcurly_close', 'curly_open', 'curly_close', 'bold', 'italics', 'equals' }, name=self._name + ".markups") """ A list of markup tokens """ self.markup_frequency = frequencies.table(self.markups, name=self._name + ".markup_frequency") """ A frequency table of markup tokens. """ self.cjks = self.tokens_in_types({'cjk'}, name=self._name + ".cjks") """ A list of Chinese/Japanese/Korean tokens """ self.cjk_frequency = frequencies.table(self.cjks, name=self._name + ".cjk_frequency") """ A frequency table of cjk tokens. """ self.entities = self.tokens_in_types({'entity'}, name=self._name + ".entities") """ A list of HTML entity tokens """ self.entity_frequency = frequencies.table(self.entities, name=self._name + ".entity_frequency") """ A frequency table of entity tokens. """ self.urls = self.tokens_in_types({'url'}, name=self._name + ".urls") """ A list of URL tokens """ self.url_frequency = frequencies.table(self.urls, name=self._name + ".url_frequency") """ A frequency table of url tokens. """ self.words = self.tokens_in_types({'word'}, name=self._name + ".words") """ A list of word tokens """ self.word_frequency = frequencies.table(mappers.lower_case(self.words), name=self._name + ".word_frequency") """ A frequency table of lower-cased word tokens. """ self.uppercase_words = filters.filter(is_uppercase_word, self.words, name=self._name + ".uppercase_words") """ A list of uppercase word tokens that are at least two characters long. """ self.uppercase_word_frequency = frequencies.table( self.uppercase_words, name=self._name + ".uppercase_word_frequency") """ A frequency table of uppercase word tokens that are at least two characters long. """ self.punctuations = self.tokens_in_types( { 'period', 'qmark', 'epoint', 'comma', 'colon', 'scolon', 'japan_punct' }, name=self._name + ".punctuations") """ A list of punctuation tokens """ self.punctuation_frequency = frequencies.table( self.punctuations, name=self._name + ".punctuation_frequency") """ A frequency table of punctuation tokens. """ self.breaks = self.tokens_in_types({'break'}, name=self._name + ".breaks") """ A list of break tokens """ self.break_frequency = frequencies.table(self.breaks, name=self._name + ".break_frequency") """
def test_missing_key(): with raises(RuntimeError): my_dict = Datasource("my_dict") foobar = key(['foo', 'bar'], my_dict, if_missing=(RuntimeError)) assert solve(foobar, cache={my_dict: {'bar': 1}}) is None
def test_key_exists(): my_dict = Datasource("my_dict") foo_exists = key_exists('foo', my_dict) assert solve(foo_exists, cache={my_dict: {'foo': "bar"}}) is True assert solve(foo_exists, cache={my_dict: {'baz': "bar"}}) is False assert pickle.loads(pickle.dumps(foo_exists)) == foo_exists
from revscoring.datasources import Datasource id = Datasource("page.id") wikiproject_title = Datasource("page.wikiproject_title") stats = Datasource("page.stats")
def __init__(self, name, revision_datasources): super().__init__(name, revision_datasources) self.wikicode = Datasource(self._name + ".wikicode", _process_wikicode, depends_on=[revision_datasources.text]) """ A :class:`mwparserfromhell.wikicode.Wikicode` abstract syntax tree representing the structure of the page. """ self.node_class_map = Datasource(self._name + ".node_class_map", _process_node_class_map, depends_on=[self.wikicode]) """ A map of mwparserfromhell.wikicode.<class> to lists of nodes of that type. """ self.content = execute_method("strip_code", self.wikicode, name=self._name + ".content") """ The viewable content (no markup or templates) of the revision. """ self.headings = get_key(mwparserfromhell.nodes.Heading, self.node_class_map, default=[], name=self._name + ".headings") """ A list of :class:`mwparserfromhell.nodes.heading.Heading`'s """ self.heading_titles = mappers.map(_extract_heading_title, self.headings, name=self._name + ".heading_titles") """ A list of heading titles """ self.external_links = get_key(mwparserfromhell.nodes.ExternalLink, self.node_class_map, default=[], name=self._name + ".external_links") """ A list of :class:`mwparserfromhell.nodes.heading.ExternalLink`'s """ self.external_link_urls = mappers.map(_extract_external_link_url, self.external_links, name=self._name + ".external_link_url") """ A list of external link urls """ self.wikilinks = get_key(mwparserfromhell.nodes.Wikilink, self.node_class_map, default=[], name=self._name + ".wikilinks") """ A list of :class:`mwparserfromhell.nodes.heading.Wikilink`'s """ self.wikilink_titles = mappers.map(_extract_wikilink_title, self.wikilinks, name=self._name + ".wikilink_titles") """ Returns a list of string titles of internal links (aka "targets") """ self.tags = get_key(mwparserfromhell.nodes.Tag, self.node_class_map, default=[], name=self._name + ".tags") """ A list of :class:`mwparserfromhell.nodes.heading.Tag`'s """ self.tag_names = mappers.map(_extract_tag_name, self.tags, name=self._name + ".tag_names") """ Returns a list of html tag names present in the content of the revision """ self.tags_str = mappers.map(str, self.tags, name=self._name + ".tags_str") """ Returns a list of tags present in the content of the revision as strings """ self.templates = get_key(mwparserfromhell.nodes.Template, self.node_class_map, default=[], name=self._name + ".templates") """ A list of :class:`mwparserfromhell.nodes.heading.Templates`'s """ self.template_names = mappers.map(_extract_template_name, self.templates, name=self._name + ".template_names") """ Returns a list of template names present in the content of the revision """ self.templates_str = mappers.map(str, self.templates, name=self._name + ".templates_str") """ Returns a list of templates present in the content of the revision as strings """ self.sections = Datasource(self._name + ".section", _extract_sections, depends_on=[self.wikicode]) """
def __init__(self, name, revision_datasources): super().__init__(name) self.revision_entity = revision_datasources.entity self.parent_entity = revision_datasources.parent.entity # sitelinks self.sitelinks_diff = Datasource( name + ".sitelinks_diff", diff_dicts, depends_on=[ revision_datasources.parent.sitelinks, revision_datasources.sitelinks ]) self.sitelinks_added, self.sitelinks_removed, self.sitelinks_changed =\ diff_parts(name + ".sitelinks", self.sitelinks_diff) # labels self.labels_diff = Datasource(name + ".labels_diff", diff_dicts, depends_on=[ revision_datasources.parent.labels, revision_datasources.labels ]) self.labels_added, self.labels_removed, self.labels_changed = \ diff_parts(name + ".labels", self.labels_diff) # aliases self.aliases_diff = Datasource(name + ".aliases_diff", diff_dicts, depends_on=[ revision_datasources.parent.aliases, revision_datasources.aliases ]) self.aliases_added, self.aliases_removed, self.aliases_changed = \ diff_parts(name + ".aliases", self.aliases_diff) # descriptions self.descriptions_diff = Datasource( name + ".descriptions_diff", diff_dicts, depends_on=[ revision_datasources.parent.descriptions, revision_datasources.descriptions ]) (self.descriptions_added, self.descriptions_removed, self.descriptions_changed) = \ diff_parts(name + ".descriptions", self.descriptions_diff) # properties self.properties_diff = Datasource( name + ".properties_diff", diff_dicts, depends_on=[ revision_datasources.parent.properties, revision_datasources.properties ]) (self.properties_added, self.properties_removed, self.properties_changed) = \ diff_parts(name + ".properties", self.properties_diff) self.statements_added = Datasource(name + ".statements_added", _process_statements_added, depends_on=[ self.properties_diff, self.parent_entity, self.revision_entity ]) self.claims_added = Datasource( # Backwards compatible name + ".claims_added", _identity, depends_on=[self.statements_added]) self.statements_removed = Datasource(name + ".statements_removed", _process_statements_removed, depends_on=[ self.properties_diff, self.parent_entity, self.revision_entity ]) self.claims_removed = Datasource( # Backwards compatible name + ".claims_removed", _identity, depends_on=[self.statements_removed]) self.statements_changed = Datasource(name + ".statements_changed", _process_statements_changed, depends_on=[ self.properties_diff, self.parent_entity, self.revision_entity ]) self.claims_changed = Datasource( # Backwards compatible name + ".claims_changed", _identity, depends_on=[self.statements_changed]) self.sources_added = Datasource(name + ".sources_added", _process_sources_added, depends_on=[self.claims_changed]) self.sources_removed = Datasource(name + ".sources_removed", _process_sources_removed, depends_on=[self.claims_changed]) self.qualifiers_added = Datasource(name + ".qualifiers_added", _process_qualifiers_added, depends_on=[self.claims_changed]) self.qualifiers_removed = Datasource(name + ".qualifiers_removed", _process_qualifiers_removed, depends_on=[self.claims_changed]) # badges self.badges_diff = Datasource(name + ".badges_diff", diff_dicts, depends_on=[ revision_datasources.parent.badges, revision_datasources.badges ]) self.badges_added, self.badges_removed, self.badges_changed = \ diff_parts(name + ".badges", self.badges_diff)
def __init__(self, name, revision_datasources): super().__init__(name) self.entity_doc = Datasource(name + ".entity_doc", _process_entity_doc, depends_on=[revision_datasources.text]) """ A JSONable `dict` of content for a Wikibase content. """ self.entity = Datasource(name + ".entity", _process_entity, depends_on=[self.entity_doc]) """ A `~mwbase.Entity` for the Wikibase content """ self.sitelinks = Datasource(name + ".sitelinks", _process_sitelinks, depends_on=[self.entity]) """ A `dict` of wiki/sitelink pairs in the revision """ self.labels = Datasource(name + ".labels", _process_labels, depends_on=[self.entity]) """ A `dict` of lang/label pairs in the revision """ self.aliases = Datasource(name + ".aliases", _process_aliases, depends_on=[self.entity]) """ A `set` of unique aliases in the revision """ self.descriptions = Datasource(name + ".descriptions", _process_descriptions, depends_on=[self.entity]) """ A `dict` of lang/description pairs in the revision """ self.properties = Datasource(name + ".properties", _process_properties, depends_on=[self.entity]) """ A `set` of properties in the revision """ self.claims = Datasource(name + ".claim", _process_claims, depends_on=[self.entity]) """ A `set` of unique claims in the revision """ self.sources = Datasource(name + ".sources", _process_sources, depends_on=[self.entity]) """ A `set` of unique sources in the revision """ self.reference_claims = Datasource(name + ".reference_claims", _process_ref_claims, depends_on=[self.entity]) """ A `set` of unique reference claims in the revision """ self.qualifiers = Datasource(name + ".qualifiers", _process_qualifiers, depends_on=[self.entity]) """ A `set` of unique qualifiers in the revision """ self.badges = Datasource(name + ".badges", _process_badges, depends_on=[self.entity]) """ A `set` of unique badges in the revision """ if hasattr(revision_datasources, "parent") and \ hasattr(revision_datasources.parent, "text"): self.parent = Revision(name + ".parent", revision_datasources.parent) if hasattr(revision_datasources, "diff"): self.diff = Diff(name + ".diff", self)
def test_scoring_context(): from revscoring.datasources import Datasource from revscoring.dependencies import Dependent from revscoring.features import Feature fake_data = Datasource("fake_data", lambda: "fake") len_func = Dependent("len_func") literal_fake = Dependent("literal_fake") characters = Feature("characters", lambda word, len: len(word), returns=int, depends_on=[fake_data, len_func]) is_fake = Feature("is_fake", lambda word, fake: word == fake, returns=bool, depends_on=[fake_data, literal_fake]) FakeExtractor = namedtuple("Extractor", ['extract', 'solve', 'language']) def fake_extract(rev_ids, dependents, caches=None): caches = caches if caches is not None else {} for rev_id in rev_ids: if rev_id % 5 != 0: cache = caches.get(rev_id, {}) values = dependencies.solve(dependents, context={len_func: lambda: len}, cache=cache) values = list(values) caches[rev_id] = cache yield None, values else: yield RuntimeError("extract"), None def fake_solve(dependents, cache=None): cache = cache if cache is not None else {} cache.update({len_func: len, literal_fake: "fake"}) return dependencies.solve(dependents, cache=cache) extractor = FakeExtractor(fake_extract, fake_solve, None) FakeScorerModel = namedtuple("FakeScorerModel", ['score', 'version', 'language', 'features']) scorer_model = FakeScorerModel(lambda fvs: {"prediction": "generated"}, "1", None, [characters, is_fake]) scoring_context = ScoringContext("fakewiki", {"fake": scorer_model}, extractor) rev_ids = [1, 2, 3, 4, 5] root_ds_caches, errors = scoring_context.extract_root_dependency_caches( ["fake"], rev_ids) print(root_ds_caches) print(errors) assert len(root_ds_caches) == 4 assert len(errors) == 1 assert root_ds_caches[1][fake_data] == "fake" assert 5 in errors score = scoring_context.process_model_scores(["fake"], { characters: 10, is_fake: False }) assert score['fake']['score']['prediction'] == "generated"
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.operations = Datasource( self._name + ".operations", _process_operations, depends_on=[ self.revision.parent.paragraphs_sentences_and_whitespace, self.revision.paragraphs_sentences_and_whitespace, self.revision.parent.tokens, self.revision.tokens ]) """ Returns a tuple that describes the difference between the parent revision text and the current revision's text. The tuple contains three fields: * operations: `list` of :class:`deltas.Operation` * A tokens: `list` of `str` * B tokens: `list` of `str` """ self.segments_added = Datasource(self._name + ".segments_added", _process_segments_added, depends_on=[self.operations]) """ Returns a list of all contiguous segments of tokens added in this revision. """ self.segments_removed = Datasource(self._name + ".segments_removed", _process_segments_removed, depends_on=[self.operations]) """ Returns a list of all contiguous segments of tokens removed in this revision. """ self.tokens_added = Datasource(self._name + ".tokens_added", _process_tokens_added, depends_on=[self.operations]) """ Constructs a :class:`revscoring.Datasource` that returns a list of all tokens added in this revision. """ self.tokens_removed = Datasource(self._name + ".tokens_removed", _process_tokens_removed, depends_on=[self.operations]) """ Constructs a :class:`revscoring.Datasource` that returns a list of all tokens removed in this revision. """ self.numbers_added = self.tokens_added_in_types({'number'}, name=self._name + ".numbers_added") """ A list of numeric tokens added in the edit """ self.numbers_removed = self.tokens_removed_in_types({'number'}, name=self._name + ".numbers_removed") """ A list of numeric tokens removed in the edit """ self.whitespaces_added = self.tokens_added_in_types( {'whitespace'}, name=self._name + ".whitespaces_added") """ A list of whitespace tokens added in the edit """ self.whitespaces_removed = self.tokens_removed_in_types( {'whitespace'}, name=self._name + ".whitespaces_removed") """ A list of whitespace tokens removed in the edit """ self.markups_added = self.tokens_added_in_types( { 'dbrack_open', 'dbrack_close', 'brack_open', 'brack_close', 'tab_open', 'tab_close', 'dcurly_open', 'dcurly_close', 'curly_open', 'curly_close', 'bold', 'italics', 'equals' }, name=self._name + ".markups_added") """ A list of markup tokens added in the edit """ self.markups_removed = self.tokens_removed_in_types( { 'dbrack_open', 'dbrack_close', 'brack_open', 'brack_close', 'tab_open', 'tab_close', 'dcurly_open', 'dcurly_close', 'curly_open', 'curly_close', 'bold', 'italics', 'equals' }, name=self._name + ".markups_removed") """ A list of markup tokens removed in the edit """ self.cjks_added = self.tokens_added_in_types({'cjk'}, name=self._name + ".cjks_added") """ A list of Chinese/Japanese/Korean tokens added in the edit """ self.cjks_removed = self.tokens_removed_in_types({'cjk'}, name=self._name + ".cjks_removed") """ A list of Chinese/Japanese/Korean tokens removed in the edit """ self.entities_added = self.tokens_added_in_types({'entity'}, name=self._name + ".entities_added") """ A list of HTML entity tokens added in the edit """ self.entities_removed = self.tokens_removed_in_types( {'entity'}, name=self._name + ".entities_removed") """ A list of HTML entity tokens removed in the edit """ self.urls_added = self.tokens_added_in_types({'url'}, name=self._name + ".urls_added") """ A list of URL tokens rempved in the edit """ self.urls_removed = self.tokens_removed_in_types({'url'}, name=self._name + ".urls_removed") """ A list of URL tokens added in the edit """ self.words_added = self.tokens_added_in_types({'word'}, name=self._name + ".words_added") """ A list of word tokens added in the edit """ self.words_removed = self.tokens_removed_in_types({'word'}, name=self._name + ".words_removed") """ A list of word tokens removed in the edit """ self.uppercase_words_added = filters.filter(is_uppercase_word, self.words_added, name=self._name + ".uppercase_words_added") """ A list of fully UPPERCASE word tokens added in the edit """ self.uppercase_words_removed = filters.filter( is_uppercase_word, self.words_removed, name=self._name + ".uppercase_words_removed") """ A list of fully UPPERCASE word tokens removed in the edit """ self.punctuations_added = self.tokens_added_in_types( { 'period', 'qmark', 'epoint', 'comma', 'colon', 'scolon', 'japan_punct' }, name=self._name + ".punctuations_added") """ A list of punctuation tokens added in the edit """ self.punctuations_removed = self.tokens_removed_in_types( { 'period', 'qmark', 'epoint', 'comma', 'colon', 'scolon', 'japan_punct' }, name=self._name + ".punctuations_removed") """ A list of punctuation tokens removed in the edit """ self.breaks_added = self.tokens_added_in_types({'break'}, name=self._name + ".breaks_added") """ A list of break tokens added in the edit """ self.breaks_removed = self.tokens_removed_in_types({'break'}, name=self._name + ".breaks_removed") """
import pickle from revscoring.datasources import Datasource from revscoring.dependencies import solve from revscoring.features.meta import vectorizers my_dict = Datasource("my_dict") class KeysDict(Datasource): def __init__(self, name, keys): super().__init__(name) self._keys = keys def keys(self): return self._keys my_keys_dict = KeysDict("my_keys_dict", ["a", "b", "c"]) def test_vectorize(): my_vector = vectorizers.vectorize(my_dict, ["a", "b", "c"], returns=int) assert (solve(my_vector, cache={my_dict: {"a": 5}}) == [5, 0, 0]) assert (solve(my_vector, cache={my_dict: {"d": 5}}) == [0, 0, 0]) assert (solve(my_vector, cache={my_dict: { "a": 1, "b": 2, "c": 3 }}) == [1, 2, 3])
def get_polarity_score(non_stop_tokens): """ Gets the positive and negative polarity of the document using SentiWordnet takes the most common sense of the word for efficiency """ pos, neg = 0.0, 0.0 for t in non_stop_tokens: synsets = list(swn.senti_synsets(t)) if synsets: pos += synsets[0].pos_score() neg += synsets[0].neg_score() return [pos, neg] sentiment_score = Datasource("english.sentiment.revision.polarity_score", get_polarity_score, depends_on=[english.stopwords.revision.datasources.non_stopwords]) # noqa: E501 def get_positive_score(senti_score): return senti_score[0] def get_negative_score(senti_score): return senti_score[1] positive_polarity = Feature( "english.sentiment.revision.positive_polarity", get_positive_score, depends_on=[sentiment_score],