def test_datasource(): d = Datasource("d") assert pickle.loads(pickle.dumps(d)) == d assert solve(d, cache={d: "foo"}) == "foo" assert solve(d, cache={"datasource.d": "foo"}) == "foo" assert str(d) == "datasource.d" assert repr(d) == "<datasource.d>"
import pickle from revscoring.datasources.datasource import Datasource from revscoring.datasources.meta import hashing from revscoring.dependencies import solve my_tokens = Datasource("my_tokens") my_hashes = hashing.hash(my_tokens, n=10) def test_hashing(): hashes = solve(my_hashes, cache={my_tokens: [("one", "two"), "two", "three", "four"]}) assert len(hashes) == 4 assert max(hashes) <= 10, str(max(hashes)) hashes_again = solve( my_hashes, cache={my_tokens: [("one", "two"), "two", "three", "four"]}) assert hashes == hashes_again assert (pickle.loads(pickle.dumps(my_hashes)) == my_hashes)
import pickle from revscoring.datasources.datasource import Datasource from revscoring.datasources.meta import mappers from revscoring.dependencies import solve tokens = Datasource("tokens") my_ints = Datasource("my_ints") def extract_first_char(token): return token[:1] first_char = mappers.map(extract_first_char, tokens, name="first_char") lower_case_tokens = mappers.lower_case(tokens, name="lower_case_tokens") derepeat_tokens = mappers.derepeat(tokens, name="derepeat_tokens") de1337_tokens = mappers.de1337(tokens, name="de1337_tokens") abs_ints = mappers.abs(my_ints) def test_item_mapper(): cache = {tokens: ["alpha", "bravo", "charlie", "delta"]} assert (solve(first_char, cache=cache) == ["a", "b", "c", "d"]) assert pickle.loads(pickle.dumps(first_char)) == first_char
Mapping of english descriptions to item idenifiers """ HUMAN = 'Q5' def _process_source_claims(item): return [ source_claim for pid, claims in item.claims.items() for claim in claims for source in claim.sources for source_pid, source_claims in source.items() for source_claim in source_claims ] source_claims = Datasource(name + ".revision.source_claims", _process_source_claims, depends_on=[wikibase_.revision.datasources.item]) def _process_wikimedia_sources(source_claims): return [ source_claim for source_claim in source_claims if isinstance(source_claim.target, pywikibase.ItemPage) and source_claim.target.id in wikimedia.PROJECT_QIDS ] wikimedia_sources = Datasource(name + ".revision.wikimedia_sources", _process_wikimedia_sources, depends_on=[source_claims])
import pickle from revscoring.datasources.datasource import Datasource from revscoring.datasources.meta import extractors from revscoring.dependencies import solve def return_foo(): return "foo" segments = Datasource("segments") text = Datasource("text") text_extractor = extractors.regex(["foo bar", "bar foo"], text, name="text_extractor") exclusion_text_extractor = extractors.regex(["foo+"], text, name="text_extractor", exclusions=['foooo']) segment_extractor = extractors.regex(["foo bar", "bar foo"], segments, name="text_extractor") def test_text_extractor(): cache = {text: "This is some text foo bar nope bar foo"}
import pickle from revscoring.datasources.datasource import Datasource from revscoring.datasources.meta import dicts from revscoring.dependencies import solve my_dict = Datasource("my_dict") my_keys = dicts.keys(my_dict) my_values = dicts.values(my_dict) def test_dict_keys(): cache = {my_dict: {"foo": 1, "bar": 2}} assert set(solve(my_keys, cache=cache)) == {"foo", "bar"} cache = {my_dict: None} assert set(solve(my_keys, cache=cache)) == set() assert pickle.loads(pickle.dumps(my_keys)) == my_keys def test_dict_values(): cache = {my_dict: {"foo": 1, "bar": 2}} assert set(solve(my_values, cache=cache)) == {1, 2} cache = {my_dict: None} assert set(solve(my_values, cache=cache)) == set() assert pickle.loads(pickle.dumps(my_values)) == my_values
""" Mapping of english descriptions to item idenifiers """ HUMAN = 'Q5' def _process_references(entity): return [reference for pid, statements in entity.properties.items() for statement in statements for pid, references in statement.references.items() for reference in references] references = Datasource( name + ".revision.references", _process_references, depends_on=[wikibase_.revision.datasources.entity]) def _process_wikimedia_references(references): return [reference for reference in references if (reference.datatype == 'wikibase-entityid' and reference.datavalue.id in wikimedia.PROJECT_QIDS)] wikimedia_references = Datasource( name + ".revision.wikimedia_references", _process_wikimedia_references, depends_on=[references])
import pickle from revscoring.datasources.datasource import Datasource from revscoring.datasources.meta import frequencies from revscoring.dependencies import solve old_tokens = Datasource("old_tokens") new_tokens = Datasource("new_tokens") old_ft = frequencies.table(old_tokens, name="old_ft") new_ft = frequencies.table(new_tokens, name="new_ft") delta = frequencies.delta(old_ft, new_ft, name="delta") pos_delta = frequencies.positive(delta, name="pos_delta") neg_delta = frequencies.negative(delta, name="neg_delta") neg_abs_delta = frequencies.negative( delta, absolute=True, name="neg_abs_delta") prop_delta = frequencies.prop_delta(old_ft, delta, name="prop_delta") def test_table(): cache = {new_tokens: ["a"] * 3 + ["b"] * 2 + ["c"] * 45} assert (solve(new_ft, cache=cache) == {'a': 3, 'b': 2, 'c': 45}) assert (pickle.loads(pickle.dumps(new_ft)) == new_ft) def test_delta():
result_set.add(label) return len(result_set) / 8 def _process_important_translations_descriptions(item_descriptions): result_set = set() for description in (item_descriptions.keys() & IMPORTANT_LANG_CODES): result_set.add(description) return len(result_set) / 8 item_doc = Datasource(name + ".item_doc", _process_item_doc, depends_on=[revision_oriented.revision.text]) """A JSONable `dict` of content for a Wikibase content.""" item = Datasource(name + ".item", _process_item, depends_on=[item_doc]) """A `~pywikibase.Item` for the Wikibase content""" item_labels_datasource = Datasource(name + ".labels", _process_labels, depends_on=[item]) item_descriptions_datasource = Datasource(name + ".descriptions", _process_descriptions, depends_on=[item]) complete_translations = Feature( name + ".complete_translations", _process_complete_translations,