def test_max(): max_five_six_seven = modifiers.max(5, 6, 7) assert solve(max_five_six_seven) == 7 assert solve(pickle.loads(pickle.dumps(max_five_six_seven))) == 7 assert repr(max_five_six_seven) == "<feature.max(5, 6, 7)>"
def test_trim(): d1 = Datasource("derp1") f1 = Feature("foobar1", returns=int) f2 = Feature("foobar2", returns=int, depends_on=[d1]) c = Constant(value=5) fv = FeatureVector("foobar3", returns=int, depends_on=[c]) assert list(trim(f1)) == [f1] assert list(trim([f1, f2, fv])) == [f1, f2, fv] assert (list(trim(log(max(f1 - f2, 1)))) == [f1, f2])
""" Turkish Wikipedia +++++++++++++++++ """ from revscoring.features import wikitext from revscoring.features.modifiers import log, max, sub from . import wikipedia cite_templates = wikitext.revision.template_names_matching( r"Kaynak|.*[ _]kaynağı", name="trwiki.revision.cite_templates") proportion_of_templated_references = \ cite_templates / max(wikitext.revision.ref_tags, 1) non_templated_references = max(wikitext.revision.ref_tags - cite_templates, 0) non_cite_templates = sub( wikitext.revision.templates, cite_templates, name="trwiki.revision.non_cite_templates" ) infobox_templates = wikitext.revision.template_names_matching( r".*[ _]bilgi[ _]kutusu", name="trwiki.revision.infobox_templates") # Copied (2015-10-29) from: # https://fr.wikipedia.org/wiki/Wikip%C3%A9dia:Citez_vos_sources#R.C3.A9clamation_et_contestation_de_sources cn_templates = wikitext.revision.template_names_matching( r"Kaynak[ _]belirt|Olgu|Fact|Delil", name="trwiki.revision.lvl1_cn_templates") main_article_templates = wikitext.revision.template_names_matching(
# Copied (2015-10-29) from: # https://fr.wikipedia.org/wiki/Cat%C3%A9gorie:Mod%C3%A8le_pour_bibliographie CITE_TEMPLATES = [ r"Article", r"Chapitre", r"Jugement", r"Lien[ _]web", r"Loi", r"Ouvrage" ] cite_templates = wikitext.revision.template_names_matching( "|".join(CITE_TEMPLATES), name="frwiki.revision.cite_templates") proportion_of_templated_references = \ cite_templates / max(wikitext.revision.ref_tags, 1) non_templated_references = max(wikitext.revision.ref_tags - cite_templates, 0) non_cite_templates = sub( wikitext.revision.templates, cite_templates, name="frwiki.revision.non_cite_templates" ) infobox_templates = wikitext.revision.template_names_matching( r"^infobox", name="frwiki.revision.infobox_templates") # Copied (2015-10-29) from: # https://fr.wikipedia.org/wiki/Wikip%C3%A9dia:Citez_vos_sources#R.C3.A9clamation_et_contestation_de_sources LVL1_CN_TEMPLATES = [r"Référence[ _]souhaitée", r"Citation[ _]nécessaire", r"Référence[ _]à[ _]confirmer", r"Référence[ _]nécessaire", r"Inédit"] lvl1_cn_templates = wikitext.revision.template_names_matching(
r"Harvard citation text", r"harvtxt", r"Harvcoltxt", r"Harvcol", r"Harvcolnb", r"Harvard citations", r"harvs", r"Harvp", r"Citation" ] cite_templates = wikitext.revision.template_names_matching( "|".join(CITE_TEMPLATES), name="ukwiki.revision.cite_templates") shortened_footnote_templates = wikitext.revision.template_names_matching( "sfn", name="ukwiki.revision.shortened_footnote_templates") all_ref_tags = shortened_footnote_templates + wikitext.revision.ref_tags all_cite_templates = cite_templates + shortened_footnote_templates proportion_of_templates_references = \ all_cite_templates / max(all_ref_tags, 1) non_templated_references = max(all_ref_tags - all_cite_templates, 0) non_cite_templates = sub( wikitext.revision.templates, all_cite_templates, name="ukwiki.revision.non_cite_templates" ) # Links CATEGORY_LINKS = [ r"Категорія", r"Category", r"Категория" ] category_links = wikitext.revision.wikilink_titles_matching( "|".join(CATEGORY_LINKS), name="ukwiki.revision.category_links")
"glwiki.revision.revision", wikitext.revision.datasources, ) paragraphs = mappers.map( str, revision.paragraphs_sentences_and_whitespace, name="glwiki.revision.paragraphs" ) paragraphs_without_refs = filters.regex_matching( r"^(?!\s*$)((?!<ref>)(.|\n))*$", paragraphs, name="glwiki.revision.paragraphs_without_refs" ) paragraphs_without_refs_total_length = aggregators.sum( mappers.map(len, paragraphs_without_refs), name="glwiki.revision.paragraphs_without_refs_total_length" ) local_wiki = [ image_links, image_links / max(wikitext.revision.content_chars, 1), category_links, category_links / max(wikitext.revision.content_chars, 1), cn_templates + 1, cn_templates / max(wikitext.revision.content_chars, 1), log(paragraphs_without_refs_total_length + 1), paragraphs_without_refs_total_length / max(wikitext.revision.content_chars, 1), ] wp10 = wikipedia.article + local_wiki
else: return 1 parent = [ log(wikitext.revision.parent.chars + 1), log(wikitext.revision.parent.tokens + 1), log(wikitext.revision.parent.words + 1), log(wikitext.revision.parent.uppercase_words + 1), log(wikitext.revision.parent.headings + 1), log(wikitext.revision.parent.wikilinks + 1), log(wikitext.revision.parent.external_links + 1), log(wikitext.revision.parent.templates + 1), log(wikitext.revision.parent.ref_tags + 1), div(wikitext.revision.parent.chars, max(wikitext.revision.parent.words, 1), name="revision.parent.chars_per_word"), div(wikitext.revision.parent.words, max(wikitext.revision.parent.tokens, 1), name="revision.parent.words_per_token"), div(wikitext.revision.parent.uppercase_words, max(wikitext.revision.parent.words, 1), name="revision.parent.uppercase_words_per_word"), div(wikitext.revision.parent.markups, max(wikitext.revision.parent.tokens, 1), name="revision.parent.markups_per_token"), ] diff = [ wikitext.revision.diff.markup_delta_sum, wikitext.revision.diff.markup_delta_increase,
wikitext.revision.datasources, ) paragraphs = mappers.map(str, revision.paragraphs_sentences_and_whitespace, name="euwiki.revision.paragraphs") paragraphs_without_refs = filters.regex_matching( r"^(?!\s*$)((?!<ref>)(.|\n))*$", paragraphs, name="euwiki.revision.paragraphs_without_refs") paragraphs_without_refs_total_length = aggregators.sum( mappers.map(len, paragraphs_without_refs), name="euwiki.revision.paragraphs_without_refs_total_length") local_wiki = [ image_links, image_links / max(wikitext.revision.content_chars, 1), # category_links, # category_links / max(wikitext.revision.content_chars, 1), infobox_templates, cn_templates + 1, cn_templates / max(wikitext.revision.content_chars, 1), log(paragraphs_without_refs_total_length + 1), basque.dictionary.revision.dict_words, basque.dictionary.revision.dict_words / max(wikitext.revision.words, 1), english.dictionary.revision.dict_words, english.dictionary.revision.dict_words / max(wikitext.revision.words, 1), spanish.dictionary.revision.dict_words, spanish.dictionary.revision.dict_words / max(wikitext.revision.words, 1), ] wp10 = wikipedia.article + local_wiki
from revscoring.features import diff, page, parent_revision, revision, user from revscoring.features.modifiers import log, max from revscoring.languages import portuguese from . import enwiki proportion_of_badwords_added = portuguese.diff.badwords_added / \ max(portuguese.diff.words_added, 1) proportion_of_badwords_removed = portuguese.diff.badwords_added / \ max(portuguese.diff.words_added, 1) proportion_of_misspellings_added = portuguese.diff.misspellings_added / \ max(portuguese.diff.words_added, 1) proportion_of_misspellings_removed = portuguese.diff.misspellings_added / \ max(portuguese.diff.words_added, 1) proportion_of_informals_added = portuguese.diff.informals_added / \ max(portuguese.diff.words_added, 1) proportion_of_informals_removed = portuguese.diff.informals_added / \ max(portuguese.diff.words_added, 1) proportion_of_badwords = portuguese.parent_revision.badwords / \ max(portuguese.parent_revision.words, 1) proportion_of_misspellings = portuguese.parent_revision.misspellings / \ max(portuguese.parent_revision.words, 1) proportion_of_informals = portuguese.parent_revision.informals / \ max(portuguese.parent_revision.words, 1) added_badwords_ratio = proportion_of_badwords_added / \ max(proportion_of_badwords, 0.01) added_misspellings_ratio = proportion_of_misspellings_added / \ max(proportion_of_misspellings, 0.01) added_informals_ratio = proportion_of_informals_added / \
""" French Wikisource +++++++++++++++++ """ from revscoring.features import wikitext from revscoring.features.modifiers import max from revscoring.languages import french from . import wikisource local_wiki = [ wikitext.revision.chars, french.stemmed.revision.stem_chars, french.stemmed.revision.stem_chars / max(wikitext.revision.chars, 1), french.dictionary.revision.dict_words / max(wikitext.revision.words, 1), french.dictionary.revision.dict_words / max(french.dictionary.revision.non_dict_words, 1) ] pagelevel = local_wiki + wikisource.page
unique_sources_count = aggregators.len(unique_sources) "`int` : A count of unique sources in the revision" # Status is_human = wikibase_.revision.has_property_value( properties.INSTANCE_OF, items.HUMAN, name=name + '.is_human') has_birthday = wikibase_.revision.has_property( properties.DATE_OF_BIRTH, name='revision.has_birthday') dead = wikibase_.revision.has_property( properties.DATE_OF_DEATH, name='revision.dead') is_blp = has_birthday.and_(not_(dead)) local_wiki = [ is_human, is_blp, aggregators.len(complete_translations), aggregators.len(important_label_translations), aggregators.len(important_description_translations), aggregators.len(important_complete_translations), source_claims_count, wikimedia_sources_count, wikimedia_sources_count / modifiers.max(source_claims_count, 1), external_sources_count, external_sources_count / modifiers.max(source_claims_count, 1), unique_sources_count, unique_sources_count / modifiers.max(source_claims_count, 1) ] item_quality = wikibase.item + local_wiki
return r_longest else: return 1 parent = [ log(wikitext.revision.parent.chars + 1), log(wikitext.revision.parent.tokens + 1), log(wikitext.revision.parent.words + 1), log(wikitext.revision.parent.uppercase_words + 1), log(wikitext.revision.parent.headings + 1), log(wikitext.revision.parent.wikilinks + 1), log(wikitext.revision.parent.external_links + 1), log(wikitext.revision.parent.templates + 1), log(wikitext.revision.parent.ref_tags + 1), div(wikitext.revision.parent.chars, max(wikitext.revision.parent.words, 1), name="revision.parent.chars_per_word"), div(wikitext.revision.parent.words, max(wikitext.revision.parent.tokens, 1), name="revision.parent.words_per_token"), div(wikitext.revision.parent.uppercase_words, max(wikitext.revision.parent.words, 1), name="revision.parent.uppercase_words_per_word"), div(wikitext.revision.parent.markups, max(wikitext.revision.parent.tokens, 1), name="revision.parent.markups_per_token"), ] diff = [ wikitext.revision.diff.markup_delta_sum, wikitext.revision.diff.markup_delta_increase,
] cite_templates = wikitext.revision.template_names_matching( "|".join(CITE_TEMPLATES), name="enwiki.revision.cite_templates") SFN_TEMPLATES = [ r"Shortened footnote template", r"sfn", r"Sfnp", r"Sfnm", r"Sfnmp" ] shortened_footnote_templates = wikitext.revision.template_names_matching( "|".join(SFN_TEMPLATES), name="enwiki.revision.shortened_footnote_templates") all_ref_tags = shortened_footnote_templates + wikitext.revision.ref_tags all_cite_templates = cite_templates + shortened_footnote_templates proportion_of_templated_references = \ all_cite_templates / max(all_ref_tags, 1) non_templated_references = max(all_ref_tags - all_cite_templates, 0) non_cite_templates = sub( wikitext.revision.templates, all_cite_templates, name="enwiki.revision.non_cite_templates" ) # Links category_links = wikitext.revision.wikilink_titles_matching( r"Category\:", name="enwiki.revision.category_links") image_links = wikitext.revision.wikilink_titles_matching( r"File|Image\:", name="enwiki.revision.image_links") # References revision = Revision( "enwiki.revision.revision",
revision_oriented_datasources.revision.page.suggested.properties]) # Status is_human = wikibase_.revision.has_property_value( properties.INSTANCE_OF, items.HUMAN, name=name + '.revision.is_human') has_birthday = wikibase_.revision.has_property( properties.DATE_OF_BIRTH, name=name + '.revision.has_birthday') dead = wikibase_.revision.has_property( properties.DATE_OF_DEATH, name=name + '.revision.dead') is_blp = has_birthday.and_(not_(dead)) local_wiki = [ is_human, is_blp, aggregators.len(complete_translations), aggregators.len(important_label_translations), aggregators.len(important_description_translations), aggregators.len(important_complete_translations), references_count, wikimedia_references_count, wikimedia_references_count / modifiers.max(references_count, 1), external_references_count, external_references_count / modifiers.max(references_count, 1), unique_references_count, unique_references_count / modifiers.max(references_count, 1), item_completeness ] item_quality = wikibase.item + local_wiki
r"Harvard citation", r"harv", r"Harvard citation text", r"harvtxt", r"Harvcoltxt", r"Harvcol", r"Harvcolnb", r"Harvard citations", r"harvs", r"Harvp" ] cite_templates = wikitext.revision.template_names_matching( "|".join(CITE_TEMPLATES), name="enwiki.revision.cite_templates") SFN_TEMPLATES = [ r"Shortened footnote template", r"sfn", r"Sfnp", r"Sfnm", r"Sfnmp" ] shortened_footnote_templates = wikitext.revision.template_names_matching( "|".join(SFN_TEMPLATES), name="enwiki.revision.shortened_footnote_templates") all_ref_tags = shortened_footnote_templates + wikitext.revision.ref_tags all_cite_templates = cite_templates + shortened_footnote_templates proportion_of_templated_references = \ all_cite_templates / max(all_ref_tags, 1) non_templated_references = max(all_ref_tags - all_cite_templates, 0) non_cite_templates = sub(wikitext.revision.templates, all_cite_templates, name="enwiki.revision.non_cite_templates") # Links category_links = wikitext.revision.wikilink_titles_matching( r"Category\:", name="enwiki.revision.category_links") image_links = wikitext.revision.wikilink_titles_matching( r"File|Image\:", name="enwiki.revision.image_links") image_templates = wikitext.revision.template_names_matching( r"((Wide|Tall|scalable) image)|Panorama|Panorama 2", name='enwiki.revision.image_template')
from revscoring.features.modifiers import log, max from revscoring.languages import spanish from . import enwiki, util proportion_of_badwords_added = spanish.diff.badwords_added / max(spanish.diff.words_added, 1) proportion_of_badwords_removed = spanish.diff.badwords_removed / max(spanish.diff.words_removed, 1) proportion_of_misspellings_added = spanish.diff.misspellings_added / max(spanish.diff.words_added, 1) proportion_of_misspellings_removed = spanish.diff.misspellings_removed / max(spanish.diff.words_removed, 1) proportion_of_informals_added = spanish.diff.informals_added / max(spanish.diff.words_added, 1) proportion_of_informals_removed = spanish.diff.informals_removed / max(spanish.diff.words_removed, 1) proportion_of_badwords = spanish.parent_revision.badwords / max(spanish.parent_revision.words, 1) proportion_of_misspellings = spanish.parent_revision.misspellings / max(spanish.parent_revision.words, 1) proportion_of_informals = spanish.parent_revision.informals / max(spanish.parent_revision.words, 1) added_badwords_ratio = proportion_of_badwords_added / max(proportion_of_badwords, 0.01) added_misspellings_ratio = proportion_of_misspellings_added / max(proportion_of_misspellings, 0.01) added_informals_ratio = proportion_of_informals_added / max(proportion_of_informals, 0.01) damaging = ( util.no_lang_damaging + enwiki.badwords + enwiki.informals + [ log(spanish.diff.badwords_added + 1), log(spanish.diff.badwords_removed + 1), log(spanish.diff.informals_added + 1), log(spanish.diff.informals_removed + 1), log(spanish.diff.misspellings_added + 1), log(spanish.diff.misspellings_removed + 1),
from revscoring.features import wikitext, modifiers article = [ wikitext.revision.chars, wikitext.revision.content_chars, wikitext.revision.ref_tags, (wikitext.revision.ref_tags / modifiers.max(wikitext.revision.content_chars, 1)), wikitext.revision.wikilinks, (wikitext.revision.wikilinks / modifiers.max(wikitext.revision.content_chars, 1)), wikitext.revision.external_links, (wikitext.revision.external_links / modifiers.max(wikitext.revision.content_chars, 1)), wikitext.revision.headings_by_level(2), (wikitext.revision.headings_by_level(2) / modifiers.max(wikitext.revision.content_chars, 1)), wikitext.revision.headings_by_level(3), (wikitext.revision.headings_by_level(3) / modifiers.max(wikitext.revision.content_chars, 1)) ]
# References revision = Revision( "glwiki.revision.revision", wikitext.revision.datasources, ) paragraphs = mappers.map(str, revision.paragraphs_sentences_and_whitespace, name="glwiki.revision.paragraphs") paragraphs_without_refs = filters.regex_matching( r"^(?!\s*$)((?!<ref>)(.|\n))*$", paragraphs, name="glwiki.revision.paragraphs_without_refs") paragraphs_without_refs_total_length = aggregators.sum( mappers.map(len, paragraphs_without_refs), name="glwiki.revision.paragraphs_without_refs_total_length") local_wiki = [ image_links, image_links / max(wikitext.revision.content_chars, 1), category_links, category_links / max(wikitext.revision.content_chars, 1), cn_templates + 1, cn_templates / max(wikitext.revision.content_chars, 1), log(paragraphs_without_refs_total_length + 1), paragraphs_without_refs_total_length / max(wikitext.revision.content_chars, 1), ] wp10 = wikipedia.article + local_wiki
English Wikipedia +++++++++++++++++ """ from revscoring.features import revision from revscoring.features.modifiers import log, max from revscoring.languages import english from ..features.revision import templates_that_match cite_templates = templates_that_match( r"cite", name="enwiki.revision.cite_templates") infobox_templates = templates_that_match( r"infobox", name="enwiki.revision.infobox_templates") proportion_of_templated_references = cite_templates / max(revision.ref_tags, 1) CN_TEMPLATES = [ r"Citation needed", r"Cn", r"Fact" ] cn_templates = templates_that_match("|".join(CN_TEMPLATES), name="enwiki.revision.cn_templates") who_templates = templates_that_match("Who", name="enwiki.revision.cn_templates") main_article_templates = templates_that_match( "Main", name="enwiki.main_article_templates")
paragraphs = mappers.map( str, wikitext.revision.datasources.paragraphs_sentences_and_whitespace, name="enwiki.revision.paragraphs") paragraphs_without_refs = filters.regex_matching( r"^(?!\s*$)((?!<ref>)(.|\n))*$", paragraphs, name="enwiki.revision.paragraphs_without_refs") paragraphs_without_refs_total_length = aggregators.sum( mappers.map(len, paragraphs_without_refs), name="enwiki.revision.paragraphs_without_refs_total_length") local_wiki = [ dutch.stemmed.revision.stem_chars, (dutch.stemmed.revision.stem_chars / max(wikitext.revision.content_chars, 1)), image_links, image_links / max(wikitext.revision.content_chars, 1), category_links, category_links / max(wikitext.revision.content_chars, 1), dutch.dictionary.revision.dict_words, dutch.dictionary.revision.dict_words / max(wikitext.revision.words, 1), paragraphs_without_refs_total_length, paragraphs_without_refs_total_length / max(wikitext.revision.content_chars, 1), cn_templates, cn_templates / max(wikitext.revision.content_chars, 1), ] wp10 = local_wiki + wikipedia.article
from revscoring.features import wikitext from revscoring.features.modifiers import max, sub from revscoring.languages import portuguese char_based = [ wikitext.revision.chars, wikitext.revision.whitespace_chars, wikitext.revision.markup_chars, wikitext.revision.cjk_chars, wikitext.revision.entity_chars, wikitext.revision.url_chars, wikitext.revision.word_chars, wikitext.revision.uppercase_word_chars, wikitext.revision.punctuation_chars, wikitext.revision.break_chars, wikitext.revision.longest_repeated_char, wikitext.revision.whitespace_chars / max(wikitext.revision.chars, 1), wikitext.revision.markup_chars / max(wikitext.revision.chars, 1), wikitext.revision.cjk_chars / max(wikitext.revision.chars, 1), wikitext.revision.entity_chars / max(wikitext.revision.chars, 1), wikitext.revision.url_chars / max(wikitext.revision.chars, 1), wikitext.revision.word_chars / max(wikitext.revision.chars, 1), wikitext.revision.uppercase_word_chars / max(wikitext.revision.chars, 1), wikitext.revision.punctuation_chars / max(wikitext.revision.chars, 1), wikitext.revision.break_chars / max(wikitext.revision.chars, 1), wikitext.revision.longest_repeated_char / max(wikitext.revision.chars, 1) ] token_based = [ wikitext.revision.tokens, wikitext.revision.numbers, wikitext.revision.whitespaces, wikitext.revision.markups, wikitext.revision.cjks, wikitext.revision.entities, wikitext.revision.urls, wikitext.revision.words, wikitext.revision.uppercase_words, wikitext.revision.punctuations, wikitext.revision.breaks, wikitext.revision.longest_token, wikitext.revision.longest_word, wikitext.revision.numbers / max(wikitext.revision.tokens, 1),
from revscoring.features import diff, page, parent_revision, revision, user from revscoring.features.modifiers import log, max from revscoring.languages import indonesian from . import enwiki proportion_of_badwords_added = indonesian.diff.badwords_added / \ max(indonesian.diff.words_added, 1) proportion_of_badwords_removed = indonesian.diff.badwords_added / \ max(indonesian.diff.words_added, 1) proportion_of_misspellings_added = indonesian.diff.misspellings_added / \ max(indonesian.diff.words_added, 1) proportion_of_misspellings_removed = indonesian.diff.misspellings_added / \ max(indonesian.diff.words_added, 1) proportion_of_informals_added = indonesian.diff.informals_added / \ max(indonesian.diff.words_added, 1) proportion_of_informals_removed = indonesian.diff.informals_added / \ max(indonesian.diff.words_added, 1) proportion_of_badwords = indonesian.parent_revision.badwords / \ max(indonesian.parent_revision.words, 1) proportion_of_misspellings = indonesian.parent_revision.misspellings / \ max(indonesian.parent_revision.words, 1) proportion_of_informals = indonesian.parent_revision.informals / \ max(indonesian.parent_revision.words, 1) added_badwords_ratio = proportion_of_badwords_added / \ max(proportion_of_badwords, 0.01) added_misspellings_ratio = proportion_of_misspellings_added / \ max(proportion_of_misspellings, 0.01) added_informals_ratio = proportion_of_informals_added / \
from revscoring.features import wikitext from revscoring.features.modifiers import log, max, sub from revscoring.languages import french from . import wikipedia # Copied (2015-10-29) from: # https://fr.wikipedia.org/wiki/Cat%C3%A9gorie:Mod%C3%A8le_pour_bibliographie CITE_TEMPLATES = [ r"Article", r"Chapitre", r"Jugement", r"Lien[ _]web", r"Loi", r"Ouvrage" ] cite_templates = wikitext.revision.template_names_matching( "|".join(CITE_TEMPLATES), name="frwiki.revision.cite_templates") proportion_of_templated_references = \ cite_templates / max(wikitext.revision.ref_tags, 1) non_templated_references = max(wikitext.revision.ref_tags - cite_templates, 0) non_cite_templates = sub(wikitext.revision.templates, cite_templates, name="frwiki.revision.non_cite_templates") infobox_templates = wikitext.revision.template_names_matching( r"^infobox", name="frwiki.revision.infobox_templates") # Copied (2015-10-29) from: # https://fr.wikipedia.org/wiki/Wikip%C3%A9dia:Citez_vos_sources#R.C3.A9clamation_et_contestation_de_sources LVL1_CN_TEMPLATES = [ r"Référence[ _]souhaitée", r"Citation[ _]nécessaire", r"Référence[ _]à[ _]confirmer", r"Référence[ _]nécessaire", r"Inédit" ] lvl1_cn_templates = wikitext.revision.template_names_matching( "|".join(LVL1_CN_TEMPLATES), name="frwiki.revision.lvl1_cn_templates")
"`int` : A count of all sources which do not come from Wikimedia projects" unique_sources_count = aggregators.len(unique_sources) "`int` : A count of unique sources in the revision" # Status is_human = wikibase_.revision.has_property_value(properties.INSTANCE_OF, items.HUMAN, name=name + '.is_human') has_birthday = wikibase_.revision.has_property(properties.DATE_OF_BIRTH, name='revision.has_birthday') dead = wikibase_.revision.has_property(properties.DATE_OF_DEATH, name='revision.dead') is_blp = has_birthday.and_(not_(dead)) local_wiki = [ is_human, is_blp, aggregators.len(complete_translations), aggregators.len(important_label_translations), aggregators.len(important_description_translations), aggregators.len(important_complete_translations), source_claims_count, wikimedia_sources_count, wikimedia_sources_count / modifiers.max(source_claims_count, 1), external_sources_count, external_sources_count / modifiers.max(source_claims_count, 1), unique_sources_count, unique_sources_count / modifiers.max(source_claims_count, 1) ] item_quality = wikibase.item + local_wiki
from revscoring.features.modifiers import log, max from revscoring.languages import turkish from . import enwiki, util proportion_of_badwords_added = turkish.diff.badwords_added / \ max(turkish.diff.words_added, 1) proportion_of_badwords_removed = turkish.diff.badwords_removed / \ max(turkish.diff.words_removed, 1) proportion_of_informals_added = turkish.diff.informals_added / \ max(turkish.diff.words_added, 1) proportion_of_informals_removed = turkish.diff.informals_removed / \ max(turkish.diff.words_removed, 1) proportion_of_badwords = turkish.parent_revision.badwords / \ max(turkish.parent_revision.words, 1) proportion_of_informals = turkish.parent_revision.informals / \ max(turkish.parent_revision.words, 1) added_badwords_ratio = proportion_of_badwords_added / \ max(proportion_of_badwords, 0.01) added_informals_ratio = proportion_of_informals_added / \ max(proportion_of_informals, 0.01) damaging = util.no_lang_damaging + enwiki.badwords + enwiki.informals + [ log(turkish.diff.badwords_added + 1), log(turkish.diff.badwords_removed + 1), log(turkish.diff.informals_added + 1), log(turkish.diff.informals_removed + 1), proportion_of_badwords_added, proportion_of_badwords_removed,
from revscoring.features.modifiers import log, max from revscoring.languages import english from . import util proportion_of_badwords_added = english.diff.badwords_added / \ max(english.diff.words_added, 1) proportion_of_badwords_removed = english.diff.badwords_removed / \ max(english.diff.words_removed, 1) proportion_of_misspellings_added = english.diff.misspellings_added / \ max(english.diff.words_added, 1) proportion_of_misspellings_removed = english.diff.misspellings_removed / \ max(english.diff.words_removed, 1) proportion_of_informals_added = english.diff.informals_added / \ max(english.diff.words_added, 1) proportion_of_informals_removed = english.diff.informals_removed / \ max(english.diff.words_removed, 1) proportion_of_badwords = english.parent_revision.badwords / \ max(english.parent_revision.words, 1) proportion_of_misspellings = english.parent_revision.misspellings / \ max(english.parent_revision.words, 1) proportion_of_informals = english.parent_revision.informals / \ max(english.parent_revision.words, 1) added_badwords_ratio = proportion_of_badwords_added / \ max(proportion_of_badwords, 0.01) added_misspellings_ratio = proportion_of_misspellings_added / \ max(proportion_of_misspellings, 0.01) added_informals_ratio = proportion_of_informals_added / \ max(proportion_of_informals, 0.01)
unique_references_count = aggregators.len(unique_references) "`int` : A count of unique sources in the revision" # Status is_human = wikibase_.revision.has_property_value( properties.INSTANCE_OF, items.HUMAN, name=name + '.revision.is_human') has_birthday = wikibase_.revision.has_property( properties.DATE_OF_BIRTH, name=name + '.revision.has_birthday') dead = wikibase_.revision.has_property( properties.DATE_OF_DEATH, name=name + '.revision.dead') is_blp = has_birthday.and_(not_(dead)) local_wiki = [ is_human, is_blp, aggregators.len(complete_translations), aggregators.len(important_label_translations), aggregators.len(important_description_translations), aggregators.len(important_complete_translations), references_count, wikimedia_references_count, wikimedia_references_count / modifiers.max(references_count, 1), external_references_count, external_references_count / modifiers.max(references_count, 1), unique_references_count, unique_references_count / modifiers.max(references_count, 1) ] item_quality = wikibase.item + local_wiki
""" from revscoring.features import wikitext from revscoring.features.modifiers import max from revscoring.languages import swedish from . import wikipedia cn_templates = wikitext.revision.template_names_matching( r"Källa[ _]behövs|Kb", name="svwiki.revision.cn_templates") # Links category_links = wikitext.revision.wikilink_titles_matching( r"Category|Kategori\:", name="revision.category_links") image_links = wikitext.revision.wikilink_titles_matching( r"File|Image|Fil\:", name="revision.image_links") local_wiki = [ image_links, image_links / max(wikitext.revision.content_chars, 1), category_links, category_links / max(wikitext.revision.content_chars, 1), swedish.dictionary.revision.dict_words, swedish.dictionary.revision.dict_words / max(wikitext.revision.words, 1), cn_templates, cn_templates / max(wikitext.revision.content_chars, 1), ] wp10 = local_wiki + wikipedia.article
from revscoring.features.modifiers import log, max from revscoring.languages import french from . import enwiki, util proportion_of_badwords_added = french.diff.badwords_added / \ max(french.diff.words_added, 1) proportion_of_badwords_removed = french.diff.badwords_removed / \ max(french.diff.words_removed, 1) proportion_of_misspellings_added = french.diff.misspellings_added / \ max(french.diff.words_added, 1) proportion_of_misspellings_removed = french.diff.misspellings_removed / \ max(french.diff.words_removed, 1) proportion_of_badwords = french.parent_revision.badwords / \ max(french.parent_revision.words, 1) proportion_of_misspellings = french.parent_revision.misspellings / \ max(french.parent_revision.words, 1) added_badwords_ratio = proportion_of_badwords_added / \ max(proportion_of_badwords, 0.01) added_misspellings_ratio = proportion_of_misspellings_added / \ max(proportion_of_misspellings, 0.01) damaging = util.no_lang_damaging + enwiki.badwords + enwiki.informals + [ log(french.diff.badwords_added + 1), log(french.diff.badwords_removed + 1), log(french.diff.misspellings_added + 1), log(french.diff.misspellings_removed + 1), proportion_of_badwords_added, proportion_of_badwords_removed,
# <ref name="derp">...</ref> (in another page # <ref following="derp" name="otherderp">...</ref>) # TODO # <big>,<small>,<center>,<div>,<span>,<b>,<i>,<poem>,<section>,''','' good_tags = wikitext.revision.tag_names_matching( r"big|small|center|div|span|b|i|poem|section", name="wikitext.revision.good_tags") expected_markup = aggregators.len( wikitext.revision.datasources.tokens_matching(r"'''|''"), name="wiktext.revision.expected_markup") page = [ wikitext.revision.chars, wikitext.revision.content_chars, wikitext.revision.content_chars / max(wikitext.revision.chars, 1), wikitext.revision.markup_chars, wikitext.revision.markup_chars / max(wikitext.revision.chars, 1), wikitext.revision.whitespace_chars, wikitext.revision.whitespace_chars / max(wikitext.revision.chars, 1), wikitext.revision.entity_chars, wikitext.revision.entity_chars / max(wikitext.revision.chars, 1), wikitext.revision.punctuation_chars, wikitext.revision.punctuation_chars / max(wikitext.revision.chars, 1), wikitext.revision.longest_repeated_char, wikitext.revision.numbers, wikitext.revision.numbers / max(wikitext.revision.words, 1), wikitext.revision.uppercase_words, wikitext.revision.uppercase_words / max(wikitext.revision.words, 1), wikitext.revision.longest_token, wikitext.revision.longest_word,
from . import wikipedia # Templates infobox_templates = wikitext.revision.template_names_matching( r"infobox", name="enwiki.revision.infobox_templates") CN_TEMPLATES = [r"Citation[_ ]needed", r"Cn", r"Fact"] cn_templates = wikitext.revision.template_names_matching( "|".join(CN_TEMPLATES), name="enwiki.revision.cn_templates") who_templates = wikitext.revision.template_names_matching( "Who", name="enwiki.revision.who_templates") main_article_templates = wikitext.revision.template_names_matching( "Main", name="enwiki.main_article_templates") cite_templates = wikitext.revision.template_names_matching( r"cite", name="enwiki.revision.cite_templates") proportion_of_templated_references = \ cite_templates / max(wikitext.revision.ref_tags, 1) non_templated_references = max(wikitext.revision.ref_tags - cite_templates, 0) non_cite_templates = sub(wikitext.revision.templates, cite_templates, name="enwiki.revision.non_cite_templates") # Links category_links = wikitext.revision.wikilink_titles_matching( r"Category\:", name="enwiki.revision.category_links") image_links = wikitext.revision.wikilink_titles_matching( r"File|Image\:", name="enwiki.revision.image_links") # References revision = Revision( "enwiki.revision.revision", wikitext.revision.datasources,
def vectorize_words(words): return vectorizers.word2vec.vectorize_words(enwiki_kvs, words) revision_text_vectors = vectorizers.word2vec(mappers.lower_case( wikitext.revision.datasources.words), vectorize_words, name="revision.text.en_vectors") w2v = aggregators.mean(revision_text_vectors, vector=True, name="revision.text.en_vectors_mean") female_pronouns = wikitext.revision.datasources.tokens_matching( r"\b(she|her|hers)\b") male_pronouns = wikitext.revision.datasources.tokens_matching( r"\b(he|him|his)\b") female_pronouns_count = aggregators.len(female_pronouns) male_pronouns_count = aggregators.len(male_pronouns) pronoun_features = [ female_pronouns_count, male_pronouns_count, female_pronouns_count + male_pronouns_count, female_pronouns_count / modifiers.max(female_pronouns_count + male_pronouns_count, 1) ] drafttopic = [w2v] + pronoun_features articletopic = drafttopic
from revscoring.features import wikitext, modifiers article = [ wikitext.revision.chars, wikitext.revision.content_chars, wikitext.revision.ref_tags, wikitext.revision.ref_tags / modifiers.max(wikitext.revision.content_chars, 1), wikitext.revision.wikilinks, wikitext.revision.wikilinks / modifiers.max(wikitext.revision.content_chars, 1), wikitext.revision.external_links, wikitext.revision.external_links / modifiers.max(wikitext.revision.content_chars, 1), wikitext.revision.headings_by_level(2), wikitext.revision.headings_by_level(2) / modifiers.max(wikitext.revision.content_chars, 1), wikitext.revision.headings_by_level(3), wikitext.revision.headings_by_level(3) / modifiers.max(wikitext.revision.content_chars, 1) ]
_process_all_sources, depends_on=[item]) all_sources = aggregators.len(all_sources_datasource) "`int` : A count of all sources in the revision" all_wikimedia_sources_datasource = Datasource( name + ".all_wikimedia_sources", _process_wikimedia_sources, depends_on=[all_sources_datasource]) all_wikimedia_sources = aggregators.len(all_wikimedia_sources_datasource) "`int` : A count of all sources which come from Wikimedia projects in the revision" all_external_sources = modifiers.sub(all_sources, all_wikimedia_sources) "A count of all sources which do not come from Wikimedia projects in the revision" external_sources_ratio = all_external_sources / modifiers.max( wikibase_features.revision.sources, 1) "A ratio/division between number of external references and number of claims that have references in the revision" unique_sources = Feature(name + ".unique_sources", _process_unique_sources, depends_on=[all_sources_datasource], returns=int) "`int` : A count of unique sources in the revision" # Status is_human = revision.has_property_value(properties.INSTANCE_OF, items.HUMAN, name=name + '.is_human') has_birthday = revision.has_property(properties.DATE_OF_BIRTH, name='revision.has_birthday') dead = revision.has_property(properties.DATE_OF_DEATH, name='revision.dead')