def bot_gen(rev_pages, language, api_url): session = api.Session(api_url) extractor = APIExtractor(session, language=language) for rev_id, page_id in rev_pages: sys.stderr.write(".") sys.stderr.flush() try: # Detect reverted status revert = reverts.api.check(session, rev_id, page_id, radius=3) reverted = revert is not None added_words = list(extractor.extract(rev_id, [diff.added_words]))[0] yield Edit(rev_id, added_words, reverted) except KeyboardInterrupt: sys.stderr.write("\n^C Caught. Exiting...") break except: sys.stderr.write(traceback.format_exc()) sys.stderr.write("\n") sys.stderr.write("\n")
def bot_gen(rev_pages, language, api_url): session = api.Session(api_url) extractor = APIExtractor(session, language=language) for rev_id, page_id in rev_pages: sys.stderr.write(".") sys.stderr.flush() try: # Detect reverted status revert = reverts.api.check(session, rev_id, page_id, radius=3) reverted = revert is not None added_words = list( extractor.extract(rev_id, [diff.added_words]))[0] yield Edit(rev_id, added_words, reverted) except KeyboardInterrupt: sys.stderr.write("\n^C Caught. Exiting...") break except: sys.stderr.write(traceback.format_exc()) sys.stderr.write("\n") sys.stderr.write("\n")
import mwapi from revscoring import ScorerModel from revscoring.extractors import APIExtractor with open("models/enwiki.damaging.linear_svc.model") as f: scorer_model = ScorerModel.load(f) extractor = APIExtractor(mwapi.Session(host="https://en.wikipedia.org", user_agent="revscoring demo")) feature_values = extractor.extract(123456789, scorer_model.features) print(scorer_model.score(feature_values))
import mwapi from revscoring import ScorerModel from revscoring.extractors import APIExtractor with open("models/enwiki.damaging.linear_svc.model") as f: scorer_model = ScorerModel.load(f) extractor = APIExtractor( mwapi.Session(host="https://en.wikipedia.org", user_agent="revscoring demo")) feature_values = extractor.extract(123456789, scorer_model.features) print(scorer_model.score(feature_values))
count += 1 else: break """ Feature to examine. Let FEATURE be one of diff.added_tokens diff.removed_tokens diff.added_segments diff.removed_segments revision.content revision.content_tokens """ FEATURE = diff.added_segments # extract data from selected revisions and write to selected file extr = APIExtractor(mwapi.Session("https://en.wikipedia.org")) for id in rev_ids: data = extr.extract(id, FEATURE) dump_target.write("\n\nBeginning %s of revision %d\n\n" % (FEATURE, id)) if type(data) is str: dump_target.write(data) elif type(data) is list: dump_target.writelines(data) else: print("Unknown Type") exit() dump_target.write("\n\nEnd %s of revision %d" % (FEATURE, id))
import sys from mw.api import Session sys.path.insert(0, ".") from revscoring.extractors import APIExtractor from revscoring.features import (diff, page, parent_revision, previous_user_revision, revision, user) from revscoring.languages import portuguese api_extractor = APIExtractor( Session("https://pt.wikipedia.org/w/api.php"), language=portuguese ) features = [diff.added_badwords_ratio, diff.added_markup_chars_ratio, diff.added_misspellings_ratio, diff.added_number_chars_ratio, diff.added_symbolic_chars_ratio, diff.added_uppercase_chars_ratio, diff.badwords_added, diff.badwords_removed, diff.chars_added, diff.chars_removed, diff.longest_repeated_char_added, diff.longest_token_added, diff.markup_chars_added, diff.markup_chars_removed, diff.misspellings_added, diff.misspellings_removed, diff.numeric_chars_added, diff.numeric_chars_removed, diff.proportion_of_badwords_added, diff.proportion_of_badwords_removed, diff.proportion_of_chars_added, diff.proportion_of_chars_removed, diff.proportion_of_markup_chars_added, diff.proportion_of_misspellings_added, diff.proportion_of_misspellings_removed,
from mw.api import Session from revscoring.extractors import APIExtractor from revscoring.features import diff, parent_revision, revision, user api_extractor = APIExtractor(Session("https://en.wikipedia.org/w/api.php")) features = [revision.day_of_week, revision.hour_of_day, revision.has_custom_comment, diff.bytes_changed, diff.chars_added, user.age, user.is_anon, user.is_bot] values = api_extractor.extract( 624577024, features ) for feature, value in zip(features, values): print("{0}: {1}".format(feature, value))
from mw.api import Session from revscoring.extractors import APIExtractor from revscoring.scorers import MLScorerModel, Scorer api_session = Session("https://en.wikipedia.org/w/api.php") filename = "models/reverts.halfak_mix.trained.model" model = MLScorerModel.load(open(filename, 'rb')) extractor = APIExtractor(api_session, model.language) scorer = Scorer({'reverted': model}, extractor) for rev_id in [105, 642215410, 638307884]: score_doc = scorer.score(rev_id) print("{0}: {1}".format(rev_id, score_doc))
import sys from mw.api import Session sys.path.insert(0, ".") from revscoring.extractors import APIExtractor from revscoring.features import (diff, page, parent_revision, previous_user_revision, revision, user) from revscoring.languages import portuguese api_extractor = APIExtractor(Session("https://pt.wikipedia.org/w/api.php"), language=portuguese) features = [ diff.added_badwords_ratio, diff.added_markup_chars_ratio, diff.added_misspellings_ratio, diff.added_number_chars_ratio, diff.added_symbolic_chars_ratio, diff.added_uppercase_chars_ratio, diff.badwords_added, diff.badwords_removed, diff.chars_added, diff.chars_removed, diff.longest_repeated_char_added, diff.longest_token_added, diff.markup_chars_added, diff.markup_chars_removed, diff.misspellings_added, diff.misspellings_removed, diff.numeric_chars_added, diff.numeric_chars_removed, diff.proportion_of_badwords_added, diff.proportion_of_badwords_removed, diff.proportion_of_chars_added, diff.proportion_of_chars_removed, diff.proportion_of_markup_chars_added, diff.proportion_of_misspellings_added, diff.proportion_of_misspellings_removed, diff.proportion_of_numeric_chars_added, diff.proportion_of_symbolic_chars_added, diff.proportion_of_uppercase_chars_added, diff.removed_badwords_ratio, diff.removed_misspellings_ratio, diff.segments_added,