def bot_gen(rev_pages, language, api_url):

    session = api.Session(api_url)
    extractor = APIExtractor(session, language=language)

    for rev_id, page_id in rev_pages:
        sys.stderr.write(".")
        sys.stderr.flush()
        try:

            # Detect reverted status
            revert = reverts.api.check(session, rev_id, page_id, radius=3)
            reverted = revert is not None
            added_words = list(extractor.extract(rev_id,
                                                 [diff.added_words]))[0]
            yield Edit(rev_id, added_words, reverted)

        except KeyboardInterrupt:
            sys.stderr.write("\n^C Caught.  Exiting...")
            break

        except:
            sys.stderr.write(traceback.format_exc())
            sys.stderr.write("\n")

    sys.stderr.write("\n")
def bot_gen(rev_pages, language, api_url):

    session = api.Session(api_url)
    extractor = APIExtractor(session, language=language)

    for rev_id, page_id in rev_pages:
        sys.stderr.write(".")
        sys.stderr.flush()
        try:

            # Detect reverted status
            revert = reverts.api.check(session, rev_id, page_id, radius=3)
            reverted = revert is not None
            added_words = list(
                extractor.extract(rev_id, [diff.added_words]))[0]
            yield Edit(rev_id, added_words, reverted)

        except KeyboardInterrupt:
            sys.stderr.write("\n^C Caught.  Exiting...")
            break

        except:
            sys.stderr.write(traceback.format_exc())
            sys.stderr.write("\n")

    sys.stderr.write("\n")
示例#3
0
import mwapi
from revscoring import ScorerModel
from revscoring.extractors import APIExtractor

with open("models/enwiki.damaging.linear_svc.model") as f:
    scorer_model = ScorerModel.load(f)

extractor = APIExtractor(mwapi.Session(host="https://en.wikipedia.org",
                                       user_agent="revscoring demo"))

feature_values = extractor.extract(123456789, scorer_model.features)

print(scorer_model.score(feature_values))
示例#4
0
import mwapi
from revscoring import ScorerModel
from revscoring.extractors import APIExtractor

with open("models/enwiki.damaging.linear_svc.model") as f:
    scorer_model = ScorerModel.load(f)

extractor = APIExtractor(
    mwapi.Session(host="https://en.wikipedia.org",
                  user_agent="revscoring demo"))

feature_values = extractor.extract(123456789, scorer_model.features)

print(scorer_model.score(feature_values))
示例#5
0
        count += 1
    else:
        break

"""
Feature to examine.  Let FEATURE be one of
diff.added_tokens
diff.removed_tokens
diff.added_segments
diff.removed_segments
revision.content
revision.content_tokens
"""
FEATURE = diff.added_segments


# extract data from selected revisions and write to selected file
extr = APIExtractor(mwapi.Session("https://en.wikipedia.org"))
for id in rev_ids:

    data = extr.extract(id, FEATURE)
    dump_target.write("\n\nBeginning %s of revision %d\n\n" % (FEATURE, id))
    if type(data) is str:
        dump_target.write(data)
    elif type(data) is list:
        dump_target.writelines(data)
    else:
        print("Unknown Type")
        exit()
    dump_target.write("\n\nEnd %s of revision %d" % (FEATURE, id))
示例#6
0
import sys

from mw.api import Session

sys.path.insert(0, ".")
from revscoring.extractors import APIExtractor
from revscoring.features import (diff, page, parent_revision,
                                 previous_user_revision, revision, user)
from revscoring.languages import portuguese

api_extractor = APIExtractor(
    Session("https://pt.wikipedia.org/w/api.php"),
    language=portuguese
)

features = [diff.added_badwords_ratio, diff.added_markup_chars_ratio,
            diff.added_misspellings_ratio, diff.added_number_chars_ratio,
            diff.added_symbolic_chars_ratio, diff.added_uppercase_chars_ratio,
            diff.badwords_added, diff.badwords_removed, diff.chars_added,
            diff.chars_removed, diff.longest_repeated_char_added,
            diff.longest_token_added, diff.markup_chars_added,
            diff.markup_chars_removed, diff.misspellings_added,
            diff.misspellings_removed, diff.numeric_chars_added,
            diff.numeric_chars_removed,
            diff.proportion_of_badwords_added,
            diff.proportion_of_badwords_removed,
            diff.proportion_of_chars_added,
            diff.proportion_of_chars_removed,
            diff.proportion_of_markup_chars_added,
            diff.proportion_of_misspellings_added,
            diff.proportion_of_misspellings_removed,
示例#7
0
from mw.api import Session
from revscoring.extractors import APIExtractor
from revscoring.features import diff, parent_revision, revision, user

api_extractor = APIExtractor(Session("https://en.wikipedia.org/w/api.php"))

features = [revision.day_of_week,
            revision.hour_of_day,
            revision.has_custom_comment,
            diff.bytes_changed,
            diff.chars_added,
            user.age,
            user.is_anon,
            user.is_bot]

values = api_extractor.extract(
    624577024,
    features
)
for feature, value in zip(features, values):
    print("{0}: {1}".format(feature, value))
示例#8
0
from mw.api import Session

from revscoring.extractors import APIExtractor
from revscoring.scorers import MLScorerModel, Scorer

api_session = Session("https://en.wikipedia.org/w/api.php")

filename = "models/reverts.halfak_mix.trained.model"
model = MLScorerModel.load(open(filename, 'rb'))

extractor = APIExtractor(api_session, model.language)
scorer = Scorer({'reverted': model}, extractor)

for rev_id in [105, 642215410, 638307884]:
    score_doc = scorer.score(rev_id)
    print("{0}: {1}".format(rev_id, score_doc))
示例#9
0
import sys

from mw.api import Session

sys.path.insert(0, ".")
from revscoring.extractors import APIExtractor
from revscoring.features import (diff, page, parent_revision,
                                 previous_user_revision, revision, user)
from revscoring.languages import portuguese

api_extractor = APIExtractor(Session("https://pt.wikipedia.org/w/api.php"),
                             language=portuguese)

features = [
    diff.added_badwords_ratio, diff.added_markup_chars_ratio,
    diff.added_misspellings_ratio, diff.added_number_chars_ratio,
    diff.added_symbolic_chars_ratio, diff.added_uppercase_chars_ratio,
    diff.badwords_added, diff.badwords_removed, diff.chars_added,
    diff.chars_removed, diff.longest_repeated_char_added,
    diff.longest_token_added, diff.markup_chars_added,
    diff.markup_chars_removed, diff.misspellings_added,
    diff.misspellings_removed, diff.numeric_chars_added,
    diff.numeric_chars_removed, diff.proportion_of_badwords_added,
    diff.proportion_of_badwords_removed, diff.proportion_of_chars_added,
    diff.proportion_of_chars_removed, diff.proportion_of_markup_chars_added,
    diff.proportion_of_misspellings_added,
    diff.proportion_of_misspellings_removed,
    diff.proportion_of_numeric_chars_added,
    diff.proportion_of_symbolic_chars_added,
    diff.proportion_of_uppercase_chars_added, diff.removed_badwords_ratio,
    diff.removed_misspellings_ratio, diff.segments_added,