예제 #1
0
def test_extract_cite_history():
    FakeRevision = namedtuple("Revision", ['id', 'timestamp', 'text'])

    FakeExtractor = namedtuple("Extractor", ['extract'])

    class FakePage:
        def __init__(self, id, title):
            self.id = id
            self.title = title
        def __iter__(self):
            return iter([
                FakeRevision(1, Timestamp(1), "id1 id2"),
                FakeRevision(2, Timestamp(2), "id1 id3"),
                FakeRevision(3, Timestamp(3), "id1 id2 id3"),
                FakeRevision(4, Timestamp(4), "id1 id2 id4"),
                FakeRevision(5, Timestamp(5), "id1 id2 id4"),
            ])

    fake_page = FakePage(1, "Title")

    def extract(text):
        return (Identifier('fake', id) for id in text.split(" "))
    extractor = FakeExtractor(extract)

    expected = [(1, "Title", 1, Timestamp(1), "fake", "id1"),
                (1, "Title", 1, Timestamp(1), "fake", "id2"),
                (1, "Title", 4, Timestamp(4), "fake", "id4")]

    citations = list(extract_cite_history(fake_page, [extractor]))
    eq_(len(citations), len(expected))
    for cite in extract_cite_history(fake_page, [extractor]):
        assert cite in expected
예제 #2
0
def test_age():
    FakeRevisionMetadata = namedtuple("FakeRevisionMetadata", ['timestamp'])

    cache = {
        revision.metadata: FakeRevisionMetadata(Timestamp(10)),
        page_creation.metadata: FakeRevisionMetadata(Timestamp(0))
    }
    eq_(solve(age, cache=cache), 10)
예제 #3
0
def test_seconds_since():
    FakeRevisionMetadata = namedtuple("FakeRevisionMetadata", ['timestamp'])

    cache = {
        revision.metadata: FakeRevisionMetadata(Timestamp(10)),
        parent_revision.metadata: FakeRevisionMetadata(Timestamp(1))
    }
    eq_(solve(seconds_since, cache=cache), 9)
예제 #4
0
 def __iter__(self):
     return iter([
         FakeRevision(1, Timestamp(1), "id1 id2"),
         FakeRevision(2, Timestamp(2), "id1 id3"),
         FakeRevision(3, Timestamp(3), "id1 id2 id3"),
         FakeRevision(4, Timestamp(4), "id1 id2 id4"),
         FakeRevision(5, Timestamp(5), "id1 id2 id4"),
     ])
예제 #5
0
def test_user_info_from_doc():

    doc = {
        "userid": 24278012,
        "name": "Hoardablehotsauce",
        "editcount": 5,
        "registration": "2015-02-28T22:25:37Z",
        "groups": ["*", "user"],
        "implicitgroups": ["*", "user"],
        "blockid": "5752570",
        "blockedby": "Cryptic",
        "blockedbyid": "295294",
        "blockedtimestamp": "2015-02-28T22:43:23Z",
        "blockreason": "{{uw-softerblock}} <!-- Promotional username, "
        "soft block -->",
        "blockexpiry": "infinity",
        "gender": "unknown"
    }

    info = api.APIExtractor.user_info_from_doc(doc)

    eq_(info.name, "Hoardablehotsauce")
    eq_(info.groups, ['*', "user"])
    eq_(info.implicitgroups, ['*', "user"])
    eq_(info.registration, Timestamp("2015-02-28T22:25:37Z"))
    eq_(info.block_id, 5752570)
    eq_(info.blocked_by, "Cryptic")
    eq_(info.blocked_by_id, 295294)
    eq_(info.blocked_timestamp, Timestamp("2015-02-28T22:43:23Z"))
    eq_(info.block_reason,
        "{{uw-softerblock}} <!-- Promotional username, soft block -->")
    eq_(info.block_expiry, "infinity")
    eq_(info.gender, "unknown")

    doc = {
        "userid": 24278012,
        "name": "Hoardablehotsauce",
        "editcount": 5,
        "groups": ["*", "user"],
        "implicitgroups": ["*", "user"],
        "gender": "unknown"
    }

    info = api.APIExtractor.user_info_from_doc(doc)
    eq_(info.registration, None)
    eq_(info.blocked_timestamp, None)

    doc = None
    info = api.APIExtractor.user_info_from_doc(doc)
    eq_(info, None)
예제 #6
0
def test_seconds_since():
    FakeRevisionMetadata = namedtuple("FakeRevisionMetadata", ['timestamp'])

    cache = {
        revision.metadata: FakeRevisionMetadata(Timestamp(10)),
        previous_user_revision.metadata: FakeRevisionMetadata(Timestamp(1))
    }
    eq_(solve(seconds_since, cache=cache), 9)

    # Makes sure we don't crash when there was no previous user revision
    cache = {
        revision.metadata: FakeRevisionMetadata(Timestamp(10)),
        previous_user_revision.metadata: None
    }
    eq_(solve(seconds_since, cache=cache), 0)
예제 #7
0
def test_seconds_since():
    FakeRevisionMetadata = namedtuple("FakeRevisionMetadata",
                                      ['timestamp'])

    cache = {
        revision.metadata: FakeRevisionMetadata(Timestamp(10)),
        parent_revision.metadata: FakeRevisionMetadata(Timestamp(1))
    }
    eq_(solve(seconds_since, cache=cache), 9)

    # Make sure we don't error when there is no parent revision
    cache = {
        revision.metadata: FakeRevisionMetadata(Timestamp(10)),
        parent_revision.metadata: None
    }
    eq_(solve(seconds_since, cache=cache), 0)
예제 #8
0
def test_revision_metadata_from_doc():
    doc = {
        "revid": 3456789,
        "parentid": 54678,
        "comment": "Wat?",
        "user": "******",
        "userid": 34567890,
        "timestamp": "2015-01-07T12:23:57Z",
        "page": {
            "pageid": 347,
            "title": "Hats",
            "ns": 0
        }
    }

    metadata = api.APIExtractor.revision_metadata_from_doc(doc)

    eq_(metadata.rev_id, 3456789)
    eq_(metadata.parent_id, 54678)
    eq_(metadata.user_id, 34567890)
    eq_(metadata.user_text, "EpochFail")
    eq_(metadata.timestamp, Timestamp("2015-01-07T12:23:57Z"))
    eq_(metadata.comment, "Wat?")
    eq_(metadata.page_id, 347)
    eq_(metadata.page_namespace, 0)
    eq_(metadata.page_title, "Hats")
예제 #9
0
def test_hour_of_day():
    FakeRevisionMetadata = namedtuple("FakeRevisionMetadata", ['timestamp'])
    timestamp = Timestamp('2014-09-07T19:55:00Z')
    cache = {
        revision.metadata: FakeRevisionMetadata(timestamp)
    }
    eq_(solve(hour_of_day, cache=cache), 19)
예제 #10
0
def main(argv=None):
    args = docopt.docopt(__doc__, argv=argv)

    window_size = int(args['--window'])

    revert_radius = int(args['--revert-radius'])

    if args['--sunset'] == "<now>":
        sunset = Timestamp(time.time())
    else:
        sunset = Timestamp(args['--sunset'])

    keep_diff = bool(args['--keep-diff'])
    verbose = bool(args['--verbose'])

    run(read_docs(sys.stdin), window_size, revert_radius, sunset, keep_diff,
        verbose)
예제 #11
0
def test_age():
    FakeRevisionMetadata = namedtuple("FakeRevisionMetadata",
                                      ['user_id', 'timestamp'])
    FakeUserInfo = namedtuple("FakeUserInfo", ['registration'])

    cache = {
        revision.metadata: FakeRevisionMetadata(10, Timestamp(10)),
        user.info: FakeUserInfo(Timestamp(0))
    }
    eq_(solve(age, cache=cache), 10)

    cache = {
        revision.metadata: FakeRevisionMetadata(None, Timestamp(10)),
        user.info: FakeUserInfo(Timestamp(0))
    }
    eq_(solve(age, cache=cache), 0)

    cache = {
        revision.metadata: FakeRevisionMetadata(10, Timestamp("20140101010101")),
        user.info: FakeUserInfo(None)
    }
    # Makes sure that old users with no registration are counted appropriately.
    assert solve(age, cache=cache) > 0

    cache = {
        revision.metadata: FakeRevisionMetadata(10, Timestamp(0)),
        user.info: FakeUserInfo(Timestamp(1))
    }
    # Makes sure that imports (revisions made before registration) don't return
    # negative values.
    eq_(solve(age, cache=cache), 0)
예제 #12
0
    def invisible_at(self, timestamp):
        timestamp = Timestamp(timestamp)
        if self.visible_since is not None:
            self.visible += max(timestamp - self.visible_since, 0)
        else:
            # This happens with diff algorithms that will detect content
            # duplication
            pass

        self.visible_since = None
예제 #13
0
 def __init__(self, id, name, editcount, registration, groups,
              implicitgroups, emailable, gender, block_id, blocked_by,
              blocked_by_id, blocked_timestamp, block_reason, block_expiry):
     self.id = int(id) if id is not None else None
     self.name = str(name) if name is not None else None
     self.editcount = int(editcount) if editcount is not None else None
     self.registration = Timestamp(registration) \
         if registration is not None else None
     self.groups = groups or []
     self.implicitgroups = implicitgroups or []
     self.emailable = bool(emailable)
     self.gender = str(gender) if gender is not None else None
     self.block_id = int(block_id) if block_id is not None else None
     self.blocked_by = str(blocked_by) if blocked_by is not None else None
     self.blocked_by_id = int(blocked_by_id) \
         if blocked_by_id is not None else None
     self.blocked_timestamp = Timestamp(blocked_timestamp) \
         if blocked_timestamp is not None else None
     self.block_reason = str(block_reason) \
         if block_reason is not None else None
     self.block_expiry = str(block_expiry) \
         if block_expiry is not None else None
예제 #14
0
def generate_stats(doc, tokens_added, window, sunset):
    revisions_processed = len(window)

    if sunset is None:
        sunset = window[-1][0][
            'timestamp']  # Use the last revision in the window

    seconds_possible = max(Timestamp(sunset) - Timestamp(doc['timestamp']), 0)

    for token in tokens_added:
        non_self_persisted = sum(doc['contributor'] != c
                                 for c in token.revisions)
        non_self_processed = sum(doc['contributor'] != d['contributor']
                                 for d, ts in window)
        yield {
            "token": str(token),
            "persisted": len(token.revisions[1:]),
            "processed": revisions_processed,
            "non_self_persisted": non_self_persisted,
            "non_self_processed": non_self_processed,
            "seconds_visible": token.seconds_visible(sunset),
            "seconds_possible": seconds_possible
        }
예제 #15
0
파일: api.py 프로젝트: 1ec5/revscoring
    def revision_metadata_from_doc(cls, rev_doc):
        if rev_doc is None: return None
        try:
            timestamp = Timestamp(rev_doc.get('timestamp'))
        except ValueError:
            timestamp = None

        return RevisionMetadata(rev_doc.get('revid'), rev_doc.get('parentid'),
                                rev_doc.get('user'), rev_doc.get('userid'),
                                timestamp, rev_doc.get('comment'),
                                rev_doc.get('page', {}).get('pageid'),
                                rev_doc.get('page', {}).get('ns'),
                                rev_doc.get('page', {}).get('title'),
                                rev_doc.get('size'), 'minor' in rev_doc)
예제 #16
0
 def __init__(self, rev_id, parent_id, user_text, user_id, timestamp,
              comment, page_id, page_namespace, page_title, bytes, minor):
     self.rev_id = int(rev_id) if rev_id is not None else None
     self.parent_id = int(parent_id) if parent_id is not None else None
     self.user_text = str(user_text) if user_text is not None else None
     self.user_id = int(user_id) if user_id is not None else None
     self.timestamp = Timestamp(timestamp) \
         if timestamp is not None else None
     self.comment = str(comment) if comment is not None else None
     self.page_id = int(page_id) if page_id is not None else None
     self.page_namespace = int(page_namespace) \
         if page_namespace is not None else None
     self.page_title = str(page_title) if page_title is not None else None
     self.bytes = int(bytes) if bytes is not None else None
     self.minor = bool(minor)
예제 #17
0
파일: api.py 프로젝트: 1ec5/revscoring
    def user_info_from_doc(cls, user_doc):
        if user_doc is None: return None
        try:
            registration = Timestamp(user_doc.get('registration'))
        except ValueError:
            registration = None

        return UserInfo(user_doc.get('userid'), user_doc.get('name'),
                        user_doc.get('editcount'), registration,
                        user_doc.get('groups', []),
                        user_doc.get('implicitgroups',
                                     []), "emailable" in user_doc,
                        user_doc.get('gender'), user_doc.get('blockid'),
                        user_doc.get('blockedby'), user_doc.get('blockedbyid'),
                        user_doc.get('blockedtimestamp'),
                        user_doc.get('blockreason'),
                        user_doc.get('blockexpiry'))
예제 #18
0
 def visible_at(self, timestamp):
     if self.visible_since is None:
         self.visible_since = Timestamp(timestamp)
예제 #19
0
 def seconds_visible(self, sunset):
     sunset = Timestamp(sunset)
     if self.visible_since != None:
         return self.visible + (sunset - self.visible_since)
     else:
         return self.visible
예제 #20
0
def run(db, user_ids, revert_radius, revert_window, session_cutoff):

    print(tsv.encode_row(HEADERS))

    for user_id in user_ids:
        sys.stderr.write("{0}: ".format(user_id))
        row = defaultdict(lambda: 0)
        row['user_id'] = user_id
        row['surviving'] = False  # Preliminary value

        user = db.users.get(user_id)
        if user['user_registration'] is None:
            sys.stderr.write("<no registration>\n")
            continue
        registration = Timestamp(user['user_registration'])
        row['user_registration'] = registration.short_format()

        end_of_first_day = registration + 60 * 60 * 24  # One day
        end_of_first_week = registration + 60 * 60 * 24 * 7  # One week

        first_week_revisions = db.revisions.query(user_id=user_id,
                                                  direction="newer",
                                                  before=end_of_first_week,
                                                  include_page=True)

        session_cache = sessions.Cache(cutoff=session_cutoff)
        session_cache.process(user_id, registration,
                              ("registration", registration))

        for rev in first_week_revisions:
            rev_timestamp = Timestamp(rev['rev_timestamp'])
            ns = rev['page_namespace']

            first_day = rev_timestamp <= end_of_first_day

            row['week_revisions'] += 1
            row['day_revisions'] += 1 if first_day else 0

            if rev_timestamp >= registration + TRIAL_PERIOD:
                row['surviving'] = True

            if ns in MAIN_NAMESPACES:
                row['week_main_revisions'] += 1
                row['day_main_revisions'] += 1 if first_day else 0

                revert = reverts.database.check_row(db,
                                                    rev,
                                                    radius=revert_radius,
                                                    window=revert_window)

                if revert != None:  # Reverted edit!
                    print(rev)
                    row['week_reverted_main_revisions'] += 1
                    row['day_reverted_main_revisions'] += 1 if first_day else 0
                    sys.stderr.write("r")
                    sys.stderr.flush()
                else:
                    sys.stderr.write(".")
                    sys.stderr.flush()
            else:
                row['week_wp_revisions'] += 1 if ns in WP_NAMESPACES else 0
                row['day_wp_revisions'] += 1 if first_day and \
                                                ns in WP_NAMESPACES else 0
                row['week_user_revisions'] += 1 if ns in USER_NAMESPACES else 0
                row['day_user_revisions'] += 1 if first_day and \
                                                ns in USER_NAMESPACES else 0
                row['week_talk_revisions'] += 1 if ns in TALK_NAMESPACES else 0
                row['day_talk_revisions'] += 1 if first_day and \
                                                ns in TALK_NAMESPACES else 0
                sys.stderr.write("_")
                sys.stderr.flush()

            user_sessions = session_cache.process(user_id, rev_timestamp,
                                                  ("edit", rev_timestamp))
            update_row_with_session_metrics(row, user_sessions)

        user_sessions = session_cache.get_active_sessions()
        update_row_with_session_metrics(row, user_sessions)

        sys.stderr.write("\n")
        sys.stdout.write(tsv.encode_row(row, headers=HEADERS))
        sys.stdout.write("\n")
예제 #21
0
"""
Demonstrates some simple Timestamp operations
"""
from mw import Timestamp

# Seconds since Unix Epoch
str(Timestamp(1234567890))
# > '20090213233130'

# Database format
int(Timestamp("20090213233130"))
# > 1234567890

# API format
int(Timestamp("2009-02-13T23:31:30Z"))
# > 1234567890

# Difference in seconds
Timestamp("2009-02-13T23:31:31Z") - Timestamp(1234567890)
# > 1

# strptime and strftime
Timestamp(1234567890).strftime("%Y foobar")
# > '2009 foobar'

str(Timestamp.strptime("2009 derp 10", "%Y derp %m"))
# > '20091001000000'
예제 #22
0
파일: user.py 프로젝트: SPQRobin/revscoring
import re

from mw import Timestamp

from ..datasources import revision, user
from .feature import Feature

# Date that registrations started being recorded in MediaWiki
USER_REGISTRATION_EPOCH = Timestamp("20050101000000")


def process_age(user_info, revision_metadata):
    if user_info is None: return 0
    if process_is_anon(revision_metadata):  # Anonymous so age == zero
        return 0
    else:
        registration_delta = revision_metadata.timestamp - \
                (user_info.registration or USER_REGISTRATION_EPOCH)
        return max(registration_delta, 0)


age = Feature("user.age",
              process_age,
              returns=int,
              depends_on=[user.info, revision.metadata])
"""
Represents age of user when the edit was made in seconds.

:Returns:
    int