def test_extract_cite_history(): FakeRevision = namedtuple("Revision", ['id', 'timestamp', 'text']) FakeExtractor = namedtuple("Extractor", ['extract']) class FakePage: def __init__(self, id, title): self.id = id self.title = title def __iter__(self): return iter([ FakeRevision(1, Timestamp(1), "id1 id2"), FakeRevision(2, Timestamp(2), "id1 id3"), FakeRevision(3, Timestamp(3), "id1 id2 id3"), FakeRevision(4, Timestamp(4), "id1 id2 id4"), FakeRevision(5, Timestamp(5), "id1 id2 id4"), ]) fake_page = FakePage(1, "Title") def extract(text): return (Identifier('fake', id) for id in text.split(" ")) extractor = FakeExtractor(extract) expected = [(1, "Title", 1, Timestamp(1), "fake", "id1"), (1, "Title", 1, Timestamp(1), "fake", "id2"), (1, "Title", 4, Timestamp(4), "fake", "id4")] citations = list(extract_cite_history(fake_page, [extractor])) eq_(len(citations), len(expected)) for cite in extract_cite_history(fake_page, [extractor]): assert cite in expected
def test_age(): FakeRevisionMetadata = namedtuple("FakeRevisionMetadata", ['timestamp']) cache = { revision.metadata: FakeRevisionMetadata(Timestamp(10)), page_creation.metadata: FakeRevisionMetadata(Timestamp(0)) } eq_(solve(age, cache=cache), 10)
def test_seconds_since(): FakeRevisionMetadata = namedtuple("FakeRevisionMetadata", ['timestamp']) cache = { revision.metadata: FakeRevisionMetadata(Timestamp(10)), parent_revision.metadata: FakeRevisionMetadata(Timestamp(1)) } eq_(solve(seconds_since, cache=cache), 9)
def __iter__(self): return iter([ FakeRevision(1, Timestamp(1), "id1 id2"), FakeRevision(2, Timestamp(2), "id1 id3"), FakeRevision(3, Timestamp(3), "id1 id2 id3"), FakeRevision(4, Timestamp(4), "id1 id2 id4"), FakeRevision(5, Timestamp(5), "id1 id2 id4"), ])
def test_user_info_from_doc(): doc = { "userid": 24278012, "name": "Hoardablehotsauce", "editcount": 5, "registration": "2015-02-28T22:25:37Z", "groups": ["*", "user"], "implicitgroups": ["*", "user"], "blockid": "5752570", "blockedby": "Cryptic", "blockedbyid": "295294", "blockedtimestamp": "2015-02-28T22:43:23Z", "blockreason": "{{uw-softerblock}} <!-- Promotional username, " "soft block -->", "blockexpiry": "infinity", "gender": "unknown" } info = api.APIExtractor.user_info_from_doc(doc) eq_(info.name, "Hoardablehotsauce") eq_(info.groups, ['*', "user"]) eq_(info.implicitgroups, ['*', "user"]) eq_(info.registration, Timestamp("2015-02-28T22:25:37Z")) eq_(info.block_id, 5752570) eq_(info.blocked_by, "Cryptic") eq_(info.blocked_by_id, 295294) eq_(info.blocked_timestamp, Timestamp("2015-02-28T22:43:23Z")) eq_(info.block_reason, "{{uw-softerblock}} <!-- Promotional username, soft block -->") eq_(info.block_expiry, "infinity") eq_(info.gender, "unknown") doc = { "userid": 24278012, "name": "Hoardablehotsauce", "editcount": 5, "groups": ["*", "user"], "implicitgroups": ["*", "user"], "gender": "unknown" } info = api.APIExtractor.user_info_from_doc(doc) eq_(info.registration, None) eq_(info.blocked_timestamp, None) doc = None info = api.APIExtractor.user_info_from_doc(doc) eq_(info, None)
def test_seconds_since(): FakeRevisionMetadata = namedtuple("FakeRevisionMetadata", ['timestamp']) cache = { revision.metadata: FakeRevisionMetadata(Timestamp(10)), previous_user_revision.metadata: FakeRevisionMetadata(Timestamp(1)) } eq_(solve(seconds_since, cache=cache), 9) # Makes sure we don't crash when there was no previous user revision cache = { revision.metadata: FakeRevisionMetadata(Timestamp(10)), previous_user_revision.metadata: None } eq_(solve(seconds_since, cache=cache), 0)
def test_seconds_since(): FakeRevisionMetadata = namedtuple("FakeRevisionMetadata", ['timestamp']) cache = { revision.metadata: FakeRevisionMetadata(Timestamp(10)), parent_revision.metadata: FakeRevisionMetadata(Timestamp(1)) } eq_(solve(seconds_since, cache=cache), 9) # Make sure we don't error when there is no parent revision cache = { revision.metadata: FakeRevisionMetadata(Timestamp(10)), parent_revision.metadata: None } eq_(solve(seconds_since, cache=cache), 0)
def test_revision_metadata_from_doc(): doc = { "revid": 3456789, "parentid": 54678, "comment": "Wat?", "user": "******", "userid": 34567890, "timestamp": "2015-01-07T12:23:57Z", "page": { "pageid": 347, "title": "Hats", "ns": 0 } } metadata = api.APIExtractor.revision_metadata_from_doc(doc) eq_(metadata.rev_id, 3456789) eq_(metadata.parent_id, 54678) eq_(metadata.user_id, 34567890) eq_(metadata.user_text, "EpochFail") eq_(metadata.timestamp, Timestamp("2015-01-07T12:23:57Z")) eq_(metadata.comment, "Wat?") eq_(metadata.page_id, 347) eq_(metadata.page_namespace, 0) eq_(metadata.page_title, "Hats")
def test_hour_of_day(): FakeRevisionMetadata = namedtuple("FakeRevisionMetadata", ['timestamp']) timestamp = Timestamp('2014-09-07T19:55:00Z') cache = { revision.metadata: FakeRevisionMetadata(timestamp) } eq_(solve(hour_of_day, cache=cache), 19)
def main(argv=None): args = docopt.docopt(__doc__, argv=argv) window_size = int(args['--window']) revert_radius = int(args['--revert-radius']) if args['--sunset'] == "<now>": sunset = Timestamp(time.time()) else: sunset = Timestamp(args['--sunset']) keep_diff = bool(args['--keep-diff']) verbose = bool(args['--verbose']) run(read_docs(sys.stdin), window_size, revert_radius, sunset, keep_diff, verbose)
def test_age(): FakeRevisionMetadata = namedtuple("FakeRevisionMetadata", ['user_id', 'timestamp']) FakeUserInfo = namedtuple("FakeUserInfo", ['registration']) cache = { revision.metadata: FakeRevisionMetadata(10, Timestamp(10)), user.info: FakeUserInfo(Timestamp(0)) } eq_(solve(age, cache=cache), 10) cache = { revision.metadata: FakeRevisionMetadata(None, Timestamp(10)), user.info: FakeUserInfo(Timestamp(0)) } eq_(solve(age, cache=cache), 0) cache = { revision.metadata: FakeRevisionMetadata(10, Timestamp("20140101010101")), user.info: FakeUserInfo(None) } # Makes sure that old users with no registration are counted appropriately. assert solve(age, cache=cache) > 0 cache = { revision.metadata: FakeRevisionMetadata(10, Timestamp(0)), user.info: FakeUserInfo(Timestamp(1)) } # Makes sure that imports (revisions made before registration) don't return # negative values. eq_(solve(age, cache=cache), 0)
def invisible_at(self, timestamp): timestamp = Timestamp(timestamp) if self.visible_since is not None: self.visible += max(timestamp - self.visible_since, 0) else: # This happens with diff algorithms that will detect content # duplication pass self.visible_since = None
def __init__(self, id, name, editcount, registration, groups, implicitgroups, emailable, gender, block_id, blocked_by, blocked_by_id, blocked_timestamp, block_reason, block_expiry): self.id = int(id) if id is not None else None self.name = str(name) if name is not None else None self.editcount = int(editcount) if editcount is not None else None self.registration = Timestamp(registration) \ if registration is not None else None self.groups = groups or [] self.implicitgroups = implicitgroups or [] self.emailable = bool(emailable) self.gender = str(gender) if gender is not None else None self.block_id = int(block_id) if block_id is not None else None self.blocked_by = str(blocked_by) if blocked_by is not None else None self.blocked_by_id = int(blocked_by_id) \ if blocked_by_id is not None else None self.blocked_timestamp = Timestamp(blocked_timestamp) \ if blocked_timestamp is not None else None self.block_reason = str(block_reason) \ if block_reason is not None else None self.block_expiry = str(block_expiry) \ if block_expiry is not None else None
def generate_stats(doc, tokens_added, window, sunset): revisions_processed = len(window) if sunset is None: sunset = window[-1][0][ 'timestamp'] # Use the last revision in the window seconds_possible = max(Timestamp(sunset) - Timestamp(doc['timestamp']), 0) for token in tokens_added: non_self_persisted = sum(doc['contributor'] != c for c in token.revisions) non_self_processed = sum(doc['contributor'] != d['contributor'] for d, ts in window) yield { "token": str(token), "persisted": len(token.revisions[1:]), "processed": revisions_processed, "non_self_persisted": non_self_persisted, "non_self_processed": non_self_processed, "seconds_visible": token.seconds_visible(sunset), "seconds_possible": seconds_possible }
def revision_metadata_from_doc(cls, rev_doc): if rev_doc is None: return None try: timestamp = Timestamp(rev_doc.get('timestamp')) except ValueError: timestamp = None return RevisionMetadata(rev_doc.get('revid'), rev_doc.get('parentid'), rev_doc.get('user'), rev_doc.get('userid'), timestamp, rev_doc.get('comment'), rev_doc.get('page', {}).get('pageid'), rev_doc.get('page', {}).get('ns'), rev_doc.get('page', {}).get('title'), rev_doc.get('size'), 'minor' in rev_doc)
def __init__(self, rev_id, parent_id, user_text, user_id, timestamp, comment, page_id, page_namespace, page_title, bytes, minor): self.rev_id = int(rev_id) if rev_id is not None else None self.parent_id = int(parent_id) if parent_id is not None else None self.user_text = str(user_text) if user_text is not None else None self.user_id = int(user_id) if user_id is not None else None self.timestamp = Timestamp(timestamp) \ if timestamp is not None else None self.comment = str(comment) if comment is not None else None self.page_id = int(page_id) if page_id is not None else None self.page_namespace = int(page_namespace) \ if page_namespace is not None else None self.page_title = str(page_title) if page_title is not None else None self.bytes = int(bytes) if bytes is not None else None self.minor = bool(minor)
def user_info_from_doc(cls, user_doc): if user_doc is None: return None try: registration = Timestamp(user_doc.get('registration')) except ValueError: registration = None return UserInfo(user_doc.get('userid'), user_doc.get('name'), user_doc.get('editcount'), registration, user_doc.get('groups', []), user_doc.get('implicitgroups', []), "emailable" in user_doc, user_doc.get('gender'), user_doc.get('blockid'), user_doc.get('blockedby'), user_doc.get('blockedbyid'), user_doc.get('blockedtimestamp'), user_doc.get('blockreason'), user_doc.get('blockexpiry'))
def visible_at(self, timestamp): if self.visible_since is None: self.visible_since = Timestamp(timestamp)
def seconds_visible(self, sunset): sunset = Timestamp(sunset) if self.visible_since != None: return self.visible + (sunset - self.visible_since) else: return self.visible
def run(db, user_ids, revert_radius, revert_window, session_cutoff): print(tsv.encode_row(HEADERS)) for user_id in user_ids: sys.stderr.write("{0}: ".format(user_id)) row = defaultdict(lambda: 0) row['user_id'] = user_id row['surviving'] = False # Preliminary value user = db.users.get(user_id) if user['user_registration'] is None: sys.stderr.write("<no registration>\n") continue registration = Timestamp(user['user_registration']) row['user_registration'] = registration.short_format() end_of_first_day = registration + 60 * 60 * 24 # One day end_of_first_week = registration + 60 * 60 * 24 * 7 # One week first_week_revisions = db.revisions.query(user_id=user_id, direction="newer", before=end_of_first_week, include_page=True) session_cache = sessions.Cache(cutoff=session_cutoff) session_cache.process(user_id, registration, ("registration", registration)) for rev in first_week_revisions: rev_timestamp = Timestamp(rev['rev_timestamp']) ns = rev['page_namespace'] first_day = rev_timestamp <= end_of_first_day row['week_revisions'] += 1 row['day_revisions'] += 1 if first_day else 0 if rev_timestamp >= registration + TRIAL_PERIOD: row['surviving'] = True if ns in MAIN_NAMESPACES: row['week_main_revisions'] += 1 row['day_main_revisions'] += 1 if first_day else 0 revert = reverts.database.check_row(db, rev, radius=revert_radius, window=revert_window) if revert != None: # Reverted edit! print(rev) row['week_reverted_main_revisions'] += 1 row['day_reverted_main_revisions'] += 1 if first_day else 0 sys.stderr.write("r") sys.stderr.flush() else: sys.stderr.write(".") sys.stderr.flush() else: row['week_wp_revisions'] += 1 if ns in WP_NAMESPACES else 0 row['day_wp_revisions'] += 1 if first_day and \ ns in WP_NAMESPACES else 0 row['week_user_revisions'] += 1 if ns in USER_NAMESPACES else 0 row['day_user_revisions'] += 1 if first_day and \ ns in USER_NAMESPACES else 0 row['week_talk_revisions'] += 1 if ns in TALK_NAMESPACES else 0 row['day_talk_revisions'] += 1 if first_day and \ ns in TALK_NAMESPACES else 0 sys.stderr.write("_") sys.stderr.flush() user_sessions = session_cache.process(user_id, rev_timestamp, ("edit", rev_timestamp)) update_row_with_session_metrics(row, user_sessions) user_sessions = session_cache.get_active_sessions() update_row_with_session_metrics(row, user_sessions) sys.stderr.write("\n") sys.stdout.write(tsv.encode_row(row, headers=HEADERS)) sys.stdout.write("\n")
""" Demonstrates some simple Timestamp operations """ from mw import Timestamp # Seconds since Unix Epoch str(Timestamp(1234567890)) # > '20090213233130' # Database format int(Timestamp("20090213233130")) # > 1234567890 # API format int(Timestamp("2009-02-13T23:31:30Z")) # > 1234567890 # Difference in seconds Timestamp("2009-02-13T23:31:31Z") - Timestamp(1234567890) # > 1 # strptime and strftime Timestamp(1234567890).strftime("%Y foobar") # > '2009 foobar' str(Timestamp.strptime("2009 derp 10", "%Y derp %m")) # > '20091001000000'
import re from mw import Timestamp from ..datasources import revision, user from .feature import Feature # Date that registrations started being recorded in MediaWiki USER_REGISTRATION_EPOCH = Timestamp("20050101000000") def process_age(user_info, revision_metadata): if user_info is None: return 0 if process_is_anon(revision_metadata): # Anonymous so age == zero return 0 else: registration_delta = revision_metadata.timestamp - \ (user_info.registration or USER_REGISTRATION_EPOCH) return max(registration_delta, 0) age = Feature("user.age", process_age, returns=int, depends_on=[user.info, revision.metadata]) """ Represents age of user when the edit was made in seconds. :Returns: int