def test_extract_cite_history(): FakeRevision = namedtuple("Revision", ['id', 'timestamp', 'text']) FakeExtractor = namedtuple("Extractor", ['extract']) class FakePage: def __init__(self, id, title): self.id = id self.title = title def __iter__(self): return iter([ FakeRevision(1, Timestamp(1), "id1 id2"), FakeRevision(2, Timestamp(2), "id1 id3"), FakeRevision(3, Timestamp(3), "id1 id2 id3"), FakeRevision(4, Timestamp(4), "id1 id2 id4"), FakeRevision(5, Timestamp(5), "id1 id2 id4"), ]) fake_page = FakePage(1, "Title") def extract(text): return (Identifier('fake', id) for id in text.split(" ")) extractor = FakeExtractor(extract) expected = [(1, "Title", 1, Timestamp(1), "fake", "id1"), (1, "Title", 1, Timestamp(1), "fake", "id2"), (1, "Title", 4, Timestamp(4), "fake", "id4")] citations = list(extract_cite_history(fake_page, [extractor])) eq_(len(citations), len(expected)) for cite in extract_cite_history(fake_page, [extractor]): assert cite in expected
def test_age(): FakeRevisionMetadata = namedtuple("FakeRevisionMetadata", ['timestamp']) cache = { revision.metadata: FakeRevisionMetadata(Timestamp(10)), page_creation.metadata: FakeRevisionMetadata(Timestamp(0)) } eq_(solve(age, cache=cache), 10)
def test_seconds_since(): FakeRevisionMetadata = namedtuple("FakeRevisionMetadata", ['timestamp']) cache = { revision.metadata: FakeRevisionMetadata(Timestamp(10)), parent_revision.metadata: FakeRevisionMetadata(Timestamp(1)) } eq_(solve(seconds_since, cache=cache), 9)
def __iter__(self): return iter([ FakeRevision(1, Timestamp(1), "id1 id2"), FakeRevision(2, Timestamp(2), "id1 id3"), FakeRevision(3, Timestamp(3), "id1 id2 id3"), FakeRevision(4, Timestamp(4), "id1 id2 id4"), FakeRevision(5, Timestamp(5), "id1 id2 id4"), ])
def test_user_info_from_doc(): doc = { "userid": 24278012, "name": "Hoardablehotsauce", "editcount": 5, "registration": "2015-02-28T22:25:37Z", "groups": ["*", "user"], "implicitgroups": ["*", "user"], "blockid": "5752570", "blockedby": "Cryptic", "blockedbyid": "295294", "blockedtimestamp": "2015-02-28T22:43:23Z", "blockreason": "{{uw-softerblock}} <!-- Promotional username, " "soft block -->", "blockexpiry": "infinity", "gender": "unknown" } info = api.APIExtractor.user_info_from_doc(doc) eq_(info.name, "Hoardablehotsauce") eq_(info.groups, ['*', "user"]) eq_(info.implicitgroups, ['*', "user"]) eq_(info.registration, Timestamp("2015-02-28T22:25:37Z")) eq_(info.block_id, 5752570) eq_(info.blocked_by, "Cryptic") eq_(info.blocked_by_id, 295294) eq_(info.blocked_timestamp, Timestamp("2015-02-28T22:43:23Z")) eq_(info.block_reason, "{{uw-softerblock}} <!-- Promotional username, soft block -->") eq_(info.block_expiry, "infinity") eq_(info.gender, "unknown") doc = { "userid": 24278012, "name": "Hoardablehotsauce", "editcount": 5, "groups": ["*", "user"], "implicitgroups": ["*", "user"], "gender": "unknown" } info = api.APIExtractor.user_info_from_doc(doc) eq_(info.registration, None) eq_(info.blocked_timestamp, None) doc = None info = api.APIExtractor.user_info_from_doc(doc) eq_(info, None)
def test_seconds_since(): FakeRevisionMetadata = namedtuple("FakeRevisionMetadata", ['timestamp']) cache = { revision.metadata: FakeRevisionMetadata(Timestamp(10)), previous_user_revision.metadata: FakeRevisionMetadata(Timestamp(1)) } eq_(solve(seconds_since, cache=cache), 9) # Makes sure we don't crash when there was no previous user revision cache = { revision.metadata: FakeRevisionMetadata(Timestamp(10)), previous_user_revision.metadata: None } eq_(solve(seconds_since, cache=cache), 0)
def test_revision_metadata_from_doc(): doc = { "revid": 3456789, "parentid": 54678, "comment": "Wat?", "user": "******", "userid": 34567890, "timestamp": "2015-01-07T12:23:57Z", "page": { "pageid": 347, "title": "Hats", "ns": 0 } } metadata = api.APIExtractor.revision_metadata_from_doc(doc) eq_(metadata.rev_id, 3456789) eq_(metadata.parent_id, 54678) eq_(metadata.user_id, 34567890) eq_(metadata.user_text, "EpochFail") eq_(metadata.timestamp, Timestamp("2015-01-07T12:23:57Z")) eq_(metadata.comment, "Wat?") eq_(metadata.page_id, 347) eq_(metadata.page_namespace, 0) eq_(metadata.page_title, "Hats")
def test_hour_of_day(): FakeRevisionMetadata = namedtuple("FakeRevisionMetadata", ['timestamp']) timestamp = Timestamp('2014-09-07T19:55:00Z') cache = { revision.metadata: FakeRevisionMetadata(timestamp) } eq_(solve(hour_of_day, cache=cache), 19)
def test_seconds_since(): FakeRevisionMetadata = namedtuple("FakeRevisionMetadata", ['timestamp']) cache = { revision.metadata: FakeRevisionMetadata(Timestamp(10)), parent_revision.metadata: FakeRevisionMetadata(Timestamp(1)) } eq_(solve(seconds_since, cache=cache), 9) # Make sure we don't error when there is no parent revision cache = { revision.metadata: FakeRevisionMetadata(Timestamp(10)), parent_revision.metadata: None } eq_(solve(seconds_since, cache=cache), 0)
def main(argv=None): args = docopt.docopt(__doc__, argv=argv) window_size = int(args['--window']) revert_radius = int(args['--revert-radius']) if args['--sunset'] == "<now>": sunset = Timestamp(time.time()) else: sunset = Timestamp(args['--sunset']) keep_diff = bool(args['--keep-diff']) verbose = bool(args['--verbose']) run(read_docs(sys.stdin), window_size, revert_radius, sunset, keep_diff, verbose)
def test_age(): FakeRevisionMetadata = namedtuple("FakeRevisionMetadata", ['user_id', 'timestamp']) FakeUserInfo = namedtuple("FakeUserInfo", ['registration']) cache = { revision.metadata: FakeRevisionMetadata(10, Timestamp(10)), user.info: FakeUserInfo(Timestamp(0)) } eq_(solve(age, cache=cache), 10) cache = { revision.metadata: FakeRevisionMetadata(None, Timestamp(10)), user.info: FakeUserInfo(Timestamp(0)) } eq_(solve(age, cache=cache), 0) cache = { revision.metadata: FakeRevisionMetadata(10, Timestamp("20140101010101")), user.info: FakeUserInfo(None) } # Makes sure that old users with no registration are counted appropriately. assert solve(age, cache=cache) > 0 cache = { revision.metadata: FakeRevisionMetadata(10, Timestamp(0)), user.info: FakeUserInfo(Timestamp(1)) } # Makes sure that imports (revisions made before registration) don't return # negative values. eq_(solve(age, cache=cache), 0)
def invisible_at(self, timestamp): timestamp = Timestamp(timestamp) if self.visible_since is not None: self.visible += max(timestamp - self.visible_since, 0) else: # This happens with diff algorithms that will detect content # duplication pass self.visible_since = None
def __init__(self, id, name, editcount, registration, groups, implicitgroups, emailable, gender, block_id, blocked_by, blocked_by_id, blocked_timestamp, block_reason, block_expiry): self.id = int(id) if id is not None else None self.name = str(name) if name is not None else None self.editcount = int(editcount) if editcount is not None else None self.registration = Timestamp(registration) \ if registration is not None else None self.groups = groups or [] self.implicitgroups = implicitgroups or [] self.emailable = bool(emailable) self.gender = str(gender) if gender is not None else None self.block_id = int(block_id) if block_id is not None else None self.blocked_by = str(blocked_by) if blocked_by is not None else None self.blocked_by_id = int(blocked_by_id) \ if blocked_by_id is not None else None self.blocked_timestamp = Timestamp(blocked_timestamp) \ if blocked_timestamp is not None else None self.block_reason = str(block_reason) \ if block_reason is not None else None self.block_expiry = str(block_expiry) \ if block_expiry is not None else None
def generate_stats(doc, tokens_added, window, sunset): revisions_processed = len(window) if sunset is None: sunset = window[-1][0][ 'timestamp'] # Use the last revision in the window seconds_possible = max(Timestamp(sunset) - Timestamp(doc['timestamp']), 0) for token in tokens_added: non_self_persisted = sum(doc['contributor'] != c for c in token.revisions) non_self_processed = sum(doc['contributor'] != d['contributor'] for d, ts in window) yield { "token": str(token), "persisted": len(token.revisions[1:]), "processed": revisions_processed, "non_self_persisted": non_self_persisted, "non_self_processed": non_self_processed, "seconds_visible": token.seconds_visible(sunset), "seconds_possible": seconds_possible }
def revision_metadata_from_doc(cls, rev_doc): if rev_doc is None: return None try: timestamp = Timestamp(rev_doc.get('timestamp')) except ValueError: timestamp = None return RevisionMetadata(rev_doc.get('revid'), rev_doc.get('parentid'), rev_doc.get('user'), rev_doc.get('userid'), timestamp, rev_doc.get('comment'), rev_doc.get('page', {}).get('pageid'), rev_doc.get('page', {}).get('ns'), rev_doc.get('page', {}).get('title'), rev_doc.get('size'), 'minor' in rev_doc)
def __init__(self, rev_id, parent_id, user_text, user_id, timestamp, comment, page_id, page_namespace, page_title, bytes, minor): self.rev_id = int(rev_id) if rev_id is not None else None self.parent_id = int(parent_id) if parent_id is not None else None self.user_text = str(user_text) if user_text is not None else None self.user_id = int(user_id) if user_id is not None else None self.timestamp = Timestamp(timestamp) \ if timestamp is not None else None self.comment = str(comment) if comment is not None else None self.page_id = int(page_id) if page_id is not None else None self.page_namespace = int(page_namespace) \ if page_namespace is not None else None self.page_title = str(page_title) if page_title is not None else None self.bytes = int(bytes) if bytes is not None else None self.minor = bool(minor)
def read_osm_changes(f, format): for line in f: parts = line.strip().split("\t") user = parts[0] if format == "<unix timestamp>": timestamp = Timestamp(float(parts[1])) else: timestamp = Timestamp.strptime(parts[1], format) if len(parts) >= 3: change_id = parts[2] else: change_id = None yield user, timestamp, change_id
def read_user_actions(f, format): for line in f: parts = line.strip().split("\t") user = parts[0] if format == "<unix timestamp>": timestamp = Timestamp(float(parts[1])) else: timestamp = Timestamp.strptime(parts[1], format) if len(parts) >= 3: action = parts[2] else: action = None yield user, timestamp, action
def user_info_from_doc(cls, user_doc): if user_doc is None: return None try: registration = Timestamp(user_doc.get('registration')) except ValueError: registration = None return UserInfo(user_doc.get('userid'), user_doc.get('name'), user_doc.get('editcount'), registration, user_doc.get('groups', []), user_doc.get('implicitgroups', []), "emailable" in user_doc, user_doc.get('gender'), user_doc.get('blockid'), user_doc.get('blockedby'), user_doc.get('blockedbyid'), user_doc.get('blockedtimestamp'), user_doc.get('blockreason'), user_doc.get('blockexpiry'))
def visible_at(self, timestamp): if self.visible_since is None: self.visible_since = Timestamp(timestamp)
def seconds_visible(self, sunset): sunset = Timestamp(sunset) if self.visible_since != None: return self.visible + (sunset - self.visible_since) else: return self.visible
import sys,os;sys.path.insert(0, os.path.abspath(os.getcwd())) from mw import Timestamp # Seconds since Unix Epoch str(Timestamp(1234567890)) # > '20090213233130' # Database format int(Timestamp("20090213233130")) # > 1234567890 # API format int(Timestamp("2009-02-13T23:31:30Z")) # > 1234567890 # Difference in seconds Timestamp("2009-02-13T23:31:31Z") - Timestamp(1234567890) # > 1 # strptime and strftime Timestamp(1234567890).strftime("%Y foobar") # > '2009 foobar' str(Timestamp.strptime("2009 derp 10", "%Y derp %m")) # > '20091001000000'
def run(db, user_ids, revert_radius, revert_window, session_cutoff): print(tsv.encode_row(HEADERS)) for user_id in user_ids: sys.stderr.write("{0}: ".format(user_id)) row = defaultdict(lambda: 0) row['user_id'] = user_id row['surviving'] = False # Preliminary value user = db.users.get(user_id) if user['user_registration'] is None: sys.stderr.write("<no registration>\n") continue registration = Timestamp(user['user_registration']) row['user_registration'] = registration.short_format() end_of_first_day = registration + 60*60*24 # One day end_of_first_week = registration + 60*60*24*7 # One week first_week_revisions = db.revisions.query( user_id=user_id, direction="newer", before=end_of_first_week, include_page=True ) session_cache = sessions.Cache(cutoff=session_cutoff) session_cache.process(user_id, registration, ("registration", registration)) for rev in first_week_revisions: rev_timestamp = Timestamp(rev['rev_timestamp']) ns = rev['page_namespace'] first_day = rev_timestamp <= end_of_first_day row['week_revisions'] += 1 row['day_revisions'] += 1 if first_day else 0 if rev_timestamp >= registration + TRIAL_PERIOD: row['surviving'] = True if ns in MAIN_NAMESPACES: row['week_main_revisions'] += 1 row['day_main_revisions'] += 1 if first_day else 0 revert = reverts.database.check_row(db, rev, radius=revert_radius, window=revert_window) if revert != None: # Reverted edit! print(rev) row['week_reverted_main_revisions'] += 1 row['day_reverted_main_revisions'] += 1 if first_day else 0 sys.stderr.write("r");sys.stderr.flush() else: sys.stderr.write(".");sys.stderr.flush() else: row['week_wp_revisions'] += 1 if ns in WP_NAMESPACES else 0 row['day_wp_revisions'] += 1 if first_day and \ ns in WP_NAMESPACES else 0 row['week_user_revisions'] += 1 if ns in USER_NAMESPACES else 0 row['day_user_revisions'] += 1 if first_day and \ ns in USER_NAMESPACES else 0 row['week_talk_revisions'] += 1 if ns in TALK_NAMESPACES else 0 row['day_talk_revisions'] += 1 if first_day and \ ns in TALK_NAMESPACES else 0 sys.stderr.write("_");sys.stderr.flush() user_sessions = session_cache.process(user_id, rev_timestamp, ("edit", rev_timestamp)) update_row_with_session_metrics(row, user_sessions) user_sessions = session_cache.get_active_sessions() update_row_with_session_metrics(row, user_sessions) sys.stderr.write("\n") sys.stdout.write(tsv.encode_row(row, headers=HEADERS)) sys.stdout.write("\n")
import re from mw import Timestamp from ..datasources import revision, user from .feature import Feature # Date that registrations started being recorded in MediaWiki USER_REGISTRATION_EPOCH = Timestamp("20050101000000") def process_age(user_info, revision_metadata): if user_info is None: return 0 if process_is_anon(revision_metadata): # Anonymous so age == zero return 0 else: registration_delta = revision_metadata.timestamp - \ (user_info.registration or USER_REGISTRATION_EPOCH) return max(registration_delta, 0) age = Feature("user.age", process_age, returns=int, depends_on=[user.info, revision.metadata]) """ Represents age of user when the edit was made in seconds. :Returns: int
def run(db, user_ids, revert_radius, revert_window, session_cutoff): print(tsv.encode_row(HEADERS)) for user_id in user_ids: sys.stderr.write("{0}: ".format(user_id)) row = defaultdict(lambda: 0) row['user_id'] = user_id row['surviving'] = False # Preliminary value user = db.users.get(user_id) if user['user_registration'] is None: sys.stderr.write("<no registration>\n") continue registration = Timestamp(user['user_registration']) row['user_registration'] = registration.short_format() end_of_first_day = registration + 60 * 60 * 24 # One day end_of_first_week = registration + 60 * 60 * 24 * 7 # One week first_week_revisions = db.revisions.query(user_id=user_id, direction="newer", before=end_of_first_week, include_page=True) session_cache = sessions.Cache(cutoff=session_cutoff) session_cache.process(user_id, registration, ("registration", registration)) for rev in first_week_revisions: rev_timestamp = Timestamp(rev['rev_timestamp']) ns = rev['page_namespace'] first_day = rev_timestamp <= end_of_first_day row['week_revisions'] += 1 row['day_revisions'] += 1 if first_day else 0 if rev_timestamp >= registration + TRIAL_PERIOD: row['surviving'] = True if ns in MAIN_NAMESPACES: row['week_main_revisions'] += 1 row['day_main_revisions'] += 1 if first_day else 0 revert = reverts.database.check_row(db, rev, radius=revert_radius, window=revert_window) if revert != None: # Reverted edit! print(rev) row['week_reverted_main_revisions'] += 1 row['day_reverted_main_revisions'] += 1 if first_day else 0 sys.stderr.write("r") sys.stderr.flush() else: sys.stderr.write(".") sys.stderr.flush() else: row['week_wp_revisions'] += 1 if ns in WP_NAMESPACES else 0 row['day_wp_revisions'] += 1 if first_day and \ ns in WP_NAMESPACES else 0 row['week_user_revisions'] += 1 if ns in USER_NAMESPACES else 0 row['day_user_revisions'] += 1 if first_day and \ ns in USER_NAMESPACES else 0 row['week_talk_revisions'] += 1 if ns in TALK_NAMESPACES else 0 row['day_talk_revisions'] += 1 if first_day and \ ns in TALK_NAMESPACES else 0 sys.stderr.write("_") sys.stderr.flush() user_sessions = session_cache.process(user_id, rev_timestamp, ("edit", rev_timestamp)) update_row_with_session_metrics(row, user_sessions) user_sessions = session_cache.get_active_sessions() update_row_with_session_metrics(row, user_sessions) sys.stderr.write("\n") sys.stdout.write(tsv.encode_row(row, headers=HEADERS)) sys.stdout.write("\n")
def run(db, start_date, end_date, n, t, debug): # Print some headers print( "\t".join([ "user_id", "user_name", "user_registration", "productive", "censored" ]) ) t_seconds = DAY_SECONDS*t # Convert days to seconds so that we can do some math # Get relevant users users = db.users.query( registered_after=start_date, registered_before=end_date ) for user_row in users: logger.debug("Processing {0}:".format(str(user_row['user_name'], "utf-8"))) # Convert user_registration to a useful type user_registration = Timestamp(user_row['user_registration']) # Get all the revisions the user made within time "t" days registration revisions = db.revisions.query( user_id=user_row['user_id'], before=user_registration + (DAY_SECONDS*t), include_page=True ) # Count up the productive edits productive_edits = 0 for rev_row in revisions: # Convert revision timestamp to a useful type rev_timestamp = Timestamp(rev_row['rev_timestamp']) # Must me a content edit if rev_row['page_namespace'] == 0: # If the revert doesn't happen in 48 hours, it doesn't count revert_end_of_life = rev_timestamp + DAY_SECONDS*2 revert = reverts.database.check_row( db, rev_row, radius = 15, # Reverts can't cross more than 15 revisions before = revert_end_of_life ) if revert == None: # Not reverted productive_edits += 1 if productive_edits >= n: #We're done here break print( "\t".join([ str(user_row['user_id']), escape(str(user_row['user_name'], 'utf-8')), escape(str(user_row['user_registration'], 'utf-8')), str(productive_edits >= n), str(time.time() - user_registration.unix() < (2+t)*DAY_SECONDS) ]) )
""" Demonstrates some simple Timestamp operations """ from mw import Timestamp # Seconds since Unix Epoch str(Timestamp(1234567890)) # > '20090213233130' # Database format int(Timestamp("20090213233130")) # > 1234567890 # API format int(Timestamp("2009-02-13T23:31:30Z")) # > 1234567890 # Difference in seconds Timestamp("2009-02-13T23:31:31Z") - Timestamp(1234567890) # > 1 # strptime and strftime Timestamp(1234567890).strftime("%Y foobar") # > '2009 foobar' str(Timestamp.strptime("2009 derp 10", "%Y derp %m")) # > '20091001000000'