def process(dump, page): recentRevs = LimitedDictLists(maxsize=15) for revision in page.readRevisions(): checksum = hashlib.md5(revision.getText().encode("utf-8")).hexdigest() if checksum in recentRevs: #found a revert revertedToRev = recentRevs[checksum] #get the revisions that were reverted revertedRevs = [ r for (c, r) in reversed(recentRevs.getQueue()) if r.getId() > revertedToRev.getId() ] isVandalism = wmf.isVandalismByComment(revision.getComment()) #write revert row yield ('revert', revision.getId(), revertedToRev.getId(), isVandalism, len(revertedRevs)) for rev in revertedRevs: yield ('reverted', rev.getId(), revision.getId(), revertedToRev.getId(), isVandalism, len(revertedRevs)) else: pass recentRevs.insert(checksum, revision)
def main(args): LOGGING_STREAM = sys.stderr logging.basicConfig(level=logging.DEBUG, stream=LOGGING_STREAM, format='%(asctime)s %(levelname)-8s %(message)s', datefmt='%b-%d %H:%M:%S') logging.info("Setting up output files in %s" % args.out) reverts = open(os.path.join(args.out, "revert.tsv"), "w") revertHeaders = ['rev_id', 'to_id', 'revs_reverted'] #reverts.write("\t".join(revertHeaders) + "\n") reverted = open(os.path.join(args.out, "revert.tsv"), "w") revertedHeaders = ['rev_id', 'rvtg_id', 'rvtto_id', 'revs_reverted'] #reverted.write("\t".join(revertedHeaders) + "\n") meta = open(os.path.join(args.out, "revert.tsv"), "w") metaHeaders = [ 'rev_id', 'checksum', 'tokens', 'cs_added', 'cs_removed', 'ts_added', 'ts_removed', 'ws_added', 'ws_removed', 'ms_added', 'ms_removed' ] #meta.write("\t".join(metaHeaders) + "\n") logging.info("Reading from dump file.") for page in wp.dump.Iterator(args.dump).readPages(): logging.debug("Processing %s:%s..." % (page.getId(), page.getTitle())) recentRevs = LimitedDictLists(maxsize=15) lastTokens = [] for revision in page.readRevisions(): checksum = hashlib.md5( revision.getText().encode("utf-8")).hexdigest() if checksum in recentRevs: LOGGING_STREAM.write("r") #found a revert revertedToRev = recentRevs[checksum] #get the revisions that were reverted revertedRevs = [ r for (c, r) in recentRevs if r.getId() > revertedToRev.getId() ] #write revert row revert.write("\t".join( clean(v) for v in [ revision.getId(), revertedToRev.getId(), len(revertedRevs) ]) + "\n") LOGGING_STREAM.write(str(len(revertedRevs))) for rev in revertedRevs: reverted.write("\t".join( clean(v) for v in [ rev.getId(), revision.getId(), revertedToRev.getId(), len(revertedRevs) ]) + "\n") else: LOGGING_STREAM.write("-") tokens = tokenize(revision.getText()) tokensAdded, tokensRemoved = simpleDiff(lastTokens, tokens) row = { 'rev_id': revision.getId(), 'checksum': checksum, 'tokens': len(revision.getText()), 'cs_added': 0, 'cs_removed': 0, 'ts_added': 0, 'ts_removed': 0, 'ws_added': 0, 'ws_removed': 0, 'ms_added': 0, 'ms_removed': 0 } for token in tokensAdded: row['ts_added'] += 1 row['cs_added'] += len(token) if token.strip() == '': pass if token in MARKUP: row['ms_added'] += 1 elif token not in STOP_WORDS: row['ws_added'] += 1 for token in tokensRemoved: row['ts_removed'] += 1 row['cs_removed'] += len(token) if token.strip() == '': pass if token in MARKUP: row['ms_removed'] += 1 elif token not in STOP_WORDS: row['ws_removed'] += 1 reverted.write("\t".join([clean(row[h]) for h in metaHeaders]) + "\n") lastTokens = tokens LOGGING_STREAM.write("\n")
def process(dump, page): recentRevs = LimitedDictLists(maxsize=15) lastTokens = [] metaHeaders = [ 'rev_id', 'checksum', 'tokens', 'cs_added', 'cs_removed', 'ts_added', 'ts_removed', 'ws_added', 'ws_removed', 'ms_added', 'ms_removed' ] for revision in page.readRevisions(): checksum = hashlib.md5(revision.getText().encode("utf-8")).hexdigest() if checksum in recentRevs: #found a revert revertedToRev = recentRevs[checksum] #get the revisions that were reverted revertedRevs = [r for (c,r) in reversed(recentRevs.getQueue()) if r.getId() > revertedToRev.getId()] #write revert row yield ( 'revert', revision.getId(), revertedToRev.getId(), len(revertedRevs) ) for rev in revertedRevs: yield ( 'reverted', rev.getId(), revision.getId(), revertedToRev.getId(), len(revertedRevs) ) else: pass tokens = tokenize(revision.getText()) tokensAdded, tokensRemoved = simpleDiff(lastTokens, tokens) row = { 'rev_id': revision.getId(), 'checksum': checksum, 'tokens': len(revision.getText()), 'cs_added': 0, 'cs_removed': 0, 'ts_added': 0, 'ts_removed': 0, 'ws_added': 0, 'ws_removed': 0, 'ms_added': 0, 'ms_removed': 0 } for token in tokensAdded: row['ts_added'] += 1 row['cs_added'] += len(token) if token.strip() == '': pass if token in MARKUP: row['ms_added'] += 1 elif token not in STOP_WORDS: row['ws_added'] += 1 for token in tokensRemoved: row['ts_removed'] += 1 row['cs_removed'] += len(token) if token.strip() == '': pass if token in MARKUP: row['ms_removed'] += 1 elif token not in STOP_WORDS: row['ws_removed'] += 1 yield tuple(['meta']+[row[h] for h in metaHeaders]) lastTokens = tokens recentRevs.insert(checksum, revision)