Exemplo n.º 1
0
def process(dump, page):
    recentRevs = LimitedDictLists(maxsize=15)
    for revision in page.readRevisions():
        checksum = hashlib.md5(revision.getText().encode("utf-8")).hexdigest()
        if checksum in recentRevs:
            #found a revert
            revertedToRev = recentRevs[checksum]

            #get the revisions that were reverted
            revertedRevs = [
                r for (c, r) in reversed(recentRevs.getQueue())
                if r.getId() > revertedToRev.getId()
            ]

            isVandalism = wmf.isVandalismByComment(revision.getComment())

            #write revert row
            yield ('revert', revision.getId(), revertedToRev.getId(),
                   isVandalism, len(revertedRevs))

            for rev in revertedRevs:
                yield ('reverted', rev.getId(), revision.getId(),
                       revertedToRev.getId(), isVandalism, len(revertedRevs))
        else:
            pass

        recentRevs.insert(checksum, revision)
Exemplo n.º 2
0
def main(args):
    LOGGING_STREAM = sys.stderr
    logging.basicConfig(level=logging.DEBUG,
                        stream=LOGGING_STREAM,
                        format='%(asctime)s %(levelname)-8s %(message)s',
                        datefmt='%b-%d %H:%M:%S')

    logging.info("Setting up output files in %s" % args.out)
    reverts = open(os.path.join(args.out, "revert.tsv"), "w")
    revertHeaders = ['rev_id', 'to_id', 'revs_reverted']
    #reverts.write("\t".join(revertHeaders) + "\n")

    reverted = open(os.path.join(args.out, "revert.tsv"), "w")
    revertedHeaders = ['rev_id', 'rvtg_id', 'rvtto_id', 'revs_reverted']
    #reverted.write("\t".join(revertedHeaders) + "\n")

    meta = open(os.path.join(args.out, "revert.tsv"), "w")
    metaHeaders = [
        'rev_id', 'checksum', 'tokens', 'cs_added', 'cs_removed', 'ts_added',
        'ts_removed', 'ws_added', 'ws_removed', 'ms_added', 'ms_removed'
    ]
    #meta.write("\t".join(metaHeaders) + "\n")

    logging.info("Reading from dump file.")
    for page in wp.dump.Iterator(args.dump).readPages():
        logging.debug("Processing %s:%s..." % (page.getId(), page.getTitle()))
        recentRevs = LimitedDictLists(maxsize=15)
        lastTokens = []
        for revision in page.readRevisions():
            checksum = hashlib.md5(
                revision.getText().encode("utf-8")).hexdigest()
            if checksum in recentRevs:
                LOGGING_STREAM.write("r")
                #found a revert
                revertedToRev = recentRevs[checksum]

                #get the revisions that were reverted
                revertedRevs = [
                    r for (c, r) in recentRevs
                    if r.getId() > revertedToRev.getId()
                ]

                #write revert row
                revert.write("\t".join(
                    clean(v) for v in [
                        revision.getId(),
                        revertedToRev.getId(),
                        len(revertedRevs)
                    ]) + "\n")

                LOGGING_STREAM.write(str(len(revertedRevs)))
                for rev in revertedRevs:
                    reverted.write("\t".join(
                        clean(v) for v in [
                            rev.getId(),
                            revision.getId(),
                            revertedToRev.getId(),
                            len(revertedRevs)
                        ]) + "\n")
            else:
                LOGGING_STREAM.write("-")

            tokens = tokenize(revision.getText())

            tokensAdded, tokensRemoved = simpleDiff(lastTokens, tokens)

            row = {
                'rev_id': revision.getId(),
                'checksum': checksum,
                'tokens': len(revision.getText()),
                'cs_added': 0,
                'cs_removed': 0,
                'ts_added': 0,
                'ts_removed': 0,
                'ws_added': 0,
                'ws_removed': 0,
                'ms_added': 0,
                'ms_removed': 0
            }
            for token in tokensAdded:
                row['ts_added'] += 1
                row['cs_added'] += len(token)
                if token.strip() == '': pass
                if token in MARKUP: row['ms_added'] += 1
                elif token not in STOP_WORDS: row['ws_added'] += 1
            for token in tokensRemoved:
                row['ts_removed'] += 1
                row['cs_removed'] += len(token)
                if token.strip() == '': pass
                if token in MARKUP: row['ms_removed'] += 1
                elif token not in STOP_WORDS: row['ws_removed'] += 1

            reverted.write("\t".join([clean(row[h])
                                      for h in metaHeaders]) + "\n")

            lastTokens = tokens

        LOGGING_STREAM.write("\n")
Exemplo n.º 3
0
def process(dump, page):
	recentRevs = LimitedDictLists(maxsize=15)
	lastTokens = []
	metaHeaders = [
		'rev_id',
		'checksum',
		'tokens',
		'cs_added',
		'cs_removed',
		'ts_added',
		'ts_removed',
		'ws_added',
		'ws_removed',
		'ms_added',
		'ms_removed'
	]
	for revision in page.readRevisions():
		checksum = hashlib.md5(revision.getText().encode("utf-8")).hexdigest()
		if checksum in recentRevs:
			#found a revert
			revertedToRev = recentRevs[checksum]
			
			#get the revisions that were reverted
			revertedRevs = [r for (c,r) in reversed(recentRevs.getQueue()) if r.getId() > revertedToRev.getId()]
			
			#write revert row
			yield (
				'revert',
				revision.getId(), 
				revertedToRev.getId(), 
				len(revertedRevs)
			)
			
			for rev in revertedRevs:
				yield (
					'reverted',
					rev.getId(),
					revision.getId(),
					revertedToRev.getId(), 
					len(revertedRevs)
				)
		else:
			pass
		
		tokens = tokenize(revision.getText())
		
		tokensAdded, tokensRemoved = simpleDiff(lastTokens, tokens)
		
		row = {
			'rev_id':     revision.getId(),
			'checksum':   checksum,
			'tokens':     len(revision.getText()),
			'cs_added':   0,
			'cs_removed': 0,
			'ts_added':   0,
			'ts_removed': 0,
			'ws_added':   0,
			'ws_removed': 0,
			'ms_added':   0,
			'ms_removed': 0
		}
		for token in tokensAdded:
			row['ts_added'] += 1
			row['cs_added'] += len(token)
			if token.strip() == '':       pass
			if token in MARKUP:           row['ms_added'] += 1
			elif token not in STOP_WORDS: row['ws_added'] += 1
		for token in tokensRemoved:
			row['ts_removed'] += 1
			row['cs_removed'] += len(token)
			if token.strip() == '':       pass
			if token in MARKUP:           row['ms_removed'] += 1
			elif token not in STOP_WORDS: row['ws_removed'] += 1
			
		
		yield tuple(['meta']+[row[h] for h in metaHeaders])
		
		lastTokens = tokens
		recentRevs.insert(checksum, revision)