def main(argv=None): args = docopt.docopt(__doc__, argv=argv) features = import_from_path(args['<features>']) if args['--language'] is None: language = None else: language = import_from_path(args['--language']) session = api.Session(args['--api'], user_agent="Revscoring feature extractor utility") extractor = APIExtractor(session, language=language) if args['--rev-labels'] == "<stdin>": rev_labels = read_rev_labels(sys.stdin) else: rev_labels = read_rev_labels(open(args['--rev-labels'])) if args['--value-labels'] == "<stdout>": value_labels = sys.stdout else: value_labels = open(args['--value-labels'], 'w') verbose = args['--verbose'] run(rev_labels, value_labels, features, extractor, verbose)
def bot_gen(rev_pages, language, api_url): session = api.Session(api_url) extractor = APIExtractor(session, language=language) for rev_id, page_id in rev_pages: sys.stderr.write(".") sys.stderr.flush() try: # Detect reverted status revert = reverts.api.check(session, rev_id, page_id, radius=3) reverted = revert is not None added_words = list(extractor.extract(rev_id, [diff.added_words]))[0] yield Edit(rev_id, added_words, reverted) except KeyboardInterrupt: sys.stderr.write("\n^C Caught. Exiting...") break except: sys.stderr.write(traceback.format_exc()) sys.stderr.write("\n") sys.stderr.write("\n")
def from_config(cls, config, name, section_key="extractors"): section = config[section_key][name] session = api.Session(section['url'], user_agent=section['user_agent']) language = Language.from_config(config, section['language']) return cls(session, language)
def main(argv=None): args = docopt.docopt(__doc__, argv=argv) diff_docs = read_docs(sys.stdin) session = api.Session(args['--api']) config_doc = yamlconf.load(open(args['--config'])) diff_engine = DiffEngine.from_config(config_doc, config_doc["diff_engine"]) run(diff_docs, session, diff_engine)
def main(argv=None): args = docopt.docopt(__doc__, argv=argv) model = MLScorerModel.load(open(args['<model-file>'], 'rb')) extractor = APIExtractor(api.Session(args['--api']), language=model.language) rev_ids = [int(rev_id) for rev_id in args['<rev_id>']] verbose = args['--verbose'] run(model, extractor, rev_ids, verbose)
def update_revs(): api_session = api.Session("https://en.wikipedia.org/w/api.php") rv_props = { 'revid': 'ids', 'timestamp': 'timestamp', 'user': '******', 'userid': 'userid', 'size': 'size', 'sha1': 'sha1', 'contentmodel': 'contentmodel', 'tags': 'tags', 'flags': 'flags', 'comment': 'comment', 'content': 'content' } revs = api_session.revisions.query(properties=rv_props.values(), titles={'climate change'}, direction="newer") all_revs = list(revs) pickle.dump(all_revs, open(revcache_path, 'wb')) return all_revs
t_qmark = r'\?' t_epoint = r'!' t_comma = r',' t_colon = r':' t_scolon = r';' t_break = r'(\n|\n\r|\r\n)\s*(\n|\n\r|\r\n)+' t_whitespace = r'[\n\r\s]+' t_etc = r"." def t_error(t): print("Illegal character '%s'" % t.value[0]) t.lexer.skip(1) lexer = lex() session = api.Session("https://en.wikipedia.org/w/api.php") common1 = session.revisions.get(638029546, properties={"content"})['*'] start = time.time() for i in range(50): lexer.input(common1) while True: token = lexer.token() #print(token) if token is None: break print("Tokenizing (text_split):", (time.time() - start) / 50)
def main(): args = parse_args() output_path = args.output_folder article_filename = args.article_file #handle -W if args.logging_destination: logging.basicConfig(filename=args.logging_destination, filemode='a', level=args.logging_level) else: logging.basicConfig(level=args.logging_level) export_time = str(datetime.datetime.now()) export_date = datetime.datetime.today().strftime("%Y%m%d") logging.info(f"Starting run at {export_time}") logging.info(f"Last commit: {digobs.git_hash()}") json_output_filename = os.path.join( output_path, f"digobs_covid19-wikipedia-enwiki_revisions-{export_date}.json") tsv_output_filename = os.path.join( output_path, f"digobs_covid19-wikipedia-enwiki_revisions-{export_date}.tsv") api_session = api.Session("https://en.wikipedia.org/w/api.php") # list of properties from the API we want to gather (basically all of # them supported by mediawik-utilities) rv_props = { 'revid': 'ids', 'timestamp': 'timestamp', 'user': '******', 'userid': 'userid', 'size': 'size', 'sha1': 'sha1', 'contentmodel': 'contentmodel', 'tags': 'tags', 'flags': 'flags', 'comment': 'comment', 'content': 'content' } exclude_from_tsv = ['tags', 'comment', 'content', 'flags'] # load the list of articles with open(article_filename, 'r') as infile: article_list = list(map(str.strip, infile)) def get_revisions_for_page(title): return api_session.revisions.query(properties=rv_props.values(), titles={title}, direction="newer") tsv_fields = ['title', 'pageid', 'namespace'] tsv_fields = tsv_fields + list(rv_props.keys()) # drop fields that we identified for exclusion tsv_fields = [e for e in tsv_fields if e not in exclude_from_tsv] # add special export fields tsv_fields = tsv_fields + [ 'anon', 'minor', 'url', 'export_timestamp', 'export_commit' ] export_info = {'git_commit': digobs.git_hash(), 'timestamp': export_time} with open(json_output_filename, 'w') as json_output, \ open(tsv_output_filename, 'w') as tsv_output: tsv_writer = DictWriter(tsv_output, fieldnames=tsv_fields, delimiter="\t") tsv_writer.writeheader() for article in article_list: logging.info(f"pulling revisions for: {article}") # try to grab the code 10 times, sleeping for one minute each time tries = 0 while True: try: rev_rows = [] for rev in get_revisions_for_page(article): logging.debug(f"processing raw revision: {rev}") # add export metadata rev['exported'] = export_info # save the json version of the code print(json.dumps(rev), file=json_output) # handle missing data if "sha1" not in rev: rev["sha1"] = "" if "userhidden" in rev: rev["user"] = "" rev["userid"] = "" # recode anon so it's true or false instead of present/missing if "anon" in rev: rev["anon"] = True else: rev["anon"] = False # let's recode "minor" in the same way if "minor" in rev: rev["minor"] = True else: rev["minor"] = False # add page title information rev['title'] = rev['page']['title'] rev['pageid'] = rev['page']['pageid'] rev['namespace'] = rev['page']['ns'] # construct a URL rev['url'] = Request( 'GET', 'https://en.wikipedia.org/w/index.php', params={ 'title': rev['title'].replace(" ", "_"), 'oldid': rev['revid'] }).prepare().url rev['export_timestamp'] = export_time rev['export_commit'] = digobs.git_hash(short=True) rev_rows.append(rev) logging.debug( f"successfully received revisions for: {article}") break except: if tries > 10: logging.critical( f"giving up after 10 tries to get {article}") raise else: logging.warning(f"socket.timeout from {article}") logging.warning(f"sleeping 60 seconds before retrying") tries = tries + 1 time.sleep(60) continue # print out each of the revisions once we know we have it all for rev in rev_rows: tsv_writer.writerow({k: rev[k] for k in tsv_fields})
import time from mw import api from mwcites.extractors import doi session = api.Session("https://en.wikipedia.org/w/api.php", user_agent="Demo doi extractor") revisions = session.revisions.query(titles={"Psychotherapy"}, properties={'content'}) lots = next(revisions)['*'] print("Text with lots of DOIs has {0} characters".format(len(lots))) revisions = session.revisions.query(titles={"Waffle"}, properties={'content'}) few = next(revisions)['*'] print("Text with few DOIs has {0} characters".format(len(few))) start = time.time() for i in range(50): ids = set(doi.extract(lots)) ids = set(doi.extract(few)) print("Regex strategy: {0}".format(time.time() - start)) start = time.time() for i in range(50): ids = set(doi.extract_mwp(lots)) ids = set(doi.extract_mwp(few)) print("MWP strategy: {0}".format(time.time() - start)) start = time.time()
def main(): # This is used for Aaron Halfaker's API wrapper... loginfile = configparser.ConfigParser() loginfile.read([os.path.expanduser('~/.wiki.ini')]) username = loginfile.get('wiki', 'username') password = loginfile.get('wiki', 'password') # ...And this is for Pywikibot bot = pywikibot.Site('en', 'wikipedia') wptools = WikiProjectTools() now = datetime.datetime.utcnow() now = now.strftime( '%Y%m%d%H%M%S') # converts timestamp to MediaWiki format # Pulling timestamp of the last time the script was run query = wptools.query( 'index', 'select lu_timestamp from lastupdated where lu_key = "new_discussions";', None) lastupdated = query[0][0] # Polling for newest talk page posts in the last thirty minutes query = wptools.query( 'wiki', 'select distinct recentchanges.rc_this_oldid, page.page_id, recentchanges.rc_title, recentchanges.rc_comment, recentchanges.rc_timestamp, page.page_namespace from recentchanges join page on recentchanges.rc_namespace = page.page_namespace and recentchanges.rc_title = page.page_title join categorylinks on page.page_id=categorylinks.cl_from where rc_timestamp >= {0} and rc_timestamp < {1} and rc_comment like "% new section" and rc_deleted = 0 and cl_to like "%_articles" and page_namespace not in (0, 2, 6, 8, 10, 12, 14, 100, 108, 118) order by rc_timestamp asc;' .format(lastupdated, now), None) # Cleaning up output namespace = { 1: 'Talk:', 3: 'User_talk:', 4: 'Wikipedia:', 5: 'Wikipedia_talk:', 7: 'File_talk:', 9: 'MediaWiki_talk:', 11: 'Template_talk:', 13: 'Help_talk:', 15: 'Category_talk:', 101: 'Portal_talk:', 109: 'Book_talk:', 119: 'Draft_talk:', 447: 'Education_Program_talk:', 711: 'TimedText_talk:', 829: 'Module_talk:', 2600: 'Topic:' } output = [] for row in query: rc_id = row[0] page_id = row[1] rc_title = row[2].decode('utf-8') rc_comment = row[3].decode('utf-8') rc_comment = rc_comment[ 3:] # Truncate beginning part of the edit summary rc_comment = rc_comment[:-15] # Truncate end of the edit summary rc_timestamp = row[4].decode('utf-8') rc_timestamp = datetime.datetime.strptime(rc_timestamp, '%Y%m%d%H%M%S') rc_timestamp = rc_timestamp.strftime('%H:%M, %d %B %Y (UTC)') page_namespace = row[5] page_namespace = namespace[page_namespace] session = api.Session("https://en.wikipedia.org/w/api.php", user_agent='WPX Revert Checker') session.login(username, password) # Check if revision has been reverted reverted = reverts.api.check(session, rc_id, page_id, 3, None, 172800, None) if reverted is None: entry = { 'title': (page_namespace + rc_title), 'section': rc_comment, 'timestamp': rc_timestamp } output.append(entry) # Loading list of WikiProjects signed up to get lists of new discussions config = json.loads( wptools.query('index', 'select json from config;', None)[0][0]) if config['defaults'][ 'new_discussions'] == False: # i.e. if New Discussions is an opt-in system whitelist = [] # Whitelisted WikiProjects for new discussion lists for project in config['projects']: try: project['new_discussions'] except KeyError: continue else: if project['new_discussions'] == True: whitelist.append(project['name']) else: whitelist = None # A whitelist of [] is one where there is a whitelist, but it's just empty. # A whitelist of None is for situations where the need for a whitelist has been obviated. # Generating list of WikiProjects for each thread for thread in output: query = wptools.query( 'index', 'select distinct pi_project from projectindex where pi_page = %s;', (thread['title'])) thread['wikiprojects'] = [] for row in query: wikiproject = row[0].replace('_', ' ') if (whitelist is None) or (wikiproject in whitelist): thread['wikiprojects'].append(wikiproject) for wikiproject in thread['wikiprojects']: saveto = wikiproject + '/Discussions' page = pywikibot.Page(bot, saveto) intro_garbage = '{{WPX header|Discussions|color={{{1|#37f}}}}}\n' intro_garbage += '{{{{WPX action box|color={{{{{{2|#086}}}}}}|title=Have a question?|content={{{{Clickable button 2|url=//en.wikipedia.org/wiki/Wikipedia_talk:{0}?action=edit§ion=new|Ask the WikiProject|class=mw-ui-progressive mw-ui-block}}}}\n\n{{{{Clickable button 2|Wikipedia talk:{0}|View Other Discussions|class=mw-ui-block}}}}}}}}\n'.format( wikiproject[10:].replace(' ', '_')) intro_garbage += '{{{{WPX list start|intro={{{{WPX last updated|{0}}}}}}}}}\n\n'.format( saveto) draft = '<noinclude><div style="padding-bottom:1em;">{{{{Clickable button 2|{0}|Return to WikiProject|class=mw-ui-neutral}}}}</div>\n</noinclude>'.format( wikiproject) + intro_garbage submission = '{{{{WPX new discussion|color={{{{{{1|#37f}}}}}}|title={0}|section={1}|timestamp={2}}}}}\n'.format( thread['title'].replace('_', ' '), thread['section'], thread['timestamp']) notification = "* '''[[{0}#{1}|{1}]] on {0}".format( thread['title'].replace('_', ' '), thread['section']) queue_notification(wikiproject[10:].replace(' ', '_'), notification) index = mwparserfromhell.parse(page.text) index = index.filter_templates() templatelist = [] for i in index: if i.name == "WPX new discussion": templatelist.append(str(i)) templatelist = templatelist[:14] # Sayonara, old threads! page.text = draft + submission if len(templatelist) > 3: templatelist[ 2] += "<noinclude>" # Anything after the third item will not be transcluded templatelist[len(templatelist) - 1] += "</noinclude>" for i in templatelist: page.text += i + "\n" page.text += "{{{{WPX list end|more={0}}}}}".format( saveto.replace(' ', '_')) page.save('New discussion on [[{0}]]'.format( thread['title'].replace('_', ' ')), minor=False) # Update the Last Updated field with new timestamp wptools.query( 'index', 'update lastupdated set lu_timestamp = {0} where lu_key = "new_discussions";' .format(now), None)
from mw.types.timestamp import Timestamp from itertools import chain, islice from functools import partial, cache import re import pandas as pd from dataclasses import dataclass from multiprocessing import Pool from wikidata.client import Client def parse_wikimedia_timestamp(timestamp): return datetime.fromtimestamp(Timestamp(timestamp).serialize()) apiurl = "https://en.wikipedia.org/w/api.php" session = api.Session(apiurl) wdclient = Client() title = "Sustainable energy" wikilink_re = re.compile( r'''\[\[ # Match two opening brackets (?P<link> # <link>: [^\n\|\]\[\#\<\>\{\}]{0,256} # Text inside link group # everything not illegal, non-greedy # can be empty or up to 256 chars ) (?: # Non-capturing group \| # Match a pipe (?P<anchor> # <anchor>: [^\[]*? # Test inside anchor group: # match everything not an open braket