def get_change_object(change): """ Retrieves a Change from a given change API object, getting user data from the API and saving the User if it doesn't already exist in the DB. """ try: return Change.get(Change.change_id == change['rcid']) except DoesNotExist: size_diff = change['newlen'] - change['oldlen'] timestamp_dt = parse_iso_dt(change['timestamp']) return Change.create(change_id=change['rcid'], change_type=change['type'], user=get_user_object(change['user']), timestamp=timestamp_dt, page=get_page_object(change['pageid']), comment=change['comment'], size_diff=size_diff)
def scrape_mediawiki(): """Scrape MediaWiki for new changes and save them to the database.""" # Create DB tables if they don't already exist. User.create_table(fail_silently=True) Page.create_table(fail_silently=True) Change.create_table(fail_silently=True) # Parse changes from the MediaWiki API. print 'Parsing recent changes from MediaWiki API.' # Properties to request from the MediaWiki API. recent_changes_props = ['user', 'ids', 'title', 'comment', 'sizes', 'timestamp'] # `rctype` specifies that we only want new page and edit page events. recent_changes_action = {'action': 'query', 'list': 'recentchanges', 'rctype': 'new|edit', 'rcprop': '|'.join(recent_changes_props)} new_changes = [] req_num = 1 while True: print 'Request %s' % req_num # Make the API request. resp = request(recent_changes_action) # Parse the response data. resp_data = resp['query']['recentchanges'] req_num += 1 # `changes_exist` is a flag that lets us break out of the double loop. changes_exist = False for resp_item in resp_data: change_id = resp_item['rcid'] try: # `Change.get` either returns a `Change` object if one exists # for `change_id` or raises a `DoesNotExist` exception if a # `Change` object does not exist. Change.get(Change.change_id == change_id) # If we've gotten this far, a `DoesNotExist` exception has not # been raised, so the `Change` object exists for `change_id`. # Set the breakout flag and break out of the inner loop. changes_exist = True break # We're expecting the exception if a `Change` object does not # already exist in the DB for the given change. Ignore it. except DoesNotExist: # Append the response data to the existing data. new_changes.append(resp_item) # Two conditions break the continuing `recentchanges` request loop: # # * Reaching changes that exist in the DB # * Reaching the end of data, indicated by receiving data that doesn't # contain the `query-continue` key if changes_exist: print 'Reached changes that already exist in DB.' break if not 'query-continue' in resp: print 'No more query-continue; reached end of data.' break # The request loop hasn't been broken, and the data contains a # `query-continue` key. Use it to request the next page of data. next_start_point = resp['query-continue']['recentchanges']['rcstart'] recent_changes_action['rcstart'] = next_start_point print ' query-continue: %s' % recent_changes_action['rcstart'] # Done! print 'Done scraping.' # Add all User objects to the DB if they don't already exist. usernames = {change['user'] for change in new_changes} print('Verifying %s users...' % len(usernames)), for username in usernames: get_user_object(username) print 'done.' # Add all Page objects to the DB if they don't already exist. page_ids = {change['pageid'] for change in new_changes} print('Verifying %s pages...' % len(page_ids)), for page_id in page_ids: get_page_object(page_id) print 'done.' # Add all new Change objects to the DB. print('Verifying %s changes...' % len(new_changes)), for change in new_changes: get_change_object(change) print 'done.' print 'Finished!'