def init_source(self): response = self.with_requests(events_url + '/events') self.source = EventSource(response)
# any downtime, we always maintain 100% data coverage (up to the 7-31 days # that the EventStream historical data is kept). latest_datetime = db.get_latest_datetime() if latest_datetime[0]: latest_date_formatted = latest_datetime[0].strftime('%Y-%m-%dT%H:%M:%SZ') url = base_stream_url + '?since={date}'.format( date=latest_date_formatted) else: url = base_stream_url if len(sys.argv) > 1 and sys.argv[1] == 'nohistorical': url = base_stream_url for event in EventSource(url): if event.event == 'message': try: change = json.loads(event.data) except ValueError: continue hashtag_matches = hashtag_match(change['comment']) if hashtag_matches and valid_edit(change): for hashtag in hashtag_matches: if db.is_duplicate(hashtag, change['id']): print("Skipped duplicate {hashtag} ({id})".format( hashtag=hashtag, id=change['id'])) elif valid_hashtag(hashtag): # Check edit_summary length, truncate if necessary
import json from sseclient import SSEClient as EventSource #rc_url = 'https://stream.wikimedia.org/v2/stream/recentchange' ref_url = 'https://stream.wikimedia.org/v2/stream/page-links-change' for event in EventSource(ref_url): if event.event == 'message': try: change = json.loads(event.data) except ValueError: continue # print(change.keys()) if 'added_links' in change: added_ext_links = [l for l in change['added_links'] if '.cn' in l['link']] if added_ext_links: print('{0} ({1}): {2}'.format(change['page_title'], change['database'], added_ext_links))
def wiki_stream(): url = 'https://stream.wikimedia.org/v2/stream/revision-create' c =0 domain = {} user = {} start_time = time.time() limit = 60 open("file1", "w").close() for event in EventSource(url): if (time.time() - start_time) > limit: print("*********************************************** Last 1 Min Report **************************************************") print("\n") break if event.event == 'message': try: out_file = open("file1", "a+") out_file.write(event.data+'\n') pass out_file.close() except ValueError: pass with open('file1') as f: content = f.readlines() for i in content: if not i.lstrip().rstrip(): continue data = json.loads(i) key = data['meta']['domain'] if key in domain: domain[key] = domain[key]+1 else: domain[key] = 1 dict(sorted(domain.items(), key=lambda item: item[1])) print(" Domain Report") print("\n") print("Total number of Wikipedia Domains Updated:", len(domain)) for d in domain: print(d , " : " ,domain[d] , " Pages updated") print("\n") with open('file1') as f: content = f.readlines() for i in content: if not i.lstrip().rstrip(): continue data = json.loads(i) usr = data['performer']['user_text'] if data['meta']['domain']=='en.wikipedia.org': if usr in user: user[usr] = max(user[usr], data['performer']['user_edit_count']) else: try: user[usr] = data['performer']['user_edit_count'] except: pass dict(sorted(user.items(), key = lambda item: item[1])) print(" User Report") print("\n") print("Users who made changes to en.wikipedia.org") print("\n") for key, value in user.items(): print(key, " : " ,value)
for row in batch: cache.add(row.meta_id) rs.close() print( f"Initial cache size is: {len(cache)}, in memory: {getsizeof(cache)/1024/1024} MBytes" ) # get last date max_date = session.query(func.max(Event.meta_dt)).scalar() date_from = max_date.strftime('%Y-%m-%dT%H:%M:%SZ') print(f"Reload from date: {date_from}") url = f'https://stream.wikimedia.org/v2/stream/recentchange?since={date_from}' print(f"Using SSE URL {url}") for event in EventSource(url, retry=1000, chunk_size=81920000): try: change = json.loads(event.data) except ValueError as ex: print(ex, event.data) else: if change['meta']['id'] not in cache: buffer.append( Event(meta_id=change['meta']['id'], meta_dt=change['meta']['dt'], data=change)) cache.add(change['meta']['id']) else: print( f"ID {change['meta']['id']} {change['meta']['dt']} already in cache" )
latest_date_formatted = latest_datetime[0].strftime('%Y-%m-%dT%H:%M:%SZ') url = base_stream_url + '?since={date}'.format(date=latest_date_formatted) else: url = base_stream_url if len(sys.argv) > 1 and sys.argv[1] == 'nohistorical': url = base_stream_url for event in EventSource( url, # The retry argument sets the delay between retries in milliseconds. # We're setting this to 5 minutes. # There's no way to set the max_retries value with this library, # but since it depends upon requests, which in turn uses urllib3 # by default, we get a default max_retries value of 3. retry=300000, # The timeout argument gets passed to requests.get. # An integer value sets connect (socket connect) and # read (time to first byte / since last byte) timeout values. # A tuple value sets each respective value independently. # https://requests.readthedocs.io/en/latest/user/advanced/#timeouts timeout=(3.05, 30)): if event.event == 'message': try: change = json.loads(event.data) except ValueError: continue hashtag_matches = hashtag_match(change['comment']) if hashtag_matches and valid_edit(change): for hashtag in hashtag_matches:
def streaming(self): # construct file and fout print("Working on a new file..") print("user_text,wikiproject,type,title,ns,timestamp,minor", file=self.fout) for event in EventSource(self.url): if event.event == 'message': try: change = json.loads(event.data) except ValueError: pass else: if change['wiki'] != "enwiki": continue if change['type'] != 'edit': continue username = change.get('user').encode('utf-8') title = change.get('title').encode('utf-8') timestamp = change.get('timestamp') ns = change.get('namespace') minor = 1 if change.get('minor') else 0 if username in self.users_bonds: type = 'bonds' elif username in self.users_topics: type = 'topics' elif username in self.users_rule: type = 'rule' elif username in self.users_cf: type = 'cf' elif username in self.newcomers: type = 'newcomer' elif username in self.organizers: type = 'organizer' else: # not recommended users... continue wikiproject = None if username in self.user_project: wikiproject = self.user_project[username] else: # TODO: something is wrong here... pass print("{}**{}**{}**{}**{}**{}**{}".format( username, wikiproject, type, title, ns, timestamp, minor), file=self.fout) print("{}**{}**{}**{}**{}**{}**{}".format( username, wikiproject, type, title, ns, timestamp, minor)) self.fout.flush()
def run(self): ''' Grab events from the stream until shutdown. ''' ## SQL query to identify a redirect, disambiguation page, and list page_check_query = '''SELECT ap.page_is_redirect, IFNULL(c1.cl_from, 0) AS page_is_disambig, IFNULL(c2.cl_from, 0) AS page_is_list FROM revision r JOIN page ap ON r.rev_page=ap.page_id LEFT JOIN page tp ON (ap.page_title=tp.page_title AND tp.page_namespace=1) LEFT JOIN ( SELECT cl_from FROM categorylinks WHERE cl_to='All_article_disambiguation_pages') AS c1 ON c1.cl_from=ap.page_id LEFT JOIN ( SELECT cl_from FROM categorylinks WHERE cl_to REGEXP "^List-Class.*") AS c2 ON c2.cl_from=tp.page_id WHERE r.rev_id=%(rev_id)s LIMIT 1''' ## SQL query to insert predictions for a given revision into ## our local database table, formatted so that the two groups ## of prediction results are easy to spot. insert_query = '''INSERT INTO page_predictions VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)''' # Set up a signal handler for SIGUSR1 signal.signal(signal.SIGUSR1, self.handle_signal); # Create the ORES session variable ores_session = ORESSession(self.ores_url, self.ores_user_agent) ## Connect to the database wiki_db_conn = db.connect("{}.labsdb".format(self.wiki), "{}_p".format(self.wiki), self.db_conf) if not wiki_db_conn: logging.error("unable to connect to Wiki database") return() local_db_conn = db.connect(self.local_db_host, self.local_db_name, self.db_conf) if not local_db_conn: logging.error("unable to connect to tools database") return() logging.info("Running...") for event in EventSource(self.rc_url): if self.shutdown: break data = self.filter_event(event) if not data: continue ## Turn the timestamp into a datetime object data['timestamp'] = dt.datetime.fromtimestamp( data['timestamp'], tz=dt.timezone.utc) logging.info('{user} created {title}'.format_map(data)) ## Check that it's not a redirect, not a disambiguation page, ## and not a list page. page_is_redirect = 0 page_is_disambig = 0 page_is_list = 0 try: with db.cursor(wiki_db_conn) as db_cursor: db_cursor.execute('SELECT * FROM page LIMIT 1') except MySQLdb.OperationalError as e: wiki_db_conn = db.connect("{}.labsdb".format(self.wiki), "{}_p".format(self.wiki), self.db_conf) with db.cursor(wiki_db_conn, 'dict') as db_cursor: db_cursor.execute( page_check_query, {'rev_id': data['revision']['new']}) for row in db_cursor: page_is_redirect = row['page_is_redirect'] page_is_disambig = row['page_is_disambig'] page_is_list = row['page_is_list'] if page_is_redirect or page_is_disambig or page_is_list: continue logging.info("{user} created {title} which is not a redirect, not a disambiguation page, and not a list".format_map(data)) try: with db.cursor(local_db_conn) as db_cursor: db_cursor.execute('SELECT * FROM page_predictions LIMIT 1') except MySQLdb.OperationalError as e: local_db_conn = db.connect(self.local_db_host, self.local_db_name, self.db_conf) with db.cursor(local_db_conn, 'dict') as db_cursor: ## Grab the wp10 and draftquality predictions from ORES ## for the given revision and store it in the database: for prediction in ores_session.score(data['wiki'], self.ores_models, [data['revision']['new']]): try: draft_res = prediction['draftquality']['score'] wp10_res = prediction['wp10']['score'] except KeyError: logging.warning('unexpected ORES result') continue try: db_cursor.execute( insert_query, (data['revision']['new'], data['timestamp'], draft_res['prediction'], draft_res['probability']['spam'], draft_res['probability']['vandalism'], draft_res['probability']['attack'], draft_res['probability']['OK'], wp10_res['prediction'], wp10_res['probability']['Stub'], wp10_res['probability']['Start'], wp10_res['probability']['C'], wp10_res['probability']['B'], wp10_res['probability']['GA'], wp10_res['probability']['FA'])) local_db_conn.commit() print("inserted {}, created by {}, draftquality prediction {}, wp10 prediction {}".format(data['title'], data['user'], draft_res['prediction'], wp10_res['prediction'])) except Exception as e: print(e) ## ok, done local_db_conn.close() wiki_db_conn.close() return()
def stream(self, start_date): # construct file and fout print("Working on a new file..") user_cnt = 0 register_cnt = 0 unit = 0 if self.time_gap >= 60 * 60 * 24: unit = start_date.day elif self.time_gap >= 60 * 60: unit = start_date.hour elif self.time_gap >= 60: unit = start_date.minute self.fout_newcomers = open( self.output_dir + "/" + self.newcomers_file + str(unit) + ".csv", "w") self.fout_newreg = open( self.output_dir + "/" + self.new_registered_file + str(unit) + ".csv", "w") print("user_cnt**user_text**article**timestamp", file=self.fout_newcomers) print("user_cnt**user_text**timestamp", file=self.fout_newreg) for event in EventSource(self.url): # check if times up for the next file (over a day) if self.times_up(start_date): break if event.event == 'message': try: change = json.loads(event.data) except ValueError: pass else: if change['wiki'] != "enwiki": continue if (change['type'] == "log" and change['log_type'] == "newusers" and change['log_action'] == "create" and 'user' in change and change['user'] is not None): self.NEW_USERS[change['user']] = 0 # TODO: add on edit hour, day, and month register_cnt += 1 print("Registered {}. {}".format( register_cnt, change.get('user').encode('utf8'))) print("{}**{}**{}".format(register_cnt, change['user'], change['timestamp']), file=self.fout_newreg) self.fout_newreg.flush() elif change['type'] in ('edit', 'new'): username = change.get('user') if username in self.NEW_USERS: if self.NEW_USERS[username] == 0 and change[ 'namespace'] == 0: self.NEW_USERS[username] += 1 # TODO: add on edit hour, day, and month user_cnt += 1 print("Edited {}. {} edited {}.".format( user_cnt, username.encode('utf8'), change['title'].encode('utf8'))) print("{}**{}**{}".format( username, change['title'], change['timestamp']), file=self.fout_newcomers) self.fout_newcomers.flush()