def iter_project_pages() -> Iterator[pywikibot.Page]: query = """ SELECT page_namespace - 1, page_title FROM templatelinks JOIN page ON tl_from = page_id WHERE tl_title = "WikiProject_Essays" and tl_namespace = 10 and page_namespace in (3, 5, 13) """ conn = toolforge.connect("enwiki_p") with conn.cursor() as cur: rows = cur.execute(query) logger.info(f"{rows} pages found") data = cast(Iterable[Tuple[int, bytes]], cur.fetchall()) # XXX: Work around pywikibot bug T67262 namespaces = {2: "User:"******"Wikipedia:", 12: "Help:"} progress = -1 for i, (ns, title) in enumerate(data): percent = math.floor(i / rows * 100) if (percent > progress) and (percent % 5 == 0): logger.info(f"Analyzing pages: {percent}% complete") progress = percent yield pywikibot.Page(site, title=namespaces[ns] + str(title, encoding="utf-8")) logger.info("Analyzing pages: 100% complete")
def iter_pages_and_targets() -> Iterator[Tuple[pywikibot.Page, str]]: conn = toolforge.connect("enwiki_p") query = """ SELECT CONCAT("Talk:", com.page_title) as title, CONCAT("Talk:", target.page_title) as target FROM page com JOIN page target ON target.page_title = REPLACE(com.page_title, "/Comments", "") WHERE com.page_namespace = 1 AND target.page_namespace = 1 AND com.page_title LIKE "%/Comments" AND com.page_is_redirect = 0 AND com.page_len = 0 AND target.page_is_redirect = 0 """ with conn.cursor() as cur: cur.execute(query) data = cur.fetchall() for title, target in data: yield ( pywikibot.Page(site, str(title, encoding="utf-8")), str(target, encoding="utf-8").replace("_", " "), )
def db_get_usage(cat: pywikibot.Category, depth: int) -> UsageResult: query = """ SELECT page_title, count(*) FROM categorylinks JOIN page ON cl_from = page_id LEFT JOIN globalimagelinks ON page_title = gil_to JOIN image ON img_name = page_title WHERE cl_to IN %(cats)s AND img_major_mime = "image" AND img_minor_mime != "svg+xml" GROUP BY page_title ORDER BY count(*) DESC """ conn = toolforge.connect("commonswiki") with conn.cursor() as cur: total = cur.execute( query, args={ "cats": [ cat.title(with_ns=False, underscore=True) for cat in list_cats(cat, depth) ] }, ) data = cast(List[Tuple[bytes, int]], cur.fetchall()) return UsageResult( [ FileUsage(f"File:{str(page, encoding='utf-8')}", count) for page, count in data ][:200], total, [], )
def users(): conn = toolforge.connect('commonswiki') with conn.cursor() as cur: sql = ('SELECT actor_name, count(*) FROM actor_revision JOIN revision ' 'ON rev_actor=actor_id JOIN change_tag ON ct_rev_id=rev_id ' 'join change_tag_def ON ct_tag_id=ctd_id WHERE ' 'ctd_name="OAuth CID: 821" and actor_user>0 group' ' by actor_user order by count(*) desc') cur.execute(sql) data = cur.fetchall() users = [] for user in data: rowres = [] for item in user: if isinstance(item, type(b'a')): rowres.append(item.decode('utf-8')) else: rowres.append(item) users.append(rowres) with conn.cursor() as cur: sql = ('SELECT count(*) FROM actor_revision JOIN revision ON ' 'rev_actor=actor_id JOIN change_tag ON ct_rev_id=rev_id' ' join change_tag_def ON ct_tag_id=ctd_id WHERE ' 'ctd_name="OAuth CID: 821" AND actor_id>0') cur.execute(sql) data = cur.fetchall() total = data[0][0] return flask.render_template('users.html', users=users, total=total)
def frwiki_query(countryname, country, language): printInfo('fr', countryname, country, language) if cached: query_res = get_quarry('30043', '0') frwikidata = {} for line in query_res: iws, fr, en, wd = line if en is None: frwikidata.update({wd: [iws, ['fr', fr.replace('_', ' ')]]}) else: frwikidata.update({wd: [iws, ['en', en]]}) final2 = sort_list(frwikidata) saveToDB(final2, language, country, 'fr') return frwikidata else: query_res = run_query(SQL_frwiki.format(language, country), toolforge.connect('frwiki_p', 'analytics')) result_json = [encode_all_items(f) for f in query_res] frwikidata = {} for line in result_json: iws, fr, en, wd = line if en is None: frwikidata.update({wd: [iws, ['fr', fr.replace('_', ' ')]]}) else: frwikidata.update({wd: [iws, ['en', en]]}) final2 = sort_list(frwikidata) saveToDB(final2, language, countryname, 'fr') return frwikidata
def enwiki_query(countryname, country, language): printInfo('en', countryname, country, language) if cached: query_res = get_quarry('30043', '1') results = { f[2]: [f[0], ['en', f[1].replace('_', ' ')]] for f in query_res } final2 = sort_list(results) saveToDB(final2, language, country, 'en') return results else: query_res = run_query(SQL_enwiki.format(country, language), toolforge.connect('enwiki_p', 'analytics')) result_json = [encode_all_items(f) for f in query_res] results = { f[2]: [f[0], ['en', f[1].replace('_', ' ')]] for f in result_json } final2 = sort_list(results) saveToDB(final2, language, countryname, 'en') return results
def test_connect(self): tests = [ (['enwiki_p'], { 'database': 'enwiki_p', 'host': 'enwiki.web.db.svc.eqiad.wmflabs', }), (['enwiki'], { 'database': 'enwiki_p', 'host': 'enwiki.web.db.svc.eqiad.wmflabs', }), (['enwiki', 'analytics'], { 'database': 'enwiki_p', 'host': 'enwiki.analytics.db.svc.eqiad.wmflabs', }), (['enwiki_p', 'labsdb'], { 'database': 'enwiki_p', 'host': 'enwiki.labsdb', }), ] common_expects = { 'read_default_file': os.path.expanduser("~/replica.my.cnf"), 'charset': 'utf8mb4', } for args, expects in tests: with unittest.mock.patch('toolforge._connect') as mm: mm.return_value = None exp = common_expects.copy() exp.update(expects) conn = toolforge.connect(*args) assert conn is None mm.assert_called_once_with(**exp)
def suggest_articles(wiki, limit): w = Wiki.query.filter_by(dbname=wiki).first() conn = toolforge.connect(wiki) with conn.cursor() as cur: if len(w.excluded_articles_json) > 0: regexp = r'^(%s)' % "|".join(w.excluded_articles_json) else: regexp = r'^$' cur.execute( '''select page_id, page_title, page_len/count(*) as bytes_per_link from pagelinks join page on page_id=pl_from where page_len>%s and page_namespace=0 and page_id not in (select tl_from from templatelinks where tl_title=%s) and page_id not in (select pp_page from page_props where pp_propname="disambiguation") and page_title not rlike %s group by page_id having bytes_per_link>%s limit %s;''', (w.minimum_length, w.template, regexp, w.treshold, limit)) data = cur.fetchall() for row in data: if SuggestedArticle.query.filter_by( page_id=row[0]).first() is not None: continue s = SuggestedArticle(page_id=row[0], bytes_per_link=row[2], wiki_id=w.id) db.session.add(s) db.session.commit()
def check_user_exists(username): message = 'User with username:'******'exists' success = True try: conn = toolforge.connect('enwiki') with conn.cursor() as cursor: # finding how many users with the given username exist cursor.execute( 'SELECT count(*) FROM user WHERE user_name' ' = %s ', [str(username)]) result = cursor.fetchone() if result[0] == 0: success = False message = 'User with username:'******' does not exist' except Exception: message = 'Error fetching data from database for user: ' +\ str(username) success = False finally: cursor.close() conn.close() logger.info(message) return success
def run_query(): query = """ SELECT actor_name as `username`, user_editcount as `edit_count`, user_registration as `registration_date`, NOT(ug_group IS NULL) as `ext_conf`, COUNT(*) as `redwarn_edits`, COUNT(*)/user_editcount*100 as `redwarn_pct`, ipb_sitewide as `blocked` FROM revision_userindex JOIN change_tag ON ct_rev_id = rev_id JOIN actor_revision ON rev_actor = actor_id JOIN `user` ON actor_user = user_id LEFT JOIN user_groups ON ug_user = user_id AND ug_group = "extendedconfirmed" LEFT JOIN ipblocks ON user_id = ipb_user WHERE ct_tag_id = 577 -- RedWarn GROUP BY actor_name ORDER BY user_registration DESC """ conn = toolforge.connect("enwiki_p") with conn.cursor() as cur: cur.execute(query) data = cur.fetchall() for line in data: yield Row( username=f"[[User:{str(line[0], encoding='utf-8')}]]", edit_count=line[1], registration_date=datetime.datetime.strptime( str(line[2], encoding="utf-8"), "%Y%m%d%H%M%S"), ext_conf=bool(line[3]), redwarn_edits=line[4], redwarn_pct=line[5], blocked=bool(line[6]), )
def get_dump(database=DEFAULT_DATABASE, start=None, end=None, mode=None): """Returns result object for bulk aggregation Args: database: name of database to use start: Start hour as "2018-10-10T01" or a duration like "1d" end: End hour as "2018-10-10T01" or None mode: How to manipulate results views: (default) Report raw views in ``views`` field logprobs: Estimate log probabilities in ``logprobs`` and ``default_logprob`` field """ logger = logging.getLogger(__name__) with toolforge.connect(database, cluster="tools") as cursor: (start, end) = convert_start_and_end(cursor, start, end) logger.info(f"start={start}, end={end}") hours = get_hours(cursor, start, end) logger.info(f"{len(hours)} hours") (max_qid, total_views) = get_summary(cursor, start, end) summary = dict( start=start, end=end, hours=hours, max_qid=max_qid, total_views=total_views, ) data = list(aggregate_by_qid(cursor, start, end)) return (summary, data)
def iter_files(): conn = toolforge.connect("commonswiki") query = """ SELECT CONCAT("File:", page_title) FROM page JOIN categorylinks ON cl_from = page_id WHERE cl_to IN ( SELECT page_title FROM page JOIN categorylinks ON cl_from = page_id WHERE cl_to = "Images_from_Wiki_Loves_Africa_2020" and cl_type = "subcat" and cl_sortkey != " " ) and page_id NOT IN ( SELECT cl_from FROM categorylinks WHERE cl_to = "Images_from_Wiki_Loves_Africa_2020" ) and page_namespace = 6 """ with conn.cursor() as cur: cur.execute(query) for title in cur.fetchall(): yield pywikibot.Page(site, str(title[0], encoding="utf-8"))
def recentclaims(user, prop_name, limit): prop = "%%%s%%" % prop_name limit = int(limit) conn = toolforge.connect('wikidatawiki') with conn.cursor() as cur: sql = ''' select rev_page, page_title, rev_id, rev_comment from revision_userindex join page on page_id=rev_page ''' if user: sql += ''' where rev_user_text=%s and rev_comment like "%%claim-%%" and rev_comment like %s order by rev_timestamp desc limit %s; ''' parameters = (user, prop, limit) else: sql += 'where rev_comment like "%%claim-%%" and rev_comment like %s order by rev_timestamp desc limit %s;' parameters = (prop, limit) cur.execute(sql, parameters) data = cur.fetchall() result = [] for row in data: mid_result = re.search(r"Property:P18\]\]: (.*)$", row[3].decode('utf-8')) if not mid_result: continue value = mid_result.groups()[0] result.append({ "page_id": row[0], "qid": row[1], "rev_id": row[2], "property": request.args.get('property'), "value": value }) return result
def main(): parser = argparse.ArgumentParser(parents=[config.logging_cli()]) parser.add_argument('--progress', help='log progress every N suspects', type=int, metavar='N') parser.add_argument('--list_features', action='store_true', help='print all available features and exit') parser.add_argument('--feature', action='append', dest='features', help='''Feature to evaluate. May be repeated. Features are evaluated in the order given; it is the responsibility of the user to include the required dependencies, and in the right order. If no features are specified, defaults to all features. See --list-features to get a list of available features.''') args = parser.parse_args() if args.list_features: print_features() return features = args.features or ['reg_time', 'first_contrib_time', 'first_contrib_interval', 'live_edit_count', 'deleted_edit_count', 'block_count'] check_valid_features(features) config.configure_logging(args) logger = logging.getLogger('get_features') logger.info("Starting work") logger.info("job-name = %s", args.job_name) logger.info("Using features: %s", features) start_time = datetime.datetime.now() db_connection = toolforge.connect('enwiki') count = 0 for line in sys.stdin: initial_data = json.loads(line) suspect = Suspect(db_connection, initial_data) suspect.add_features(features) print(json.dumps(suspect.clean_data())) count += 1 if args.progress and (count % args.progress == 0): logger.info("Processed %s suspects", count) finish_time = datetime.datetime.now() elapsed_time = finish_time - start_time logger.info("Processed %d suspects in %s", count, elapsed_time)
def users(): conn = toolforge.connect('commonswiki') with conn.cursor() as cur: sql = ('select rev_user_text, count(*) from change_tag join revision ' 'on ct_rev_id=rev_id where ct_tag="OAuth CID: 821" and ' 'rev_user>0 group by rev_user order by count(*) desc;') cur.execute(sql) data = cur.fetchall() users = [] for user in data: rowres = [] for item in user: if isinstance(item, b'a'): rowres.append(item.decode('utf-8')) else: rowres.append(item) users.append(rowres) with conn.cursor() as cur: sql = ('select count(*) from change_tag join revision on ct_rev_id=' 'rev_id where ct_tag="OAuth CID: 821" and rev_user>0;') cur.execute(sql) data = cur.fetchall() total = data[0][0] return flask.render_template('users.html', users=users, total=total, logged=logged(), username=getusername())
def connect_to_replicas_database(db_name, replicas_port=None): """ Establishes connection to Wikimedia replicas database in Toolforge. :param db_name: name of the database. :param replicas_port: port for connecting to db through ssh tunneling, if used. :return: pymysql.connection to the database. """ try: if replicas_port: conn = pymysql.connect( host="127.0.0.1", port=replicas_port, user=cfg['user_credits']['user'], password=cfg['user_credits']['password'], connect_timeout=1000, ) if db_name[-2:] != "_p": db_name = db_name + "_p" with conn.cursor() as cur: cur.execute("use " + db_name) conn.commit() else: conn = toolforge.connect( dbname=db_name, connect_timeout=1000, cluster="analytics" ) return conn except pymysql.err.OperationalError as err: print("Failure: Please establish connection to Toolforge") print("Error: ", err) exit(1)
def calculateUserContributionRank(username): results = {'success': True, 'user_rank': -1, 'complete_edit_count': -1} message = "User's contribution rank calculated successfully. Results:" try: conn = toolforge.connect('enwiki') with conn.cursor() as cursor: # finding how many users have greater editcount # than that of given username cursor.execute( 'SELECT count(*), useredits.user_editcount FROM user, ' '(SELECT * FROM user WHERE user_name = %s ) AS useredits ' 'WHERE user.user_editcount > useredits.user_editcount', [username]) result = cursor.fetchone() except Exception: results['success'] = False message = 'Error occurred while calculating contribution ' \ 'distribution for all users. Results:' finally: cursor.close() conn.close() if results['success']: results['user_rank'] = result[0] + 1 results['complete_edit_count'] = result[1] logger.info("{0}{1}".format(message, str(results))) return results
def get_user_properties(user: str, dbname: str) -> UserProps: """Get signature and fancysig values for a user from the replica db""" logger.info("Getting user properties") conn = toolforge.connect(f"{dbname}_p") with conn.cursor() as cur: cur.execute( """ SELECT up_property, up_value FROM user_properties WHERE up_user = (SELECT user_id FROM `user` WHERE user_name = %s) """, [user], ) resultset = cast(List[Tuple[bytes, bytes]], cur.fetchall()) logger.debug(resultset) data: Dict[str, str] = { key.decode("utf-8"): value.decode("utf-8") for key, value in resultset } return UserProps(nickname=data.get("nickname", ""), fancysig=bool(int(data.get("fancysig", "0"))))
def suggest_articles(wiki, limit): w = Wiki.query.filter_by(dbname=wiki).first() conn = toolforge.connect(wiki) with conn.cursor() as cur: treshold = floor(w.bytes_per_link_avg, 2) cur.execute( '''select page_id, page_title, page_len/count(*) as bytes_per_link from pagelinks join page on page_id=pl_from where page_len>%s and page_namespace=0 group by page_id having bytes_per_link>%s limit %s;''', (w.minimum_length, treshold, limit)) data = cur.fetchall() for row in data: bpl_min = w.bytes_per_link_avg bpl_max = w.bytes_per_link_max + w.tolerance if row[2] < bpl_min: probability = 0 elif row[2] > bpl_max: probability = 100 else: probability = (row[2] - bpl_min) / bpl_max * 100 s = SuggestedArticle(page_id=row[0], bytes_per_link=row[2], probability=probability, wiki_id=w.id) db.session.add(s) db.session.commit()
def revert(): conn = toolforge.connect('wikidatawiki') rev_id = request.args.get('rev_id') with conn.cursor() as cur: cur.execute('select rev_page from revision where rev_id=%s', rev_id) rev_page = cur.fetchall()[0][0] payload = { "action": "query", "format": "json", "meta": "tokens", "type": "csrf" } r = requests.get(app.config['API_MWURI'], params=payload, auth=get_auth()) token = r.json()['query']['tokens']['csrftoken'] payload = { "action": "edit", "format": "json", "pageid": rev_page, "undo": request.args.get('rev_id'), "token": token } r = requests.post(app.config['API_MWURI'], data=payload, auth=get_auth()) return jsonify({ 'status': 'ok', 'message': 'reverted', 'rev_id': rev_id, "response": r.json(), })
def iter_files_and_users( days, delay_mins=30) -> Iterator[Tuple[pywikibot.Page, pywikibot.Page]]: start_ts = ((datetime.datetime.utcnow() - datetime.timedelta(days=days)).replace( hour=0, minute=0, second=0).strftime("%Y%m%d%H%M%S")) end_ts = (datetime.datetime.utcnow() - datetime.timedelta(minutes=delay_mins)).strftime("%Y%m%d%H%M%S") query = """ SELECT p0.page_namespace, p0.page_title, CONCAT("User talk:", actor_name) FROM categorylinks JOIN page p0 ON cl_from = p0.page_id JOIN logging_logindex ON log_page = p0.page_id AND log_type = "upload" JOIN actor_logging ON log_actor = actor_id WHERE cl_to = "Files_with_no_machine-readable_license" AND log_timestamp > %(start_ts) s AND log_timestamp < %(end_ts) s AND "Deletion_template_tag" NOT IN ( SELECT tl_title FROM templatelinks WHERE tl_namespace = 10 AND tl_from = p0.page_id ) AND ( log_action = "upload" OR actor_id IN ( SELECT log_actor FROM logging_logindex WHERE log_page = p0.page_id AND log_type = "upload" AND log_action = "upload" ) ) GROUP BY page_id ORDER BY actor_id """ conn = toolforge.connect("commonswiki_p", cluster=cluster) with conn.cursor() as cur: cur.execute(query, args={"start_ts": start_ts, "end_ts": end_ts}) data = cast(Iterator[Tuple[int, bytes, bytes]], cur.fetchall()) for ns, title, user in data: page = pywikibot.Page(site, title=str(title, encoding="utf-8"), ns=ns) user_talk = pywikibot.Page(site, title=str(user, encoding="utf-8")) if not page.exists(): continue if user_talk.isRedirectPage(): user_talk = user_talk.getRedirectTarget() yield ( page, user_talk, )
def getAllWikis(): conn_meta = toolforge.connect('meta_p') results = run_query( "SELECT dbname FROM wiki WHERE family='wikipedia' and is_closed=0", conn_meta) wikipediaList = [encode_if_necessary(f[0]) for f in results] return set(wikipediaList)
def get_cat_files(category: str) -> int: conn = toolforge.connect("commonswiki_p") query = """ SELECT cat_files FROM category WHERE cat_title = %s """ with conn.cursor() as cur: cur.execute(query, args=[category]) return cur.fetchall()[0]
def do_db_query(db_name: str, query: str, **kwargs) -> Any: """Uses the toolforge library to query the replica databases""" if not wmcs(): raise ConnectionError("Not running on Toolforge, database unavailable") conn = toolforge.connect(db_name) with conn.cursor() as cur: cur.execute(query, kwargs) res = cur.fetchall() return res
def _connect(self): """ connect to DB """ orig_conv = pymysql.converters.conversions #Adding support for bit data type orig_conv[FIELD_TYPE.BIT] = lambda data: data == b'\x01' args = {'host': self._HOST, 'conv': orig_conv} self.ctx = toolforge.connect(self._get_db_name(), **args)
def getAllCurrentWikidataItemsInWikipedia(): conn_lvwiki = toolforge.connect('etwiki_p', 'analytics') currWDitemsQuery = run_query( "select pp_value from page_props where pp_propname='wikibase_item'", conn_lvwiki) currWDitems = [ int(encode_if_necessary(b[0])[1:]) for b in currWDitemsQuery ] return set(currWDitems)
def get_page_links(self) -> None: page = self.page query = """ SELECT COUNT(pl_from) FROM pagelinks WHERE pl_title = %s and pl_namespace = %s""" conn = toolforge.connect("enwiki_p") with conn.cursor() as cur: cur.execute(query, (page.title( underscore=True, with_ns=False), page.namespace().id)) self.links = cast(Tuple[Tuple[int]], cur.fetchall())[0][0]
def getAllCurrentWikidataItemsInWikipedia(): conn_lvwiki = toolforge.connect('lvwiki_p', 'analytics') currWDitemsQuery = run_query( "select pp_value from page_props where pp_propname='wikibase_item'", conn_lvwiki) currWDitems = [ int(encode_if_necessary(b[0])[1:]) for b in currWDitemsQuery ] currWDitems.append(24575438) #some bug with "Module:Wikidata2" item return set(currWDitems)
def getDataFromDB(wikiLang): conn = toolforge.connect(wikiLang + 'wiki_p', 'analytics') queryRES = run_query(sqlQuery, conn) queryRES = [[encode_if_necessary(f) for f in fileSaver] for fileSaver in queryRES] with open('comcat_ex_' + wikiLang + '.txt', 'w', encoding='utf-8') as fileS: fileS.write(str(queryRES)) return queryRES
def __init__(self, db="dewiki_p", revert_detector=None): self.logger = logging.getLogger("kokolores.creator") self.start = self.START_DEFAULT self.stop = self.START_DEFAULT if revert_detector is None: self.revert_detector = RevertDetector(db) else: self.revert_detector = revert_detector self.conn = toolforge.connect(db, cursorclass=pymysql.cursors.DictCursor)