def get_project_rating_by_type(wp10db, project_name, quality=None, importance=None, project_b_name=None, quality_b=None, importance_b=None, pattern=None, page=None, limit=100): try: limit = int(limit) except ValueError: limit = 100 if limit < 0: limit = 100 if limit > 500: limit = 500 query = _project_rating_query(project_name, quality=quality, importance=importance, project_b_name=project_b_name, quality_b=quality_b, importance_b=importance_b, pattern=pattern, page=page, limit=limit) params = { 'r_project': project_name, 'r_quality': quality, 'r_importance': importance, } if pattern is not None: params['article_pattern_compiled'] = '%' + pattern + '%' if project_b_name is not None: params['r_project_b'] = project_b_name if quality_b is not None: params['r_quality_b'] = quality_b if importance_b is not None: params['r_importance_b'] = importance_b with wp10db.cursor() as cursor: cursor.execute(query, params) if project_b_name is None: return [Rating(**db_rating) for db_rating in cursor.fetchall()] results = [] for res in cursor.fetchall(): rating_b = Rating(r_project=res.pop('rating_b.r_project'), r_article=res.pop('rating_b.r_article'), r_namespace=res.pop('rating_b.r_namespace'), r_quality=res.pop('rating_b.r_quality'), r_importance=res.pop('rating_b.r_importance')) rating_a = Rating(**res) results.append((rating_a, rating_b)) return results
def setUp(self): super().setUp() self.maxDiff = None self.rating = Rating(r_project=b'Test Project', r_namespace=4, r_article=b'Test article pages', r_importance=b'NotAClass', r_importance_timestamp=b'2020-04-04T15:55:55Z', r_quality=b'NotAClass', r_quality_timestamp=b'2020-01-13T08:04:20Z')
def test_add_log_for_importance_rating(self): rating = Rating(r_project=b'Test Project', r_namespace=0, r_article=b'Testing Stuff', r_importance=b'Mid-Class', r_importance_timestamp=b'2018-04-01T12:30:00Z') logic_rating.add_log_for_rating(self.wp10db, rating, AssessmentKind.IMPORTANCE, b'NotA-Class') with self.wp10db.cursor() as cursor: cursor.execute( ''' SELECT * FROM ''' + Log.table_name + ''' WHERE l_article = %s ''', (b'Testing Stuff', )) db_log = cursor.fetchone() self.assertIsNotNone(db_log) log = Log(**db_log) self.assertEqual(b'Test Project', log.l_project) self.assertEqual(0, log.l_namespace) self.assertEqual(b'Testing Stuff', log.l_article) self.assertEqual(b'Mid-Class', log.l_new) self.assertEqual(b'NotA-Class', log.l_old) self.assertEqual(b'importance', log.l_action)
def get_project_ratings(wp10db, project_name): with wp10db.cursor() as cursor: cursor.execute( 'SELECT * FROM ' + Rating.table_name + ''' WHERE r_project = %(r_project)s ''', {'r_project': project_name}) return [Rating(**db_rating) for db_rating in cursor.fetchall()]
def _insert_ratings(self): for r in self.ratings: rating = Rating(r_project=b'Test Project', r_namespace=0, r_article=r[0]) rating.r_quality = r[1] rating.r_quality_timestamp = GLOBAL_TIMESTAMP_WIKI rating.r_importance = r[2] rating.r_importance_timestamp = GLOBAL_TIMESTAMP_WIKI with self.wp10db.cursor() as cursor: cursor.execute( ''' INSERT INTO ratings (r_project, r_namespace, r_article, r_score, r_quality, r_quality_timestamp, r_importance, r_importance_timestamp) VALUES (%(r_project)s, %(r_namespace)s, %(r_article)s, %(r_score)s, %(r_quality)s, %(r_quality_timestamp)s, %(r_importance)s, %(r_importance_timestamp)s) ''', attr.asdict(rating)) self.wp10db.commit()
def process_unseen_articles(wikidb, wp10db, project, old_ratings, seen): denom = len(old_ratings.keys()) ratio = len(seen) / denom if denom != 0 else 'NaN' logger.debug('Looking for unseen articles, ratio was: %s', ratio) in_seen = 0 skipped = 0 processed = 0 n = 0 for ref, old_rating in old_ratings.items(): if ref in seen: in_seen += 1 continue # By default, we evaluate both assessment kinds. kind = AssessmentKind.BOTH if old_rating.r_quality == NOT_A_CLASS or old_rating.r_quality is None: # The quality rating is not set, so just evaluate importance kind = AssessmentKind.IMPORTANCE if (old_rating.r_importance == NOT_A_CLASS or old_rating.r_importance is None): # The importance rating is also not set, so don't do anything. skipped += 1 continue logger.debug('Processing unseen article %s', ref.decode('utf-8')) processed += 1 ns, title = ref.decode('utf-8').split(':', 1) ns = int(ns.encode('utf-8')) title = title.encode('utf-8') move_data = logic_page.get_move_data(wp10db, wikidb, ns, title, project.timestamp_dt) if move_data is not None: logic_page.update_page_moved(wp10db, project, ns, title, move_data['dest_ns'], move_data['dest_title'], move_data['timestamp_dt']) # Mark this article as having NOT_A_CLASS for it's quality or importance. # This probably means the article was deleted, but could in fact mean that # we just failed to find its move data. Either way, the new article would # have already been picked up by the assessment updater, assuming it was # tagged correctly. rating = Rating(r_project=project.p_project, r_namespace=ns, r_article=title, r_score=0) if kind in (AssessmentKind.QUALITY, AssessmentKind.BOTH): rating.quality = NOT_A_CLASS.encode('utf-8') if move_data: rating.set_quality_timestamp_dt(move_data['timestamp_dt']) else: rating.r_quality_timestamp = GLOBAL_TIMESTAMP_WIKI if kind in (AssessmentKind.IMPORTANCE, AssessmentKind.BOTH): rating.importance = NOT_A_CLASS.encode('utf-8') if move_data: rating.set_importance_timestamp_dt(move_data['timestamp_dt']) else: rating.r_importance_timestamp = GLOBAL_TIMESTAMP_WIKI logic_rating.insert_or_update(wp10db, rating, kind) if kind in (AssessmentKind.QUALITY, AssessmentKind.BOTH): logic_rating.add_log_for_rating(wp10db, rating, AssessmentKind.QUALITY, old_rating.r_quality) if kind in (AssessmentKind.IMPORTANCE, AssessmentKind.BOTH): logic_rating.add_log_for_rating(wp10db, rating, AssessmentKind.IMPORTANCE, old_rating.r_importance) n += 1 if n >= MAX_ARTICLES_BEFORE_COMMIT: wp10db.ping() wp10db.commit() logger.info('End, committing db') wp10db.ping() wp10db.commit() logger.debug('SEEN REPORT:\nin seen: %s\nskipped: %s\nprocessed: %s', in_seen, skipped, processed)
def update_project_assessments_by_kind(wikidb, wp10db, project, extra_assessments, kind, old_ratings, seen): if kind not in (AssessmentKind.QUALITY, AssessmentKind.IMPORTANCE): raise ValueError( 'Parameter "kind" was not one of QUALITY or IMPORTANCE') logger.info('Updating project %s assessments for %s', kind, project.p_project) rating_to_category = update_project_categories_by_kind( wikidb, wp10db, project, extra_assessments, kind) n = 0 new_ratings = defaultdict(list) for current_rating, (category, ranking) in rating_to_category.items(): logger.info('Fetching article list for %r' % category.decode('utf-8')) current_rating = current_rating.encode('utf-8') for page in logic_page.get_pages_by_category(wikidb, category): # Talk pages are tagged, we want the NS of the article itself. namespace = page.page_namespace - 1 if not logic_util.is_namespace_acceptable(namespace): logger.debug('Skipping %s with namespace=%s', page.page_title, namespace) continue article_ref = str(namespace).encode( 'utf-8') + b':' + page.page_title seen.add(article_ref) old_rating = old_ratings.get(article_ref) old_rating_value = None if old_rating: rating = Rating(**attr.asdict(old_rating)) if kind == AssessmentKind.QUALITY: old_rating_value = rating.r_quality elif kind == AssessmentKind.IMPORTANCE: old_rating_value = rating.r_importance else: rating = Rating(r_project=project.p_project, r_namespace=namespace, r_article=page.page_title, r_score=0) old_rating_value = NOT_A_CLASS.encode('utf-8') if kind == AssessmentKind.QUALITY: rating.r_quality = current_rating rating.set_quality_timestamp_dt(page.cl_timestamp) elif kind == AssessmentKind.IMPORTANCE: rating.r_importance = current_rating rating.set_importance_timestamp_dt(page.cl_timestamp) new_ratings[article_ref].append((rating, kind, old_rating_value)) n += 1 if n >= MAX_ARTICLES_BEFORE_COMMIT: wp10db.ping() wp10db.commit() logger.info('End, committing db') wp10db.ping() wp10db.commit() return (new_ratings, rating_to_category)
class RatingModelTest(BaseWpOneDbTest): def setUp(self): super().setUp() self.maxDiff = None self.rating = Rating(r_project=b'Test Project', r_namespace=4, r_article=b'Test article pages', r_importance=b'NotAClass', r_importance_timestamp=b'2020-04-04T15:55:55Z', r_quality=b'NotAClass', r_quality_timestamp=b'2020-01-13T08:04:20Z') def test_to_web_dict_namespace(self): expected = { 'article': 'Wikipedia:Test article pages', 'article_link': 'https://en.wikipedia.org/w/index.php?title=Wikipedia:Test%20article%20pages', 'article_talk': 'Wikipedia talk:Test article pages', 'article_talk_link': 'https://en.wikipedia.org/w/index.php?title=Wikipedia talk:Test%20article%20pages', 'article_history_link': 'https://en.wikipedia.org/w/index.php?title=Wikipedia:Test%20article%20pages&action=history', 'importance': 'NotAClass', 'importance_updated': '2020-04-04T15:55:55Z', 'quality': 'NotAClass', 'quality_updated': '2020-01-13T08:04:20Z' } actual = self.rating.to_web_dict(self.wp10db) self.assertEqual(expected, actual) def test_to_web_dict_no_namespace(self): self.rating.r_namespace = 0 expected = { 'article': 'Test article pages', 'article_link': 'https://en.wikipedia.org/w/index.php?title=Test%20article%20pages', 'article_talk': 'Talk:Test article pages', 'article_talk_link': 'https://en.wikipedia.org/w/index.php?title=Talk:Test%20article%20pages', 'article_history_link': 'https://en.wikipedia.org/w/index.php?title=Test%20article%20pages&action=history', 'importance': 'NotAClass', 'importance_updated': '2020-04-04T15:55:55Z', 'quality': 'NotAClass', 'quality_updated': '2020-01-13T08:04:20Z' } actual = self.rating.to_web_dict(self.wp10db) self.assertEqual(expected, actual)