def update_top_image_reposts(uowm: UnitOfWorkManager, reddit: Reddit) -> NoReturn: days = [1, 7, 30, 365] with uowm.start() as uow: uow.session.execute('TRUNCATE `stats_top_image_repost`') for day in days: result = uow.session.execute( 'SELECT repost_of, COUNT(*) c FROM image_reposts WHERE detected_at > NOW() - INTERVAL :days DAY GROUP BY repost_of HAVING c > 1 ORDER BY c DESC LIMIT 2000', {'days': day}) for chunk in chunk_list(result.fetchall(), 100): reddit_ids_to_lookup = [] for post in chunk: existing = uow.stats_top_image_repost.get_by_post_id_and_days( post[0], day) if existing: existing.repost_count = post[1] continue reddit_ids_to_lookup.append(f't3_{post[0]}') for submission in reddit.info(reddit_ids_to_lookup): count_data = next( (x for x in chunk if x[0] == submission.id)) if not count_data: continue uow.stats_top_image_repost.add( StatsTopImageRepost(post_id=count_data[0], repost_count=count_data[1], days=day, nsfw=submission.over_18)) uow.commit()
def update_banned_sub_wiki(uowm: UnitOfWorkManager, reddit: Reddit) -> NoReturn: """ Update the banned sub wiki page with the most recent list of banned subs :param uowm: UnitOfWorkmanager :param reddit: Praw Reddit instance """ print('[Scheduled Job] Update Ban Wiki Start') wiki_template_file = os.path.join(os.getcwd(), 'banned-subs.md') if not os.path.isfile(wiki_template_file): log.critical('Unable to locate banned sub wiki file at %s', wiki_template_file) return with open(wiki_template_file, 'r') as f: template = f.read() with uowm.start() as uow: banned = uow.banned_subreddit.get_all() results = [[f'r/{sub.subreddit}', sub.detected_at, sub.last_checked] for sub in banned] table_data = build_markdown_table( results, ['Subreddit', 'Detected At', 'Last Checked']) wiki = reddit.subreddit( 'RepostSleuthBot').wiki['published-data/banned-subreddits'] wiki.edit(template.format(banned_subs=table_data, total=len(banned))) log.info('[Banned Sub Wiki Update] Fished update') print('[Scheduled Job] Update Ban Wiki End')
def send_reports_to_meme_voting(uowm: UnitOfWorkManager) -> NoReturn: with uowm.start() as uow: reports = uow.user_report.get_reports_for_voting(7) for report in reports: if uow.meme_template.get_by_post_id(report.post_id): continue if uow.meme_template_potential.get_by_post_id(report.post_id): continue post = uow.posts.get_by_post_id(report.post_id) if not post: continue try: if not requests.head(post.searched_url).status_code == 200: continue except Exception: continue potential_template = MemeTemplatePotential( post_id=report.post_id, submitted_by='background', vote_total=0) uow.meme_template_potential.add(potential_template) report.sent_for_voting = True uow.commit()
def update_mod_status(uowm: UnitOfWorkManager, reddit: Reddit) -> NoReturn: """ Go through all registered subs and check if their a mod and what level of permissions they have :param uowm: UnitOfWorkManager :param reddit: Rreddit """ ignore_no_mod = [ 'CouldYouDeleteThat', 'CouldYouDeleteThat', ] print('[Scheduled Job] Checking Mod Status Start') with uowm.start() as uow: monitored_subs: List[MonitoredSub] = uow.monitored_sub.get_all() for sub in monitored_subs: if not is_sub_mod_praw(sub.name, 'RepostSleuthBot', reddit): log.info('[Mod Check] Bot is not a mod on %s', sub.name) sub.is_mod = False uow.commit() continue sub.is_mod = True sub.post_permission = bot_has_permission(sub.name, 'posts', reddit) sub.wiki_permission = bot_has_permission(sub.name, 'wiki', reddit) log.info('[Mod Check] %s | Post Perm: %s | Wiki Perm: %s', sub.name, sub.post_permission, sub.wiki_permission) uow.commit() print('[Scheduled Job] Checking Mod Status End')
def update_ban_list(uowm: UnitOfWorkManager, reddit: Reddit, notification_svc: NotificationService = None) -> NoReturn: """ Go through banned subs and see if we're still banned :rtype: NoReturn :param uowm: UnitOfWorkManager :param reddit: Reddit """ log.info('Starting Job: Update Subreddit Bans') with uowm.start() as uow: bans = uow.banned_subreddit.get_all() for ban in bans: last_checked_delta = (datetime.utcnow() - ban.last_checked).days if last_checked_delta < 1: log.debug('Banned sub %s last checked %s days ago. Skipping', ban.subreddit, last_checked_delta) continue if is_bot_banned(ban.subreddit, reddit): log.info('[Subreddit Ban Check] Still banned on %s', ban.subreddit) ban.last_checked = func.utc_timestamp() else: log.info('[Subreddit Ban Check] No longer banned on %s', ban.subreddit) uow.banned_subreddit.remove(ban) if notification_svc: notification_svc.send_notification( f'Removed {ban.subreddit} from ban list', subject='Subreddit Removed From Ban List!') uow.commit()
def get_link_reposts( url: Text, uowm: UnitOfWorkManager, search_settings: SearchSettings, post: Post = None, get_total: bool = False, ) -> LinkSearchResults: url_hash = md5(url.encode('utf-8')) url_hash = url_hash.hexdigest() with uowm.start() as uow: search_results = LinkSearchResults(url, search_settings, checked_post=post, search_times=LinkSearchTimes()) search_results.search_times.start_timer('query_time') search_results.search_times.start_timer('total_search_time') raw_results = uow.posts.find_all_by_url_hash(url_hash) search_results.search_times.stop_timer('query_time') log.debug('Query time: %s', search_results.search_times.query_time) search_results.matches = [ SearchMatch(url, match) for match in raw_results ] if get_total: search_results.total_searched = uow.posts.count_by_type('link') return search_results
def check_meme_template_potential_votes(uowm: UnitOfWorkManager) -> NoReturn: with uowm.start() as uow: potential_templates = uow.meme_template_potential.get_all() for potential_template in potential_templates: if potential_template.vote_total >= 10: existing_template = uow.meme_template.get_by_post_id(potential_template.post_id) if existing_template: log.info('Meme template already exists for %s. Removing', potential_template.post_id) uow.meme_template_potential.remove(potential_template) uow.commit() return log.info('Post %s received %s votes. Creating meme template', potential_template.post_id, potential_template.vote_total) post = uow.posts.get_by_post_id(potential_template.post_id) try: meme_hashes = get_image_hashes(post.searched_url, hash_size=32) except Exception as e: log.error('Failed to get meme hash for %s', post.post_id) return meme_template = MemeTemplate( dhash_h=post.dhash_h, dhash_256=meme_hashes['dhash_h'], post_id=post.post_id ) uow.meme_template.add(meme_template) uow.meme_template_potential.remove(potential_template) elif potential_template.vote_total <= -10: log.info('Removing potential template with at least 10 negative votes') uow.meme_template_potential.remove(potential_template) else: continue uow.commit()
def save_image_repost_result(search_results: ImageSearchResults, uowm: UnitOfWorkManager, high_match_check: bool = False, source: Text = 'unknown') -> NoReturn: """ Take a found repost and save to the database :param source: What triggered this search :rtype: NoReturn :param high_match_check: Perform a high match meme check. :param search_results: Set of search results :param uowm: Unit of Work Manager :return:None """ with uowm.start() as uow: search_results.checked_post.checked_repost = True if not search_results.matches: log.debug('Post %s has no matches', search_results.checked_post.post_id) uow.posts.update(search_results.checked_post) uow.commit() return # This is used for ingest repost checking. If a meme template gets created, it intentionally throws a # IngestHighMatchMeme exception. This will cause celery to retry the task so the newly created meme template # gets used if high_match_check: check_for_high_match_meme( search_results, uowm) # This intentionally throws if we create a meme template log.info('Creating repost. Post %s is a repost of %s', search_results.checked_post.url, search_results.matches[0].post.url) new_repost = ImageRepost( post_id=search_results.checked_post.post_id, repost_of=search_results.matches[0].post.post_id, hamming_distance=search_results.matches[0].hamming_distance, annoy_distance=search_results.matches[0].annoy_distance, author=search_results.checked_post.author, search_id=search_results.logged_search.id if search_results.logged_search else None, subreddit=search_results.checked_post.subreddit, source=source) uow.image_repost.add(new_repost) uow.posts.update(search_results.checked_post) try: uow.commit() except Exception as e: log.exception('Failed to save image repost', exc_info=True)
def remove_expired_bans(uowm: UnitOfWorkManager, notification_svc: NotificationService = None) -> NoReturn: print('[Scheduled Job] Removed Expired Bans Start') with uowm.start() as uow: bans = uow.banned_user.get_expired_bans() for ban in bans: if notification_svc: notification_svc.send( f'Removing expired ban for user {ban.name}', subject='**Expired Ban Removed**' ) log.info('[Ban Remover] Removing %s from ban list', ban.name) uow.banned_user.remove(ban) uow.commit()
def check_for_post_watch(matches: List[SearchMatch], uowm: UnitOfWorkManager) -> List[Dict]: results = [] with uowm.start() as uow: for match in matches: watches = uow.repostwatch.get_all_active_by_post_id( match.post.post_id) if watches: log.info('Found %s active watch requests for post %s', len(watches), match.post.post_id) for watch in watches: results.append({'match': match, 'watch': watch}) return results
def queue_post_watch_cleanup(uowm: UnitOfWorkManager, config: Config) -> NoReturn: """ Send all watches to celery to check if the post has been deleted :param uowm: Unit of work manager """ print('[Scheduled Job] Queue Deleted Watch Check') redis = get_redis_client(config) if len(redis.lrange('watch_remove_deleted', 0, 20000)) > 0: log.info('Deleted watchqueue still has pending jobs. Skipping update queueing ') return with uowm.start() as uow: watches = uow.repostwatch.get_all() for chunk in chunk_list(watches, 30): check_if_watched_post_is_active.apply_async((chunk,))
def queue_config_updates(uowm: UnitOfWorkManager, config: Config) -> NoReturn: print('[Scheduled Job] Queue config update check') redis = get_redis_client(config) if len(redis.lrange('config_update_check', 0, 20000)) > 0: log.info( 'Config update queue still has pending jobs. Skipping update queueing ' ) return with uowm.start() as uow: monitored_subs = uow.monitored_sub.get_all() for monitored_sub in monitored_subs: check_for_subreddit_config_update_task.apply_async( (monitored_sub, )) print('[Scheduled Job Complete] Queue config update check')
def save_link_repost(post: Post, repost_of: Post, uowm: UnitOfWorkManager, source: Text) -> None: with uowm.start() as uow: new_repost = LinkRepost(post_id=post.post_id, repost_of=repost_of.post_id, author=post.author, subreddit=post.subreddit, source=source) post.checked_repost = True uow.posts.update(post) uow.link_repost.add(new_repost) try: uow.commit() except IntegrityError: log.error('Failed to save link repost, it already exists') except Exception as e: log.exception('Failed to save link repost', exc_info=True)
def pre_process_post(post: Post, uowm: UnitOfWorkManager, hash_api) -> Post: log.debug(post) with uowm.start() as uow: if post.post_type == 'image': log.debug('Post %s: Is an image', post.post_id) try: post, image_post, image_post_current = process_image_post( post, hash_api) except (ImageRemovedException, ImageConversioinException, InvalidImageUrlException, ConnectionError): return if image_post is None or image_post_current is None: log.error( 'Post %s: Failed to save image post. One of the post objects is null', post.post_id) log.error('Image Post: %s - Image Post Current: %s', image_post, image_post_current) return if not post.dhash_h: log.error('Post %s: is missing dhash', post.post_id) return uow.image_post.add(image_post) uow.image_post_current.add(image_post_current) elif post.post_type == 'link': url_hash = md5(post.url.encode('utf-8')) post.url_hash = url_hash.hexdigest() log.debug('Set URL hash for post %s', post.post_id) elif post.post_type == 'hosted:video': pass try: uow.posts.add(post) uow.commit() log.debug('Post %s: Commited post to database', post.post_id) except IntegrityError as e: log.exception('Post %s: Database save failed', post.post_id, exc_info=False) return return post
def check_for_high_match_meme(search_results: ImageSearchResults, uowm: UnitOfWorkManager) -> NoReturn: if search_results.meme_template is not None: return with uowm.start() as uow: meme_template = None # TODO - 1/12/2021 - Should probably remember the meme in subreddit check and generate more templates if len( search_results.matches ) > 5 and 'meme' in search_results.checked_post.subreddit.lower(): try: meme_hashes = get_image_hashes(search_results.checked_post.url, hash_size=32) except Exception as e: log.error('Failed to get meme hash for %s', search_results.checked_post.post_id) return try: meme_template = MemeTemplate( dhash_h=search_results.checked_post.dhash_h, dhash_256=meme_hashes['dhash_h'], post_id=search_results.checked_post.post_id) uow.meme_template.add(meme_template) uow.commit() except IntegrityError as e: log.exception( f'Failed to create meme template. Template already exists for post {search_results.checked_post.post_id}', exc_info=True) meme_template = None if meme_template: log.info('Saved new meme template for post %s in %s', search_results.checked_post.post_id, search_results.checked_post.subreddit) # Raise exception so celery will retry the task and use the new meme template raise IngestHighMatchMeme( 'Created meme template. Post needs to be rechecked')
def build_msg_values_from_search(search_results: 'SearchResults', uowm: UnitOfWorkManager = None, **kwargs) -> Dict: """ Take a ImageRepostWrapper object and return a dict of values for use in a message template :param search_results: ImageRepostWrapper :param uowm: UnitOfWorkManager """ base_values = { 'total_searched': f'{search_results.total_searched:,}', 'total_posts': 0, 'match_count': len(search_results.matches), 'post_type': search_results.checked_post.post_type, 'this_subreddit': search_results.checked_post.subreddit, 'times_word': 'times' if len(search_results.matches) > 1 else 'time', 'stats_searched_post_str': searched_post_str(search_results.checked_post, search_results.total_searched), 'post_shortlink': f'https://redd.it/{search_results.checked_post.post_id}', 'post_author': search_results.checked_post.author, 'report_post_link': '' } if search_results.search_times: base_values['search_time'] = search_results.search_times.total_search_time if search_results.matches: base_values['oldest_created_at'] = search_results.matches[0].post.created_at base_values['oldest_url'] = search_results.matches[0].post.url base_values['oldest_shortlink'] = f'https://redd.it/{search_results.matches[0].post.post_id}' base_values['oldest_sub'] = search_results.matches[0].post.subreddit base_values['newest_created_at'] = search_results.matches[-1].post.created_at base_values['newest_url'] = search_results.matches[-1].post.url base_values['newest_shortlink'] = f'https://redd.it/{search_results.matches[-1].post.post_id}' base_values['newest_sub'] = search_results.matches[-1].post.subreddit base_values['first_seen'] = f"First Seen [Here](https://redd.it/{search_results.matches[0].post.post_id}) on {search_results.matches[0].post.created_at.strftime('%Y-%m-%d')}" base_values['last_seen'] = f"Last Seen [Here](https://redd.it/{search_results.matches[-1].post.post_id}) on {search_results.matches[-1].post.created_at.strftime('%Y-%m-%d')}" if uowm: with uowm.start() as uow: base_values['total_posts'] = f'{uow.posts.get_newest_post().id:,}' return {**base_values, **search_results.search_settings.to_dict(), **kwargs}
def update_monitored_sub_data(uowm: UnitOfWorkManager) -> NoReturn: print('[Scheduled Job] Update Monitored Sub Data') with uowm.start() as uow: subs = uow.monitored_sub.get_all_active() for sub in subs: update_monitored_sub_stats.apply_async((sub.name, ))