def sub_monitor_check_post(self, submission: Dict, monitored_sub: MonitoredSub): if self.sub_monitor.has_post_been_checked(submission['id']): log.debug('Post %s has already been checked', submission['id']) return start = time.perf_counter() with self.uowm.start() as uow: post = uow.posts.get_by_post_id(submission['id']) if not post: log.info('Post %s does exist, sending to ingest queue', submission['id']) post = pushshift_to_post(submission, source='reddit_json') celery.send_task( 'redditrepostsleuth.core.celery.ingesttasks.save_new_post', args=[post], queue='postingest') return title_keywords = [] if monitored_sub.title_ignore_keywords: title_keywords = monitored_sub.title_ignore_keywords.split(',') if not self.sub_monitor.should_check_post( post, monitored_sub.check_image_posts, monitored_sub.check_link_posts, title_keyword_filter=title_keywords): return self.sub_monitor.check_submission(monitored_sub, post) print(f'Total time: {round(time.perf_counter() - start, 5)}')
def _process_user_report(self, msg: Message): with self.uowm.start() as uow: existing = uow.user_report.get_first_by_message_id(msg.id) if existing: log.debug('Report %s has already been saved', msg.id) return report_data = self._load_msg_body_data(msg.body) if not report_data: log.info('Failed to get report data from message %s. Not saving', msg.id) if len(self.failed_checks) > 10000: self.failed_checks = [] if msg.id not in self.failed_checks: self.failed_checks.append(msg.id) return report = UserReport(post_id=report_data['post_id'], reported_by=msg.author.name, report_type=msg.subject, meme_template=report_data['meme_template'], msg_body=msg.body, message_id=msg.id, sent_for_voting=False) with self.uowm.start() as uow: uow.user_report.add(report) uow.commit() self.response_handler.reply_to_private_message(msg, REPORT_RESPONSE)
def sub_monitor_check_post_old(self, submission, monitored_sub): if self.sub_monitor.has_post_been_checked(submission.id): log.debug('Post %s has already been checked', submission.id) return with self.uowm.start() as uow: post = uow.posts.get_by_post_id(submission.id) if not post: log.info('Post %s does exist, sending to ingest queue', submission.id) post = submission_to_post(submission) celery.send_task( 'redditrepostsleuth.core.celery.ingesttasks.save_new_post', args=[post], queue='postingest') return title_keywords = [] if monitored_sub.title_ignore_keywords: title_keywords = monitored_sub.title_ignore_keywords.split(',') if not self.sub_monitor.should_check_post( post, title_keyword_filter=title_keywords): return self.sub_monitor.check_submission(submission, monitored_sub, post)
def _final_meme_filter(self, searched_hash: Text, matches: List[ImageSearchMatch], target_hamming) -> List[ImageSearchMatch]: results = [] log.debug('MEME FILTER - Filtering %s matches', len(matches)) if len(matches) == 0: return matches for match in matches: try: match_hash = self._get_meme_hash(match.post.url) except Exception as e: log.error('Failed to get meme hash for %s', match.post.id) continue h_distance = hamming(searched_hash, match_hash) if h_distance > target_hamming: log.info( 'Meme Hamming Filter Reject - Target: %s Actual: %s - %s', target_hamming, h_distance, f'https://redd.it/{match.post.post_id}') continue log.debug('Match found: %s - H:%s', f'https://redd.it/{match.post.post_id}', h_distance) match.hamming_distance = h_distance match.hash_size = len(searched_hash) results.append(match) return results
def _reply_to_comment(self, response: SummonsResponse) -> SummonsResponse: log.debug('Sending response to summons comment %s. MESSAGE: %s', response.summons.comment_id, response.message) try: reply_comment = self.response_handler.reply_to_comment( response.summons.comment_id, response.message) response.comment_reply_id = reply_comment.id except APIException as e: if e.error_type == 'DELETED_COMMENT': log.debug('Comment %s has been deleted', response.summons.comment_id) response.message = 'DELETED COMMENT' elif e.error_type == 'THREAD_LOCKED': log.info('Comment %s is in a locked thread', response.summons.comment_id) response.message = 'THREAD LOCKED' elif e.error_type == 'TOO_OLD': log.info('Comment %s is too old to reply to', response.summons.comment_id) response.message = 'TOO OLD' elif e.error_type == 'RATELIMIT': log.exception('PRAW Ratelimit exception', exc_info=False) raise else: log.exception('APIException without error_type', exc_info=True) raise except Exception: log.exception('Problem leaving response', exc_info=True) raise return response
def update_ban_list(uowm: UnitOfWorkManager, reddit: Reddit, notification_svc: NotificationService = None) -> NoReturn: """ Go through banned subs and see if we're still banned :rtype: NoReturn :param uowm: UnitOfWorkManager :param reddit: Reddit """ log.info('Starting Job: Update Subreddit Bans') with uowm.start() as uow: bans = uow.banned_subreddit.get_all() for ban in bans: last_checked_delta = (datetime.utcnow() - ban.last_checked).days if last_checked_delta < 1: log.debug('Banned sub %s last checked %s days ago. Skipping', ban.subreddit, last_checked_delta) continue if is_bot_banned(ban.subreddit, reddit): log.info('[Subreddit Ban Check] Still banned on %s', ban.subreddit) ban.last_checked = func.utc_timestamp() else: log.info('[Subreddit Ban Check] No longer banned on %s', ban.subreddit) uow.banned_subreddit.remove(ban) if notification_svc: notification_svc.send_notification( f'Removed {ban.subreddit} from ban list', subject='Subreddit Removed From Ban List!') uow.commit()
def filter_title(match: SearchMatch): for kw in keywords: log.info('Title: %s - KW: %s', match.post.title, kw) if kw in match.post.title.lower(): log.debug('Title Filter Reject. Title contains %s', kw) return False return True
def sub_filter(match: SearchMatch): if match.post.subreddit != subreddit: log.debug('Same Sub Reject: Orig sub: %s - Match Sub: %s - %s', subreddit, match.post.subreddit, f'https://redd.it/{match.post.post_id}') return False return True
def _update_wiki_page(self, wiki_page: WikiPage, new_config: Dict) -> NoReturn: log.info('Writing new config to %s', wiki_page.subreddit.display_name) log.debug('New Config For %s: %s', wiki_page.subreddit.display_name, new_config) # TODO - Check what exceptions can be thrown here wiki_page.edit(json.dumps(new_config))
def _offer_watch(self, submission: Submission) -> NoReturn: """ Offer to add watch to OC post :param search: """ if not self.config.top_post_offer_watch: log.debug('Top Post Offer Watch Disabled') return log.info('Offer watch to %s on post %s', submission.author.name, submission.id) with self.uowm.start() as uow: existing_response = uow.bot_private_message.get_by_user_source_and_post( submission.author.name, 'toppost', submission.id) if existing_response: log.info('Already sent a message to %s', submission.author.name) return try: self.response_handler.send_private_message( submission.author, TOP_POST_WATCH_BODY.format( shortlink=f'https://redd.it/{submission.id}'), subject=TOP_POST_WATCH_SUBJECT, source='toppost', post_id=submission.id) except APIException as e: if e.error_type == 'NOT_WHITELISTED_BY_USER_MESSAGE': log.error('Not whitelisted API error') else: log.exception('Unknown error sending PM to %s', submission.author.name, exc_info=True)
def hamming_filter(match: ImageSearchMatch): if match.hamming_distance <= target_hamming_distance: return True log.debug('Hamming Filter Reject - Target: %s Actual: %s - %s', target_hamming_distance, match.hamming_distance, f'https://redd.it/{match.post.post_id}') return False
def get_link_reposts( url: Text, uowm: UnitOfWorkManager, search_settings: SearchSettings, post: Post = None, get_total: bool = False, ) -> LinkSearchResults: url_hash = md5(url.encode('utf-8')) url_hash = url_hash.hexdigest() with uowm.start() as uow: search_results = LinkSearchResults(url, search_settings, checked_post=post, search_times=LinkSearchTimes()) search_results.search_times.start_timer('query_time') search_results.search_times.start_timer('total_search_time') raw_results = uow.posts.find_all_by_url_hash(url_hash) search_results.search_times.stop_timer('query_time') log.debug('Query time: %s', search_results.search_times.query_time) search_results.matches = [ SearchMatch(url, match) for match in raw_results ] if get_total: search_results.total_searched = uow.posts.count_by_type('link') return search_results
def cross_post_filter(match: SearchMatch) -> bool: if match.post.crosspost_parent: log.debug('Crosspost Filter Reject - %s', f'https://redd.it/{match.post.post_id}') return False else: return True
def _reply_to_submission(self, submission_id: str, comment_body) -> Optional[Comment]: submission = self.reddit.submission(submission_id) if not submission: log.error('Failed to get submission %s', submission_id) return try: start_time = perf_counter() comment = submission.reply(comment_body) self._record_api_event( float(round(perf_counter() - start_time, 2)), 'reply_to_submission', self.reddit.reddit.auth.limits['remaining'] ) log.info('Left comment at: https://reddit.com%s', comment.permalink) log.debug(comment_body) self._log_response(comment) return comment except APIException as e: if e.error_type == 'RATELIMIT': log.exception('Reddit rate limit') raise RateLimitException('Hit rate limit') else: log.exception('Unknown error type of APIException', exc_info=True) raise except Forbidden: self._save_banned_sub(submission.subreddit.display_name) except Exception: log.exception('Unknown exception leaving comment on post https://redd.it/%s', submission_id, exc_info=True) raise
def days_filter(match: SearchMatch): if (datetime.utcnow() - match.post.created_at).days > cutoff_days: log.debug('Date Cutoff Reject: Target: %s Actual: %s - %s', cutoff_days, (datetime.utcnow() - match.post.created_at).days, f'https://redd.it/{match.post.post_id}') return False return True
def date_filter(match: SearchMatch): if match.post.created_at >= cutoff_date: log.debug('Date Filter Reject: Target: %s Actual: %s - %s', cutoff_date.strftime('%Y-%d-%m %H:%M:%S'), match.post.created_at.strftime('%Y-%d-%m %H:%M:%S'), f'https://redd.it/{match.post.post_id}') return False return True
def save_event(self, event: InfluxEvent): log.debug('Unsaved events %s', len(self._unsaved_events)) if not self.can_save(): log.info('Event logging disabled until %s', self._retry_time) self._unsaved_events.append(event) return self._write_to_influx(event) self._flush_unsaved()
def send_notification(self, msg: Text, **kwargs) -> NoReturn: for agent in self.notification_agents: log.info('Sending notification to %s', agent.name) log.debug(msg) try: agent.send(msg, **kwargs) except Exception as e: log.exception('Failed to send notification', exc_info=True)
def save_new_post(self, post): with self.uowm.start() as uow: existing = uow.posts.get_by_post_id(post.post_id) if existing: return log.debug('Post %s: Ingesting', post.post_id) post = pre_process_post(post, self.uowm, self.config.image_hash_api) if post: ingest_repost_check.apply_async((post,self.config), queue='repost') log.debug('Post %s: Sent post to repost queue', post.post_id)
def _return_redditor(self, username: Text) -> Redditor: for redditor in self._redditors: if redditor.name == username: log.debug('Returning cached redditor %s', redditor.name) return redditor new_redditor = self.reddit.redditor(username) if new_redditor: self._redditors.append(new_redditor) log.debug('Returning new redditor %s', username) return new_redditor
def _return_submission(self, submission_id: Text) -> Submission: for submission in self._submissions: if submission.id == submission_id: log.debug('Returning cached submission %s', submission_id) return submission new_submission = self.reddit.submission(submission_id) if new_submission: self._submissions.append(new_submission) log.debug('Returning new submission %s', submission_id) return new_submission
def _return_comment(self, comment_id: Text) -> Comment: for comment in self._comments: if comment.id == comment_id: log.debug('Returning cached comment %s', comment_id) return comment new_comment = self.reddit.comment(comment_id) log.debug('Returning new comment %s', comment_id) if new_comment: self._comments.append(new_comment) return new_comment
def _return_subreddit(self, sub_name: Text) -> Subreddit: for sub in self._subreddits: if sub.display_name == sub_name: log.debug('Returning cached sub %s', sub_name) return sub new_sub = self.reddit.subreddit(sub_name) if new_sub: log.debug('Returning new subreddit %s', sub_name) self._subreddits.append(new_sub) return new_sub
def save_pushshift_results_archive(self, data): with self.uowm.start() as uow: for submission in data: existing = uow.posts.get_by_post_id(submission['id']) if existing: log.debug('Skipping pushshift post: %s', submission['id']) continue post = pushshift_to_post(submission) log.debug('Saving pushshift post: %s', submission['id']) save_new_post.apply_async((post,), queue='pushshift_ingest')
def monitor_for_mentions(self): bad_mentions = [] while True: try: for comment in self.reddit.inbox.mentions(): if comment.created_utc < datetime.utcnow().timestamp( ) - 86400: log.debug('Skipping old mention. Created at %s', datetime.fromtimestamp(comment.created_utc)) continue if comment.author.name.lower() in [ 'sneakpeekbot', 'automoderator' ]: continue if comment.id in bad_mentions: continue with self.uowm.start() as uow: existing_summons = uow.summons.get_by_comment_id( comment.id) if existing_summons: log.debug('Skipping existing mention %s', comment.id) continue summons = Summons( post_id=comment.submission.id, comment_id=comment.id, comment_body=comment.body.replace('\\', ''), summons_received_at=datetime.fromtimestamp( comment.created_utc), requestor=comment.author.name, subreddit=comment.subreddit.display_name) uow.summons.add(summons) try: uow.commit() except DataError as e: log.error('SQLAlchemy Data error saving comment') bad_mentions.append(comment.id) continue except ResponseException as e: if e.response.status_code == 429: log.error('IP Rate limit hit. Waiting') time.sleep(60) continue except AssertionError as e: if 'code: 429' in str(e): log.error('Too many requests from IP. Waiting') time.sleep(60) return except Exception as e: log.exception('Mention monitor failed', exc_info=True) time.sleep(20)
def link_repost_check(self, posts, ): with self.uowm.start() as uow: for post in posts: """ if post.url_hash == '540f1167d27dcca2ea2772443beb5c79': continue """ if post.url_hash in self.link_blacklist: log.info('Skipping blacklisted URL hash %s', post.url_hash) continue log.debug('Checking URL for repost: %s', post.url_hash) search_results = get_link_reposts(post.url, self.uowm, get_default_link_search_settings(self.config), post=post) if len(search_results.matches) > 10000: log.info('Link hash %s shared %s times. Adding to blacklist', post.url_hash, len(search_results.matches)) self.link_blacklist.append(post.url_hash) self.notification_svc.send_notification(f'URL has been shared {len(search_results.matches)} times. Adding to blacklist. \n\n {post.url}') search_results = filter_search_results( search_results, uitl_api=f'{self.config.util_api}/maintenance/removed' ) search_results.search_times.stop_timer('total_search_time') log.info('Link Query Time: %s', search_results.search_times.query_time) if not search_results.matches: log.debug('Not matching linkes for post %s', post.post_id) post.checked_repost = True uow.posts.update(post) uow.commit() continue log.info('Found %s matching links', len(search_results.matches)) log.info('Creating Link Repost. Post %s is a repost of %s', post.post_id, search_results.matches[0].post.post_id) repost_of = search_results.matches[0].post new_repost = LinkRepost(post_id=post.post_id, repost_of=repost_of.post_id, author=post.author, source='ingest', subreddit=post.subreddit) repost_of.repost_count += 1 post.checked_repost = True uow.posts.update(post) uow.link_repost.add(new_repost) try: uow.commit() self.event_logger.save_event(RepostEvent(event_type='repost_found', status='success', repost_of=search_results.matches[0].post.post_id, post_type=post.post_type)) except IntegrityError as e: uow.rollback() log.exception('Error saving link repost', exc_info=True) self.event_logger.save_event(RepostEvent(event_type='repost_found', status='error', repost_of=search_results.matches[0].post.post_id, post_type=post.post_type)) self.event_logger.save_event( BatchedEvent(event_type='repost_check', status='success', count=len(posts), post_type='link'))
def reply_to_comment(self, comment_id: Text, comment_body: Text, subreddit: Text = None) -> Optional[Comment]: if self.live_response: return self._reply_to_comment(comment_id, comment_body, subreddit=subreddit) log.debug('Live response disabled') # TODO - 1/12/2021 - Sketchy at best return Comment(self.reddit.reddit, id='1111')
def _remove_duplicates( self, matches: List[ImageSearchMatch]) -> List[ImageSearchMatch]: log.debug('Remove duplicates from %s matches', len(matches)) results = [] for a in matches: match = next((x for x in results if x.post.id == a.post.id), None) if match: continue results.append(a) log.debug('%s matches after duplicate removal', len(results)) return results
def create_image_posts( post: Post) -> Tuple[Post, RedditImagePost, RedditImagePostCurrent]: """ Since we need to store multiple copies of an image post for the multiple indexes, this function creates all in one shot :param post: Post obj """ image_post = post_to_image_post(post) image_post_current = post_to_image_post_current(post) log.debug('Post %s: Created image_post and image_post_current', post.post_id) return post, image_post, image_post_current
def filter_dead_urls(match: SearchMatch) -> bool: try: headers = {'User-Agent': random.choice(USER_AGENTS)} r = requests.head(match.post.url, timeout=3, headers=headers) except (ConnectionError, SSLError, ReadTimeout): return False if r.status_code == 200: return True else: log.debug('Active URL Reject: https://redd.it/%s', match.post.post_id) return False