def process_comment_urls(udb, ulimit=100000, number_of_processes=4): print('---EXTRACTING COMMENT URLS') totalcompleted = 0 if ulimit == 0: ulimit = None total_to_process = Comment.select().where( Comment.number_urls.is_null()).count() if ulimit is not None and total_to_process > ulimit: total_to_process = ulimit with tqdm(total=total_to_process) as pbar: while totalcompleted < total_to_process: with udb.atomic(): queue_tasks = [(comment.id, comment.body) for comment in Comment.select().where( Comment.number_urls.is_null()).limit(ulimit) ] # Create queues task_queue = Queue() # ctx.Queue() # done_queue = Queue() # ctx.Queue() # # Submit tasks for task in queue_tasks: task_queue.put(task) # Start worker processes for i in range(number_of_processes): Process(target=url_worker, args=(task_queue, done_queue)).start() for i in range(len(queue_tasks)): comment_id, url_set = done_queue.get() try: with udb.atomic(): Comment.update(number_urls=len(url_set)).where( Comment.id == comment_id).execute() for url in url_set: url, urlcreated = Url.get_or_create(link=url) try: CommentLinks.insert( comment=comment_id, url=url.id).on_conflict_ignore().execute() except SQLError: print(comment_id, url.id) raise except KeyboardInterrupt: quit() pbar.update(1) totalcompleted += 1 # Tell child processes to stop for i in range(number_of_processes): task_queue.put('STOP') """
def reddit_comment_update(appcfg, update_length=604800): print(' ---UPDATING COMMENTS WITH DATA FROM THE REDDIT API') totalnumber = Comment.select().where( (Comment.retrieved_on - Comment.created_utc) < update_length).count() needs_update_list = list() needs_update = Comment.select().where( (Comment.retrieved_on - Comment.created_utc) < update_length) print( ' ---Building Task List. This could take a while for large subreddits' ) with tqdm(total=totalnumber, ncols=100, dynamic_ncols=False) as nbar: for dbcomment in needs_update: fullname = "t1_{}".format(dbcomment.comment_id) needs_update_list.append(fullname) nbar.update(1) needs_update_list = list(chunks(needs_update_list, 100)) print( ' ---Accessing data from Reddit API and entering into database' ) with tqdm(total=totalnumber, ncols=100, dynamic_ncols=False) as pbar: for nlist in needs_update_list: try: rd_comments = list(r.info(nlist)) except RequestException: print("Connection Error to Reddit API. Exiting...") # quit() return with appcfg.database.atomic(): for rdcomment in rd_comments: updatedtime = arrow.now().timestamp if rdcomment.author is None and rdcomment.body == '[deleted]': Comment.update( score=rdcomment.score, retrieved_on=updatedtime, deleted=True).where( Comment.comment_id == rdcomment.id).execute() """ elif rdcomment.body == '[deleted]': Comment.update(score=rdcomment.score, retrieved_on=updatedtime, deleted=False).where(Comment.comment_id == rdcomment.id).execute() elif rdcomment.author is None: Comment.update(score=rdcomment.score, # body=rdcomment.body_html, retrieved_on=updatedtime, deleted=True).where(Comment.comment_id == rdcomment.id).execute() """ else: Comment.update( score=rdcomment.score, # body=rdcomment.body_html, retrieved_on=updatedtime, deleted=False).where( Comment.comment_id == rdcomment.id).execute() pbar.update(1)
def process_comments(appcfg): # Get newest comments with two week overlap print(' PROCESSING NEWEST PUSHSHIFT.IO COMMENTS FOR', appcfg.subreddit) try: newest_utc = int( Comment.select(fn.MAX(Comment.created_utc)).scalar().timestamp()) except (TypeError, AttributeError): newest_utc = None if newest_utc is not None: oldestdate = newest_utc # - 1209600 # two weeks overlap, in seconds else: oldestdate = appcfg.oldestdate try: comment_id_set = get_push_comments(appcfg, appcfg.newestdate, oldestdate) except (ConnectionError, SSLError, ChunkedEncodingError): comment_id_set = None print(" Connection Error for Pushshift API. Quitting...") # quit() return comment_id_set # Get oldest comments in case progress was interrupted, with two week overlap try: oldest_utc = int( Comment.select(fn.MIN(Comment.created_utc)).scalar().timestamp()) except (TypeError, AttributeError): oldest_utc = None if oldest_utc is not None: newestdate = oldest_utc # + 1209600 # two weeks overlap, in seconds else: newestdate = appcfg.newestdate print(' PROCESSING OLDEST PUSHSHIFT.IO COMMENTS FOR', appcfg.subreddit) try: old_comment_id_set = get_push_comments(appcfg, newestdate, appcfg.oldestdate) except (ConnectionError, SSLError, ChunkedEncodingError): old_comment_id_set = None print(" Connection Error for Pushshift API. Quitting...") # quit() return old_comment_id_set comment_id_set |= old_comment_id_set filedate = arrow.now().timestamp basedir = "/rpa" if os.environ.get('DOCKER', '0') == '1' else '.' coutput_file_path = "{basedir}/{subreddit}_comments_{timestamp}.txt".format( basedir=basedir, subreddit=appcfg.subreddit, timestamp=filedate) # with open(coutput_file_path, 'w', encoding='UTF-8') as comment_file: # comment_file.writelines(comment_id_set) print(" Total comments submitted to", appcfg.subreddit, "in set:", len(comment_id_set)) deleted = Author.get_or_none(name='[deleted]') if deleted is not None: cupdatet = Comment.update(deleted=True).where( (Comment.author == deleted.id) & (Comment.deleted.is_null() or Comment.deleted == 0)).execute() print( ' Updated deleted field in comments. Set deleted = True for', cupdatet, 'records.') cupdatef = Comment.update( deleted=False).where((Comment.author != deleted.id) & (Comment.deleted.is_null())).execute() print( ' Updated deleted field in comments. Set deleted = False for', cupdatef, 'records.')
def get_push_comments(appcfg, newestdate, oldestdate): subnumber = 1 sub, subcreated = Subreddit.get_or_create(name=appcfg.subreddit) sub_id = sub.id totalsubnumber = 0 push_comment_id_set = set() total_available = "https://api.pushshift.io/reddit/search/comment/?subreddit={subreddit}" \ "&after={oldestdate}&before={newestdate}&aggs=subreddit&size=0" turl = total_available.format(subreddit=appcfg.subreddit, oldestdate=oldestdate, newestdate=newestdate) # newestdate = appcfg.newestdate with requests.get(turl) as tp: if tp.status_code != 200: print("Connection Error for Pushshift API, quitting...") # quit() return push_comment_id_set tpush = tp.json() try: total_comments = tpush['aggs']['subreddit'][0]['doc_count'] except (IndexError, KeyError): print(" No new comments to process from pushshift API for", appcfg.subreddit) return push_comment_id_set linktemplate = "https://api.pushshift.io/reddit/search/comment/?subreddit={subreddit}" \ "&after={oldestdate}&before={newestdate}&sort=desc&size=500" with tqdm(total=total_comments, ncols=100, dynamic_ncols=False) as pbar: while subnumber > 0: url = linktemplate.format(subreddit=appcfg.subreddit, oldestdate=oldestdate, newestdate=newestdate) with requests.get(url) as rp: try: push = rp.json() except JSONDecodeError: print(" JSON DECODE ERROR on Pushshift API Comments", url) time.sleep(10) continue # return push_comment_id_set subnumber = len(push['data']) totalsubnumber += subnumber commentlinktemplate = 'https://www.reddit.com/comments/{link_id}/_/{comment_id}/.json\n' with appcfg.database.atomic(): for item in push['data']: if 'id' not in item.keys(): print('The following item has no primary comment ID:', item) continue else: item['comment_id'] = item.pop('id') try: link_id = item['link_id'] item['link_id'] = link_id.replace('t3_', '') commentlink = commentlinktemplate.format( link_id=item['link_id'], comment_id=item['comment_id']) push_comment_id_set.add(commentlink) except KeyError: print('The following item has no submission link ID:', item) continue if item['created_utc'] < newestdate: newestdate = item['created_utc'] item['subreddit'] = sub_id if 'author_flair_text' in item.keys( ) and item['author_flair_text'] is not None: author_flair, author_flaircreated = AuthorFlair.get_or_create( text=item['author_flair_text']) item['author_flair'] = author_flair.id else: item['author_flair'] = None author, author_created = Author.get_or_create( name=item['author']) item['author'] = author.id itemfields = Comment._meta.fields.keys() insertdict = dict() for key in item.keys(): if key in itemfields: insertdict[key] = item[key] Comment.insert(insertdict).on_conflict_ignore().execute() pbar.update(subnumber) return push_comment_id_set