# dictionary format with the filter attributes as keys sub_results = [ post.d_ for post in list( api.search_submissions(after=start_epoch, subreddit=subreddit, filter=['title', 'selftext', 'id'], limit=1000)) ] #print(sub_results) # map posts to comments post_to_comment_dict = dict() for post in sub_results: sub_id = post['id'] # fetch comment ids comment_id_results = list(api._get_submission_comment_ids(sub_id)) #print("sub_id: ", sub_id, " comments: ", comment_id_results) post_to_comment_dict[sub_id] = comment_id_results # fetch comment data for comment_list in post_to_comment_dict.values(): # parent ids: t3 is original submission, t1 is another comment # is_submitter = true means that it is a reply from OP comment_results = [ comment.d_ for comment in list( api.search_comments(ids=comment_list, filter=['id', 'body', 'is_submitter', 'score' ])) ] #print(comment_results)
def fetch_links(subreddit=None, date_start=None, date_stop=None, limit=None, score=None, self_only=False): if subreddit is None or date_start is None or date_stop is None: print('ERROR: missing required arguments') exit() api = PushshiftAPI(rate_limit_per_minute=pushshift_rate_limit_per_minute, detect_local_tz=False) # get links links = [] print('fetching submissions %s to %s...' % (time.strftime( '%Y-%m-%d', date_start), time.strftime('%Y-%m-%d', date_stop))) params = { 'after': int(mktime(date_start)) - 86400, # make date inclusive, adjust for UTC 'before': int(mktime(date_stop)) + 86400, 'subreddit': subreddit, 'filter': link_fields, 'sort': 'asc', 'sort_type': 'created_utc', } if limit: params['limit'] = int(limit) if score: params['score'] = score if self_only: params['is_self'] = True link_results = list(api.search_submissions(**params)) print('processing %s links' % len(link_results)) for s in link_results: # print('%s %s' % (datetime.utcfromtimestamp(int(s.d_['created_utc'])), s.d_['title'])) # pprint(s) # get comment ids comments = [] if s.d_['num_comments'] > 0 and not comment_data_exists( subreddit, s.d_['created_utc'], s.d_['id']): comment_ids = list(api._get_submission_comment_ids(s.d_['id'])) # print('%s comment_ids: %s' % (data['id'], comment_ids)) # get comments if (len(comment_ids) > 0): mychunks = [] if len(comment_ids) > max_comments_per_query: mychunks = chunks(comment_ids, max_comments_per_query) else: mychunks = [comment_ids] for chunk in mychunks: comment_params = { 'filter': comment_fields, 'ids': ','.join(chunk), 'limit': max_comments_per_query, } comments_results = list( api.search_comments(**comment_params)) print( '%s fetch link %s comments %s/%s' % (datetime.utcfromtimestamp(int(s.d_['created_utc'])), s.d_['id'], len(comments_results), len(comment_ids))) for c in comments_results: comments.append(c.d_) s.d_['comments'] = comments links.append(s.d_) # write results if len(links) >= write_every: success = write_links(subreddit, links) if success: links = [] # write remining results if len(links): write_links(subreddit, links)
logger.warning(f"Found more than 1000 ({bin_submissions}) submissions in one bin, try lowering the bin size: {before}-{after}") out_dir = data_dir.joinpath(subreddit) os.makedirs(out_dir, exist_ok=True) if len(list(out_dir.glob("*.text"))) > args.number_of_threads: print(f"stopping at {args.number_of_threads} threads") break for submission in submissions: submission = psaw_to_dict(submission) submission_id = get_id_for_comments(submission) out_file = out_dir.joinpath(submission_id + ".pickle") if not out_file.is_file(): # Get comments submission_comment_ids = api._get_submission_comment_ids( submission["id"] ) if len(submission_comment_ids) > 3000: logger.debug(f"Skipping thread with large amount of commments {submission['id']}") continue # because it's too slow to parse these large trees with the current code comment_dict = collections.defaultdict(list) # Batch to avoid 414: Url too long batch_size = 400 # We can do 1000 at a time for i in range(0, len(submission_comment_ids), batch_size): batch_ids = submission_comment_ids[i : i + batch_size] # Use psaw try: comments = api.search_comments(ids=batch_ids) # It will just repeat unless we set a limit