def __init__(self, credentials_file='credentials.json'): try: with open(credentials_file) as f: params = json.load(f) self.filename = None self.reddit = praw.Reddit(client_id=params['client_id'], client_secret=params['api_key'], user_agent='Sentiment Analyzer') self.api = PushshiftAPI() # if credentials.json does not exist, prompt the user for authentication and create the file, then proceed except FileNotFoundError: auth_keys = {} auth_keys['client_id'] = prompt_for_auth_val('client_id') auth_keys['api_key'] = prompt_for_auth_val('api_key') auth_keys['username'] = prompt_for_auth_val('username') auth_keys['password'] = prompt_for_auth_val('password') with open(credentials_file, 'w') as outf: json.dump(auth_keys, outf) load_params = json.dumps(auth_keys) params = json.loads(load_params) self.filename = None self.reddit = praw.Reddit(client_id=params['client_id'], client_secret=params['api_key'], user_agent='Sentiment Analyzer') self.api = PushshiftAPI(backoff=10, max_retries=20)
def subreddit_data(subreddit_name, post_amount=None): if post_amount: post_amount = int(post_amount) else: post_amount = config['display'].getint('subreddit_max_posts') subreddit = reddit.subreddit(subreddit_name) print(f'subreddit data, name: {subreddit_name}, amount: {post_amount}') if not config['modes'].getboolean('slow_mode'): submissions = subreddit.top('day', limit=post_amount) else: api = PushshiftAPI(reddit) submissions = list(api.search_submissions(subreddit=subreddit_name, after='56h', before='24h')) id_set = set() for submission in submissions: if submission.id in id_set: print('{}: "{}" is duplicated'.format(submission.id, submission.title)) else: id_set.add(submission.id) if submission.subreddit.display_name.lower() != subreddit_name.lower(): print('submission {} of {} != subreddit {}'.format(submission.id, submission.subreddit.display_name, subreddit_name)) submissions.sort(key=lambda item: item.score, reverse=True) submissions = submissions[:post_amount] print(id_set) print(subreddit.display_name, submissions) posts = get_posts(submissions) return jsonify(posts)
def getRedditSubsList(startEpoch, endEpoch, prawUserAgent='alpha'): """ Returns a list of Reddit submissions from the bachelor subreddit Args : startEpoch : An int variable of posix time. Inidcates start time of reddit submissions. endEpoch : An int variable of posix time. Indicates end time of reddit submissions. prawUserAgent : The name of the reddit app. This should be specified in a praw.ini text file see: https://praw.readthedocs.io/en/latest/getting_started/configuration/prawini.html Returns : subList : A list of reddit submission objects from PRAW """ import praw from psaw import PushshiftAPI r = praw.Reddit(prawUserAgent) api = PushshiftAPI(r) subList = list( api.search_submissions(before=endEpoch, after=startEpoch, subreddit='thebachelor')) return subList
def get_reddit_comments(search_terms: list, subreddits: list): import praw from psaw import PushshiftAPI import pandas as pd import numpy as np import sys reddit = praw.Reddit(client_id="t6zhrU4Kfc2nRA", client_secret="hosJ2fHU1z47MVfxRF-onhwxqpQ", user_agent="sentiment_analysis") api = PushshiftAPI(reddit) body, timestamp, subreddit_name = [], [], [] for query in search_terms: for subreddit in subreddits: print('Searching ' + subreddit + ' for:', query) gen = api.search_submissions(q=query, subreddit=subreddit) comment_counter = 0 submission_counter = 0 for submission in list(gen): submission.comments.replace_more(limit=None) submission_counter += 1 sys.stdout.write("\033[F") # back to previous line sys.stdout.write("\033[K") # clear line print(str(submission_counter) + ' posts found') for comment in list(submission.comments): body += [comment.body] timestamp += [ pd.to_datetime(int(comment.created_utc), unit='s').tz_localize('UTC') ] subreddit_name += [comment.subreddit.display_name] comment_counter += 1 sys.stdout.write("\033[F") # back to previous line sys.stdout.write("\033[K") # clear line print(str(comment_counter) + ' comments found') # Check that all are same length, otherwise just add a nan if len(body) < len(timestamp) or len(body) < len( subreddit_name): body += [np.nan] elif len(timestamp) < len(body) or len(timestamp) < len( subreddit_name): timestamp += [np.nan] elif len(subreddit_name) < len(body) or len( subreddit_name) < len(timestamp): subreddit_name += [np.nan] df = pd.DataFrame({ 'Timestamp': timestamp, 'Body': body, 'Subreddit': subreddit_name }).dropna() df.set_index('Timestamp', inplace=True) df.sort_index(inplace=True) df = df.drop_duplicates() return df
def grab_data(subreddit, reddit): """ Grabs at most 400000 submissions from subreddit for 2 year period Parameters ---------- subreddit : string the subreddit of interest reddit : Reddit Instance an instance of the PRAW Reddit class Returns ---------- list of submissions """ api = PushshiftAPI(reddit) start_epoch = int(dt.datetime(2017, 3, 30).timestamp()) end_epoch = int(dt.datetime(2019, 3, 30).timestamp()) submissions = list(api.search_submissions(before=end_epoch, after=start_epoch, subreddit=subreddit, limit=40000)) print(f'grabbed {len(submissions)} submissions') np.save(subreddit+'_submissions', submissions) return submissions
def extract_submissions(subr, lim, bef, aft, srt='desc'): subm_lst = [] time.sleep(3) try: api = PushshiftAPI() corona_generator = \ api.search_submissions(subreddit = subr, limit = lim, before = bef, after = aft, sort = srt, lang = 'en', filter=['id','title', 'subreddit', 'author', 'url','domain', 'is_self','is_video','is_crosspostable','post_hint', 'num_comments','score','removed_by_category', 'selftext','link_flair_text','full_link']) subm_lst = list(corona_generator) #SC_subm.debug('Success!') except StopIteration: SC_subm.error(subr + ': StopIterrationError') except RuntimeError: SC_subm.error(subr + ': RuntimeError') except timeout_decorator.timeout_decorator.TimeoutError: SC_subm.error(subr + ': TimeoutError') except: SC_subm.error(subr + ': OtherError') pass return (subm_lst)
def handler_name(event, context): # This is the defined function Lambda reads in password = urllib.parse.quote("p@sswordw!thsp&ci@lch@r@ct&rs") mng_client = pymongo.MongoClient( "mongodb-uri-string" % password) mng_db = mng_client['db_name'] # Replace mongo db name collection_name = 'collection_name' # Replace mongo db collection name db_cm = mng_db[collection_name] # praw is reddit's api wrapper reddit = praw.Reddit(client_id='client_id', client_secret='client_secret', user_agent='user_agent', username='******', password='******') # now we need to wrap the around praw with Reddit's push shift api wrapper api = PushshiftAPI(reddit) # run function to get end and start datestrings end_epoch, start_epoch = get_date_string(db_cm) # query reddit's api for comments results = list(api.search_comments(after=start_epoch, before=end_epoch, subreddit='subreddit', filter=['url', 'author', 'title', 'subreddit'])) # iterate over comments in result and save to database for comment in results: save_comm(comment, db_cm)
def get_submission_psaw(n, sub_dict): """ Returns a list of results for submission in past: 1st list: current result from n hours ago until now 2nd list: prev result from 2n hours ago until n hours ago """ api = PushshiftAPI() mid_interval = datetime.today() - timedelta(hours=n) timestamp_mid = int(mid_interval.timestamp()) timestamp_start = int((mid_interval - timedelta(hours=n)).timestamp()) timestamp_end = int(datetime.today().timestamp()) recent = {} prev = {} for key in sub_dict: # results from the last n hours recent[key] = api.search_submissions( after=timestamp_mid, before=timestamp_end, subreddit=key, filter=['title', 'link_flair_text', 'selftext', 'score']) # results from the last 2n hours until n hours ago prev[key] = api.search_submissions( after=timestamp_start, before=timestamp_mid, subreddit=key, filter=['title', 'link_flair_text', 'selftext', 'score']) return recent, prev
def wallstreetbettor(): api = PushshiftAPI() start_time = int(datetime.datetime(2021, 2, 3).timestamp()) submissions = api.search_submissions( after=start_time, subreddit='wallstreetbets', filter=['url', 'author', 'title', 'subreddit'], limit=15000) stock_tracker = {} for submission in submissions: words = submission.title.split() cashtag = list( set(filter(lambda word: word.lower().startswith('$'), words))) if len(cashtag) > 0: for item in cashtag: tag = item if item[-1] == ',' or item[-1] == ')' or item[-1] == '?': tag = item[:-1] if tag[1:].isalpha(): if tag in stock_tracker: stock_tracker[tag] = stock_tracker[tag] + 1 else: stock_tracker[tag] = 1 stock_tracker = sorted(stock_tracker.items(), key=operator.itemgetter(1), reverse=True) return stock_tracker
def scrape_data(subreddit): # Inicializacion api = PushshiftAPI() # Creo una lista con el scrapping scrape_list = list(api.search_submissions(subreddit=subreddit, filter=['title', 'subreddit', 'num_comments', 'author', 'subreddit_subscribers', 'score', 'domain', 'created_utc'], limit=15000)) #Filtro el subreddit por author y titulos clean_scrape_lst = [] for i in range(len(scrape_list)): scrape_dict = {} scrape_dict['subreddit'] = scrape_list[i][5] scrape_dict['author'] = scrape_list[i][0] scrape_dict['domain'] = scrape_list[i][2] scrape_dict['title'] = scrape_list[i][7] scrape_dict['num_comments'] = scrape_list[i][3] scrape_dict['score'] = scrape_list[i][4] scrape_dict['timestamp'] = scrape_list[i][1] clean_scrape_lst.append(scrape_dict) # Ver numero de suscriptores print(subreddit, 'subscribers:',scrape_list[1][6]) # Retorno lista de scrapping return clean_scrape_lst
def when_pressed(event=None): global x, tags, urls api = PushshiftAPI() now = datetime.now() - timedelta(days=3) start = datetime(hour=1, month=now.month, year=now.year, day=now.day) posts = list( api.search_submissions(after=start, subreddit='wallstreetbets', filter=['url', 'author', 'title', 'subreddit'], limit=1000)) for post in posts: words = post.title.split() cashtags = list( set(filter(lambda word: word.lower().startswith('$'), words))) if len(cashtags) > 0: for cashtag in cashtags: if ("$" + e.get().upper()) in cashtag: print(post.url) urls["tag" + str(x)] = post.url text.insert('1.0', post.title + "\n") text.tag_add("tag" + str(x), '1.0', '1.end') text.tag_config("tag" + str(x), foreground='blue', underline=True) text.tag_bind("tag" + str(x), '<Enter>', show_hand_cursor) text.tag_bind("tag" + str(x), '<Leave>', show_xterm_cursor) tag = "tag" + str(x) callback = lambda even, tag=tag: do(event, tag) text.tag_bind("tag" + str(x), '<Button-1>', callback) x = x + 1 break
def create_annotation_file(annotations, filename): gen = PushshiftAPI().search_submissions( ids=[annotations[k]["ID"] for k in annotations.keys()], subreddit='Denmark', filter=['id', 'title', 'selftext', 'full_link', 'created_utc']) annotations = process_submissions(gen, annotations) with open(filename, 'w') as f, open('./logs.txt', 'a') as fp: for _, v in annotations.items(): state, code = validate_post(v) if not state: print( f"\033[1;33;40m WARNING: \033[0m The submission {v['ID']} or annotations have changed since producing this data set. To learn more check the logs afterward (logs.txt)" ) write_logs(fp, v, code) continue try: for t, l1, l2 in zip(v['text'], v['Layer1'].split(' '), v['Layer2'].split(' ')): f.write(f'{t}\t{l1}\t{l2}\n') f.write('\n') except KeyError: print( f"\033[1;33;40m WARNING:\033[0m Pulling the submission {v['ID']} produced an error leading to this KeyError. To learn more check the logs afterward (logs.txt)" ) write_logs(fp, v, 4)
def get_tickers(): tickers = pd.read_csv('data_scrappers\\tickers.csv', header=None, names=['Tickers']) tickers['Tickers'] = tickers['Tickers'].str[:-2] tickers = tickers['Tickers'].values api = PushshiftAPI() start_epoch = dt.date.today() - timedelta(days=7) subs = api.search_submissions( after=start_epoch, subreddit='wallstreetbets', filter=['url', 'author', 'title', 'subreddit'], limit=2000) cash_tags = {} for sub in subs: for word in sub.title.split(' '): if (word.isupper() or '$' in word) and word in tickers and word.upper() != 'GME': word = re.sub("[^a-zA-Z]+", "", word) if word.upper() not in cash_tags: cash_tags[word.upper()] = 1 else: cash_tags[word.upper()] += 1 cash_tags = pd.Series(cash_tags, name='tickers') return cash_tags
def __init__(self, subreddit_name, limit): print("API parameters:", subreddit_name, limit) ranges = [(1, 1, 2019, 1, 2, 2019), (1, 2, 2019, 1, 3, 2019), (1, 3, 2019, 1, 4, 2019), (1, 4, 2019, 1, 5, 2019), (1, 5, 2019, 1, 6, 2019), (1, 6, 2019, 1, 7, 2019), (1, 7, 2019, 1, 8, 2019), (1, 8, 2019, 1, 9, 2019), (1, 9, 2019, 1, 10, 2019), (1, 10, 2019, 1, 11, 2019), (1, 11, 2019, 1, 12, 2019), (1, 12, 2019, 1, 1, 2020), (1, 1, 2020, 1, 2, 2020), (1, 2, 2020, 1, 3, 2020), (1, 3, 2020, 1, 4, 2020), (1, 4, 2020, 1, 5, 2020), (1, 5, 2020, 1, 6, 2020), (1, 6, 2020, 1, 7, 2020), (1, 7, 2020, 1, 8, 2020), (1, 8, 2020, 1, 9, 2020), (1, 9, 2020, 1, 10, 2020), (1, 10, 2020, 1, 11, 2020), (1, 11, 2020, 1, 12, 2020), (1, 12, 2020, 1, 1, 2021), (1, 1, 2021, 1, 2, 2021), (1, 2, 2021, 1, 3, 2021), (1, 3, 2021, 1, 4, 2021)] for d1, m1, y1, d2, m2, y2 in ranges: posted_after = int(datetime.datetime(y1, m1, d1).timestamp()) posted_before = int(datetime.datetime(y2, m2, d2).timestamp()) self.api = PushshiftAPI() self.comBatchNo = 0 self.outputPath = './{0}/{1}/'.format(subreddit_name, posted_after) Path(self.outputPath).mkdir(parents=True, exist_ok=True) self.getComments(subreddit_name, None, [ 'created_utc', 'score', 'selftext', 'title', 'upvote_ratio', 'body' ], posted_after, posted_before, limit)
def get_submission_detail(submission_id: str) -> dict: r = praw.Reddit(client_id=reddit_client_id, client_secret=reddit_client_secret, user_agent=reddit_user_agent) api = PushshiftAPI(r) pass
def create_report_body(subreddit_name): reddit = praw.Reddit( username=USERNAME, password=PASSWORD, client_id=CLIENT_ID, client_secret=CLIENT_SECRET, user_agent=USER_AGENT ) pushshift = PushshiftAPI(r=reddit) time_now = dt.datetime.now() time_one_week_ago = time_now - dt.timedelta(days=7) time_one_week_ago_epoch = int(time_one_week_ago.timestamp()) submissions = get_submissions(subreddit_name, reddit, pushshift, time_one_week_ago_epoch) for submission in submissions: submission.data = submission.title submission.link = submission.shortlink submission_report = process_data(submissions, "Post Title", "Submission") comments = get_comments(subreddit_name, reddit, pushshift, time_one_week_ago_epoch) for comment in comments: ind = next((i for i, ch in enumerate(comment.body) if ch in {'\n', '`'}), len(comment.body)) ind = min(ind, 150) comment.data = comment.body[:ind] if ind < len(comment.body) - 2: comment.data += " ...\\[trimmed\\]" comment.link = comment.permalink comment_report = process_data(comments, "Comment Link", "Comment") return f"""
def main(args): api = PushshiftAPI() folder = "Subreddit" Path(folder).mkdir(parents=True, exist_ok=True) if args.subreddit: subreddit = [x.strip() for x in args.subreddit.split(",")] else: logger.error("Use -s to set the subreddit") exit() for i in subreddit: try: df = fetch_posts(api, i) df["date_utc"] = pd.to_datetime(df["created_utc"], unit="s") df["date"] = pd.to_datetime(df["created"], unit="s") df["permalink"] = "https://old.reddit.com" + df[ "permalink"].astype(str) df = df[df.columns.intersection(COLUMNS)] filename = f"{folder}/posts_{i}_{int(time.time())}" if args.export_format == "xlsx": writer = pd.ExcelWriter( f"{filename}.xlsx", engine="xlsxwriter", options={"strings_to_urls": False}, ) df.to_excel(writer, sheet_name="Sheet1") writer.save() else: df.to_csv(f"{filename}.csv", index=False, sep="\t") except Exception as e: logger.error("Complete error : %s", e) logger.info("Runtime : %.2f seconds" % (time.time() - temps_debut))
def main(argv): reddit = None try: opts, _ = getopt.getopt(argv, "u:h:l", ["user="******"help", "log"]) except getopt.GetoptError: print("see: scraper.py -help") sys.exit(2) for opt, val in opts: if opt in ("-l", "-log"): enablelogging() elif opt in ("-u", "-user"): reddit = praw.Reddit(val) elif opt in ("-h", "-help"): help_msg = """ run with '-u'/'-user' and valid praw agent argument from praw.ini\n run with '-l'/'-log' to enable logging of API calls """ sys.exit(help_msg) if not reddit: sys.exit( "Reddit instance could not be obtained!\nSee '-help' for more information" ) pushAPI = PushshiftAPI(reddit) datafolder = "submissions/" submission_ids = "submission_ids/" process_queries('queries.csv', pushAPI, submission_ids) for info_file in os.listdir(submission_ids): process_submissions(reddit, datafolder, os.path.join(submission_ids, info_file))
def main(args): api = PushshiftAPI() folder = "Search" Path(folder).mkdir(parents=True, exist_ok=True) if not args.search_terms: logger.error("Use -s to set search terms") exit() df = fetch_comments(api, args.search_terms, args.subreddit) df["date_utc"] = pd.to_datetime(df["created_utc"], unit="s") df["date"] = pd.to_datetime(df["created"], unit="s") df["permalink"] = "https://old.reddit.com" + df["permalink"].astype(str) df = df[df.columns.intersection(COLUMNS)] filename = f"{folder}/comments_{int(time.time())}_{args.search_terms}" if args.export_format == "xlsx": writer = pd.ExcelWriter( f"{filename}.xlsx", engine="xlsxwriter", options={"strings_to_urls": False}, ) df.to_excel(writer, sheet_name="Sheet1") writer.save() else: df.to_csv(f"{filename}.csv", index=False, sep="\t") logger.info("Runtime : %.2f seconds" % (time.time() - temps_debut))
def get_subreddit_info( subreddit: str, date: dt.date = dt.date.today()) -> dict: '''Gets a list of all submissions for a given subreddit and date.''' r = praw.Reddit(client_id=reddit_client_id, client_secret=reddit_client_secret, user_agent=reddit_user_agent) api = PushshiftAPI(r) end = dt.datetime.combine(dt.date.today(), dt.datetime.min.time()) start = end - dt.timedelta(days=1) results = api.search_submissions(after=int(start.timestamp()), before=int(end.timestamp()), subreddit=subreddit, stickied=False, limit=500) # build json sub_info = { 'subreddit': subreddit, 'date': date.strftime('%Y-%m-%d'), 'submissions': [], } for entry in results: record = { 'id': entry.id, 'score': entry.score, 'title': entry.title, 'author': (entry.author.name if entry.author is not None else None), 'comment_count': entry.num_comments } sub_info['submissions'].append(record) sub_info['post_count'] = len(sub_info['submissions']) return sub_info
def scrape_chunk(id_chunk, keep_columns_comm, subm_limit): time.sleep(3) com_df = pd.DataFrame({}, columns=keep_columns_comm) comments_all = [] try: api = PushshiftAPI() comments_all = list( api.search_comments(link_id=','.join(id_chunk), limit=subm_limit, filter=keep_columns_comm)) if len(comments_all) > 0: comments_level1 = [ c.d_ for c in comments_all if c.parent_id[:3] == 't3_' ] comments_level2 = [ c.d_ for c in comments_all if c.parent_id in ['t1_' + r['id'] for r in comments_level1] ] com_df = pd.DataFrame(comments_level1 + comments_level2) except StopIteration: SC_comm.error('StopIterration error') except RuntimeError: SC_comm.error('Runtime error') except timeout_decorator.timeout_decorator.TimeoutError: SC_subm.error('TimeoutError') except: SC_comm.error('Other error') pass return (com_df)
def get_posts(subreddit, year, day, month, iteration, end_year, end_day, end_month): evn_path = '/disk/data/share/s1690903/pandemic_anxiety/evn/' evn = load_experiment(evn_path + 'experiment.yaml') reddit = praw.Reddit(client_id=evn['reddit_api_3']['client_id'], client_secret=evn['reddit_api_3']['client_secret'], user_agent=evn['reddit_api_3']['user_agent'], username=evn['reddit_api_3']['username'], password=evn['reddit_api_3']['password']) #using pushshift to fetch IDS api = PushshiftAPI(reddit) count = 0 while count < iteration: # get post ids c = CollectPostids(year, month, day, subreddit) c.save_postids() # get posts and continue the next iteration cp = CollectPost() last_day = cp.collect_posts('postids/{}_postids.csv'.format(subreddit)) print(last_day) year = int(last_day.split('/')[2]) month = int(last_day.split('/')[0]) day = int(last_day.split('/')[1]) print(month, day) if (year == end_year and month == end_month and day == end_day): break count = count + 1 time.sleep(30)
def generate_submissions_psaw(month_num, subreddit): """ Gets submissions between start/end epochs for requested subreddit Parameters ---------- month_num: int The month number to be passed to epoch_generate() month_half: int The month half to be passed to epoch_generate() subreddit: string The name of the subreddit to be scraped Returns ------- generator A generator object that will be used to loop through submissions """ # init api api = PushshiftAPI() epoch_tuple = epoch_generate(month_num, 2020) start_epoch = epoch_tuple[0] end_epoch = epoch_tuple[1] return api.search_submissions(after=start_epoch, before=end_epoch, subreddit=subreddit, size=1000)
def update_user(target_user, target_sub, r, flair_queue, perm_queue, sub_list): # Check existing data check_data = ProcessComment.check_user(target_user, target_sub) update_flair = True user_in_accnt_info = check_data[ 2] # Does the user's data need to be updated or inserted user_in_sub_info = check_data[3] # Collect new data try: # PushShift Instance ps = PushshiftAPI(r) DataCollector.load_data(user_in_accnt_info, user_in_sub_info, update_flair, target_user, target_sub, sub_list, ps) except: logging.warning("PM: User " + str(target_user) + " was not able to have their data and flair updated" "\nStacktrace: " + str(traceback.print_exc())) # Update flair with new data prog_flair_enabled = target_sub.main_config.getboolean("progression tier") new_accnt_flair_enabled = target_sub.main_config.getboolean( "young account tag") activity_flair_enabled = target_sub.main_config.getboolean("activity tag") FlairManager.update_flair(flair_queue, perm_queue, target_user, target_sub, prog_flair_enabled, new_accnt_flair_enabled, activity_flair_enabled) logging.debug("PM: User " + str(target_user) + " has had their data and flair updated")
def __init__(self, subreddit, output_path): self.subreddit = subreddit self.output_path = output_path self.images_links = [] self.api = PushshiftAPI() self.new_memes = self.api.search_submissions(subreddit=self.subreddit)
def get_elements(self): ps = PushshiftAPI() for user in self.data['users'].split(','): user = user.replace('/u/', '', 1).strip() _params = {'author': user} if self.data['limit']: _params['limit'] = self.data['limit'] if self.data['scan_submissions']: for post in ps.search_submissions(**_params): p = RedditElement(post) if self.check_filters(p): yield p if self.data['scan_comments']: for post in ps.search_comments(**_params): parents = list( ps.search_submissions(ids=post.link_id.replace( 't3_', '', 1), limit=1)) if not len(parents): print( "PushShift Warning: Unable to locate parent Submission:", post.link_id) continue submission = parents[0] p = RedditElement(post, ext_submission_obj=submission) if self.check_filters(p): yield p
def get_elements(self): url = self.data['url'] submission = re.search(r'\/comments\/([a-zA-Z0-9]+)\/?', url) comment = re.search(r'\/comments\/.+?\/.+?\/([a-zA-Z0-9]+)\/?', url) ps = PushshiftAPI() if comment: for post in ps.search_comments(ids=[comment.group(1)]): parents = list( ps.search_submissions(ids=post.link_id.replace( 't3_', '', 1), limit=1)) if not len(parents): raise AssertionError( "PushShift Warning: Unable to locate direct parent Submission:", post.link_id) submission = parents[0] p = RedditElement(post, ext_submission_obj=submission) if self.check_filters(p): yield p elif submission: for post in ps.search_submissions( ids=[submission.group(1).replace('t3_', '', 1)], limit=1): p = RedditElement(post) if self.check_filters(p): yield p else: raise TypeError('Invalid Reddit URL provided! "%s"' % url)
def scrape_data(subreddit): print("cp1") api = PushshiftAPI() print("cp2") # Create list of scraped data scrape_list = list( api.search_submissions(subreddit=subreddit, filter=[ 'title', 'subreddit', 'num_comments', 'author', 'subreddit_subscribers', 'score', 'domain', 'created_utc' ], limit=15000)) print("cp3") clean_scrape_lst = [] for i in range(len(scrape_list)): scrape_dict = {} scrape_dict['subreddit'] = scrape_list[i][5] #Name of subreddit scrape_dict['author'] = scrape_list[i][0] scrape_dict['domain'] = scrape_list[i][2] #Publishing House scrape_dict['title'] = scrape_list[i][7] scrape_dict['num_comments'] = scrape_list[i][3] scrape_dict['score'] = scrape_list[i][4] #upvotes-downvotes scrape_dict['timestamp'] = scrape_list[i][1] #time in epoch format clean_scrape_lst.append(scrape_dict) print("cp4") # Show number of subscribers print(subreddit, 'subscribers:', scrape_list[1][6]) # Return list of scraped data return clean_scrape_lst
def compile(self, progress_cb=None): # def compile(self, compile_params): # client_id = compile_params["client_id"] # client_secret = compile_params["client_secret"] if self.compiled: return reddit = praw.Reddit( client_id=os.environ["CLIENT_ID"], client_secret=os.environ["CLIENT_SECRET"], user_agent="linux:org.reddit-nlp.reddit-nlp:v0.1.0 (by /u/YeetoCalrissian)", ) api = PushshiftAPI(reddit) comments = [] start_epoch = int(self.start_time.timestamp()) end_epoch = int(self.end_time.timestamp()) n = 0 for subreddit in self.subreddits: print(subreddit, start_epoch, end_epoch) for comment in api.search_comments( after=start_epoch, before=end_epoch, subreddit=subreddit ): print(comment.body) comments.append(comment) n += 1 if progress_cb is not None: progress_cb(n) with open(self.comments_pickle_path, "wb") as pickle_file: pickle.dump(comments, pickle_file) self.compiled = True self.write()
def get_api_instance(src_path): ''' returns an instance of the psaw object initialized using praw parameters: --src_path: pathlib.path object pointing to the src directory or the directory containing api_credentials.txt returns: --s_api: an instance of pushift's psaw api ''' # retrieve api credentials from .gitignore'd text file secrets_path = src_path / 'api_credentials.txt' secrets_txt = open(secrets_path, 'r') my_id = secrets_txt.readline().split('=')[1].rstrip() my_secret = secrets_txt.readline().split('=')[1].rstrip() my_agent = secrets_txt.readline().split('=')[1].rstrip() secrets_txt.close() # create a praw and pushshitft instances reddit = praw.Reddit(client_id=my_id, client_secret=my_secret, user_agent=my_agent) s_api = PushshiftAPI(reddit) return s_api