def test_submission_praw_mem_safe(): api_praw = PushshiftAPI(file_checkpoint=1, praw=reddit) posts = api_praw.search_submissions(subreddit="science", limit=1000, mem_safe=True, before=1629990795) assert (len(posts) == 1000)
def test_asc_sort(): with pytest.raises(NotImplementedError): api = PushshiftAPI() comments = api.search_comments(subreddit="science", limit=100, before=1629990795, sort='asc')
def test_comment_praw_mem_safe(): api_praw = PushshiftAPI(file_checkpoint=1, praw=reddit) comments = api_praw.search_comments(subreddit="science", limit=1000, mem_safe=True, before=1629990795) assert (len(comments) == 1000)
def get_comments_from_wallstreetbets(before, after): """ Functions returns comments dataframe in particular horizon. :param before: pd.Timestamp() :param after: pd.Timestamp() :return: pd.DataFrame() """ # Get cpu counts to specify maximum cores on VM available max_threads = os.cpu_count() * 5 # Scrap comments from wallstreetbets api = PushshiftAPI() subreddit = "wallstreetbets" comments = api.search_comments( # PMAW parameters mem_safe=True, num_workers=max_threads, # Pushift.io parameters subreddit=subreddit, after=int(after.timestamp()), before=int(before.timestamp()) ) # Clean dataframe with comments comments_df = pd.DataFrame(comments) if not comments_df.empty: comments_df = comments_df[['id', 'author', 'body', 'created_utc']].drop_duplicates() comments_df.created_utc = pd.to_datetime(comments_df.created_utc, unit='s') comments_df = comments_df[~comments_df.body.isin(['[removed]', '[deleted]'])] comments_df = comments_df.sort_values('created_utc').reset_index(drop=True) return comments_df
def test_comment_praw_query(): api_praw = PushshiftAPI(file_checkpoint=1, praw=reddit) comments = api_praw.search_comments(q="quantum", subreddit="science", limit=100, before=1629990795) assert (len(comments) == 100)
def __init__(self, list_filters, parallelize, log) -> None: super().__init__() self._list_filters = list_filters self._parallelize = parallelize self._log = log self._dict_df_posts = {} self._api = PushshiftAPI()
def test_praw_ids_filter(): def fxn(item): return item['ups'] > 2 api_praw = PushshiftAPI(praw=reddit) comments = api_praw.search_comments(ids=comment_ids, filter_fn=fxn) assert (len(comments) == 4)
def test_submission_praw_query(): api_praw = PushshiftAPI(file_checkpoint=1, praw=reddit) posts = api_praw.search_submissions(q="quantum", subreddit="science", limit=100, before=1629990795) assert (len(posts) == 100)
def test_submission_comment_id_exception(): with pytest.raises(ValueError): api = PushshiftAPI() def fxn(item): return item['score'] > 2 posts = api.search_submission_comment_ids(ids=post_ids, filter_fn=fxn)
def test_filter_key_exception(): with pytest.raises(KeyError): api = PushshiftAPI() def fxn(item): return item['badkeydoesntexist'] > 2 posts = api.search_submissions(ids=post_ids, filter_fn=fxn)
def test_filter_param_exception(): with pytest.raises(TypeError): api = PushshiftAPI() def fxn(): return True posts = api.search_submissions(ids=post_ids, filter_fn=fxn)
def test_search_ids_filter(): api = PushshiftAPI() def fxn(item): return item['score'] > 2 posts = api.search_submissions(ids=post_ids, filter_fn=fxn) assert (len(posts) == 2)
def __init__(self, start: dt.datetime, end: dt.datetime, subreddit: str): self.start = start self.end = end self.subreddit = subreddit # initializing PMAW wrapper self.api = PushshiftAPI() # slices the columns self.peadf = pd.DataFrame(columns=['created_utc', 'author', 'body'])
def __init__(self, subreddit: str = None, start: dt.datetime = None, path: str = os.getcwd(), limit: int = None): self.api = PushshiftAPI() self.subreddit = subreddit self.start = start self.path = path self.limit = limit self.fpath = os.path.join(self.path, self.subreddit) self.fpathComments = os.path.join(self.fpath, "comments") self.fpathPosts = os.path.join(self.fpath, "posts") self.dfComments_list = None self.dfPosts_list = None
def main(): faces = get_commentfaces() api = PushshiftAPI() # start_epoch = int(dt.datetime(year=2018, month=7, day=6, tzinfo=dt.timezone.utc).timestamp()) # all cdfs start_epoch = int( dt.datetime(year=2022, month=1, day=7, tzinfo=dt.timezone.utc). timestamp()) # most recent cdf as of time of writing (2022/1/7) print('epoch', start_epoch) comments = get_all_comments_using_commentfaces(api, start_epoch, faces) commentators = get_commentators_by_commentface(faces, comments) cdfs = get_cdfs(api, start_epoch) cdf_commentators = get_cdf_commentators(api, start_epoch, cdfs) analysis_and_visualization(faces, commentators, cdf_commentators) commentators_by_commentfaces_csv(commentators, cdf_commentators, faces)
class pea(): def __init__(self, start: dt.datetime, end: dt.datetime, subreddit: str): self.start = start self.end = end self.subreddit = subreddit # initializing PMAW wrapper self.api = PushshiftAPI() # slices the columns self.peadf = pd.DataFrame(columns=['created_utc', 'author', 'body']) def __str__(self, limit=None): return "start: {}, end: {}, subreddit: {}".format( self.start, self.end, self.subreddit) # get dataframe with time in utc, author, and body of text def getdf(self, limit=None): # prints out basic info print(self) # uses PMAW to gather the info form PushShift comments = self.api.search_comments(subreddit=self.subreddit, rate_limit=30, limit=limit, before=int(self.end.timestamp()), after=int(self.start.timestamp())) print(f'Retrieved {len(comments)} comments from Pushshift') comments_df = pd.DataFrame(comments) comments_df = comments_df.loc[:, ['created_utc', 'author', 'body']] self.peadf = comments_df.sort_values(by=['created_utc']).set_index( ['created_utc']) return self.peadf # adds another column with tickers that are present in the body def analyze_df(self): self.peadf['tickers'] = self.peadf.apply( lambda row: analyze(row['body']), axis=1) return self.peadf
def test_submission_search_limit(): api = PushshiftAPI(file_checkpoint=1) posts = api.search_submissions(subreddit="science", limit=100, before=1629990795) assert (len(posts) == 100)
class redditdb: def __init__(self, subreddit: str = None, start: dt.datetime = None, path: str = os.getcwd(), limit: int = None): self.api = PushshiftAPI() self.subreddit = subreddit self.start = start self.path = path self.limit = limit self.fpath = os.path.join(self.path, self.subreddit) self.fpathComments = os.path.join(self.fpath, "comments") self.fpathPosts = os.path.join(self.fpath, "posts") self.dfComments_list = None self.dfPosts_list = None # get list of dataframes def updateListComments(self): self.dfComments_list = [f for f in listdir(self.fpathComments) if isfile(join(self.fpathComments, f))] def updateListPosts(self): self.dfPosts_list = [f for f in listdir(self.fpathPosts) if isfile(join(self.fpathPosts, f))] # updates comments and posts def updateAll(self, date: dt.datetime): if not os.path.isdir(self.fpath): os.makedirs(self.fpath) self.updateComments(date) self.updatePosts(date) # update set of comment dataframes to yesterday def updateComments(self, date: dt.datetime): # makes the directory if it doesn't already exist if not os.path.isdir(self.fpathComments): os.makedirs(self.fpathComments) # starts downloading self.getallComments(date) def updatePosts(self, date: dt.datetime): # makes the directory if it doesn't already exist if not os.path.isdir(self.fpathPosts): os.makedirs(self.fpathPosts) # starts downloading self.getallPosts(date) # save all data in a time range def getallComments(self, end: dt.datetime): self.updateListComments() print("Retrieving comment data from {}: {} to {}".format(self.subreddit, self.start, end)) dset = set(self.dfComments_list) for i in range( int( (end-self.start).days ) + 1): day = self.start + dt.timedelta(days = i) # check if day is already accounted for, if not download the comment dataframe if not '{}.csv'.format(day.date()) in dset: print('\033[32m{}\033[37m comments has not been downloaded for {}/comments. Downloading...'. format(day.date(), self.subreddit)) self.savedayComments(day = day) else: print('\033[31m{}\033[37m comments already exists in {}/comments'. format(day.date(), self.subreddit)) def getallPosts(self, end: dt.datetime): self.updateListPosts() print("Retrieving post data from {}: {} to {}".format(self.subreddit, self.start, end)) dset = set(self.dfPosts_list) for i in range( int( (end-self.start).days ) + 1): day = self.start + dt.timedelta(days = i) # check if day is already accounted for, if not download the post dataframe if not '{}.csv'.format(day.date()) in dset: print('\033[32m{}\033[37m posts has not been downloaded in {}/posts. Downloading...'. format(day.date(), self.subreddit)) self.savedayPosts(day = day) else: print('\033[31m{}\033[37m posts already exists in {}/posts'. format(day.date(), self.subreddit)) # helper function to save data by day def savedayComments(self, day: dt.datetime): # gets comments from PushShift using PMAW wrapper comments = self.api.search_comments(subreddit=self.subreddit, rate_limit = 20, limit=self.limit, before=int((day+dt.timedelta(days=1)).timestamp()), after=int(day.timestamp())) print(f'Retrieved {len(comments)} comments from Pushshift') # converts into a dataframe with utc as index comments_df = pd.DataFrame(comments) # extra check if not comments_df.empty: comments_df = comments_df.sort_values(by=['created_utc']).set_index(['created_utc']) # calls download function self.downloadComments(comments_df, day) # helper function to save data by day def savedayPosts(self, day: dt.datetime): # gets posts from PushShift using PMAW wrapper posts = self.api.search_submissions(subreddit=self.subreddit, rate_limit = 20, limit=self.limit, before=int((day+dt.timedelta(days=1)).timestamp()), after=int(day.timestamp())) print(f'Retrieved {len(posts)} posts from Pushshift') # converts into a dataframe with utc as index posts_df = pd.DataFrame(posts) # extra check if not posts_df.empty: posts_df = posts_df.sort_values(by=['created_utc']).set_index(['created_utc']) # calls download function self.downloadPosts(posts_df, day) # download the info into a dataframe # path/subreddit/date def downloadComments(self, df: pd.DataFrame, day: dt.datetime): # checks to see if a folder with the name of the subreddit already exists if not os.path.isdir(self.fpathComments): os.makedirs(self.fpathComments) # names the file the current day fname = '{}.csv'.format(day.date()) #save the file df.to_csv(os.path.join(self.fpathComments, fname)) # download the info into a dataframe # path/subreddit/date def downloadPosts(self, df: pd.DataFrame, day: dt.datetime): # checks to see if a folder with the name of the subreddit already exists if not os.path.isdir(self.fpathPosts): os.makedirs(self.fpathPosts) # names the file the current day fname = '{}.csv'.format(day.date()) #save the file df.to_csv(os.path.join(self.fpathPosts, fname)) # load dataframe into memory def loadDayComments(self, day: dt.datetime): # check if folder exists if not os.path.isdir(self.fpathComments): print('The folder for {}/comments does not exist'.format(self.subreddit)) return pd.DataFrame() else: # get name of file for the date fname = '{}.csv'.format(day.date()) loadpath = os.path.join(self.fpathComments, fname) # check for file if os.path.exists(loadpath): return pd.read_csv(loadpath) else: print("\033[31m{}\033[37m does not exist in {}/posts".format(day.date(), self.subreddit)) return pd.DataFrame() # load dataframe into memory def loadDayPosts(self, day: dt.datetime): # check if folder exists if not os.path.isdir(self.fpathPosts): print('The folder for {}/posts does not exist'.format(self.subreddit)) return pd.DataFrame() else: # get name of file for the date fname = '{}.csv'.format(day.date()) loadpath = os.path.join(self.fpathPosts, fname) # check for file if os.path.exists(loadpath): return pd.read_csv(loadpath) else: print("\033[31m{}\033[37m does not exist in {}/posts".format(day.date(), self.subreddit)) return pd.DataFrame() def loadRangeAll(self, start: dt.datetime, end: dt.datetime): return self.loadRangeComments(start, end), self.loadRangePosts(start, end) # return the combined dataframe def loadRangeComments(self, start: dt.datetime, end: dt.datetime): dfout = pd.DataFrame() for i in range( int( (end-start).days ) + 1): day = start + dt.timedelta(days = i) df = self.loadDayComments(day) if not df.empty: dfout = dfout.append(df) return dfout # return the combined dataframe def loadRangePosts(self, start: dt.datetime, end: dt.datetime): dfout = pd.DataFrame() for i in range( int( (end-start).days ) + 1): day = start + dt.timedelta(days = i) df = self.loadDayPosts(day) if not df.empty: dfout = dfout.append(df) return dfout
def test_comment_praw_ids(): api_praw = PushshiftAPI(file_checkpoint=1, praw=reddit) comments = api_praw.search_comments(ids=comment_ids) assert (len(comments) == len(comment_ids))
class QueryRedditPostsV2(QueryPostsInterface): def __init__(self, list_filters, parallelize, log) -> None: super().__init__() self._list_filters = list_filters self._parallelize = parallelize self._log = log self._dict_df_posts = {} self._api = PushshiftAPI() @property def dict_df_posts(self): return self._dict_df_posts def set_dict_df_posts(self, key, df) -> None: if (len(df) > 0): now = dt.datetime.now() dt_string = now.strftime("%d_%m_%Y_%H_%M_%S") k = 'reddit_pmaw_' + key + '_' + dt_string self._dict_df_posts[k] = df def query(self, reddit_filter, subreddit) -> pd.DataFrame: df_posts = pd.DataFrame() try: posts = self._api.search_submissions(subreddit=subreddit, limit=reddit_filter.items, **reddit_filter.query_params) # create the dataframe df_posts = pd.DataFrame(posts) # standardize the name of the text column df_posts.rename( columns={'selftext': 'text'}, inplace=True) # change 'selftext' to 'text' for preprocessing # format the date df_posts['created_utc'] = df_posts['created_utc'].apply( dt.datetime.fromtimestamp) # create a column with the value from the 'label' filter parameter if (reddit_filter.label is not None): df_posts['label'] = reddit_filter.label except: self._log.exception('Fail to query Reddit posts.') # return the dataframe return df_posts def query_par(self, reddit_filter, queue, subreddit) -> None: # call query function to query posts and create a dataframe df_posts = self.query(reddit_filter, subreddit) # put the pandas dataframe in the queue queue.put(df_posts) def query_manager(self) -> None: self._log.timer_message( 'Collecting Reddit data with the pmaw package.') # select only the Reddit filters list_reddit_filters = list( filter(lambda x: (x.key == 'Reddit' and x.library == 'pmaw'), self._list_filters)) # both methods perform the same task using a parallel or sequential strategy if (self._parallelize): # query posts parallelized self.query_parallel(list_reddit_filters) else: # query posts sequentially self.query_sequential(list_reddit_filters) def query_sequential(self, list_filters) -> None: start_time_seq = time.time() # separate filters by type search_filters = list( filter(lambda x: (x.filter_type == 'search'), list_filters)) # for each subreddit from each filter, create a query of posts # concatenate all dataframes of posts information # for each subreddit from each filter, create a query of posts # concatenate all dataframes of posts information df_search_posts = pd.DataFrame() for sf in search_filters: for subreddit in sf.subreddits: df_posts = self.query(sf, subreddit) df_search_posts = pd.concat([df_search_posts, df_posts]) self.set_dict_df_posts('search_posts', df_search_posts) self._log.user_message('Reddit posts\' query finished.') final_time_seq = time.time() - start_time_seq self._log.timer_message('Sequential Query Time: ' + str(final_time_seq) + ' seconds.') def query_parallel(self, list_filters) -> None: start_time_par = time.time() # separate filters by type search_filters = list( filter(lambda x: (x.filter_type == 'search'), list_filters)) # configure the queue queue_search = Queue() # for each subreddit from each filter, create a query of posts # concatenate all dataframes of posts information processes_search = [] for sf in search_filters: processes_search.extend([ Process(target=self.query_par, args=(sf, queue_search, sub)) for sub in sf.subreddits ]) # start the processes for p in processes_search: p.start() # concatenate all dataframes of search information df_search_posts = pd.DataFrame() for _ in processes_search: df_process_posts = queue_search.get() df_search_posts = pd.concat([df_search_posts, df_process_posts]) self.set_dict_df_posts('search_posts', df_search_posts) self._log.user_message('Reddit posts\' query finished.') # wait the processes for p in processes_search: p.join() final_time_par = time.time() - start_time_par self._log.timer_message('Parallelized Query Time: ' + str(final_time_par) + ' seconds.')
import requests import datetime as dt from pmaw import PushshiftAPI import pandas as pd api = PushshiftAPI() # before = int(dt.datetime(2021,1,1).timestamp()) # after = int(dt.datetime(2020,12,1).timestamp()) before = int(dt.datetime(2021,2,1).timestamp()) after = int(dt.datetime(2021,1,1).timestamp()) subreddit="wallstreetbets" limit=100 comments = api.search_comments(subreddit=subreddit, limit=limit, before=before, after=after) print(f'Retrieved {len(comments)} comments from Pushshift') comments_df = pd.DataFrame(comments)# preview the comments data print(comments_df.index) print(comments_df.head(10).loc[:,['created_utc', 'author', 'body']]) # comments_df.to_csv('./wsb_comments.csv', header=True, index=False, columns=list(comments_df.axes[1]))
def test_safe_exit_praw(): with pytest.raises(NotImplementedError): api_praw = PushshiftAPI(praw=reddit) comments = api_praw.search_comments(ids=comment_ids, safe_exit=True)
def test_submission_comment_ids_praw_mem_safe(): api_praw = PushshiftAPI(file_checkpoint=1, praw=reddit) comments = api_praw.search_submission_comment_ids(ids=post_ids, mem_safe=True) assert (len(comments) == 66)
def redditconnect_PMAW(): # Pmaw Object temp_threadcount = os.cpu_count() api = PushshiftAPI(num_workers=temp_threadcount * 5) return api
def test_submission_comment_ids_search(): api = PushshiftAPI(file_checkpoint=1) comments = api.search_submission_comment_ids(ids=post_ids) assert (len(comments) == 66)
def test_response_load_cache(): api = PushshiftAPI(file_checkpoint=1) comments = api.search_submission_comment_ids(ids=post_ids, mem_safe=True) resp = Response.load_cache(key=comments._cache.key) assert (len(comments) == len(resp) and len(comments) == 66)
def test_response_generator(): api = PushshiftAPI(file_checkpoint=1) comments = api.search_submission_comment_ids(ids=post_ids, mem_safe=True) all_c = [c for c in comments] assert (len(all_c) == 66)
# Require packate: # pip install pmaw import datetime as dt import pandas as pd import numpy as np from tqdm import tqdm from pmaw import PushshiftAPI # Create time filter before = int(dt.datetime(2021, 1, 12, 0, 0).timestamp()) # timestamp in seconds # Create subreddit and parameters subreddit = "wallstreetbets" size = 500 # maximum submission size = 500 api = PushshiftAPI() limit = 1000000 test_list_data = [] total_results_df = [] #------------------------------------PARAMETERS EDIT------------------------------------# min_score = '>40' # > here is >= #Sumission score 40, comment score 10 time_range = 365 * 2 download_per_loop = 10 # days IS_SUBMISSION = True #Download by smaller step to avoid imcomplete result (too many resuls coulde be lost due to PushShift shards are down???) for i in tqdm(range(int(time_range / download_per_loop) + 2)): after = before - 60 * 60 * 24 * download_per_loop # download per day if IS_SUBMISSION:
def test_submission_praw_ids(): api_praw = PushshiftAPI(file_checkpoint=1, praw=reddit) posts = api_praw.search_submissions(ids=post_ids) assert (len(posts) == len(post_ids))
def test_submission_search_ids(): api = PushshiftAPI(file_checkpoint=1) posts = api.search_submissions(ids=post_ids) assert (len(posts) == len(post_ids))