예제 #1
0
def test_submission_praw_mem_safe():
    api_praw = PushshiftAPI(file_checkpoint=1, praw=reddit)
    posts = api_praw.search_submissions(subreddit="science",
                                        limit=1000,
                                        mem_safe=True,
                                        before=1629990795)
    assert (len(posts) == 1000)
예제 #2
0
def test_asc_sort():
    with pytest.raises(NotImplementedError):
        api = PushshiftAPI()
        comments = api.search_comments(subreddit="science",
                                       limit=100,
                                       before=1629990795,
                                       sort='asc')
예제 #3
0
def test_comment_praw_mem_safe():
    api_praw = PushshiftAPI(file_checkpoint=1, praw=reddit)
    comments = api_praw.search_comments(subreddit="science",
                                        limit=1000,
                                        mem_safe=True,
                                        before=1629990795)
    assert (len(comments) == 1000)
예제 #4
0
def get_comments_from_wallstreetbets(before, after):
    """
    Functions returns comments dataframe in particular horizon.
    :param before: pd.Timestamp()
    :param after: pd.Timestamp()
    :return: pd.DataFrame()
    """

    # Get cpu counts to specify maximum cores on VM available
    max_threads = os.cpu_count() * 5

    # Scrap comments from wallstreetbets
    api = PushshiftAPI()
    subreddit = "wallstreetbets"
    comments = api.search_comments(
        # PMAW parameters
        mem_safe=True,
        num_workers=max_threads,
        # Pushift.io parameters
        subreddit=subreddit,
        after=int(after.timestamp()),
        before=int(before.timestamp())
    )

    # Clean dataframe with comments
    comments_df = pd.DataFrame(comments)
    if not comments_df.empty:
        comments_df = comments_df[['id', 'author', 'body', 'created_utc']].drop_duplicates()
        comments_df.created_utc = pd.to_datetime(comments_df.created_utc, unit='s')
        comments_df = comments_df[~comments_df.body.isin(['[removed]', '[deleted]'])]
        comments_df = comments_df.sort_values('created_utc').reset_index(drop=True)

    return comments_df
예제 #5
0
def test_comment_praw_query():
    api_praw = PushshiftAPI(file_checkpoint=1, praw=reddit)
    comments = api_praw.search_comments(q="quantum",
                                        subreddit="science",
                                        limit=100,
                                        before=1629990795)
    assert (len(comments) == 100)
예제 #6
0
 def __init__(self, list_filters, parallelize, log) -> None:
     super().__init__()
     self._list_filters = list_filters
     self._parallelize = parallelize
     self._log = log
     self._dict_df_posts = {}
     self._api = PushshiftAPI()
예제 #7
0
def test_praw_ids_filter():
    def fxn(item):
        return item['ups'] > 2

    api_praw = PushshiftAPI(praw=reddit)
    comments = api_praw.search_comments(ids=comment_ids, filter_fn=fxn)
    assert (len(comments) == 4)
예제 #8
0
def test_submission_praw_query():
    api_praw = PushshiftAPI(file_checkpoint=1, praw=reddit)
    posts = api_praw.search_submissions(q="quantum",
                                        subreddit="science",
                                        limit=100,
                                        before=1629990795)
    assert (len(posts) == 100)
예제 #9
0
def test_submission_comment_id_exception():
    with pytest.raises(ValueError):
        api = PushshiftAPI()

        def fxn(item):
            return item['score'] > 2

        posts = api.search_submission_comment_ids(ids=post_ids, filter_fn=fxn)
예제 #10
0
def test_filter_key_exception():
    with pytest.raises(KeyError):
        api = PushshiftAPI()

        def fxn(item):
            return item['badkeydoesntexist'] > 2

        posts = api.search_submissions(ids=post_ids, filter_fn=fxn)
예제 #11
0
def test_filter_param_exception():
    with pytest.raises(TypeError):
        api = PushshiftAPI()

        def fxn():
            return True

        posts = api.search_submissions(ids=post_ids, filter_fn=fxn)
예제 #12
0
def test_search_ids_filter():
    api = PushshiftAPI()

    def fxn(item):
        return item['score'] > 2

    posts = api.search_submissions(ids=post_ids, filter_fn=fxn)
    assert (len(posts) == 2)
예제 #13
0
파일: pea.py 프로젝트: quantsnd/reddit
    def __init__(self, start: dt.datetime, end: dt.datetime, subreddit: str):
        self.start = start
        self.end = end
        self.subreddit = subreddit

        # initializing PMAW wrapper
        self.api = PushshiftAPI()
        # slices the columns
        self.peadf = pd.DataFrame(columns=['created_utc', 'author', 'body'])
예제 #14
0
파일: rdb.py 프로젝트: quantsnd/reddit
    def __init__(self, subreddit: str = None, start: dt.datetime = None, path: str = os.getcwd(), limit: int = None):
        self.api = PushshiftAPI() 
        self.subreddit = subreddit
        self.start = start
        self.path = path
        self.limit = limit
        self.fpath = os.path.join(self.path, self.subreddit)
        self.fpathComments = os.path.join(self.fpath, "comments")
        self.fpathPosts = os.path.join(self.fpath, "posts")

        self.dfComments_list = None
        self.dfPosts_list = None
def main():
    faces = get_commentfaces()

    api = PushshiftAPI()

    # start_epoch = int(dt.datetime(year=2018, month=7, day=6, tzinfo=dt.timezone.utc).timestamp()) # all cdfs
    start_epoch = int(
        dt.datetime(year=2022, month=1, day=7, tzinfo=dt.timezone.utc).
        timestamp())  # most recent cdf as of time of writing (2022/1/7)
    print('epoch', start_epoch)

    comments = get_all_comments_using_commentfaces(api, start_epoch, faces)
    commentators = get_commentators_by_commentface(faces, comments)

    cdfs = get_cdfs(api, start_epoch)
    cdf_commentators = get_cdf_commentators(api, start_epoch, cdfs)

    analysis_and_visualization(faces, commentators, cdf_commentators)
    commentators_by_commentfaces_csv(commentators, cdf_commentators, faces)
예제 #16
0
파일: pea.py 프로젝트: quantsnd/reddit
class pea():
    def __init__(self, start: dt.datetime, end: dt.datetime, subreddit: str):
        self.start = start
        self.end = end
        self.subreddit = subreddit

        # initializing PMAW wrapper
        self.api = PushshiftAPI()
        # slices the columns
        self.peadf = pd.DataFrame(columns=['created_utc', 'author', 'body'])

    def __str__(self, limit=None):
        return "start: {}, end: {}, subreddit: {}".format(
            self.start, self.end, self.subreddit)

    # get dataframe with time in utc, author, and body of text
    def getdf(self, limit=None):
        # prints out basic info
        print(self)

        # uses PMAW to gather the info form PushShift
        comments = self.api.search_comments(subreddit=self.subreddit,
                                            rate_limit=30,
                                            limit=limit,
                                            before=int(self.end.timestamp()),
                                            after=int(self.start.timestamp()))
        print(f'Retrieved {len(comments)} comments from Pushshift')
        comments_df = pd.DataFrame(comments)
        comments_df = comments_df.loc[:, ['created_utc', 'author', 'body']]
        self.peadf = comments_df.sort_values(by=['created_utc']).set_index(
            ['created_utc'])

        return self.peadf

    # adds another column with tickers that are present in the body
    def analyze_df(self):
        self.peadf['tickers'] = self.peadf.apply(
            lambda row: analyze(row['body']), axis=1)
        return self.peadf
예제 #17
0
def test_submission_search_limit():
    api = PushshiftAPI(file_checkpoint=1)
    posts = api.search_submissions(subreddit="science",
                                   limit=100,
                                   before=1629990795)
    assert (len(posts) == 100)
예제 #18
0
파일: rdb.py 프로젝트: quantsnd/reddit
class redditdb:
    def __init__(self, subreddit: str = None, start: dt.datetime = None, path: str = os.getcwd(), limit: int = None):
        self.api = PushshiftAPI() 
        self.subreddit = subreddit
        self.start = start
        self.path = path
        self.limit = limit
        self.fpath = os.path.join(self.path, self.subreddit)
        self.fpathComments = os.path.join(self.fpath, "comments")
        self.fpathPosts = os.path.join(self.fpath, "posts")

        self.dfComments_list = None
        self.dfPosts_list = None

    # get list of dataframes
    def updateListComments(self):
        self.dfComments_list = [f for f in listdir(self.fpathComments) if isfile(join(self.fpathComments, f))]

    def updateListPosts(self):
        self.dfPosts_list = [f for f in listdir(self.fpathPosts) if isfile(join(self.fpathPosts, f))]

    # updates comments and posts
    def updateAll(self, date: dt.datetime):
        if not os.path.isdir(self.fpath):
            os.makedirs(self.fpath)

        self.updateComments(date)
        self.updatePosts(date)

    # update set of comment dataframes to yesterday
    def updateComments(self, date: dt.datetime):
        # makes the directory if it doesn't already exist
        if not os.path.isdir(self.fpathComments):
            os.makedirs(self.fpathComments)
        # starts downloading
        self.getallComments(date)
    
    def updatePosts(self, date: dt.datetime):
        # makes the directory if it doesn't already exist
        if not os.path.isdir(self.fpathPosts):
            os.makedirs(self.fpathPosts)
        # starts downloading
        self.getallPosts(date)
        
    # save all data in a time range
    def getallComments(self, end: dt.datetime):
        self.updateListComments()
        print("Retrieving comment data from {}: {} to {}".format(self.subreddit, self.start, end))

        dset = set(self.dfComments_list)

        for i in range( int( (end-self.start).days ) + 1):
            day = self.start + dt.timedelta(days = i)

            # check if day is already accounted for, if not download the comment dataframe
            if not '{}.csv'.format(day.date()) in dset:
                print('\033[32m{}\033[37m comments has not been downloaded for {}/comments. Downloading...'. format(day.date(), self.subreddit))
                self.savedayComments(day = day)
            else:
                print('\033[31m{}\033[37m comments already exists in {}/comments'. format(day.date(), self.subreddit))

    def getallPosts(self, end: dt.datetime):
        self.updateListPosts()
        print("Retrieving post data from {}: {} to {}".format(self.subreddit, self.start, end))

        dset = set(self.dfPosts_list)

        for i in range( int( (end-self.start).days ) + 1):
            day = self.start + dt.timedelta(days = i)

            # check if day is already accounted for, if not download the post dataframe
            if not '{}.csv'.format(day.date()) in dset:
                print('\033[32m{}\033[37m posts has not been downloaded in {}/posts. Downloading...'. format(day.date(), self.subreddit))
                self.savedayPosts(day = day)
            else:
                print('\033[31m{}\033[37m posts already exists in {}/posts'. format(day.date(), self.subreddit))

    
    # helper function to save data by day
    def savedayComments(self, day: dt.datetime):
        # gets comments from PushShift using PMAW wrapper
        comments = self.api.search_comments(subreddit=self.subreddit, rate_limit = 20, limit=self.limit, before=int((day+dt.timedelta(days=1)).timestamp()), after=int(day.timestamp()))
        print(f'Retrieved {len(comments)} comments from Pushshift')

        # converts into a dataframe with utc as index
        comments_df = pd.DataFrame(comments)
        # extra check
        if not comments_df.empty:
            comments_df = comments_df.sort_values(by=['created_utc']).set_index(['created_utc'])
        
        # calls download function
        self.downloadComments(comments_df, day)
    
    # helper function to save data by day
    def savedayPosts(self, day: dt.datetime):
        # gets posts from PushShift using PMAW wrapper
        posts = self.api.search_submissions(subreddit=self.subreddit, rate_limit = 20, limit=self.limit, before=int((day+dt.timedelta(days=1)).timestamp()), after=int(day.timestamp()))
        print(f'Retrieved {len(posts)} posts from Pushshift')

        # converts into a dataframe with utc as index
        posts_df = pd.DataFrame(posts)
        # extra check
        if not posts_df.empty:
            posts_df = posts_df.sort_values(by=['created_utc']).set_index(['created_utc'])
        
        # calls download function
        self.downloadPosts(posts_df, day)


    # download the info into a dataframe
    # path/subreddit/date
    def downloadComments(self, df: pd.DataFrame, day: dt.datetime):
        # checks to see if a folder with the name of the subreddit already exists
        if not os.path.isdir(self.fpathComments):
            os.makedirs(self.fpathComments)
        
        # names the file the current day
        fname = '{}.csv'.format(day.date())

        #save the file
        df.to_csv(os.path.join(self.fpathComments, fname))
        
     # download the info into a dataframe
    # path/subreddit/date
    def downloadPosts(self, df: pd.DataFrame, day: dt.datetime):
        # checks to see if a folder with the name of the subreddit already exists
        if not os.path.isdir(self.fpathPosts):
            os.makedirs(self.fpathPosts)
        
        # names the file the current day
        fname = '{}.csv'.format(day.date())

        #save the file
        df.to_csv(os.path.join(self.fpathPosts, fname))

    # load dataframe into memory
    def loadDayComments(self, day: dt.datetime):
        # check if folder exists
 
        if not os.path.isdir(self.fpathComments):
            print('The folder for {}/comments does not exist'.format(self.subreddit))
            return pd.DataFrame()
        else:
            # get name of file for the date
            fname = '{}.csv'.format(day.date())
            loadpath = os.path.join(self.fpathComments, fname)
            # check for file
            if os.path.exists(loadpath):
                return pd.read_csv(loadpath) 
            else:
                print("\033[31m{}\033[37m does not exist in {}/posts".format(day.date(), self.subreddit))
                return pd.DataFrame()
    
    # load dataframe into memory
    def loadDayPosts(self, day: dt.datetime):
        # check if folder exists
 
        if not os.path.isdir(self.fpathPosts):
            print('The folder for {}/posts does not exist'.format(self.subreddit))
            return pd.DataFrame()
        else:
            # get name of file for the date
            fname = '{}.csv'.format(day.date())
            loadpath = os.path.join(self.fpathPosts, fname)
            # check for file
            if os.path.exists(loadpath):
                return pd.read_csv(loadpath) 
            else:
                print("\033[31m{}\033[37m does not exist in {}/posts".format(day.date(), self.subreddit))
                return pd.DataFrame()


    def loadRangeAll(self, start: dt.datetime, end: dt.datetime):
        return self.loadRangeComments(start, end), self.loadRangePosts(start, end)

    # return the combined dataframe
    def loadRangeComments(self, start: dt.datetime, end: dt.datetime):
        dfout = pd.DataFrame()
        for i in range( int( (end-start).days ) + 1):
            day = start + dt.timedelta(days = i)
            df = self.loadDayComments(day)
            
            if not df.empty:
                dfout = dfout.append(df)
        
        return dfout

    # return the combined dataframe
    def loadRangePosts(self, start: dt.datetime, end: dt.datetime):
        dfout = pd.DataFrame()
        for i in range( int( (end-start).days ) + 1):
            day = start + dt.timedelta(days = i)
            df = self.loadDayPosts(day)
            
            if not df.empty:
                dfout = dfout.append(df)
        
        return dfout
예제 #19
0
def test_comment_praw_ids():
    api_praw = PushshiftAPI(file_checkpoint=1, praw=reddit)
    comments = api_praw.search_comments(ids=comment_ids)
    assert (len(comments) == len(comment_ids))
예제 #20
0
class QueryRedditPostsV2(QueryPostsInterface):
    def __init__(self, list_filters, parallelize, log) -> None:
        super().__init__()
        self._list_filters = list_filters
        self._parallelize = parallelize
        self._log = log
        self._dict_df_posts = {}
        self._api = PushshiftAPI()

    @property
    def dict_df_posts(self):
        return self._dict_df_posts

    def set_dict_df_posts(self, key, df) -> None:
        if (len(df) > 0):
            now = dt.datetime.now()
            dt_string = now.strftime("%d_%m_%Y_%H_%M_%S")
            k = 'reddit_pmaw_' + key + '_' + dt_string
            self._dict_df_posts[k] = df

    def query(self, reddit_filter, subreddit) -> pd.DataFrame:
        df_posts = pd.DataFrame()

        try:
            posts = self._api.search_submissions(subreddit=subreddit,
                                                 limit=reddit_filter.items,
                                                 **reddit_filter.query_params)

            # create the dataframe
            df_posts = pd.DataFrame(posts)

            # standardize the name of the text column
            df_posts.rename(
                columns={'selftext': 'text'},
                inplace=True)  # change 'selftext' to 'text' for preprocessing

            # format the date
            df_posts['created_utc'] = df_posts['created_utc'].apply(
                dt.datetime.fromtimestamp)

            # create a column with the value from the 'label' filter parameter
            if (reddit_filter.label is not None):
                df_posts['label'] = reddit_filter.label

        except:
            self._log.exception('Fail to query Reddit posts.')

        # return the dataframe
        return df_posts

    def query_par(self, reddit_filter, queue, subreddit) -> None:
        # call query function to query posts and create a dataframe
        df_posts = self.query(reddit_filter, subreddit)

        # put the pandas dataframe in the queue
        queue.put(df_posts)

    def query_manager(self) -> None:
        self._log.timer_message(
            'Collecting Reddit data with the pmaw package.')

        # select only the Reddit filters
        list_reddit_filters = list(
            filter(lambda x: (x.key == 'Reddit' and x.library == 'pmaw'),
                   self._list_filters))

        # both methods perform the same task using a parallel or sequential strategy
        if (self._parallelize):
            # query posts parallelized
            self.query_parallel(list_reddit_filters)
        else:
            # query posts sequentially
            self.query_sequential(list_reddit_filters)

    def query_sequential(self, list_filters) -> None:
        start_time_seq = time.time()

        # separate filters by type
        search_filters = list(
            filter(lambda x: (x.filter_type == 'search'), list_filters))

        # for each subreddit from each filter, create a query of posts
        # concatenate all dataframes of posts information
        # for each subreddit from each filter, create a query of posts
        # concatenate all dataframes of posts information
        df_search_posts = pd.DataFrame()
        for sf in search_filters:
            for subreddit in sf.subreddits:
                df_posts = self.query(sf, subreddit)
                df_search_posts = pd.concat([df_search_posts, df_posts])

        self.set_dict_df_posts('search_posts', df_search_posts)
        self._log.user_message('Reddit posts\' query finished.')

        final_time_seq = time.time() - start_time_seq
        self._log.timer_message('Sequential Query Time: ' +
                                str(final_time_seq) + ' seconds.')

    def query_parallel(self, list_filters) -> None:
        start_time_par = time.time()

        # separate filters by type
        search_filters = list(
            filter(lambda x: (x.filter_type == 'search'), list_filters))

        # configure the queue
        queue_search = Queue()

        # for each subreddit from each filter, create a query of posts
        # concatenate all dataframes of posts information
        processes_search = []
        for sf in search_filters:
            processes_search.extend([
                Process(target=self.query_par, args=(sf, queue_search, sub))
                for sub in sf.subreddits
            ])

        # start the processes
        for p in processes_search:
            p.start()

        # concatenate all dataframes of search information
        df_search_posts = pd.DataFrame()
        for _ in processes_search:
            df_process_posts = queue_search.get()
            df_search_posts = pd.concat([df_search_posts, df_process_posts])

        self.set_dict_df_posts('search_posts', df_search_posts)
        self._log.user_message('Reddit posts\' query finished.')

        # wait the processes
        for p in processes_search:
            p.join()

        final_time_par = time.time() - start_time_par
        self._log.timer_message('Parallelized Query Time: ' +
                                str(final_time_par) + ' seconds.')
예제 #21
0
파일: PS.py 프로젝트: quantsnd/reddit
import requests
import datetime as dt
from pmaw import PushshiftAPI
import pandas as pd

api = PushshiftAPI() 

# before = int(dt.datetime(2021,1,1).timestamp())
# after = int(dt.datetime(2020,12,1).timestamp())

before = int(dt.datetime(2021,2,1).timestamp())
after = int(dt.datetime(2021,1,1).timestamp())

subreddit="wallstreetbets"
limit=100

comments = api.search_comments(subreddit=subreddit, limit=limit, before=before, after=after)
print(f'Retrieved {len(comments)} comments from Pushshift')

comments_df = pd.DataFrame(comments)# preview the comments data
print(comments_df.index)
print(comments_df.head(10).loc[:,['created_utc', 'author', 'body']])

# comments_df.to_csv('./wsb_comments.csv', header=True, index=False, columns=list(comments_df.axes[1]))
예제 #22
0
def test_safe_exit_praw():
    with pytest.raises(NotImplementedError):
        api_praw = PushshiftAPI(praw=reddit)
        comments = api_praw.search_comments(ids=comment_ids, safe_exit=True)
예제 #23
0
def test_submission_comment_ids_praw_mem_safe():
    api_praw = PushshiftAPI(file_checkpoint=1, praw=reddit)
    comments = api_praw.search_submission_comment_ids(ids=post_ids,
                                                      mem_safe=True)
    assert (len(comments) == 66)
def redditconnect_PMAW():
    # Pmaw Object
    temp_threadcount = os.cpu_count()
    api = PushshiftAPI(num_workers=temp_threadcount * 5)

    return api
예제 #25
0
def test_submission_comment_ids_search():
    api = PushshiftAPI(file_checkpoint=1)
    comments = api.search_submission_comment_ids(ids=post_ids)
    assert (len(comments) == 66)
예제 #26
0
def test_response_load_cache():
    api = PushshiftAPI(file_checkpoint=1)
    comments = api.search_submission_comment_ids(ids=post_ids, mem_safe=True)
    resp = Response.load_cache(key=comments._cache.key)
    assert (len(comments) == len(resp) and len(comments) == 66)
예제 #27
0
def test_response_generator():
    api = PushshiftAPI(file_checkpoint=1)
    comments = api.search_submission_comment_ids(ids=post_ids, mem_safe=True)
    all_c = [c for c in comments]
    assert (len(all_c) == 66)
예제 #28
0
# Require packate:
# pip install pmaw

import datetime as dt
import pandas as pd
import numpy as np
from tqdm import tqdm
from pmaw import PushshiftAPI

# Create time filter
before = int(dt.datetime(2021, 1, 12, 0,
                         0).timestamp())  # timestamp in seconds
# Create subreddit and parameters
subreddit = "wallstreetbets"
size = 500  # maximum submission size = 500
api = PushshiftAPI()
limit = 1000000
test_list_data = []
total_results_df = []

#------------------------------------PARAMETERS EDIT------------------------------------#
min_score = '>40'  # > here is >=  #Sumission score 40, comment score 10
time_range = 365 * 2
download_per_loop = 10  # days
IS_SUBMISSION = True

#Download by smaller step to avoid imcomplete result (too many resuls coulde be lost due to PushShift shards are down???)
for i in tqdm(range(int(time_range / download_per_loop) + 2)):
    after = before - 60 * 60 * 24 * download_per_loop  # download per day

    if IS_SUBMISSION:
예제 #29
0
def test_submission_praw_ids():
    api_praw = PushshiftAPI(file_checkpoint=1, praw=reddit)
    posts = api_praw.search_submissions(ids=post_ids)
    assert (len(posts) == len(post_ids))
예제 #30
0
def test_submission_search_ids():
    api = PushshiftAPI(file_checkpoint=1)
    posts = api.search_submissions(ids=post_ids)
    assert (len(posts) == len(post_ids))