Exemplo n.º 1
0
    def __init__(self, credentials_file='credentials.json'):
        try:
            with open(credentials_file) as f:
                params = json.load(f)
            self.filename = None
            self.reddit = praw.Reddit(client_id=params['client_id'],
                                      client_secret=params['api_key'],
                                      user_agent='Sentiment Analyzer')
            self.api = PushshiftAPI()

        # if credentials.json does not exist, prompt the user for authentication and create the file, then proceed
        except FileNotFoundError:

            auth_keys = {}

            auth_keys['client_id'] = prompt_for_auth_val('client_id')
            auth_keys['api_key'] = prompt_for_auth_val('api_key')
            auth_keys['username'] = prompt_for_auth_val('username')
            auth_keys['password'] = prompt_for_auth_val('password')

            with open(credentials_file, 'w') as outf:
                json.dump(auth_keys, outf)
                load_params = json.dumps(auth_keys)
                params = json.loads(load_params)
            self.filename = None
            self.reddit = praw.Reddit(client_id=params['client_id'],
                                      client_secret=params['api_key'],
                                      user_agent='Sentiment Analyzer')
            self.api = PushshiftAPI(backoff=10, max_retries=20)
Exemplo n.º 2
0
def subreddit_data(subreddit_name, post_amount=None):
    if post_amount:
        post_amount = int(post_amount)
    else:
        post_amount = config['display'].getint('subreddit_max_posts')
    subreddit = reddit.subreddit(subreddit_name)

    print(f'subreddit data, name: {subreddit_name}, amount: {post_amount}')

    if not config['modes'].getboolean('slow_mode'):
        submissions = subreddit.top('day', limit=post_amount)
    else:
        api = PushshiftAPI(reddit)
        submissions = list(api.search_submissions(subreddit=subreddit_name, after='56h', before='24h'))
        id_set = set()
        for submission in submissions:
            if submission.id in id_set:
                print('{}: "{}" is duplicated'.format(submission.id, submission.title))
            else:
                id_set.add(submission.id)
            if submission.subreddit.display_name.lower() != subreddit_name.lower():
                print('submission {} of {} != subreddit {}'.format(submission.id, submission.subreddit.display_name,
                                                                   subreddit_name))
        submissions.sort(key=lambda item: item.score, reverse=True)
        submissions = submissions[:post_amount]
        print(id_set)
        print(subreddit.display_name, submissions)
    posts = get_posts(submissions)
    return jsonify(posts)
Exemplo n.º 3
0
def getRedditSubsList(startEpoch, endEpoch, prawUserAgent='alpha'):
    """
    Returns a list of Reddit submissions from the bachelor subreddit
    
    Args :
        startEpoch : An int variable of posix time. Inidcates start time of
        reddit submissions.
        
        endEpoch : An int variable of posix time. Indicates end time of reddit
        submissions.
        
        prawUserAgent : The name of the reddit app. This should be specified in
        a praw.ini text file see: 
        https://praw.readthedocs.io/en/latest/getting_started/configuration/prawini.html
    
    Returns :
        subList : A list of reddit submission objects from PRAW
    """
    import praw
    from psaw import PushshiftAPI

    r = praw.Reddit(prawUserAgent)
    api = PushshiftAPI(r)
    subList = list(
        api.search_submissions(before=endEpoch,
                               after=startEpoch,
                               subreddit='thebachelor'))
    return subList
def get_reddit_comments(search_terms: list, subreddits: list):
    import praw
    from psaw import PushshiftAPI
    import pandas as pd
    import numpy as np
    import sys

    reddit = praw.Reddit(client_id="t6zhrU4Kfc2nRA",
                         client_secret="hosJ2fHU1z47MVfxRF-onhwxqpQ",
                         user_agent="sentiment_analysis")

    api = PushshiftAPI(reddit)
    body, timestamp, subreddit_name = [], [], []

    for query in search_terms:

        for subreddit in subreddits:
            print('Searching ' + subreddit + ' for:', query)
            gen = api.search_submissions(q=query, subreddit=subreddit)
            comment_counter = 0
            submission_counter = 0

            for submission in list(gen):
                submission.comments.replace_more(limit=None)
                submission_counter += 1
                sys.stdout.write("\033[F")  # back to previous line
                sys.stdout.write("\033[K")  # clear line
                print(str(submission_counter) + ' posts found')

                for comment in list(submission.comments):
                    body += [comment.body]
                    timestamp += [
                        pd.to_datetime(int(comment.created_utc),
                                       unit='s').tz_localize('UTC')
                    ]
                    subreddit_name += [comment.subreddit.display_name]
                    comment_counter += 1
                    sys.stdout.write("\033[F")  # back to previous line
                    sys.stdout.write("\033[K")  # clear line
                    print(str(comment_counter) + ' comments found')
                    # Check that all are same length, otherwise just add a nan
                    if len(body) < len(timestamp) or len(body) < len(
                            subreddit_name):
                        body += [np.nan]
                    elif len(timestamp) < len(body) or len(timestamp) < len(
                            subreddit_name):
                        timestamp += [np.nan]
                    elif len(subreddit_name) < len(body) or len(
                            subreddit_name) < len(timestamp):
                        subreddit_name += [np.nan]

    df = pd.DataFrame({
        'Timestamp': timestamp,
        'Body': body,
        'Subreddit': subreddit_name
    }).dropna()
    df.set_index('Timestamp', inplace=True)
    df.sort_index(inplace=True)
    df = df.drop_duplicates()
    return df
Exemplo n.º 5
0
def grab_data(subreddit, reddit):
    """ Grabs at most 400000 submissions from subreddit for 2 year period
    
    Parameters
    ----------
    subreddit : string
        the subreddit of interest
    
    reddit : Reddit Instance 
        an instance of the PRAW Reddit class
    Returns
    ----------
    list of submissions
    """
    api = PushshiftAPI(reddit)
    
    start_epoch = int(dt.datetime(2017, 3, 30).timestamp())
    end_epoch = int(dt.datetime(2019, 3, 30).timestamp())

    submissions = list(api.search_submissions(before=end_epoch, 
                                after=start_epoch,
                                subreddit=subreddit,
                                limit=40000))
                                
    print(f'grabbed {len(submissions)} submissions')
    np.save(subreddit+'_submissions', submissions)
    return submissions
Exemplo n.º 6
0
def extract_submissions(subr, lim, bef, aft, srt='desc'):
    subm_lst = []
    time.sleep(3)
    try:
        api = PushshiftAPI()
        corona_generator =  \
             api.search_submissions(subreddit = subr, limit = lim,
                                    before = bef, after = aft,
                                    sort = srt, lang = 'en',
                                    filter=['id','title', 'subreddit',
                                            'author', 'url','domain',
                                            'is_self','is_video','is_crosspostable','post_hint',
                                            'num_comments','score','removed_by_category',
                                            'selftext','link_flair_text','full_link'])
        subm_lst = list(corona_generator)
        #SC_subm.debug('Success!')
    except StopIteration:
        SC_subm.error(subr + ': StopIterrationError')
    except RuntimeError:
        SC_subm.error(subr + ': RuntimeError')
    except timeout_decorator.timeout_decorator.TimeoutError:
        SC_subm.error(subr + ': TimeoutError')
    except:
        SC_subm.error(subr + ': OtherError')
        pass
    return (subm_lst)
def handler_name(event, context):
	# This is the defined function Lambda reads in
    password = urllib.parse.quote("p@sswordw!thsp&ci@lch@r@ct&rs")
    mng_client = pymongo.MongoClient(
        "mongodb-uri-string" % password)
    mng_db = mng_client['db_name']  # Replace mongo db name
    collection_name = 'collection_name'  # Replace mongo db collection name
    db_cm = mng_db[collection_name]

    # praw is reddit's api wrapper
    reddit = praw.Reddit(client_id='client_id',
                         client_secret='client_secret',
                         user_agent='user_agent',
                         username='******',
                         password='******')

    # now we need to wrap the around praw with Reddit's push shift api wrapper
    api = PushshiftAPI(reddit)

    # run function to get end and start datestrings
    end_epoch, start_epoch = get_date_string(db_cm)

    # query reddit's api for comments
    results = list(api.search_comments(after=start_epoch,
                                          before=end_epoch,
                                          subreddit='subreddit',
                                          filter=['url', 'author', 'title', 'subreddit']))

    # iterate over comments in result and save to database
    for comment in results:
        save_comm(comment, db_cm)
Exemplo n.º 8
0
def get_submission_psaw(n, sub_dict):
    """
    Returns a list of results for submission in past:
    1st list: current result from n hours ago until now
    2nd list: prev result from 2n hours ago until n hours ago
     """
    api = PushshiftAPI()

    mid_interval = datetime.today() - timedelta(hours=n)
    timestamp_mid = int(mid_interval.timestamp())
    timestamp_start = int((mid_interval - timedelta(hours=n)).timestamp())
    timestamp_end = int(datetime.today().timestamp())

    recent = {}
    prev = {}
    for key in sub_dict:
        # results from the last n hours
        recent[key] = api.search_submissions(
            after=timestamp_mid,
            before=timestamp_end,
            subreddit=key,
            filter=['title', 'link_flair_text', 'selftext', 'score'])

        # results from the last 2n hours until n hours ago
        prev[key] = api.search_submissions(
            after=timestamp_start,
            before=timestamp_mid,
            subreddit=key,
            filter=['title', 'link_flair_text', 'selftext', 'score'])

    return recent, prev
Exemplo n.º 9
0
def wallstreetbettor():
    api = PushshiftAPI()
    start_time = int(datetime.datetime(2021, 2, 3).timestamp())
    submissions = api.search_submissions(
        after=start_time,
        subreddit='wallstreetbets',
        filter=['url', 'author', 'title', 'subreddit'],
        limit=15000)
    stock_tracker = {}
    for submission in submissions:
        words = submission.title.split()
        cashtag = list(
            set(filter(lambda word: word.lower().startswith('$'), words)))

        if len(cashtag) > 0:
            for item in cashtag:
                tag = item
                if item[-1] == ',' or item[-1] == ')' or item[-1] == '?':
                    tag = item[:-1]
                if tag[1:].isalpha():
                    if tag in stock_tracker:
                        stock_tracker[tag] = stock_tracker[tag] + 1
                    else:
                        stock_tracker[tag] = 1
    stock_tracker = sorted(stock_tracker.items(),
                           key=operator.itemgetter(1),
                           reverse=True)
    return stock_tracker
Exemplo n.º 10
0
def scrape_data(subreddit):
    
    # Inicializacion 
    api = PushshiftAPI()

    # Creo una lista con el scrapping
    scrape_list = list(api.search_submissions(subreddit=subreddit,
                                filter=['title', 'subreddit', 'num_comments', 'author', 'subreddit_subscribers', 'score', 'domain', 'created_utc'],
                                limit=15000))

    #Filtro el subreddit por author y titulos 
    clean_scrape_lst = []
    for i in range(len(scrape_list)):
        scrape_dict = {}
        scrape_dict['subreddit'] = scrape_list[i][5]
        scrape_dict['author'] = scrape_list[i][0]
        scrape_dict['domain'] = scrape_list[i][2]
        scrape_dict['title'] = scrape_list[i][7]
        scrape_dict['num_comments'] = scrape_list[i][3]
        scrape_dict['score'] = scrape_list[i][4]
        scrape_dict['timestamp'] = scrape_list[i][1]
        clean_scrape_lst.append(scrape_dict)

    # Ver numero de suscriptores
    print(subreddit, 'subscribers:',scrape_list[1][6])
    
    # Retorno lista de scrapping
    return clean_scrape_lst
Exemplo n.º 11
0
def when_pressed(event=None):
    global x, tags, urls

    api = PushshiftAPI()
    now = datetime.now() - timedelta(days=3)
    start = datetime(hour=1, month=now.month, year=now.year, day=now.day)
    posts = list(
        api.search_submissions(after=start,
                               subreddit='wallstreetbets',
                               filter=['url', 'author', 'title', 'subreddit'],
                               limit=1000))

    for post in posts:
        words = post.title.split()
        cashtags = list(
            set(filter(lambda word: word.lower().startswith('$'), words)))
        if len(cashtags) > 0:
            for cashtag in cashtags:
                if ("$" + e.get().upper()) in cashtag:
                    print(post.url)
                    urls["tag" + str(x)] = post.url
                    text.insert('1.0', post.title + "\n")
                    text.tag_add("tag" + str(x), '1.0', '1.end')
                    text.tag_config("tag" + str(x),
                                    foreground='blue',
                                    underline=True)
                    text.tag_bind("tag" + str(x), '<Enter>', show_hand_cursor)
                    text.tag_bind("tag" + str(x), '<Leave>', show_xterm_cursor)
                    tag = "tag" + str(x)
                    callback = lambda even, tag=tag: do(event, tag)
                    text.tag_bind("tag" + str(x), '<Button-1>', callback)
                    x = x + 1
                    break
Exemplo n.º 12
0
def create_annotation_file(annotations, filename):
    gen = PushshiftAPI().search_submissions(
        ids=[annotations[k]["ID"] for k in annotations.keys()],
        subreddit='Denmark',
        filter=['id', 'title', 'selftext', 'full_link', 'created_utc'])

    annotations = process_submissions(gen, annotations)

    with open(filename, 'w') as f, open('./logs.txt', 'a') as fp:
        for _, v in annotations.items():
            state, code = validate_post(v)
            if not state:
                print(
                    f"\033[1;33;40m WARNING: \033[0m The submission {v['ID']} or annotations have changed since producing this data set. To learn more check the logs afterward (logs.txt)"
                )
                write_logs(fp, v, code)
                continue
            try:
                for t, l1, l2 in zip(v['text'], v['Layer1'].split(' '),
                                     v['Layer2'].split(' ')):
                    f.write(f'{t}\t{l1}\t{l2}\n')
                f.write('\n')
            except KeyError:
                print(
                    f"\033[1;33;40m WARNING:\033[0m Pulling the submission {v['ID']} produced an error leading to this KeyError. To learn more check the logs afterward (logs.txt)"
                )
                write_logs(fp, v, 4)
Exemplo n.º 13
0
def get_tickers():
    tickers = pd.read_csv('data_scrappers\\tickers.csv',
                          header=None,
                          names=['Tickers'])
    tickers['Tickers'] = tickers['Tickers'].str[:-2]
    tickers = tickers['Tickers'].values

    api = PushshiftAPI()
    start_epoch = dt.date.today() - timedelta(days=7)

    subs = api.search_submissions(
        after=start_epoch,
        subreddit='wallstreetbets',
        filter=['url', 'author', 'title', 'subreddit'],
        limit=2000)

    cash_tags = {}

    for sub in subs:
        for word in sub.title.split(' '):
            if (word.isupper() or '$'
                    in word) and word in tickers and word.upper() != 'GME':
                word = re.sub("[^a-zA-Z]+", "", word)
                if word.upper() not in cash_tags:
                    cash_tags[word.upper()] = 1
                else:
                    cash_tags[word.upper()] += 1

    cash_tags = pd.Series(cash_tags, name='tickers')
    return cash_tags
    def __init__(self, subreddit_name, limit):
        print("API parameters:", subreddit_name, limit)

        ranges = [(1, 1, 2019, 1, 2, 2019), (1, 2, 2019, 1, 3, 2019),
                  (1, 3, 2019, 1, 4, 2019), (1, 4, 2019, 1, 5, 2019),
                  (1, 5, 2019, 1, 6, 2019), (1, 6, 2019, 1, 7, 2019),
                  (1, 7, 2019, 1, 8, 2019), (1, 8, 2019, 1, 9, 2019),
                  (1, 9, 2019, 1, 10, 2019), (1, 10, 2019, 1, 11, 2019),
                  (1, 11, 2019, 1, 12, 2019), (1, 12, 2019, 1, 1, 2020),
                  (1, 1, 2020, 1, 2, 2020), (1, 2, 2020, 1, 3, 2020),
                  (1, 3, 2020, 1, 4, 2020), (1, 4, 2020, 1, 5, 2020),
                  (1, 5, 2020, 1, 6, 2020), (1, 6, 2020, 1, 7, 2020),
                  (1, 7, 2020, 1, 8, 2020), (1, 8, 2020, 1, 9, 2020),
                  (1, 9, 2020, 1, 10, 2020), (1, 10, 2020, 1, 11, 2020),
                  (1, 11, 2020, 1, 12, 2020), (1, 12, 2020, 1, 1, 2021),
                  (1, 1, 2021, 1, 2, 2021), (1, 2, 2021, 1, 3, 2021),
                  (1, 3, 2021, 1, 4, 2021)]

        for d1, m1, y1, d2, m2, y2 in ranges:
            posted_after = int(datetime.datetime(y1, m1, d1).timestamp())
            posted_before = int(datetime.datetime(y2, m2, d2).timestamp())

            self.api = PushshiftAPI()
            self.comBatchNo = 0
            self.outputPath = './{0}/{1}/'.format(subreddit_name, posted_after)

            Path(self.outputPath).mkdir(parents=True, exist_ok=True)

            self.getComments(subreddit_name, None, [
                'created_utc', 'score', 'selftext', 'title', 'upvote_ratio',
                'body'
            ], posted_after, posted_before, limit)
Exemplo n.º 15
0
def get_submission_detail(submission_id: str) -> dict:
    r = praw.Reddit(client_id=reddit_client_id,
                    client_secret=reddit_client_secret,
                    user_agent=reddit_user_agent)
    api = PushshiftAPI(r)

    pass
Exemplo n.º 16
0
def create_report_body(subreddit_name):
    reddit = praw.Reddit(
        username=USERNAME,
        password=PASSWORD,
        client_id=CLIENT_ID,
        client_secret=CLIENT_SECRET,
        user_agent=USER_AGENT
    )
    pushshift = PushshiftAPI(r=reddit)

    time_now = dt.datetime.now()
    time_one_week_ago = time_now - dt.timedelta(days=7)
    time_one_week_ago_epoch = int(time_one_week_ago.timestamp())

    submissions = get_submissions(subreddit_name, reddit, pushshift, time_one_week_ago_epoch)
    for submission in submissions:
        submission.data = submission.title
        submission.link = submission.shortlink
    submission_report = process_data(submissions, "Post Title", "Submission")

    comments = get_comments(subreddit_name, reddit, pushshift, time_one_week_ago_epoch)
    for comment in comments:
        ind = next((i for i, ch in enumerate(comment.body) if ch in {'\n', '`'}), len(comment.body))
        ind = min(ind, 150)
        comment.data = comment.body[:ind]
        if ind < len(comment.body) - 2:
            comment.data += " ...\\[trimmed\\]"
        comment.link = comment.permalink
    comment_report = process_data(comments, "Comment Link", "Comment")


    return f"""
def main(args):
    api = PushshiftAPI()
    folder = "Subreddit"
    Path(folder).mkdir(parents=True, exist_ok=True)

    if args.subreddit:
        subreddit = [x.strip() for x in args.subreddit.split(",")]
    else:
        logger.error("Use -s to set the subreddit")
        exit()

    for i in subreddit:
        try:
            df = fetch_posts(api, i)
            df["date_utc"] = pd.to_datetime(df["created_utc"], unit="s")
            df["date"] = pd.to_datetime(df["created"], unit="s")
            df["permalink"] = "https://old.reddit.com" + df[
                "permalink"].astype(str)
            df = df[df.columns.intersection(COLUMNS)]
            filename = f"{folder}/posts_{i}_{int(time.time())}"
            if args.export_format == "xlsx":
                writer = pd.ExcelWriter(
                    f"{filename}.xlsx",
                    engine="xlsxwriter",
                    options={"strings_to_urls": False},
                )
                df.to_excel(writer, sheet_name="Sheet1")
                writer.save()
            else:
                df.to_csv(f"{filename}.csv", index=False, sep="\t")
        except Exception as e:
            logger.error("Complete error : %s", e)

    logger.info("Runtime : %.2f seconds" % (time.time() - temps_debut))
Exemplo n.º 18
0
def main(argv):
    reddit = None
    try:
        opts, _ = getopt.getopt(argv, "u:h:l", ["user="******"help", "log"])
    except getopt.GetoptError:
        print("see: scraper.py -help")
        sys.exit(2)
    for opt, val in opts:
        if opt in ("-l", "-log"):
            enablelogging()
        elif opt in ("-u", "-user"):
            reddit = praw.Reddit(val)
        elif opt in ("-h", "-help"):
            help_msg = """ 
            run with '-u'/'-user' and valid praw agent argument from praw.ini\n
            run with '-l'/'-log' to enable logging of API calls
            """
            sys.exit(help_msg)

    if not reddit:
        sys.exit(
            "Reddit instance could not be obtained!\nSee '-help' for more information"
        )

    pushAPI = PushshiftAPI(reddit)

    datafolder = "submissions/"
    submission_ids = "submission_ids/"

    process_queries('queries.csv', pushAPI, submission_ids)

    for info_file in os.listdir(submission_ids):
        process_submissions(reddit, datafolder,
                            os.path.join(submission_ids, info_file))
Exemplo n.º 19
0
def main(args):
    api = PushshiftAPI()
    folder = "Search"
    Path(folder).mkdir(parents=True, exist_ok=True)

    if not args.search_terms:
        logger.error("Use -s to set search terms")
        exit()
    df = fetch_comments(api, args.search_terms, args.subreddit)
    df["date_utc"] = pd.to_datetime(df["created_utc"], unit="s")
    df["date"] = pd.to_datetime(df["created"], unit="s")
    df["permalink"] = "https://old.reddit.com" + df["permalink"].astype(str)
    df = df[df.columns.intersection(COLUMNS)]
    filename = f"{folder}/comments_{int(time.time())}_{args.search_terms}"
    if args.export_format == "xlsx":
        writer = pd.ExcelWriter(
            f"{filename}.xlsx",
            engine="xlsxwriter",
            options={"strings_to_urls": False},
        )
        df.to_excel(writer, sheet_name="Sheet1")
        writer.save()
    else:
        df.to_csv(f"{filename}.csv", index=False, sep="\t")

    logger.info("Runtime : %.2f seconds" % (time.time() - temps_debut))
Exemplo n.º 20
0
def get_subreddit_info(
    subreddit: str, date: dt.date = dt.date.today()) -> dict:
    '''Gets a list of all submissions for a given subreddit and date.'''
    r = praw.Reddit(client_id=reddit_client_id,
                    client_secret=reddit_client_secret,
                    user_agent=reddit_user_agent)
    api = PushshiftAPI(r)

    end = dt.datetime.combine(dt.date.today(), dt.datetime.min.time())
    start = end - dt.timedelta(days=1)
    results = api.search_submissions(after=int(start.timestamp()),
                                     before=int(end.timestamp()),
                                     subreddit=subreddit,
                                     stickied=False,
                                     limit=500)

    # build json
    sub_info = {
        'subreddit': subreddit,
        'date': date.strftime('%Y-%m-%d'),
        'submissions': [],
    }

    for entry in results:
        record = {
            'id': entry.id,
            'score': entry.score,
            'title': entry.title,
            'author':
            (entry.author.name if entry.author is not None else None),
            'comment_count': entry.num_comments
        }
        sub_info['submissions'].append(record)
    sub_info['post_count'] = len(sub_info['submissions'])
    return sub_info
Exemplo n.º 21
0
def scrape_chunk(id_chunk, keep_columns_comm, subm_limit):
    time.sleep(3)
    com_df = pd.DataFrame({}, columns=keep_columns_comm)
    comments_all = []
    try:
        api = PushshiftAPI()
        comments_all = list(
            api.search_comments(link_id=','.join(id_chunk),
                                limit=subm_limit,
                                filter=keep_columns_comm))
        if len(comments_all) > 0:
            comments_level1 = [
                c.d_ for c in comments_all if c.parent_id[:3] == 't3_'
            ]
            comments_level2 = [
                c.d_ for c in comments_all
                if c.parent_id in ['t1_' + r['id'] for r in comments_level1]
            ]
            com_df = pd.DataFrame(comments_level1 + comments_level2)
    except StopIteration:
        SC_comm.error('StopIterration error')
    except RuntimeError:
        SC_comm.error('Runtime error')
    except timeout_decorator.timeout_decorator.TimeoutError:
        SC_subm.error('TimeoutError')
    except:
        SC_comm.error('Other error')
        pass
    return (com_df)
Exemplo n.º 22
0
def get_posts(subreddit, year, day, month, iteration, end_year, end_day, end_month):
    evn_path = '/disk/data/share/s1690903/pandemic_anxiety/evn/'
    evn = load_experiment(evn_path + 'experiment.yaml')

    reddit = praw.Reddit(client_id=evn['reddit_api_3']['client_id'],
                         client_secret=evn['reddit_api_3']['client_secret'],
                         user_agent=evn['reddit_api_3']['user_agent'],
                         username=evn['reddit_api_3']['username'],
                         password=evn['reddit_api_3']['password'])

    #using pushshift to fetch IDS
    api = PushshiftAPI(reddit)

    count = 0
    while count < iteration:
        # get post ids
        c = CollectPostids(year, month, day, subreddit)
        c.save_postids()
        
        # get posts and continue the next iteration 
        cp = CollectPost()
        last_day = cp.collect_posts('postids/{}_postids.csv'.format(subreddit))
        print(last_day)
        
        year = int(last_day.split('/')[2])
        month = int(last_day.split('/')[0])
        day = int(last_day.split('/')[1])
        print(month, day)

        if (year == end_year and month == end_month and day == end_day):
            break

        count = count + 1
        time.sleep(30)
Exemplo n.º 23
0
def generate_submissions_psaw(month_num, subreddit):
    """
    Gets submissions between start/end epochs for requested
    subreddit

    Parameters
    ----------
    month_num: int
        The month number to be passed to epoch_generate()
    month_half: int
        The month half to be passed to epoch_generate()
    subreddit: string
        The name of the subreddit to be scraped

    Returns
    -------
    generator
        A generator object that will be used to loop through
        submissions
    """

    # init api
    api = PushshiftAPI()

    epoch_tuple = epoch_generate(month_num, 2020)
    start_epoch = epoch_tuple[0]
    end_epoch = epoch_tuple[1]

    return api.search_submissions(after=start_epoch,
                                  before=end_epoch,
                                  subreddit=subreddit,
                                  size=1000)
Exemplo n.º 24
0
def update_user(target_user, target_sub, r, flair_queue, perm_queue, sub_list):
    # Check existing data
    check_data = ProcessComment.check_user(target_user, target_sub)
    update_flair = True
    user_in_accnt_info = check_data[
        2]  # Does the user's data need to be updated or inserted
    user_in_sub_info = check_data[3]

    # Collect new data
    try:
        # PushShift Instance
        ps = PushshiftAPI(r)
        DataCollector.load_data(user_in_accnt_info, user_in_sub_info,
                                update_flair, target_user, target_sub,
                                sub_list, ps)
    except:
        logging.warning("PM: User " + str(target_user) +
                        " was not able to have their data and flair updated"
                        "\nStacktrace: " + str(traceback.print_exc()))

    # Update flair with new data
    prog_flair_enabled = target_sub.main_config.getboolean("progression tier")
    new_accnt_flair_enabled = target_sub.main_config.getboolean(
        "young account tag")
    activity_flair_enabled = target_sub.main_config.getboolean("activity tag")
    FlairManager.update_flair(flair_queue, perm_queue, target_user, target_sub,
                              prog_flair_enabled, new_accnt_flair_enabled,
                              activity_flair_enabled)

    logging.debug("PM: User " + str(target_user) +
                  " has had their data and flair updated")
Exemplo n.º 25
0
 def __init__(self, subreddit, output_path):
     
     self.subreddit = subreddit
     self.output_path = output_path
     self.images_links = []
     self.api = PushshiftAPI()
     self.new_memes = self.api.search_submissions(subreddit=self.subreddit)
 def get_elements(self):
     ps = PushshiftAPI()
     for user in self.data['users'].split(','):
         user = user.replace('/u/', '', 1).strip()
         _params = {'author': user}
         if self.data['limit']:
             _params['limit'] = self.data['limit']
         if self.data['scan_submissions']:
             for post in ps.search_submissions(**_params):
                 p = RedditElement(post)
                 if self.check_filters(p):
                     yield p
         if self.data['scan_comments']:
             for post in ps.search_comments(**_params):
                 parents = list(
                     ps.search_submissions(ids=post.link_id.replace(
                         't3_', '', 1),
                                           limit=1))
                 if not len(parents):
                     print(
                         "PushShift Warning: Unable to locate parent Submission:",
                         post.link_id)
                     continue
                 submission = parents[0]
                 p = RedditElement(post, ext_submission_obj=submission)
                 if self.check_filters(p):
                     yield p
Exemplo n.º 27
0
    def get_elements(self):
        url = self.data['url']
        submission = re.search(r'\/comments\/([a-zA-Z0-9]+)\/?', url)
        comment = re.search(r'\/comments\/.+?\/.+?\/([a-zA-Z0-9]+)\/?', url)
        ps = PushshiftAPI()

        if comment:
            for post in ps.search_comments(ids=[comment.group(1)]):
                parents = list(
                    ps.search_submissions(ids=post.link_id.replace(
                        't3_', '', 1),
                                          limit=1))
                if not len(parents):
                    raise AssertionError(
                        "PushShift Warning: Unable to locate direct parent Submission:",
                        post.link_id)
                submission = parents[0]
                p = RedditElement(post, ext_submission_obj=submission)
                if self.check_filters(p):
                    yield p
        elif submission:
            for post in ps.search_submissions(
                    ids=[submission.group(1).replace('t3_', '', 1)], limit=1):
                p = RedditElement(post)
                if self.check_filters(p):
                    yield p
        else:
            raise TypeError('Invalid Reddit URL provided! "%s"' % url)
Exemplo n.º 28
0
def scrape_data(subreddit):
    print("cp1")
    api = PushshiftAPI()
    print("cp2")
    # Create list of scraped data
    scrape_list = list(
        api.search_submissions(subreddit=subreddit,
                               filter=[
                                   'title', 'subreddit', 'num_comments',
                                   'author', 'subreddit_subscribers', 'score',
                                   'domain', 'created_utc'
                               ],
                               limit=15000))
    print("cp3")
    clean_scrape_lst = []
    for i in range(len(scrape_list)):
        scrape_dict = {}
        scrape_dict['subreddit'] = scrape_list[i][5]  #Name of subreddit
        scrape_dict['author'] = scrape_list[i][0]
        scrape_dict['domain'] = scrape_list[i][2]  #Publishing House
        scrape_dict['title'] = scrape_list[i][7]
        scrape_dict['num_comments'] = scrape_list[i][3]
        scrape_dict['score'] = scrape_list[i][4]  #upvotes-downvotes
        scrape_dict['timestamp'] = scrape_list[i][1]  #time in epoch format
        clean_scrape_lst.append(scrape_dict)
    print("cp4")
    # Show number of subscribers
    print(subreddit, 'subscribers:', scrape_list[1][6])

    # Return list of scraped data
    return clean_scrape_lst
Exemplo n.º 29
0
    def compile(self, progress_cb=None):
        # def compile(self, compile_params):
        # client_id = compile_params["client_id"]
        # client_secret = compile_params["client_secret"]
        if self.compiled:
            return

        reddit = praw.Reddit(
            client_id=os.environ["CLIENT_ID"],
            client_secret=os.environ["CLIENT_SECRET"],
            user_agent="linux:org.reddit-nlp.reddit-nlp:v0.1.0 (by /u/YeetoCalrissian)",
        )
        api = PushshiftAPI(reddit)
        comments = []

        start_epoch = int(self.start_time.timestamp())
        end_epoch = int(self.end_time.timestamp())

        n = 0
        for subreddit in self.subreddits:
            print(subreddit, start_epoch, end_epoch)
            for comment in api.search_comments(
                after=start_epoch, before=end_epoch, subreddit=subreddit
            ):
                print(comment.body)
                comments.append(comment)
                n += 1
                if progress_cb is not None:
                    progress_cb(n)

        with open(self.comments_pickle_path, "wb") as pickle_file:
            pickle.dump(comments, pickle_file)

        self.compiled = True
        self.write()
Exemplo n.º 30
0
def get_api_instance(src_path):
    '''
    returns an instance of the psaw object initialized using praw
    
    parameters:
    --src_path: pathlib.path object pointing to the src directory or the directory containing api_credentials.txt
    
    returns:
    --s_api: an instance of pushift's psaw api
    '''
    # retrieve api credentials from .gitignore'd text file
    secrets_path = src_path / 'api_credentials.txt'
    secrets_txt = open(secrets_path, 'r')

    my_id = secrets_txt.readline().split('=')[1].rstrip()
    my_secret = secrets_txt.readline().split('=')[1].rstrip()
    my_agent = secrets_txt.readline().split('=')[1].rstrip()

    secrets_txt.close()

    # create a praw and pushshitft instances
    reddit = praw.Reddit(client_id=my_id,
                         client_secret=my_secret,
                         user_agent=my_agent)

    s_api = PushshiftAPI(reddit)

    return s_api