Пример #1
0
def update_top_image_reposts(uowm: UnitOfWorkManager,
                             reddit: Reddit) -> NoReturn:
    days = [1, 7, 30, 365]
    with uowm.start() as uow:
        uow.session.execute('TRUNCATE `stats_top_image_repost`')
        for day in days:
            result = uow.session.execute(
                'SELECT repost_of, COUNT(*) c FROM image_reposts WHERE detected_at > NOW() - INTERVAL :days DAY GROUP BY repost_of HAVING c > 1 ORDER BY c DESC LIMIT 2000',
                {'days': day})
            for chunk in chunk_list(result.fetchall(), 100):
                reddit_ids_to_lookup = []
                for post in chunk:
                    existing = uow.stats_top_image_repost.get_by_post_id_and_days(
                        post[0], day)
                    if existing:
                        existing.repost_count = post[1]
                        continue
                    reddit_ids_to_lookup.append(f't3_{post[0]}')
                for submission in reddit.info(reddit_ids_to_lookup):
                    count_data = next(
                        (x for x in chunk if x[0] == submission.id))
                    if not count_data:
                        continue
                    uow.stats_top_image_repost.add(
                        StatsTopImageRepost(post_id=count_data[0],
                                            repost_count=count_data[1],
                                            days=day,
                                            nsfw=submission.over_18))
            uow.commit()
def filter_removed_posts(reddit: Reddit, matches: List[SearchMatch]) -> List[SearchMatch]:
    """
    Take a list of SearchMatches, get the submission from Reddit and see if they have been removed
    :param reddit: Praw Reddit instance
    :param matches: List of matches
    :return: List of filtered matches
    """
    if not matches:
        return matches
    if len(matches) > 100:
        log.info('Skipping removed post check due to > 100 matches (%s)', len(matches))
        return matches
    post_ids = [f't3_{match.post.post_id}' for match in matches]
    submissions = reddit.info(post_ids)
    for sub in submissions:
        if sub.__dict__.get('removed', None):
            log.debug('Removed Post Filter Reject - %s', sub.id)
            del matches[next(i for i, x in enumerate(matches) if x.post.post_id == sub.id)]
    return matches
Пример #3
0
def defuzzed_submissions_scores(connection: Reddit, submissions: List[Submission], iterations: int) -> Mapping[Submission, List[int]]:
    """"De-fuzzes" multiple submissions' scores by batch requesting each score
        from Reddit multiple times, and calculating the average score for each."""

    def t3_(id: str) -> str:
        if id.startswith('t3_'):
            return id
        else:
            return f't3_{id}'

    # scores is a dict mapping submission ids to lists of scores
    ids = [ t3_(submission.id) for submission in submissions ]
    scores = { i : list() for i in ids }

    for _ in range(iterations):
        for submission in connection.info(ids):
            scores[t3_(submission.id)].append(submission.score)

    # map given submissions to submission ids
    idmap = { t3_(submission.id) : submission for submission in submissions }
    return { idmap[i] : scores[i] for i in ids }
Пример #4
0
def get_request_params(client_id, redirect_uri, thing):
    scopes = ["*"]

    reddit = Reddit(
        client_id=client_id,
        client_secret=None,
        redirect_uri=redirect_uri,
        user_agent="Award fetcher by u/Lil_SpazJoekp",
    )
    state = str(random.randint(0, 65000))
    url = reddit.auth.url(scopes, state, "temporary")
    print(f"Open this url in your browser: {url}")
    sys.stdout.flush()

    client = receive_connection()
    data = client.recv(1024).decode("utf-8")
    param_tokens = data.split(" ", 2)[1].split("?", 1)[1].split("&")
    params = {
        key: value
        for (key, value) in [token.split("=") for token in param_tokens]
    }

    if state != params["state"]:
        send_message(
            client,
            f"State mismatch. Expected: {state} Received: {params['state']}",
        )
        return
    elif "error" in params:
        send_message(client, params["error"])
        return

    reddit.auth.authorize(params["code"])
    thing = list(reddit.info([thing]))[0]
    subreddit = thing.subreddit_id
    return reddit._authorized_core._authorizer.access_token, thing.fullname, subreddit
Пример #5
0
class Praw(Loader):
    def __init__(self, root, config, subreddit):
        Loader.__init__(self, 'praw', root, config, subreddit)

        self.endpoint = {
            'user_agent': Env.USER_AGENT(),
            'client_id': Env.REDDIT_CLIENT_ID(),
            'client_secret': Env.REDDIT_CLIENT_SECRET()
        }
        self.reddit = Reddit(**self.endpoint)

        # config parameters
        self.types = self.config['praw']['types']
        self.periode = self.config['praw']['periode']
        self.retrospect_time = self.config['praw']['retrospect_time']

        # initial run variables
        self.last_run = {}
        for file_type in self.types:
            self.last_run[file_type] = 0

        # saved run variables
        for file_type in self.types:
            meta = self.read_meta(file_type)
            if 'last_run' in meta:
                self.last_run[file_type] = meta['last_run']

    def run(self):
        self.runevent.set()

        try:
            # download reddit data
            while not self.stopped():
                stores = [
                    Store('crawler', self.root, self.config, self.subreddit),
                    Store('pushshift', self.root, self.config, self.subreddit)
                ]
                for file_type in self.types:
                    self.download(file_type, stores)

                # periodic run
                if self.alive():
                    self.log(f'sleep for {self.periode} seconds')
                    self.time.sleep(self.periode)
                else:
                    break

        except KeyboardInterrupt:
            self.runevent.clear()
            raise KeyboardInterrupt()
        except Exception as e:
            self.log(f'...run error {repr(e)}')

        self.runevent.clear()

    def download(self, file_type, stores):
        now = int(datetime.now(timezone.utc).timestamp())

        # set last run from now
        self.last_run[file_type] = now

        # define columns
        columns = {
            'submission': [
                'submission', 'subreddit', 'author', 'created', 'retrieved',
                'edited', 'pinned', 'archived', 'locked', 'removed', 'deleted',
                'is_self', 'is_video', 'is_original_content', 'title',
                'link_flair_text', 'upvote_ratio', 'score', 'gilded',
                'total_awards_received', 'num_comments', 'num_crossposts',
                'selftext', 'thumbnail', 'shortlink'
            ],
            'comment': [
                'submission',
                'subreddit',
                'comment',
                'author',
                'created',
                'retrieved'  # TODO fetch comments
            ]
        }[file_type]

        # read existing data
        df = self.read_data(file_type)
        if df.empty:
            df = pd.DataFrame(columns=columns).set_index(file_type)
        df = df.sort_values(by=['created', 'retrieved'])

        # load metadata
        idxs = list(df.index)
        for store in stores:
            df_store = store.read_data(file_type)

            # validate dataset
            if df_store.empty:
                continue
            df_store = df_store.sort_values(by=['created', 'retrieved'])

            # obtain existing items
            df_store_existing = df_store[df_store.index.isin(idxs)]
            df_store_existing = df_store_existing.sort_values(
                by=['created', 'retrieved'])

            # update last x hours based on retrospect time sliding window
            last_time = df_store.iloc[0][
                'created'] if df_store_existing.empty else df_store_existing.iloc[
                    -1]['created']
            update_time = last_time - (60 * 60 * self.retrospect_time)

            self.log(
                f'update data after {datetime.fromtimestamp(update_time)} from {store.name}'
            )

            # obtain fetch ids
            prefix = {'submission': 't3_', 'comment': 't1_'}
            df_store_update = df_store[df_store['created'] >= update_time]
            ids = list(prefix[file_type] + df_store_update.index)

            # process submissions
            if file_type == 'submission':

                # fetch data
                data = self.fetch(file_type, ids)

                # update submission data
                df_update = pd.DataFrame(data,
                                         columns=columns).set_index(file_type)
                df = df.combine_first(df_update)
                df.update(df_update)

                # updated data
                self.log(f'updated {df_update.shape[0]} {file_type}s')

        # convert datatypes
        df = df.convert_dtypes()
        df = df.sort_values(by=['created', 'retrieved'])

        # write data
        self.write_data(file_type,
                        df,
                        overwrite=True,
                        last_run=self.last_run[file_type])
        self.log(f'exported {df.shape[0]} {file_type}s')

        # export data
        file_path = os.path.join(self.root, 'data', 'export', self.subreddit,
                                 f'{file_type}.csv')
        os.makedirs(os.path.dirname(file_path), exist_ok=True)
        df.to_csv(file_path,
                  header=True,
                  index=True,
                  doublequote=True,
                  quoting=csv.QUOTE_NONNUMERIC,
                  sep=',',
                  encoding='utf-8')

    def fetch(self, file_type, ids):
        try:
            data = []

            # chunk id's into batches of size 100
            self.log(f'download {len(ids)} {file_type}s')
            batches = [ids[i:i + 100] for i in range(0, len(ids), 100)]
            for fullnames in tqdm(batches,
                                  desc=self.text() + 'fetching',
                                  unit_scale=100):
                now = datetime.now(timezone.utc).timestamp()

                # process submissions
                if file_type == 'submission':

                    # request data
                    submissions = self.reddit.info(fullnames=fullnames)

                    # parse submissions
                    data += [[
                        str(x.id),
                        str(self.subreddit),
                        str(x.author.name if x.author else '[deleted]'),
                        int(x.created_utc),
                        int(now),
                        int(x.edited),
                        int(x.pinned),
                        int(x.archived),
                        int(x.locked),
                        int(x.selftext == '[removed]'
                            or x.removed_by_category != None),
                        int(x.selftext == '[deleted]'),
                        int(x.is_self),
                        int(x.is_video),
                        int(x.is_original_content),
                        str(x.title),
                        str(x.link_flair_text),
                        float(x.upvote_ratio),
                        int(x.score),
                        int(x.gilded),
                        int(x.total_awards_received),
                        int(x.num_comments),
                        int(x.num_crossposts),
                        str(x.selftext),
                        str(x.thumbnail),
                        str(x.shortlink)
                    ] for x in submissions]

                # wait for next request
                Sleep(0.35)

            return data

        except Exception as e:
            self.log(f'...request error {repr(e)}, retry')
            Sleep(1)

        return []