def get_new(self): new_reddits = 0 first_uri = 'http://www.reddit.com/reddits/new/.json' current_uri = first_uri counter = 0 while True: page = self._get_json(current_uri) if not page: self.logger.error( 'ERROR retrieving page %s. Spidering aborted.\n' '%s reddits scanned.\n%s new reddits found.' % (current_uri, counter, new_reddits)) return reddits = page['data']['children'] for reddit in reddits: reddit = reddit['data'] id = reddit['id'] s = session.query(Subreddit).filter_by(id=id).first() if not s: self.logger.info('new subreddit: %s' % reddit['url']) new_reddits += 1 s = Subreddit() s.name = reddit['name'] s.created = unix_string(int(reddit['created'])) s.url = reddit['url'][3:-1] s.title = reddit['title'] s.over18 = reddit['over18'] s.subscribers = reddit['subscribers'] s.id = reddit['id'] s.description = reddit['description'] session.commit() counter += len(reddits) after = page['data']['after'] current_uri = '%s?count=%s&after=%s' % (first_uri, counter, after) if not after: self.logger.info('Finished spidering.\n' '%s reddits scanned.\n%s new reddits found.' % (counter, new_reddits)) return
def get_new(self): new_reddits = 0 first_uri = 'http://www.reddit.com/reddits/new/.json' current_uri = first_uri counter = 0 while True: page = self._get_json(current_uri) if not page: self.logger.error('ERROR retrieving page %s. Spidering aborted.\n' '%s reddits scanned.\n%s new reddits found.' % (current_uri, counter, new_reddits)) return reddits = page['data']['children'] for reddit in reddits: reddit = reddit['data'] id = reddit['id'] s = session.query(Subreddit).filter_by(id=id).first() if not s: self.logger.info('new subreddit: %s' % reddit['url']) new_reddits += 1 s = Subreddit() s.name = reddit['name'] s.created = unix_string(int(reddit['created'])) s.url = reddit['url'][3:-1] s.title = reddit['title'] s.over18 = reddit['over18'] s.subscribers = reddit['subscribers'] s.id = reddit['id'] s.description = reddit['description'] session.commit() counter += len(reddits) after = page['data']['after'] current_uri = '%s?count=%s&after=%s' % (first_uri, counter, after) if not after: self.logger.info('Finished spidering.\n' '%s reddits scanned.\n%s new reddits found.' % (counter, new_reddits)) return
def _scan_posts(self, url, newest): seen = 0 data = self.spider._get_json(url) posts = data['data']['children'] after = data['data']['after'] for i, c in enumerate(posts): post = c['data'] if i == 0: next_newest = post['id'] if post['id'] <= newest: seen = len(posts) - i break title = post['title'].lower() selftext = post['selftext'].lower() for k in self._mentioned_keywords(title, text2=selftext): mention = Mention() mention.keyword_uid = k.uid mention.thread_id = post['id'] mention.author = post['author'] mention.subreddit = post['subreddit'] mention.created = unix_string(int(post['created_utc'])) session.commit() return (seen, after, next_newest)
def _scan_comments(self, url, newest): seen = 0 data = self.spider._get_json(url) comments = data['data']['children'] after = data['data']['after'] for i, c in enumerate(comments): comment = c['data'] if i == 0: next_newest = comment['id'] if comment['id'] <= newest: seen = len(comments) - i - 1 break body = comment['body'].lower() for k in self._mentioned_keywords(body): mention = Mention() mention.keyword_uid = k.uid mention.thread_id = comment['link_id'][3:] mention.comment_id = comment['id'] mention.author = comment['author'] mention.subreddit = comment['subreddit'] mention.created = unix_string(int(comment['created_utc'])) session.commit() return (seen, after, next_newest)