示例#1
0
    def __init__(self, reddit_data, save_path, num_threads, is_just_json):
        super().__init__('root')
        self.base_dir = self.norm_path(save_path)

        # Do we only want the json files?
        self.just_json = is_just_json

        # Thread life
        self.num_threads = num_threads
        self.q = Queue(maxsize=0)

        scraper_name = socket.gethostname()  # Name of scraper to put in the user agent
        self.reddit = RedditData(reddit_data, scraper_name)
        self.reddit.login()

        # We only need the static files if we are downloading the content as well
        if self.just_json is False:
            # Create a temp downloads folder
            self.download_path = self.create_save_path("temp", "downloads")

            # Add static templates to use
            self.static = StaticTemplates()

            # Create/update static assets
            self.gen_static_files()

            # Setup external scraper
            self.ed = ExternalDownload(self.base_dir, self.download_path, 'root')

            # Create failed domain down path
            self.failed_domain_file = os.path.join(self.base_dir, 'logs', 'failed_domains.csv')

        # Dict of users and subreddits to scrape
        self.scrape = {}

        # load content into self.scrape
        self.load_scrape_config()

        # Run parser
        self.main()

        # Clean up
        self.cleanup()
示例#2
0
class RedditScraper(GeneralUtils):

    def __init__(self, reddit_data, save_path, num_threads, is_just_json):
        super().__init__('root')
        self.base_dir = self.norm_path(save_path)

        # Do we only want the json files?
        self.just_json = is_just_json

        # Thread life
        self.num_threads = num_threads
        self.q = Queue(maxsize=0)

        scraper_name = socket.gethostname()  # Name of scraper to put in the user agent
        self.reddit = RedditData(reddit_data, scraper_name)
        self.reddit.login()

        # We only need the static files if we are downloading the content as well
        if self.just_json is False:
            # Create a temp downloads folder
            self.download_path = self.create_save_path("temp", "downloads")

            # Add static templates to use
            self.static = StaticTemplates()

            # Create/update static assets
            self.gen_static_files()

            # Setup external scraper
            self.ed = ExternalDownload(self.base_dir, self.download_path, 'root')

            # Create failed domain down path
            self.failed_domain_file = os.path.join(self.base_dir, 'logs', 'failed_domains.csv')

        # Dict of users and subreddits to scrape
        self.scrape = {}

        # load content into self.scrape
        self.load_scrape_config()

        # Run parser
        self.main()

        # Clean up
        self.cleanup()

    def main(self):
        ###
        # Thread processing of each failed post
        ###
        for i in range(self.num_threads):
            worker = threading.Thread(target=self.post_worker)
            worker.setDaemon(True)
            worker.start()

        try:
            stream = praw.helpers.submission_stream(self.reddit.r, 'all', None, 0)
            for item in stream:
                self.q.put(item)
            self.q.join()
        except InterruptedError:
            return

    def post_worker(self):
        """
        Function to be used as the thread worker
        """
        try:
            while True:
                self.parse_post(self.q.get())
                self.q.task_done()
        except Exception as e:
            self.log("Exception in main for posts: " + str(e) + "\n" + str(traceback.format_exc()), level='critical')

    def load_scrape_config(self):
        """
        Load scrape.ini config file into self.scrape
        This will run every n seconds to get any updates to the config in its own thread
        """
        # Read scrap config file
        scrape_config_file = './configs/scrape.ini'
        if not os.path.isfile(scrape_config_file):
            self.cprint("\nScrape config file not found: " + scrape_config_file, log=True)
        config.read(scrape_config_file)

        temp_scrape = {'subreddits': [], 'users': [], 'content': {}}

        # Break down the params in the user and subreddit lists
        for feed in ['users', 'subreddits']:
            for subreddit in config['scrape'][feed].split("\n"):
                option = subreddit.lower().split(',')
                temp_scrape[feed].append(option[0].strip())
                if len(option) > 1:
                    temp_scrape['content'][option[0].strip()] = option[1].strip().lower()

        # Cpoy temp_scrape to self.scrape
        self.scrape = temp_scrape.copy()

        self.log("Reloaded scape config: " + str(self.scrape['subreddits']), level='debug')

        # Check to see if both the subreddit and user lists are blank
        #   If so exit the script as there is no reason to run
        if (len(temp_scrape['users']) == 1 and temp_scrape['users'][0] == '') and (len(temp_scrape['subreddits'])  == 1 and temp_scrape['subreddits'][0] == ''):
            self.cprint("You have no users or subreddits in ./configs/scrape.ini", log=True)
        else:
            self.cprint("Searching for posts", log=True)

        # Reload again in n seconds
        t_reload = threading.Timer(10, self.load_scrape_config)
        t_reload.setDaemon(True)
        t_reload.start()

    def parse_post(self, raw_post):
        """
        Process post
        """
        post = vars(raw_post)
        # Convert objects to strings
        if raw_post.author:
            post['author'] = raw_post.author.name
        else:
            post['author'] = '[deleted]'
        post['subreddit'] = str(raw_post.subreddit).lower()

        # Check if we even want this post
        if 'all' not in self.scrape['subreddits']:
            if post['subreddit'] not in self.scrape['subreddits'] and \
               post['author'].lower() not in self.scrape['subreddits']:
                # This is not the post we are looking for, move along
                return

        # Check if we want only sfw or nsfw content from this subreddit
        if 'all' not in self.scrape['content']:
            if post['subreddit'] in self.scrape['content']:
                if self.scrape['content'][post['subreddit']] == 'nsfw' and post['over_18'] is False:
                    return
                elif self.scrape['content'][post['subreddit']] == 'sfw' and post['over_18'] is True:
                    return
        else:
            if self.scrape['content']['all'] == 'nsfw' and post['over_18'] is False:
                return
            elif self.scrape['content']['all'] == 'sfw' and post['over_18'] is True:
                return

        # Remove, we do not need this
        post.pop('reddit_session')

        self.cprint("Checking post: " + post['id'])

        created = self.get_datetime(post['created_utc'])
        y = str(created.year)
        m = str(created.month)
        d = str(created.day)
        utc_str = str(int(post['created_utc']))

        # Check here if we just want the json
        #   If we do save `post` to json file and move on
        if self.just_json:
            # Also check if the first 3 letters match
            #  We already checked if the whole name was in bad_folders

            sub = post['subreddit'][0:3]
            sub_dir = sub
            # Check if first 3 letters of sub name is in bad_folders
            if sub in self.bad_folders or post['subreddit'] in self.bad_folders:
                sub_dir = sub + "_r_" + sub
            # Check if full sub name is in bad_folders
            if post['subreddit'] in self.bad_folders:
                post['subreddit_original'] = post['subreddit']
                post['subreddit'] = sub_dir

            # Create .json savepath, filename will be created_utc_id.json
            # Create directory 3 letters deep (min length of a subreddit name)
            self.log("Saving just json for subreddit: " + post['subreddit'], level='info')
            # Make sure the subreddit cannot create the folder `con` (Windows bug)
            jjson_save_path = self.create_base_path('subreddits',
                                                    post['subreddit'][0:1],
                                                    post['subreddit'][0:2],
                                                    sub_dir,
                                                    post['subreddit'],
                                                    y, m, d
                                                    )
            # Save json data
            jjson_save_file = os.path.join(jjson_save_path, utc_str + "_" + post['id'] + ".json")
            try:
                self.save_file(jjson_save_file, post, content_type='json')
            except Exception as e:
                self.log("Exception [just_json]: " + post['subreddit'] + "\n" + str(e) + " " + post['id'] + "\n" + str(traceback.format_exc()), level='critical')
            # We are done here
            return

        # Check for bad folder names, only care about authors if we are saving content
        if post['author'] in self.bad_folders:
            post['author_original'] = post['author']
            post['author'] = post['author'] + "_u_" + post['author']

        if post['subreddit'] in self.bad_folders:
            post['subreddit_original'] = post['subreddit']
            post['subreddit'] = post['subreddit'] + "_r_" + post['subreddit']

        ###
        # Used for linking on other pages
        ###
        post['user_web_path'] = self.create_web_path(post['author'], path_type="user")
        post['post_web_path'] = self.create_web_path(post['author'], y, m, d, utc_str, path_type="post")
        ###
        # Used to save files/content
        ###
        post['user_save_path'] = self.create_base_path(post['user_web_path'])
        post['post_save_path'] = self.create_base_path(post['post_web_path'])

        post_json_file = os.path.join(post['post_save_path'], "post.json")

        ###
        # If we already have the post then skip it
        ###
        if os.path.isfile(post_json_file):
            return True

        ###
        # If there is no user json file, create new user
        ###
        if not os.path.isfile(post['user_save_path'] + "user.json"):
            self.add_new_user(post)

        self.cprint("Getting post " + post['id'] + " by: " + post['author'])

        ###
        # Download thumbnail if there is one
        ###
        if len(post['thumbnail']) > 0 and post['thumbnail'].startswith('http'):
            post['thumbnail_original'] = post['thumbnail']
            download_response = self.ed.download(post['thumbnail_original'], post['user_save_path'])
            # If the thumbnail does not download then download_responce would have lenght 0
            if len(download_response) > 0:
                thumbnail_download = download_response[0]
                post['thumbnail'] = self.save_to_web_path(thumbnail_download)

        ###
        # Process post data and download any media needed
        ###
        if post['is_self'] is False:
            # Try to save the content
            post = self.download_content(post)

        ###
        # Now save post data to json
        ###
        self.save_file(post_json_file, post, content_type='json')

        ###
        # Create post html file
        ###
        self.save_file(os.path.join(post['post_save_path'], "index.html"), self.static.gen_frame('post_viewer'), content_type='html')

        url_appends = []
        ###
        # Add post to user urls
        ###
        user_post_base = self.create_base_path('user', post['author'][0], post['author'], 'posts')
        url_appends.append(self.create_joined_path(user_post_base, y))
        url_appends.append(self.create_joined_path(user_post_base, y, m))
        url_appends.append(self.create_joined_path(user_post_base, y, m, d))

        ###
        # Add post to subreddit urls
        ###
        subreddit_post_base = self.create_base_path('subreddit', post['subreddit'][0], post['subreddit'])
        url_appends.append(self.create_joined_path(subreddit_post_base, y))
        url_appends.append(self.create_joined_path(subreddit_post_base, y, m))
        url_appends.append(self.create_joined_path(subreddit_post_base, y, m, d))

        ###
        # Append urls to correct urls.csv files
        ###
        for path in url_appends:
            self.append_file(os.path.join(path, 'urls.csv'), post['post_web_path'])
            self.check_view_index(path)
            # self.log("Added " + post['post_web_path'] + " to " + path, level='debug')

        # Done doing things here
        return True

    def add_new_user(self, post):
        """
        Add new user to the system
        """
        # self.log("Adding new user: "******"index.html"), self.static.gen_redirect("./posts"), content_type='html')

    def check_view_index(self, path):
        """
        Check if there is an index.html in each of year, month, and day directories
        If not, create one
        """
        index_view_file = os.path.join(path, 'index.html')
        if not os.path.isfile(index_view_file):
            # self.log("Creating view index at: " + index_view_file, level='debug')
            self.save_file(index_view_file, self.static.gen_frame('csv_viewer'), content_type='html')

    def create_web_path(self, base, *args, path_type=''):
        """
        Creates absolute path that will be used on the web server
        """
        path = ''
        if path_type == 'user' or path_type == 'post':
            path = "/user/" + base[0] + "/" + base + "/"
            if path_type == 'post':
                path += "posts/" + "/".join(args) + "/"
        else:
            path = "/" + "/".join(args)

        return path

    def gen_static_files(self):
        """
        Every run, create/update the static files
        """
        save_path_js = self.create_base_path("assets", "js")
        self.copy_file("./static_assets/js/jquery.js", os.path.join(save_path_js, "jquery.js"))
        self.copy_file("./static_assets/js/csvToArray.js", os.path.join(save_path_js, "csvToArray.js"))
        self.copy_file("./static_assets/js/functions.js", os.path.join(save_path_js, "functions.js"))

        save_path_css = self.create_base_path("assets", "css")
        self.copy_file("./static_assets/css/styles.css", os.path.join(save_path_css, "styles.css"))

        save_path_templates = self.create_base_path("assets", "templates")
        self.copy_file("./static_assets/templates/csv_viewer.html", os.path.join(save_path_templates, "csv_viewer.html"))
        self.copy_file("./static_assets/templates/post_viewer.html", os.path.join(save_path_templates, "post_viewer.html"))

    def cleanup(self):
        self.reddit.close()