Python CandidatesList.difference示例

编程语言: Python
命名空间/包名称: reddit_scraper.data_types
类/类型: CandidatesList
方法/功能: difference
hotexamples.com的示例: 1
Python CandidatesList.difference - 已找到1个示例。这些是从开源项目中提取的最受好评的reddit_scraper.data_types.CandidatesList.difference现实Python示例。您可以评价示例，以帮助我们提高示例质量。
常用方法
显示隐藏
difference(1)
update(1)
示例#1
显示文件
文件： plugin_interface.py 项目： oscillot/reddit-scraper
class PluginInterface():
    def __init__(self, database, candidates, output, filter_type='all',
                 categorize=False, nsfw=False):
        """
        The PluginInterface takes care of reading the plugins, determining
        which ones are valid, and iterating through them. It is a wrapper
        around the plugins that makes sure that they are valid subclasses of
        the BasePlugin class.

        :param str database: the prefix for the database filename.
        :param list candidates: a list of candidate json objects from the
        `class` RedditConnect class
        :param str output: the location on disk to store your images (note
        that this can be changed and the database data will handle all
        duplicate filtering
        """
        self.database = database
        self.candidates = candidates
        self.output = output
        self.type = filter_type
        self.categorize = categorize
        self.nsfw = nsfw
        if self.nsfw:
            nsfw_flag = 'enabled'
        else:
            nsfw_flag = 'disabled'
        print '\nFetching NSFW Images is %s.\n' % nsfw_flag
        self.config = get_config()
        #set up some class variables
        self.handled_posts = {}
        self.unhandled_posts = set()
        self.image_urls_already_fetched = None
        self.candidates_backup = CandidatesList(set())

    def hand_off_to_plugins(self):
        """Calls each plugin module and hand the CandidateList off to it.
        """
        for plugin in loaded_plugins:
            print 'Loading plugin: %s.\n' % plugin.__name__
            plug_inst = plugin(self.database, self.candidates, self.output,
                               self.type, self.config, self.categorize,
                               self.nsfw)
            self.handled_posts.update(plug_inst.handled_posts)

            #lazy instantiation so we only get it on the first loop
            if len(self.candidates_backup) == 0:
                # candidates backup is the original list of candidates
                self.candidates_backup.update(plug_inst.candidates_backup)

            #trim down the candidates from what got parsed
            self.candidates = plug_inst.revised

            #this shouldn't(?) change so assigning them each time is fine
            self.image_urls_already_fetched = \
                plug_inst.image_urls_already_fetched

            print '%s handled the following posts:\n' % plugin.__name__
            if len(plug_inst.handled_posts):
                for post in plug_inst.handled_posts:
                    print '%s (%s)' % \
                          (ensure_ascii(post.title, post.url))

                    print '\n\t...which provided the following image urls:\n'
                    for link in plug_inst.handled_posts[post]:
                        if link.duplicate:
                            print '\t%s (Duplicate)\n' % ensure_ascii(link.url)
                        elif link.skipped:
                            print '\t%s (Skipped)\n' % ensure_ascii(link.url)
                        else:
                            print '\t%s\n' % ensure_ascii(link.url)
            else:
                print 'None.'
            print '\n'

    def check_unhandled_posts(self):
        """
        Create a list of unhandled posts along with domains for those
        links which we output at the end to help target plugin
        development/maintenance
        """
        handled_posts = self.handled_posts.keys()
        unhandled_posts = self.candidates_backup.difference(handled_posts)

        for each in unhandled_posts:
            self.unhandled_posts.add(
                (extract_domain(each.url), '%s (%s)' %
                    (ensure_ascii(each.title, each.url))))

    def acquire(self):
        """
        Handles the calls to the database and the requests out to the world
        for the image candidates. This handles images returned directly as
        well as gzipped images and does all of the reads and writes to and
        from the database. Images that are already in the database are
        skipped. This is done on a filename only basis and is not very smart if
         say there are two different images called "image1.jpg" for example.

        :param list candidates: The list of dictionaries that is returned
        from :func: get_upvoted_wallpapers, where each 'url' key s a direct
        link to an image.
        :param str output: The location to save the downloaded images to as a
         string
        """
        if not os.path.exists(self.output):
            os.makedirs(self.output)

        #parse through links once and try to remove any unneeded plugins
        # (saves time once you have a lot of plugins)
        self.remove_unneeded_plugins()

        #parse links through plugins
        print '\nProcessing: parse links through plugins...'
        self.hand_off_to_plugins()

        print 'The following posts had links that were unhandled:'
        self.check_unhandled_posts()
        if len(self.unhandled_posts) > 0:
            #iterating through these sorted puts them in alpha order by domain
            #so you should be able to see which domains you want or need to
            # target
            for uh in sorted(self.unhandled_posts):
                print uh[0], '\t', ensure_ascii(uh[1])
            print '\n'
        else:
            print 'None\n'

        def filter_dupes(handled):
            unduped = set()
            for h in handled:
                if not h.duplicate:
                    unduped.add(h)
            return unduped

        def filter_new(handled):
            duped = set()
            for h in handled:
                if h.duplicate:
                    duped.add(h)
            return duped

        len_posts = len(self.handled_posts)
        len_urls = sum([len(self.handled_posts[p]) for p in self.handled_posts])
        len_new = sum([len(filter_dupes(self.handled_posts[p]))
                      for p in self.handled_posts])
        len_dupes = sum([len(filter_new(self.handled_posts[p]))
                        for p in self.handled_posts])
        c = PluginExceptionCounter.Instance()
        len_bad = c.get_count()
        print '\nComplete.' \
              '\n%d posts were processed.' \
              '\n%d urls were attempted.' \
              '\n%d new images were acquired this run.' \
              '\n%d were duplicate images.' \
              '\n%d were not handled or invalid.\n' \
              % (len_posts, len_urls, len_new, len_dupes, len_bad)
        #emit the exit code based on whether everything went well or not
        exit(len_bad)

    def remove_unneeded_plugins(self):
        plugins_count = {}
        for plugin in loaded_plugins:
            plugins_count[plugin] = 0
            for candidate in self.candidates:
                if plugin.url_matches(candidate['data']['url']):
                    plugins_count[plugin] += 1

        for plugin in plugins_count.keys():
            if plugins_count[plugin] == 0:
                print 'No matches for %s, unloading.' % plugin.__name__
                loaded_plugins.remove(plugin)