Exemplo n.º 1
0
    def _fetch_css(self):
        logger.debug("Fetching css using {} threads".format(self.workers))
        workpool = WorkerPool(size=self.workers)

        if not os.path.exists(self.cache_dir):
            os.makedirs(self.cache_dir)

        for subreddit in self.subreddits:
            css_path = os.path.sep.join([self.cache_dir, subreddit + ".css"])
            if self.prefer_cache and os.path.exists(css_path):
                with open(css_path) as css_file:
                    css = css_file.read().decode("utf8")
                    self._handle_css(css, subreddit)
            else:
                workpool.put(
                    DownloadJob(
                        self._requests,
                        "http://www.reddit.com/r/{}/stylesheet".format(subreddit),
                        retry=5,
                        rate_limit_lock=self.rate_limit_lock,
                        callback=self._callback_fetch_stylesheet,
                        **{"subreddit": subreddit}
                    )
                )

        workpool.shutdown()
        workpool.join()
Exemplo n.º 2
0
    def _download_images(self):
        logger.debug("Downloading images using {} threads".format(self.workers))
        workpool = WorkerPool(size=self.workers)

        # cache emotes
        key_func = lambda e: e["background-image"]
        with self.mutex:
            for image_url, group in itertools.groupby(sorted(self.emotes, key=key_func), key_func):
                if not image_url:
                    continue

                file_path = self.get_file_path(image_url, rootdir=self.cache_dir)
                if not os.path.isfile(file_path):
                    image_url = re.sub(r"^(https?:)?//", "https://", image_url)
                    workpool.put(
                        DownloadJob(
                            self._requests,
                            image_url,
                            retry=5,
                            rate_limit_lock=self.rate_limit_lock,
                            callback=self._callback_download_image,
                            **{"image_path": file_path}
                        )
                    )

        workpool.shutdown()
        workpool.join()
Exemplo n.º 3
0
    def _download_images(self):
        logger.debug("Downloading images using {} threads".format(
            self.workers))
        workpool = WorkerPool(size=self.workers)

        # cache emotes
        key_func = lambda e: e['background-image']
        with self.mutex:
            for image_url, group in itertools.groupby(
                    sorted(self.emotes, key=key_func), key_func):
                if not image_url:
                    continue

                file_path = self.get_file_path(image_url,
                                               rootdir=self.cache_dir)
                if not os.path.isfile(file_path):
                    # Temp workaround for downloading apngs straight from amazon instead of broken ones from cloudflare
                    image_url = re.sub(r'^(https?:)?//',
                                       'https://s3.amazonaws.com/', image_url)
                    workpool.put(
                        DownloadJob(self._requests,
                                    image_url,
                                    retry=5,
                                    rate_limit_lock=self.rate_limit_lock,
                                    callback=self._callback_download_image,
                                    **{'image_path': file_path}))

        workpool.shutdown()
        workpool.join()
Exemplo n.º 4
0
    def fetch_css(self):
        logger.info('Beginning fetch_css()')

        logger.debug("Fetching css using {} threads".format(self.workers))
        workpool = WorkerPool(size=self.workers)

        for subreddit in self.subreddits:
            try:
                css_subreddit_path = path.join(self.session_cache,
                                               subreddit.lower()) + '.css'
                with open(css_subreddit_path, 'r') as f:
                    pass
            except:
                workpool.put(
                    DownloadJob(
                        self._requests,
                        'https://pay.reddit.com/r/{}/stylesheet'.format(
                            subreddit),
                        retry=5,
                        rate_limit_lock=self.rate_limit_lock,
                        callback=self._callback_fetch_stylesheet,
                        **{'subreddit': subreddit}))

        workpool.shutdown()
        workpool.join()
Exemplo n.º 5
0
    def download_images(self):
        logger.info('Beginning download_images()')
        logger.debug("Downloading images using {} threads".format(self.workers))
        workpool = WorkerPool(size=self.workers)

        def create_download_jobs(key_func):
            for image_url, group in itertools.groupby(sorted(self.emotes, key=key_func), key_func):
                if not image_url:
                    continue

                file_path = get_file_path(image_url, rootdir=self.reddit_cache)
                if not path.isfile(file_path):
                    workpool.put(DownloadJob(self._requests,
                                             urlparse.urljoin('https://s3.amazonaws.com/',image_url),
                                             retry=5,
                                             rate_limit_lock=self.rate_limit_lock,
                                             callback=self._callback_download_image,
                                             **{'image_path': file_path}))

        with self.mutex:
            create_download_jobs( lambda e: e['background-image'])
            create_download_jobs( lambda e: e.get('hover-background-image'))

        workpool.shutdown()
        workpool.join()
Exemplo n.º 6
0
    def download_images(self):
        logger.info('Beginning download_images()')
        logger.debug("Downloading images using {} threads".format(
            self.workers))
        workpool = WorkerPool(size=self.workers)

        def create_download_jobs(key_func):
            for image_url, group in itertools.groupby(
                    sorted(self.emotes, key=key_func), key_func):
                if not image_url:
                    continue

                file_path = get_file_path(image_url, rootdir=self.reddit_cache)
                if not path.isfile(file_path):
                    workpool.put(
                        DownloadJob(self._requests,
                                    urlparse.urljoin(
                                        'https://s3.amazonaws.com/',
                                        image_url),
                                    retry=5,
                                    rate_limit_lock=self.rate_limit_lock,
                                    callback=self._callback_download_image,
                                    **{'image_path': file_path}))

        with self.mutex:
            create_download_jobs(lambda e: e['background-image'])
            create_download_jobs(lambda e: e.get('hover-background-image'))

        workpool.shutdown()
        workpool.join()
Exemplo n.º 7
0
    def _fetch_css(self):
        logger.debug("Fetching css using {} threads".format(self.workers))
        workpool = WorkerPool(size=self.workers)

        if not os.path.exists(self.cache_dir):
            os.makedirs(self.cache_dir)

        for subreddit in self.subreddits:
            if subreddit in self.legacy_subreddits:
                legacy_file = '{}/../legacy_css/{}.css'.format(
                    os.path.dirname(__file__), subreddit)
                if os.path.exists(legacy_file):
                    with open(legacy_file) as fh:
                        css = fh.read()
                        self._process_stylesheet_response(
                            200, css, "text/css", subreddit)
                else:
                    logger.error(
                        "No css file found for legacy subreddit {}".format(
                            subreddit))
            else:
                workpool.put(
                    DownloadJob(
                        self._requests,
                        'https://old.reddit.com/r/{}/stylesheet'.format(
                            subreddit),
                        retry=5,
                        rate_limit_lock=self.rate_limit_lock,
                        callback=self._callback_fetch_stylesheet,
                        **{'subreddit': subreddit}))

        workpool.shutdown()
        workpool.join()
Exemplo n.º 8
0
    def _download_images(self):
        logger.debug("Downloading images using {} threads".format(
            self.workers))
        workpool = WorkerPool(size=self.workers)

        # cache emotes
        key_func = lambda e: e['background-image']
        with self.mutex:
            for image_url, group in itertools.groupby(
                    sorted(self.emotes, key=key_func), key_func):
                if not image_url:
                    continue

                file_path = self.get_file_path(image_url,
                                               rootdir=self.cache_dir)
                if not os.path.isfile(file_path):
                    workpool.put(
                        DownloadJob(self._requests,
                                    image_url,
                                    retry=5,
                                    rate_limit_lock=self.rate_limit_lock,
                                    callback=self._callback_download_image,
                                    **{'image_path': file_path}))

        workpool.shutdown()
        workpool.join()
Exemplo n.º 9
0
    def _fetch_css(self):
        logger.debug("Fetching css using {} threads".format(self.workers))
        workpool = WorkerPool(size=self.workers)

        for subreddit in self.subreddits:
            workpool.put(DownloadJob(self._requests,
                                     'http://www.reddit.com/r/{}/stylesheet'.format(subreddit),
                                     retry=5,
                                     rate_limit_lock=self.rate_limit_lock,
                                     callback=self._callback_fetch_stylesheet,
                                     **{'subreddit': subreddit}))

        workpool.shutdown()
        workpool.join()
Exemplo n.º 10
0
    def _process_emotes(self):
        logger.debug("Processing emotes using {} threads".format(self.workers))
        workpool = WorkerPool(self.workers)

        key_func = lambda e: e['background-image']
        with self.mutex:
            for image_url, group in itertools.groupby(sorted(self.emotes, key=key_func), key_func):
                if not image_url:
                    continue

                workpool.put(self.processor_factory.new_processor(scraper=self, image_url=image_url, group=list(group)))

        workpool.shutdown()
        workpool.join()
Exemplo n.º 11
0
def main():
    DOMAIN = "benchmark"
    conn = boto.connect_sdb()
    domain = conn.get_domain(DOMAIN)

    # Prepare item list
    items = []
    now = time.time()
    for i in domain:
        items.append(i)
    elapsed = time.time() - now

    if not items:
        print "No items found."
        return

    msg = "Fetched manifest of %d items in %f seconds, proceeding."
    print msg % (len(items), elapsed)

    # THE REAL MEAT:

    # Prepare the pool
    print "Initializing pool."

    def toolbox_factory():
        return SDBToolBox(DOMAIN)

    def worker_factory(job_queue):
        return EquippedWorker(job_queue, toolbox_factory)

    pool = WorkerPool(size=20, worker_factory=worker_factory)

    print "Starting to fetch items..."
    now = time.time()

    # Insert jobs
    results_queue = Queue()
    for i in items:
        j = SdbJob(results_queue, boto.sdb.domain.Domain.get_item, [i])
        pool.put(j)

    # Fetch results
    r = [results_queue.get() for i in items]
    elapsed = time.time() - now

    print "Fetched %d items paralleled in %f seconds." % (len(r), elapsed)

    pool.shutdown()
Exemplo n.º 12
0
def main():
    DOMAIN = "benchmark"
    conn = boto.connect_sdb()
    domain = conn.get_domain(DOMAIN)

    # Prepare item list
    items = []
    now = time.time()
    for i in domain:
        items.append(i)
    elapsed = time.time() - now

    if not items:
        print "No items found."
        return

    msg = "Fetched manifest of %d items in %f seconds, proceeding."
    print msg % (len(items), elapsed)

    # THE REAL MEAT:

    # Prepare the pool
    print "Initializing pool."

    def toolbox_factory():
        return SDBToolBox(DOMAIN)

    def worker_factory(job_queue):
        return EquippedWorker(job_queue, toolbox_factory)

    pool = WorkerPool(size=20, worker_factory=worker_factory)

    print "Starting to fetch items..."
    now = time.time()

    # Insert jobs
    results_queue = Queue()
    for i in items:
        j = SdbJob(results_queue, boto.sdb.domain.Domain.get_item, [i])
        pool.put(j)

    # Fetch results
    r = [results_queue.get() for i in items]
    elapsed = time.time() - now

    print "Fetched %d items paralleled in %f seconds." % (len(r), elapsed)

    pool.shutdown()
Exemplo n.º 13
0
    def _fetch_css(self):
        logger.debug("Fetching css using {} threads".format(self.workers))
        workpool = WorkerPool(size=self.workers)

        for subreddit in self.subreddits:
            workpool.put(
                DownloadJob(
                    self._requests,
                    'http://www.reddit.com/r/{}/stylesheet'.format(subreddit),
                    retry=5,
                    rate_limit_lock=self.rate_limit_lock,
                    callback=self._callback_fetch_stylesheet,
                    **{'subreddit': subreddit}))

        workpool.shutdown()
        workpool.join()
Exemplo n.º 14
0
    def _process_emotes(self):
        logger.debug("Processing emotes using {} threads".format(self.workers))
        workpool = WorkerPool(self.workers)

        key_func = lambda e: e['background-image']
        with self.mutex:
            for image_url, group in itertools.groupby(
                    sorted(self.emotes, key=key_func), key_func):
                if not image_url:
                    continue

                workpool.put(
                    self.processor_factory.new_processor(scraper=self,
                                                         image_url=image_url,
                                                         group=list(group)))

        workpool.shutdown()
        workpool.join()
Exemplo n.º 15
0
    def fetch_css(self):
        logger.info('Beginning fetch_css()')

        logger.debug("Fetching css using {} threads".format(self.workers))
        workpool = WorkerPool(size=self.workers)

        for subreddit in self.subreddits:
            try:
                css_subreddit_path = path.join(self.session_cache, subreddit.lower()) + '.css'
                with open(css_subreddit_path, 'r') as f:
                    pass
            except:
                workpool.put(DownloadJob(self._requests,
                                         'https://pay.reddit.com/r/{}/stylesheet'.format(subreddit),
                                         retry=5,
                                         rate_limit_lock=self.rate_limit_lock,
                                         callback=self._callback_fetch_stylesheet,
                                         **{'subreddit': subreddit}))

        workpool.shutdown()
        workpool.join()
Exemplo n.º 16
0
    def _download_images(self):
        logger.debug("Downloading images using {} threads".format(self.workers))
        workpool = WorkerPool(size=self.workers)

        # cache emotes
        key_func = lambda e: e['background-image']
        with self.mutex:
            for image_url, group in itertools.groupby(sorted(self.emotes, key=key_func), key_func):
                if not image_url:
                    continue

                file_path = self.get_file_path(image_url, rootdir=self.cache_dir)
                if not os.path.isfile(file_path):
                    # Temp workaround for downloading apngs straight from amazon instead of broken ones from cloudflare
                    image_url = image_url.replace('http://', 'https://s3.amazonaws.com/')
                    workpool.put(DownloadJob(self._requests,
                                             image_url,
                                             retry=5,
                                             rate_limit_lock=self.rate_limit_lock,
                                             callback=self._callback_download_image,
                                             **{'image_path': file_path}))

        workpool.shutdown()
        workpool.join()
Exemplo n.º 17
0
class Send(Job):
    def __init__(self, user, message):
        self.user = user
        self.message = message

    def run(self):
        api.CreateChatWith(self.user).SendMessage(self.message)


if __name__ == "__main__":
    pool = WorkerPool(size=pool_size)  # create new pool
    message = open(message).read()

    print "Sending message..."
    total = api.Friends.Count
    print "Total: %s" % total

    current = 0
    for user in list(api.Friends):
        job = Send(user.Handle, message)
        pool.put(job)
        print "Sending...  %3.2f" % (current * 100 / float(total))
        current += 1

    print "Shutting down..."
    pool.shutdown()  # close pool
    pool.wait()  # wait to finish

    print "Done."