Exemplo n.º 1
0
    def download_images(self):
        logger.info('Beginning download_images()')
        logger.debug("Downloading images using {} threads".format(self.workers))
        workpool = WorkerPool(size=self.workers)

        def create_download_jobs(key_func):
            for image_url, group in itertools.groupby(sorted(self.emotes, key=key_func), key_func):
                if not image_url:
                    continue

                file_path = get_file_path(image_url, rootdir=self.reddit_cache)
                if not path.isfile(file_path):
                    workpool.put(DownloadJob(self._requests,
                                             urlparse.urljoin('https://s3.amazonaws.com/',image_url),
                                             retry=5,
                                             rate_limit_lock=self.rate_limit_lock,
                                             callback=self._callback_download_image,
                                             **{'image_path': file_path}))

        with self.mutex:
            create_download_jobs( lambda e: e['background-image'])
            create_download_jobs( lambda e: e.get('hover-background-image'))

        workpool.shutdown()
        workpool.join()
Exemplo n.º 2
0
    def contract(self, jobs, result):
        """
        Perform a contract on a number of jobs and block until a result is
        retrieved for each job.
        """
        for j in jobs:
            WorkerPool.put(self, j)

        r = []
        for i in xrange(len(jobs)):
            r.append(result.get())

        return r
Exemplo n.º 3
0
    def _fetch_css(self):
        logger.debug("Fetching css using {} threads".format(self.workers))
        workpool = WorkerPool(size=self.workers)

        if not os.path.exists(self.cache_dir):
            os.makedirs(self.cache_dir)

        for subreddit in self.subreddits:
            if subreddit in self.legacy_subreddits:
                legacy_file = '{}/../legacy_css/{}.css'.format(
                    os.path.dirname(__file__), subreddit)
                if os.path.exists(legacy_file):
                    with open(legacy_file) as fh:
                        css = fh.read()
                        self._process_stylesheet_response(
                            200, css, "text/css", subreddit)
                else:
                    logger.error(
                        "No css file found for legacy subreddit {}".format(
                            subreddit))
            else:
                workpool.put(
                    DownloadJob(
                        self._requests,
                        'https://old.reddit.com/r/{}/stylesheet'.format(
                            subreddit),
                        retry=5,
                        rate_limit_lock=self.rate_limit_lock,
                        callback=self._callback_fetch_stylesheet,
                        **{'subreddit': subreddit}))

        workpool.shutdown()
        workpool.join()
Exemplo n.º 4
0
    def fetch_css(self):
        logger.info('Beginning fetch_css()')

        logger.debug("Fetching css using {} threads".format(self.workers))
        workpool = WorkerPool(size=self.workers)

        for subreddit in self.subreddits:
            try:
                css_subreddit_path = path.join(self.session_cache,
                                               subreddit.lower()) + '.css'
                with open(css_subreddit_path, 'r') as f:
                    pass
            except:
                workpool.put(
                    DownloadJob(
                        self._requests,
                        'https://pay.reddit.com/r/{}/stylesheet'.format(
                            subreddit),
                        retry=5,
                        rate_limit_lock=self.rate_limit_lock,
                        callback=self._callback_fetch_stylesheet,
                        **{'subreddit': subreddit}))

        workpool.shutdown()
        workpool.join()
Exemplo n.º 5
0
    def _download_images(self):
        logger.debug("Downloading images using {} threads".format(
            self.workers))
        workpool = WorkerPool(size=self.workers)

        # cache emotes
        key_func = lambda e: e['background-image']
        with self.mutex:
            for image_url, group in itertools.groupby(
                    sorted(self.emotes, key=key_func), key_func):
                if not image_url:
                    continue

                file_path = self.get_file_path(image_url,
                                               rootdir=self.cache_dir)
                if not os.path.isfile(file_path):
                    workpool.put(
                        DownloadJob(self._requests,
                                    image_url,
                                    retry=5,
                                    rate_limit_lock=self.rate_limit_lock,
                                    callback=self._callback_download_image,
                                    **{'image_path': file_path}))

        workpool.shutdown()
        workpool.join()
Exemplo n.º 6
0
    def _download_images(self):
        logger.debug("Downloading images using {} threads".format(
            self.workers))
        workpool = WorkerPool(size=self.workers)

        # cache emotes
        key_func = lambda e: e['background-image']
        with self.mutex:
            for image_url, group in itertools.groupby(
                    sorted(self.emotes, key=key_func), key_func):
                if not image_url:
                    continue

                file_path = self.get_file_path(image_url,
                                               rootdir=self.cache_dir)
                if not os.path.isfile(file_path):
                    # Temp workaround for downloading apngs straight from amazon instead of broken ones from cloudflare
                    image_url = re.sub(r'^(https?:)?//',
                                       'https://s3.amazonaws.com/', image_url)
                    workpool.put(
                        DownloadJob(self._requests,
                                    image_url,
                                    retry=5,
                                    rate_limit_lock=self.rate_limit_lock,
                                    callback=self._callback_download_image,
                                    **{'image_path': file_path}))

        workpool.shutdown()
        workpool.join()
Exemplo n.º 7
0
    def test_start(self):
        """Test the WorkerPool.start class."""

        @defer.inlineCallbacks
        def squaring_processor(n):
            """A fake processor that squares."""
            d = defer.succeed(n ** 2)
            result = yield d
            results.append(result)

        wp = WorkerPool(3)
        results = []

        yield wp.start(squaring_processor, range(10))
        self.assertEqual(results, [n ** 2 for n in range(10)])
Exemplo n.º 8
0
def main():
    DOMAIN = "benchmark"
    conn = boto.connect_sdb()
    domain = conn.get_domain(DOMAIN)

    # Prepare item list
    items = []
    now = time.time()
    for i in domain:
        items.append(i)
    elapsed = time.time() - now

    if not items:
        print "No items found."
        return

    msg = "Fetched manifest of %d items in %f seconds, proceeding."
    print msg % (len(items), elapsed)

    # THE REAL MEAT:

    # Prepare the pool
    print "Initializing pool."

    def toolbox_factory():
        return SDBToolBox(DOMAIN)

    def worker_factory(job_queue):
        return EquippedWorker(job_queue, toolbox_factory)

    pool = WorkerPool(size=20, worker_factory=worker_factory)

    print "Starting to fetch items..."
    now = time.time()

    # Insert jobs
    results_queue = Queue()
    for i in items:
        j = SdbJob(results_queue, boto.sdb.domain.Domain.get_item, [i])
        pool.put(j)

    # Fetch results
    r = [results_queue.get() for i in items]
    elapsed = time.time() - now

    print "Fetched %d items paralleled in %f seconds." % (len(r), elapsed)

    pool.shutdown()
Exemplo n.º 9
0
    def _fetch_css(self):
        logger.debug("Fetching css using {} threads".format(self.workers))
        workpool = WorkerPool(size=self.workers)

        if not os.path.exists(self.cache_dir):
            os.makedirs(self.cache_dir)

        for subreddit in self.subreddits:
            css_path = os.path.sep.join([self.cache_dir, subreddit + ".css"])
            if self.prefer_cache and os.path.exists(css_path):
                with open(css_path) as css_file:
                    css = css_file.read().decode("utf8")
                    self._handle_css(css, subreddit)
            else:
                workpool.put(
                    DownloadJob(
                        self._requests,
                        "http://www.reddit.com/r/{}/stylesheet".format(subreddit),
                        retry=5,
                        rate_limit_lock=self.rate_limit_lock,
                        callback=self._callback_fetch_stylesheet,
                        **{"subreddit": subreddit}
                    )
                )

        workpool.shutdown()
        workpool.join()
Exemplo n.º 10
0
    def _download_images(self):
        logger.debug("Downloading images using {} threads".format(self.workers))
        workpool = WorkerPool(size=self.workers)

        # cache emotes
        key_func = lambda e: e["background-image"]
        with self.mutex:
            for image_url, group in itertools.groupby(sorted(self.emotes, key=key_func), key_func):
                if not image_url:
                    continue

                file_path = self.get_file_path(image_url, rootdir=self.cache_dir)
                if not os.path.isfile(file_path):
                    image_url = re.sub(r"^(https?:)?//", "https://", image_url)
                    workpool.put(
                        DownloadJob(
                            self._requests,
                            image_url,
                            retry=5,
                            rate_limit_lock=self.rate_limit_lock,
                            callback=self._callback_download_image,
                            **{"image_path": file_path}
                        )
                    )

        workpool.shutdown()
        workpool.join()
Exemplo n.º 11
0
    def download_images(self):
        logger.info('Beginning download_images()')
        logger.debug("Downloading images using {} threads".format(
            self.workers))
        workpool = WorkerPool(size=self.workers)

        def create_download_jobs(key_func):
            for image_url, group in itertools.groupby(
                    sorted(self.emotes, key=key_func), key_func):
                if not image_url:
                    continue

                file_path = get_file_path(image_url, rootdir=self.reddit_cache)
                if not path.isfile(file_path):
                    workpool.put(
                        DownloadJob(self._requests,
                                    urlparse.urljoin(
                                        'https://s3.amazonaws.com/',
                                        image_url),
                                    retry=5,
                                    rate_limit_lock=self.rate_limit_lock,
                                    callback=self._callback_download_image,
                                    **{'image_path': file_path}))

        with self.mutex:
            create_download_jobs(lambda e: e['background-image'])
            create_download_jobs(lambda e: e.get('hover-background-image'))

        workpool.shutdown()
        workpool.join()
Exemplo n.º 12
0
    def _fetch_css(self):
        logger.debug("Fetching css using {} threads".format(self.workers))
        workpool = WorkerPool(size=self.workers)

        for subreddit in self.subreddits:
            workpool.put(
                DownloadJob(
                    self._requests,
                    'http://www.reddit.com/r/{}/stylesheet'.format(subreddit),
                    retry=5,
                    rate_limit_lock=self.rate_limit_lock,
                    callback=self._callback_fetch_stylesheet,
                    **{'subreddit': subreddit}))

        workpool.shutdown()
        workpool.join()
Exemplo n.º 13
0
    def _process_emotes(self):
        logger.debug("Processing emotes using {} threads".format(self.workers))
        workpool = WorkerPool(self.workers)

        key_func = lambda e: e['background-image']
        with self.mutex:
            for image_url, group in itertools.groupby(sorted(self.emotes, key=key_func), key_func):
                if not image_url:
                    continue

                workpool.put(self.processor_factory.new_processor(scraper=self, image_url=image_url, group=list(group)))

        workpool.shutdown()
        workpool.join()
Exemplo n.º 14
0
    def _fetch_css(self):
        logger.debug("Fetching css using {} threads".format(self.workers))
        workpool = WorkerPool(size=self.workers)

        for subreddit in self.subreddits:
            workpool.put(DownloadJob(self._requests,
                                     'http://www.reddit.com/r/{}/stylesheet'.format(subreddit),
                                     retry=5,
                                     rate_limit_lock=self.rate_limit_lock,
                                     callback=self._callback_fetch_stylesheet,
                                     **{'subreddit': subreddit}))

        workpool.shutdown()
        workpool.join()
Exemplo n.º 15
0
    def _process_emotes(self):
        logger.debug("Processing emotes using {} threads".format(self.workers))
        workpool = WorkerPool(self.workers)

        key_func = lambda e: e['background-image']
        with self.mutex:
            for image_url, group in itertools.groupby(
                    sorted(self.emotes, key=key_func), key_func):
                if not image_url:
                    continue

                workpool.put(
                    self.processor_factory.new_processor(scraper=self,
                                                         image_url=image_url,
                                                         group=list(group)))

        workpool.shutdown()
        workpool.join()
Exemplo n.º 16
0
def main():
    DOMAIN = "benchmark"
    conn = boto.connect_sdb()
    domain = conn.get_domain(DOMAIN)

    # Prepare item list
    items = []
    now = time.time()
    for i in domain:
        items.append(i)
    elapsed = time.time() - now

    if not items:
        print "No items found."
        return

    msg = "Fetched manifest of %d items in %f seconds, proceeding."
    print msg % (len(items), elapsed)

    # THE REAL MEAT:

    # Prepare the pool
    print "Initializing pool."

    def toolbox_factory():
        return SDBToolBox(DOMAIN)

    def worker_factory(job_queue):
        return EquippedWorker(job_queue, toolbox_factory)

    pool = WorkerPool(size=20, worker_factory=worker_factory)

    print "Starting to fetch items..."
    now = time.time()

    # Insert jobs
    results_queue = Queue()
    for i in items:
        j = SdbJob(results_queue, boto.sdb.domain.Domain.get_item, [i])
        pool.put(j)

    # Fetch results
    r = [results_queue.get() for i in items]
    elapsed = time.time() - now

    print "Fetched %d items paralleled in %f seconds." % (len(r), elapsed)

    pool.shutdown()
Exemplo n.º 17
0
    def fetch_css(self):
        logger.info('Beginning fetch_css()')

        logger.debug("Fetching css using {} threads".format(self.workers))
        workpool = WorkerPool(size=self.workers)

        for subreddit in self.subreddits:
            try:
                css_subreddit_path = path.join(self.session_cache, subreddit.lower()) + '.css'
                with open(css_subreddit_path, 'r') as f:
                    pass
            except:
                workpool.put(DownloadJob(self._requests,
                                         'https://pay.reddit.com/r/{}/stylesheet'.format(subreddit),
                                         retry=5,
                                         rate_limit_lock=self.rate_limit_lock,
                                         callback=self._callback_fetch_stylesheet,
                                         **{'subreddit': subreddit}))

        workpool.shutdown()
        workpool.join()
Exemplo n.º 18
0
    def _download_images(self):
        logger.debug("Downloading images using {} threads".format(self.workers))
        workpool = WorkerPool(size=self.workers)

        # cache emotes
        key_func = lambda e: e['background-image']
        with self.mutex:
            for image_url, group in itertools.groupby(sorted(self.emotes, key=key_func), key_func):
                if not image_url:
                    continue

                file_path = self.get_file_path(image_url, rootdir=self.cache_dir)
                if not os.path.isfile(file_path):
                    # Temp workaround for downloading apngs straight from amazon instead of broken ones from cloudflare
                    image_url = image_url.replace('http://', 'https://s3.amazonaws.com/')
                    workpool.put(DownloadJob(self._requests,
                                             image_url,
                                             retry=5,
                                             rate_limit_lock=self.rate_limit_lock,
                                             callback=self._callback_download_image,
                                             **{'image_path': file_path}))

        workpool.shutdown()
        workpool.join()
Exemplo n.º 19
0
            try:
                loc = GEO.gazetteer.get_locInfo(country=evt['Country'],
                                                admin=evt['State'],
                                                city=evt["City"])
                evt['expanded_loc'].append(loc)
            except Exception as e:
                try:
                    print("exception for event-{} , {}".format(
                        str(e), evt['City'].encode('utf-8')))
                except:
                    pass

    return data


def annotate(ln):
    data = json.loads(ln)
    data = disambiguate_event_loc(data)
    data = GEO.annotate(data)
    return data


if __name__ == "__main__":
    import gzip
    with gzip.open("DME_eventsGeoExpanded_Oct24.mjson.gz", "w") as outf:
        with gzip.open(
                "/home/sathap1/workspace/eventClf/data/DME_langTimeGeoEnriched_events.mjson.gz"
        ) as inf:
            wp = WorkerPool(inf, outf, annotate, 300, maxcnt=None)
            wp.run()
Exemplo n.º 20
0
        msg = json.loads(doc)
    except:
        return None

    try:
        msg.pop('embersGeoCode', "")
        msg.pop('esGeo', "")
        msg = GEO.evaluate(msg)
        #msg['esGeo'] = msg['embersGeoCode']
        return msg
    except:
        error.write(doc)

    return None


with gzip.open("Egypt_evaluated_embers4_tp5.txt.gz", "w") as outf:
    #with gzip.open("Colombia_evaluated_embers2.txt.gz") as inf:
    #with open("./Egypt_embers.txt") as inf:
    #with gzip.open("Egypt_evaluated_embers4_tt.txt.gz") as inf:
    with open("./twrong2.txt") as inf:
        #with gzip.open("/home/sathappan/shared/datasets/tweets_geocoded_byCountry/Colombia.txt.gz") as inf:
        #with gzip.open("/home/sathappan/shared/datasets/tweets_geocoded_byCountry/United_States.txt.gz") as inf:
        wp = WorkerPool(inf, outf, geotest, 10)
        #wp = WorkerPool(inf, outf, embersgeo, 1)
        wp.run()
        print(wp._true, wp._false, wp._nogeo, wp._notruth)
        #for l in inf:
        #    msg = geotest(l)
        #    outf.write(json.dumps(msg, ensure_ascii=False).encode("utf-8") + "\n")
Exemplo n.º 21
0
import json
import os
import signal

from tokenstore import TokenStore
from workerpool import WorkerPool

if __name__ == '__main__':
    bookkeeping = json.load(open("../WEBPAGES_RAW/bookkeeping.json", 'r'))
    store = TokenStore()
    store.store_bookkeeping(bookkeeping)
    pool = WorkerPool(TokenStore(),
                      worker_num=32,
                      bookkeeping=bookkeeping,
                      mode=os.environ.get("INDEXER_MODE", "SERVER"))
    # capture signal.SIGINT and handle it with safe termination
    signal.signal(signal.SIGINT, lambda _s, _f: pool.safe_terminate())
    pool.execute()
Exemplo n.º 22
0
print "Connecting..."
api = Skype()  # create a Skype API instance
api.Attach()  # connect to Skype


class Send(Job):
    def __init__(self, user, message):
        self.user = user
        self.message = message

    def run(self):
        api.CreateChatWith(self.user).SendMessage(self.message)


if __name__ == "__main__":
    pool = WorkerPool(size=pool_size)  # create new pool
    message = open(message).read()

    print "Sending message..."
    total = api.Friends.Count
    print "Total: %s" % total

    current = 0
    for user in list(api.Friends):
        job = Send(user.Handle, message)
        pool.put(job)
        print "Sending...  %3.2f" % (current * 100 / float(total))
        current += 1

    print "Shutting down..."
    pool.shutdown()  # close pool
Exemplo n.º 23
0
    if args.cat:
        infile = sys.stdin
        outfile = sys.stdout
    else:
        infile = smart_open(args.infile)
        outfile = smart_open(args.outfile, "wb")

    lno = 0
    t1 = time.time()
    if args.parallel:

        ################
        ### Method-1 ###
        ################
        wp = WorkerPool(infile, outfile, tmpfun, 200)
        wp.run()

        # ################
        # ### Method-2 ###
        # ################
        # articles = Parallel(n_jobs=1, verbose=10)(delayed(tmpfun)(ln) for ln in infile)
        # with io.open(args.outfile, 'wb', encoding='utf8') as outfile:
        #     for ln in articles:
        #         # Convert Python Object (Dict) to JSON
        #         str_ = json.dumps(ln, sort_keys=True, ensure_ascii=False)
        #         outfile.write(to_unicode(str_) + "\n")


    else: