Пример #1
0
class DouBanPipeline(object):
    pool = ThreadPool(1)
    mongo_client = MongoDBApi()

    @classmethod
    def save_item(cls, item):
        cls.pool.callInThread(cls.__save_itme, item)
        pass

    @classmethod
    def __save_itme(cls, item):
        try:
            comments = []
            if item.has_key('comments'):
                comments = item.pop('comments')
            insert_id = cls.mongo_client.insert_one(item)
            if insert_id:
                insert_id = ObjectId(insert_id)
                for index, comment in enumerate(comments):
                    comment['movie_id'] = insert_id
                    comments[index] = comment
                if comments:
                    insert_ids = cls.mongo_client.insert_many(
                        comments, 'movie_comments')
                logging.warn(u'========保存一条信息=======\n')
            else:
                logging.warn(u'========一条信息保存失败=======\n')
        except Exception as err:
            logging.error(traceback.format_exc(err))
Пример #2
0
def jp2_to_jpeg(_threads, _app, _source, _destination, _broken, _jpegs,
                _verbose):
    testApp(_app)

    t = ThreadPool(_threads)

    for (root, dirs, files) in os.walk(_destination):
        subpath = root.replace(_destination, '').lstrip('/')
        if _broken not in subpath:
            if any(".jp2" in s for s in files):
                print >> emaillog, 'Converting contents of ' + subpath + ' from JP2 to JPEG'
            for (output_file, size) in _jpegs:
                for file in files:
                    if file.endswith('.jp2'):
                        jp2 = os.path.join(root, file)
                        newfile = os.path.join(root,
                                  os.path.splitext(file)[0]) + '_' \
                                  + output_file
                        command = _app + ' -size ' + size + " " + jp2 \
                                  + ' -resize ' + size + ' ' + newfile
                        if _verbose == True:
                            print 'Creating ' + newfile
                        t.add_task(executeConversion, command, None, jp2,
                                   _source, _broken, file, newfile)
                t.await_completion()
Пример #3
0
    def __init__(self, thread_count: int, host: str, port: str, db_name: str,
                 user: str, channel_name: str) -> None:
        """
        Конструктор класса

        Инициализирует:
         - количество потоков
         - подключение к базе данных
         - один пул потоков

        :param thread_count: количество потоков в пуле потоков
        :param host: hostname, на которой развернута база данных
        :param port: порт подключения к базе данных
        :param db_name: наименование базы данных
        :param user: роль для подключения к базе данных
        :param channel_name: наименование канала, в который поступают сообщения от базы данных
        """
        self._host = host
        self._port = port
        self._db_name = db_name
        self._user = user
        self._thread_count = thread_count
        self._channel_name = channel_name
        self._e = self.connect()
        self.pool_task = ThreadPool(self._thread_count)
Пример #4
0
def test():
    print 'start testing'
    wm = ThreadPool(10)
    for i in range(1):
        wm.add_job(test_job, i, i * 0.001)
    wm.wait_for_complete()
    print 'end testing'
def main(event, lambdacontext):  
    starttime = time.time()    
    queue_url = event.get(c.KEY_SQS_QUEUE_URL, None)        
    print "Started consumer with queue url '{}'".format(queue_url)    
    context = event.get("context", {})        
    context[c.KEY_SQS_QUEUE_URL] = queue_url        
    context[c.KEY_LAMBDA_FUNCTION] = lambdacontext.function_name if hasattr(lambdacontext, 'function_name') else None
    context[c.KEY_REQUEST_ID] = lambdacontext.aws_request_id if hasattr(lambdacontext, 'aws_request_id') else None
    context[c.KEY_IS_LAMBDA_ENV] = context[c.KEY_REQUEST_ID] is not None
      
    prefix = util.get_stack_name_from_arn(os.environ[c.ENV_DEPLOYMENT_STACK_ARN])    

    context[c.KEY_STACK_PREFIX] = prefix
    context[c.KEY_SQS] = Sqs(context, "{0}_".format(prefix))
    context[c.KEY_SQS_AMOEBA] = Sqs(context, "{0}{1}_".format(prefix, c.KEY_SQS_AMOEBA_SUFFIX))
    context[c.KEY_SQS_AMOEBA].set_queue_url(lowest_load_queue=True)    
    context[c.KEY_LAMBDA] = Lambda(context)
    context[c.KEY_CLOUDWATCH] = CloudWatch(context)
    
    context[c.KEY_THREAD_POOL] = ThreadPool(context, 8)               
    context[c.KEY_METRIC_BUCKET] = os.environ[c.RES_S3_STORAGE]            
    
    context[c.KEY_START_TIME] = starttime
    context[c.CW_ATTR_SAVE_DURATION] = context[c.KEY_CLOUDWATCH].avg_save_duration(util.get_cloudwatch_namespace(os.environ[c.ENV_DEPLOYMENT_STACK_ARN]))
    context[c.CW_ATTR_DELETE_DURATION] = context[c.KEY_CLOUDWATCH].avg_delete_duration(util.get_cloudwatch_namespace(os.environ[c.ENV_DEPLOYMENT_STACK_ARN]))    
          
    context[c.KEY_SUCCEEDED_MSG_IDS] = []
    process(context)    
    del context
    gc.collect()
    return {        
        'StatusCode': 200        
    }
Пример #6
0
 def pool_time(thread_num):
     start = time.clock()
     tp = ThreadPool(thread_num)
     for i in range(5):
         tp.add_task(time.sleep, i)
     tp.wait_completion()
     return time.clock() - start
Пример #7
0
def download():
    lines = ["Topics"]
    thread_pool = ThreadPool()
    d = False

    if enable_proxie[0]:
        refresh_proxie()

    filename = datetime.datetime.now().strftime("%d-%m-%Y %H-%M-%S") + '.txt'
    for i, enable in enumerate(download_enables):
        if enable:
            thread_pool.give_task(download_concrete_page,
                                  args=(download_hrefs[i], lines))
            d = True

    thread_pool.join()
    if d:
        file = open(filename, 'w')
        file.write('\n'.join(lines))
        file.close()

        print(f'All chosen topics are saved to {filename}')
        to_main_menu()
    else:
        print("Nothing is chosen")
    input("Press <Enter> to continue")

    return True
Пример #8
0
def tif_to_jp2(_threads, _app, _source, _destination, _broken, _options,
               _verbose):

    testApp(_app)

    t = ThreadPool(_threads)

    for (root, dirs, files) in os.walk(_source):
        subpath = root.replace(_source, '').lstrip('/')
        if _broken not in subpath:
            jp2Path = os.path.join(_destination, subpath)
            makeDir(jp2Path)
            if any(".tif" in s for s in files):
                print >> emaillog, 'Converting contents of ' + subpath + ' from TIF to JP2'
            for file in files:
                if file.endswith('.tif'):
                    tiff = os.path.join(root, file)
                    jp2 = os.path.join(_destination, subpath,
                                       os.path.splitext(file)[0] + '.jp2')
                    tiffcopy = os.path.join(_destination, subpath, file)
                    command = _app + ' -i ' + tiff + ' -o ' + jp2 + ' ' \
                        + _options
                    command_post = 'shutil.move(\'' + tiff + '\',\'' + tiffcopy + '\')'
                    if _verbose == True:
                        print 'Creating ' + jp2
                    t.add_task(executeConversion, command, command_post, tiff,
                               _destination, _broken, file, jp2)
        t.await_completion()
Пример #9
0
    def start(self):
        if not self.db_oper.is_enabled():
            return

        repo_list = self.db_oper.get_repo_list()
        if repo_list is None:
            self.db_oper.close_db()
            return

        thread_pool = ThreadPool(self.scan_virus, self.settings.threads)
        thread_pool.start()

        for row in repo_list:
            repo_id, head_commit_id, scan_commit_id = row

            if head_commit_id == scan_commit_id:
                logger.debug('No change occur for repo %.8s, skip virus scan.',
                             repo_id)
                continue

            thread_pool.put_task(
                ScanTask(repo_id, head_commit_id, scan_commit_id))

        thread_pool.join()

        self.db_oper.close_db()
Пример #10
0
 def testcase_ThreadPool_init_thread_pool_success(self):
     """测试用例2:初始化函数中初始化线程池"""
     jobs = [str(i) for i in xrange(2)]
     pool = ThreadPool(3, test_function, jobs, 0)
     thread_count = len(pool.threads)
     self.assertEqual(3, thread_count)
     pool.wait_allcomplete()
Пример #11
0
def iterate(_source, _ignore, _patron, _patron_zip, _threads):
    print 'Descend into ' + _source

    t = ThreadPool(_threads)

    for (root, dirs, files) in os.walk(_source):
        t.add_task(patron_bundle, _patron, _patron_zip, root)
    t.await_completion()
Пример #12
0
 def __init__(self, config_name="config.json"):
     self.__config = ServerConfig(config_name)
     logging.basicConfig(filename=self.__config.log_file,
                         level=logging.DEBUG,
                         format='%(asctime)s %(message)s')
     self.thread_pool = ThreadPool()
     cache_dir = Path.cwd() / self.__config.cache_dir
     self.cache = CacheStorage(cache_dir)
     self.request_handler = RequestHandler(self.cache)
Пример #13
0
 def test_results(self):
     def my_add(a,b):
         return a+b
     
     tp = ThreadPool(5)
     for i in range(5):
         tp.add_task(my_add, i, i)
     d = tp.wait_completion()
     vals = d.values()
     vals.sort()
     assert vals == [0, 2, 4, 6, 8]
Пример #14
0
    def threadstart(self):
        self.tp = ThreadPool(10)
        self.tpool = Thread(target=self.startThreadPool, args=())
        self.tpool.daemon = True
        self.tpool.start()

        self.tdetect = Thread(target=self.detectConnect, args=())
        self.tdetect.daemon = True
        self.tdetect.start()

        self.ttimeout = Thread(target=self.detectConnectTimeOut, args=())
        self.ttimeout.daemon = True
        self.ttimeout.start()
def main():
    store_list = load_stores()
    thread_pool = ThreadPool(size=20)
    pos = 0
    total = len(store_list)
    for store in store_list:
        pos += 1
        task = SlotStateFetchTask(store, pos=pos, total=total)
        thread_pool.push_task(task)
    thread_pool.init_pool()
    thread_pool.start()
    print('Waiting for tasks exit!!!')
    thread_pool.join()
Пример #16
0
def start_tasks():
    stores = load_stores()
    thread_pool = ThreadPool(size=20)
    total = len(stores)
    pos = 0
    for store in stores:
        pos += 1
        task = UnderLoadSlotZeroTask(store=store, total=total, pos=pos)
        thread_pool.push_task(task)
    thread_pool.init_pool()
    thread_pool.start()
    print('Waiting for task exit!')
    thread_pool.join()
Пример #17
0
 def testcase_ThreadPool_set_work_queue_success(self):
     """测试用例1:初始化函数中设置工作队列成功"""
     jobs = [str(i) for i in xrange(2)]
     pool = ThreadPool(2, test_function, jobs, 0)
     while True:
         try:
             func, param = pool.work_queue.get(block=False)
             res = func(param)
             self.assertEqual(str(0), res)
         except Queue.Empty as e:
             self.logging.info(e)
             break
     pool.wait_allcomplete()
Пример #18
0
def start_tasks():
    thread_pool = ThreadPool(size=20)
    store_list = load_stores()
    total_count = len(store_list)
    count = 0
    for store in store_list:
        count += 1
        task = FetcherTask(store=store, num=count, total=total_count)
        thread_pool.push_task(task)
    thread_pool.init_pool()
    thread_pool.start()
    print('Waiting Task Finished......')
    thread_pool.join()
Пример #19
0
def main ():
    logging.basicConfig(filename='debug.log', filemode='w')
    with Imap(url) as imap:
        imap.login(address, password)
        logging.info('Logged in')
        folders = imap.get_folders()
        # print('Found folders:', len(folders))
        # all_uids = get_uids_and_count(imap, folders)
    ui = UI()
    # 15 - imap simultaneous connections limit

    tasks = [(process_messages, (folder, ui)) for folder in folders]
    pool = ThreadPool(max=15)
    pool.run(tasks, delay=1)
def main():
    store_list = load_stores()
    thread_pool = ThreadPool(size=20)
    index = 0
    total = len(store_list)
    for store in store_list:
        index += 1
        task = CompensationDisableTask(store=store, index=index, total=total)
        thread_pool.push_task(task)
    thread_pool.init_pool()
    print('Starting tasks...')
    thread_pool.start()
    print('Waiting for task exit!')
    thread_pool.join()
Пример #21
0
 def testcase_ThreadPool_get_result_success(self):
     """测试用例3:get_result,所有任务执行完后结果为1"""
     jobs = [i for i in xrange(2)]
     pool = ThreadPool(3, test_function, jobs, 0)
     pool.wait_allcomplete()
     sum = 0
     while True:
         try:
             res = pool.get_result()
             arr_res = json.loads(res)
             sum += int(arr_res['url'])
         except Queue.Empty as e:
             self.logging.info(e)
             break
     self.assertEqual(1, sum)
Пример #22
0
def test_thread_pool():
    """
    thread pool should be able to handle task processing
    """
    thread_pool = ThreadPool()
    result = []

    def populate_result_task():
        result.extend([i for i in range(0, 10)])
        return

    thread_pool.add_task(populate_result_task)
    thread_pool.tasks.join()
    thread_pool.terminate_all_workers()
    assert result == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
Пример #23
0
def generate_threads(functionid, threads_count, iterations_per_thread, events_per_iteration, sleep_duration, use_lambda, event_type, sensitivity_type, compression_mode):
    start = time.time()       
    context = {}            
    threadpool = ThreadPool(context, threads_count)  
    context=dict({})        
    db = DynamoDb(context) 
    print "Sleep durations: ", sleep_duration
    print "Number of threads: ", threads_count
    print "Number of iterations per thread: ", iterations_per_thread
    print "Number of events per iteration: ", events_per_iteration
    print "Using event type: ", event_type
    print "Using sensitivity type: ", sensitivity_type
    print "Using compression mode: ", compression_mode
    for i in range(0, threads_count):          
        threadpool.add(thread_job, functionid, iterations_per_thread, events_per_iteration, use_lambda, context, sleep_duration, event_type, sensitivity_type, compression_mode)                                                    
    threadpool.wait()      
    print "A total of {} metrics have been sent to the FIFO queues.".format((iterations_per_thread*events_per_iteration)*threads_count)    
    print "The overall process took {} seconds.".format(time.time() - start)
Пример #24
0
def main(event, request):
    context = dict({})
    context[c.KEY_LAMBDA_FUNCTION] = request.function_name if hasattr(
        request, 'function_name') else None
    context[c.KEY_REQUEST_ID] = request.aws_request_id if hasattr(
        request, 'aws_request_id') else None
    stackid = os.environ[c.ENV_DEPLOYMENT_STACK_ARN]

    context[c.KEY_DB] = DynamoDb(context)
    context[c.KEY_ATHENA_QUERY] = Query(stackid)
    context[c.KEY_GLUE_CRAWLER] = Glue()
    thread_pool = ThreadPool(size=3)
    crawler_name = context[c.KEY_GLUE_CRAWLER].get_crawler_name(stackid)
    crawler = Crawler(context, os.environ[c.ENV_S3_STORAGE])
    glue = Glue()
    events = glue.get_events()

    start = datetime.datetime.utcnow() - datetime.timedelta(hours=2)
    now = datetime.datetime.utcnow()

    found = False
    for type in events:
        dt = start
        while dt <= now:
            prefix = metric_schema.s3_key_format().format(
                context[c.KEY_SEPERATOR_PARTITION], dt.year, dt.month, dt.day,
                dt.hour, type, dt.strftime(util.partition_date_format()))
            found = crawler.exists(prefix)
            if found:
                print "FOUND new events=>", prefix
                break
            dt += timedelta(hours=1)
        if found:
            break

    if found:
        thread_pool.add(crawl, context, crawler_name,
                        context[c.KEY_ATHENA_QUERY].execute_with_format)
        thread_pool.wait()

    return custom_resource_response.success_response({}, "*")
Пример #25
0
def launch(event, lambdacontext):
    print "Start"
    hours_delta = 36
    context = dict({})
    context[c.KEY_LAMBDA_FUNCTION] = lambdacontext.function_name if hasattr(
        lambdacontext, 'function_name') else None
    context[c.KEY_REQUEST_ID] = lambdacontext.aws_request_id if hasattr(
        lambdacontext, 'aws_request_id') else None
    global threadpool
    global is_lambda
    threadpool = ThreadPool(context, 8)
    is_lambda = context[c.KEY_REQUEST_ID] is not None
    available_amoeba_lambdas = []
    available_amoeba_lambdas.append(c.ENV_AMOEBA_1)
    available_amoeba_lambdas.append(c.ENV_AMOEBA_2)
    available_amoeba_lambdas.append(c.ENV_AMOEBA_3)
    available_amoeba_lambdas.append(c.ENV_AMOEBA_4)
    available_amoeba_lambdas.append(c.ENV_AMOEBA_5)
    db = DynamoDb(context)
    crawler = Crawler(context, os.environ[c.ENV_S3_STORAGE])
    glue = Glue()

    events = glue.get_events()
    #TODO: adjust the amoeba tree depth so that we have fully utilized all available amoebas; len(available_amoeba_lambdas) * 1000
    #since the number of leaf nodes for the metric partitions can quickly get very large we use a 5 lambda pool to ensure we don't hit the 1000 invocation limit.

    start = datetime.datetime.utcnow() - datetime.timedelta(hours=hours_delta)
    now = datetime.datetime.utcnow()

    for type in events:
        dt = start
        while dt <= now:
            prefix = metric_schema.s3_key_format().format(
                context[c.KEY_SEPERATOR_PARTITION], dt.year, dt.month, dt.day,
                dt.hour, type, dt.strftime(util.partition_date_format()))
            threadpool.add(crawler.crawl, prefix, available_amoeba_lambdas,
                           invoke_lambda)
            dt += timedelta(hours=1)

    threadpool.wait()
    return custom_resource_response.success_response({"StatusCode": 200}, "*")
Пример #26
0
def test_thread_pool_with_exception():
    """
    thread pool should be able to handle task processing
    even if there were exceptions in some tasks
    """
    thread_pool = ThreadPool()
    result = []

    def throw_ex_task():
        raise Exception()

    def populate_result_task():
        result.extend([i for i in range(0, 10)])
        return

    thread_pool.add_task(throw_ex_task)
    thread_pool.add_task(populate_result_task)

    thread_pool.tasks.join()
    thread_pool.terminate_all_workers()

    assert result == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
Пример #27
0
            for i in range(0, len(item)):
                c = item[i].decode("gb2312")
                if i == 0:
                    l.append(c)
                else:
                    if c[0] == "&":
                        l.append(0)
                    else:
                        l.append(1)
            rooms.append(l)
        with open(
                "data/" + campus + "." + building + "." + week + "." +
                week_day + ".json", "w") as f:
            f.write(json.dumps(rooms))
        print "finish: week:" + week + " week_day:" + week_day
        return "success"


if __name__ == "__main__":
    s = Spider()
    s.cookies = {"JSESSIONID": "8B7DA565F71772D37B04170241A757A8.TAB2;"}
    pool = ThreadPool(size=20)
    pool.start()

    for week in range(1, 21):
        for week_day in range(1, 8):
            print "start week:" + str(week) + " week_day:" + str(week_day)
            # 请自行确定info.py中的校区id和教学楼id是正确的
            # 然后按照info.py中的数据修改校区和教学楼id
            pool.append_job(s.craw, "1709", "1783", str(week), str(week_day))
    pool.join()
Пример #28
0
    # In range, start, end(plus 1 to include end) and steps
    pool_size = [x for x in range(min_thread, max_thread, thread_step)]
    # Create dict with thread sizes to keep track of time
    for thread_count in pool_size:
        times[thread_count] = []

    for i in pool_size:
        if ovrld.overloaded:
            i = ovrld.opt_work_threads
        if need_count and i == pool_size[-1] \
                or need_count and ovrld.overloaded and i == ovrld.opt_work_threads:
            clients = i * calculate_needed()
        else:
            clients = i * count

        pool = ThreadPool(i)

        # Clients is the final goal, it'll run the thread count for "count" iterations
        # count is from config
        sched_clients += clients
        while clients:
            # Change to your desired function...
            pool.add_task(time_event, xmlrpc_call, i)
            clients -= 1
            total_clients += 1
            if errors.error_count > errors_threshold:
                quit()
        pool.wait_completion()
        avg_time = sum(times[i]) / len(times[i])
        ovrld.calc_time(avg_time, i)
Пример #29
0
#!/usr/bin/env python
# coding:utf-8
from thread_pool import ThreadPool
import hackhttp
import re
import os

hh = hackhttp.hackhttp(hackhttp.httpconpool(500))
tp = ThreadPool(500)
package = "wooyun"

if not os.path.exists(package):
    os.mkdir(package)


def vlun(wid):
    print "[+]%s" % wid
    if os.path.isfile(wid + ".html"):
        return
    _, _, html, _, _ = hh.http(url="http://wooyun.org/bugs/%s" % wid,
                               cookcookie=False)
    open(package + "/" + wid + '.html', 'wb').write(html)


def catalog(page):
    _, _, html, _, _ = hh.http(
        url="http://wooyun.org/bugs/new_public/page/%d" % page,
        cookcookie=False)
    for wid in re.findall(r'href="/bugs/(wooyun-\d+-\d+)">', html):
        tp.add_task(vlun, wid)
    if page > 0:
Пример #30
0
def start_crawler(event, context):
    glue = Glue()
    crawler_id_1 = glue.get_crawler_name(event)
    thread_pool = ThreadPool()
    thread_pool.add(glue.start_crawler, crawler_id_1)
    thread_pool.wait()