示例#1
0
    def webdir_crawler_schedu(self):
        '''
        web目录爆破任务调度
        :return:
        '''
        #根目录字典目录探测
        first_crack_dir_pool = pool.Pool(self.concurrent_num)
        first_crack_dir_pool.map(
            self.__webdir_crawler_work,
            ['%s/' % str(dir_dic) for dir_dic in self.dir_dic])
        self.__deal_exist_file(bModel=False)

        while not self.exist_dir_cache_que.empty():
            dir = self.exist_dir_cache_que.get_nowait()
            '''
            判断当前文件路径长度
            '''
            if len(dir.split('/')) <= self.depth:
                #Bug 修改
                #在已存在目录下探测一个绝对不存在的目录,判断状态
                if self.__subdir_crawler_status(dir):
                    dir_tmpdic = [
                        '%s/%s/' % (dir, dir_dic) for dir_dic in self.dir_dic
                    ]
                    path_pool = pool.Pool(self.concurrent_num)
                    path_pool.map(self.__webdir_crawler_work, dir_tmpdic)
                    self.__deal_exist_file(bModel=False)
示例#2
0
 def __init__(self, log_file_name="my.log"):
     # 默认的域名
     self.host = "http://www.kuwo.cn"
     # 根据关键字key获取歌曲的rid值的json数据的接口
     self.rid_url = "/api/www/search/searchMusicBykeyWord?key={}"
     # 根据rid获取歌曲下载链接的json数据的接口
     self.mp3_url = "/url?rid={}&type=convert_url3&br=128kmp3"
     # 获取音乐榜 可以得到sourceid
     self.bang_menu = "/api/www/bang/bang/bangMenu"
     # 获取音乐信息的接口
     self.music_info = "/api/www/music/musicInfo?mid={}"
     # 根据 musicid 获取歌词信息
     self.song_lyric = "http://m.kuwo.cn/newh5/singles/songinfoandlrc?musicId={}"
     # 根据bangid 获取音乐列表
     self.music_list = "/api/www/bang/bang/musicList?bangId={}&pn={}&rn={}"
     # 一些必要的请求头
     self.headers = {
         "User-Agent":
         "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36",
         "Referer":
         "http://www.kuwo.cn/search/list",  # 这个请求头没有的话,会出现 403 Forbidden
         "csrf":
         "0HQ0UGKNAKR",  # CSRF Token Not Found!
         # CSRF Token Not Found!
         "Cookie":
         "Hm_lvt_cdb524f42f0ce19b169a8071123a4797=1584003311; _ga=GA1.2.208068437.1584003311; _gid=GA1.2.1613688009.1584003311; Hm_lpvt_cdb524f42f0ce19b169a8071123a4797=1584017980; kw_token=0HQ0UGKNAKR; _gat=1",
     }
     # 多线程支持
     self.mp3_q = queue.Queue()
     # gevent
     self.pool1 = pool.Pool()
     self.pool2 = pool.Pool()
     # 自己的代理池
     self.proxies = [
         {
             'http': '116.114.19.204:443'
         },
         {
             'http': '101.231.104.82:80'
         },
         {
             'http': '116.114.19.211:443'
         },
         {
             'http': '84.17.47.190:80'
         },
     ]
     # 日志文件名
     # self.log_file_name = log_file_name
     self.f = open("{}/{}".format(BASE_DIR, log_file_name),
                   "w",
                   encoding="utf-8")
     self.f.write("log: use time second = minute = hour\n")
     # 过滤item["name"]中的无效字符
     self.invalid_characters = r"[/\?]"
示例#3
0
    def __init__(self, page):
        self.url = 'http://book.zongheng.com/store/c0/c0/b0/u0/p{}/v9/s9/t0/u0/i1/ALL.html'
        # 页码变化
        self.num = 1
        self.page = page
        self.useragent = UserAgent()
        # self.connect = pymysql.connect(host='localhost', port=3306, user='******', password='******', db='scrapytest')
        # self.cur = self.connect.cursor()
        self.queue = queue.Queue()
        self.pool = pool.Pool(2)

        self.queue_detail = queue.Queue()
        self.pool_detail = pool.Pool(3)
示例#4
0
 def __init__(self):
     """
     初始化方法
     """
     MyBaseSpider.__init__(self)
     # api 调用
     self.api = Music163ComAPI()
     # 自定义的下载器
     self.downloader = Downloader()
     # 协程池
     self.pool = pool.Pool()
     # 歌曲队列
     self.m4a_q = queue.Queue()
     # 榜单池
     self.bang_pool = pool.Pool()
示例#5
0
def get_http_utilization_for_all_tasks(marathon_service_config, marathon_tasks, endpoint, json_mapper):
    """
    Gets the mean utilization of a service across all of its tasks by fetching
    json from an http endpoint and applying a function that maps it to a
    utilization

    :param marathon_service_config: the MarathonServiceConfig to get data from
    :param marathon_tasks: Marathon tasks to get data from
    :param endpoint: The http endpoint to get the stats from
    :param json_mapper: A function that takes a dictionary for a task and returns that task's utilization

    :returns: the service's mean utilization, from 0 to 1
    """

    endpoint = endpoint.lstrip('/')
    utilization = []
    service = marathon_service_config.get_service()

    monkey.patch_socket()
    gevent_pool = pool.Pool(20)
    jobs = [
        gevent_pool.spawn(get_http_utilization_for_a_task, task, service, endpoint, json_mapper)
        for task in marathon_tasks
    ]
    gevent.joinall(jobs)

    for job in jobs:
        if job.value is not None:
            utilization.append(job.value)

    if not utilization:
        raise MetricsProviderNoDataError("Couldn't get any data from http endpoint {} for {}.{}".format(
            endpoint, marathon_service_config.service, marathon_service_config.instance,
        ))
    return mean(utilization)
示例#6
0
文件: main.py 项目: fcua/x8623
 def run(self):
     from gevent.server import StreamServer
     from gevent.pywsgi import WSGIServer
     from gevent.backdoor import BackdoorServer
     import gm.app  # NOQA
     from gm.init_app import application
     import settings as st
     # wait for proxy
     self.ping_proxy()
     threads = []
     logger.info('listening 0.0.0.0:%d', st.WORLD['port'])
     self.mainServer = StreamServer(('0.0.0.0', st.WORLD['port']),
                                    self.handle_client)
     threads.append(Greenlet.spawn(self.mainServer.serve_forever))
     logger.info('listening %s:%d', st.WORLD['managehost'],
                 st.WORLD['manageport'])
     threads.append(
         Greenlet.spawn(
             WSGIServer((st.WORLD['managehost'], st.WORLD['manageport']),
                        application,
                        spawn=pool.Pool(10)).serve_forever))
     if os.environ.get("DOCKER_MANAGEHOST"):
         backdoorhost = "0.0.0.0"
     else:
         backdoorhost = "127.0.0.1"
     logger.info('listening %s:%d', backdoorhost, st.WORLD['backdoorport'])
     threads.append(
         Greenlet.spawn(
             BackdoorServer(
                 (backdoorhost, st.WORLD['backdoorport'])).serve_forever))
     # start cron thread
     import cron_settings  # NOQA
     threads.append(Greenlet.spawn(self.heart_beat))
     joinall(threads)
示例#7
0
    def start(self):

        start_time = time.time()
        self.start_time = int(start_time)

        log.debug("useful_proxy proxy verify start")

        self.stat = dict(
            total = 0,
            succ = 0,
            fail = 0,
        )

        concurrency = ConfigManager.setting_config.setting.get("verify_useful_proxy_concurrency")
        task_pool = pool.Pool(concurrency)

        queue_size = self.queue.qsize()
        greenlet_list = []
        for _ in range(queue_size):
            greenlet_list.append(task_pool.spawn(self.verify))

        gevent.joinall(greenlet_list)

        end_time = time.time()
        elapsed_time = int(end_time - start_time)
        log.info('useful_proxy verify proxy finish, total:{total}, succ:{succ}, fail:{fail}, elapsed_time:{elapsed_time}s'
        .format(total=self.stat["total"], succ=self.stat["succ"], fail=self.stat["fail"], elapsed_time=elapsed_time))
示例#8
0
    def __init__(self, qqno, qqpwd, handler=None):
        self.handler = handler if handler else MessageHandner(self)
        self.uin = qqno
        self.qqpwd = qqpwd
        self.ptwebqq = ""
        self.psessionid = ""
        self.clientid = str(random.randint(1,99999999))
        self.vfwebqq = ""
        self.vcode = ""
        self.vcode2 = ""
        self.cookiefile = "/tmp/cookies.lwp"
        self.cookiejar = cookielib.LWPCookieJar(filename=self.cookiefile)
        self.fakeid = ""
        self.friends = None
        self.friendindex = 1
        self.uintoqq = {}
        self.referurl = "http://d.web2.qq.com/proxy.html?v=20110331002&callback=1&id=2"
        self.headers = {
                "User-Agent": "Mozilla/5.0 (X11; Linux i686; rv:16.0) Gecko/20100101 Firefox/16.0",
                "Referer": "http://d.web2.qq.com/proxy.html?v=20110331002&callback=1&id=2",
                "Content-Type": "application/x-www-form-urlencoded"
            }

        self.mq = queue.Queue(20)
        self.taskpool = pool.Pool(10)
        self.runflag = False
        from redis import Redis
        self.redisconn = Redis(host="localhost", db=10)
        self.logger = getLogger()

        self.session = requests.Session()
        self.session.headers = self.headers
def start():
    global son, billing_cycles

    try:
        # 取前三个月账期
        cur_month = datetime.strftime(datetime.now(), '%Y%m')
        cur_cycle = ''.join(['1', str(cur_month)[2:]])
        billing_cycles = son.query(BillingCycle).filter(
            BillingCycle.billing_cycle_id < cur_cycle,
            BillingCycle.billing_cycle_id >= MIN_BILLING_CYCLE,
        ).order_by(BillingCycle.billing_cycle_id.desc()).limit(LIMIT_MONTH).all()

        # 补送红包表
        bs = son.query(ARedPacketBs).filter(
            ARedPacketUser.red_id == ARedPacketBs.red_id
        )

        packet_users = son.query(
            ARedPacketUser
        ).filter(
            ARedPacketUser.finish_flag == FINISH_FLAG_NORMAL,
            bs.exists(),
        ).all()

        logger.info('len(packet_list)=%d' % len(packet_users))

        p = pool.Pool(500)
        jobs = []

        for user in packet_users:
            jobs.append(p.spawn(process, user.red_id))

        gevent.joinall(jobs)
    except Exception, ex:
        logger.error(ex.message)
示例#10
0
def start():
    global son, billing_cycles

    # 取前三个月账期
    cur_month = datetime.strftime(datetime.now(), '%Y%m')
    cur_cycle = ''.join(['1', str(cur_month)[2:]])
    billing_cycles = son.query(BillingCycle).filter(
        BillingCycle.billing_cycle_id < cur_cycle,
        BillingCycle.billing_cycle_id >= MIN_BILLING_CYCLE,
    ).order_by(BillingCycle.billing_cycle_id.desc()).limit(LIMIT_MONTH).all()

    min_date = min(map(lambda e: e.cycle_begin_date, billing_cycles))
    max_date = max(map(lambda b: b.cycle_end_date, billing_cycles))

    packet_users = son.query(ARedPacketUser).filter(
        ARedPacketUser.finish_flag == FINISH_FLAG_NORMAL,
        ARedPacketUser.wing_eff_date < max_date,
        ARedPacketUser.wing_exp_date >= min_date,
        ARedPacketUser.red_id.in_([100439718])).all()

    p = pool.Pool(500)
    jobs = []

    for user in packet_users:
        jobs.append(p.spawn(process, user.red_id))

    gevent.joinall(jobs)
    return 'pool tasks finished'
示例#11
0
def main():
    total_count = get_total_count()
    args = xrange(0, total_count, 10000)

    pool_size = 20  # pool_size = total_count/10000
    p = pool.Pool(pool_size)
    p.map(update_url_batch, args)
示例#12
0
def kickstart(itemset=None, poolsize=20):
    if itemset is None:
        itemset = getitems()
        logger.info("Found %s records to check", len(itemset))
        time.sleep(3)
    start = datetime.now()
    results = Counter()
    count = 0
    p = pool.Pool(poolsize)

    def wkfn(k):
        try:
            return (k, objfn(k))
        except Exception as e:
            logger.exception("Failed on %r", k)
            return (k, e)

    try:
        for k, result in p.imap_unordered(wkfn, list(itemset)):
            count += 1
            if isinstance(result, Exception):
                raise result
            if result is Status.ok or result is Status.rederive:
                itemset.remove(k)
            results[result] += 1
            if count % 100 == 0:
                rate = count / max([(datetime.now() - start).total_seconds(), 1])
                remaining = len(itemset) / rate
                logger.info("Processed %d records at %4.1f/s; %6.1fs remaining",
                            count, rate, remaining)
    finally:
        logger.info("Finished %d records, %r", count, results)
示例#13
0
def get_friends(user):
    api = twitter_api.Twitter(auth=twitter_api.OAuth(
        user.twitter_oauth.token,
        user.twitter_oauth.token_secret,
        twitter.consumer_key,
        twitter.consumer_secret,
    ))
    # Twitter allows lookup of 100 users at a time so we need to
    # chunk:
    chunk = lambda l, n: [l[x:x + n] for x in xrange(0, len(l), n)]
    friend_ids = list(api.friends.ids()['ids'])

    greenpool = pool.Pool(4)

    # Look up in parallel. Note that twitter has pretty strict 15
    # requests/second rate limiting.
    friends = []
    for result in greenpool.imap(
            lambda ids: api.users.lookup(user_id=','.join(
                str(id) for id in ids)), chunk(friend_ids, 100)):
        for r in result:
            friends.append(
                user_model.Friend(
                    screen_name=r['screen_name'],
                    name=r['name'],
                    profile_image_url=r['profile_image_url'],
                ))

    return sorted(friends)
示例#14
0
 def scanOnce(cls, concurrentNum=multiprocessing.cpu_count()):
     concurrentPool = pool.Pool(concurrentNum)
     cnEtfList = ETF.getCnETFList()
     for etf in cnEtfList:
         concurrentPool.spawn(cls.scanOne, etf)
     concurrentPool.join()
     print('scan once')
示例#15
0
    def begin_get_predict_data(self):
        # logger.info('开始获取"%s"专家预测数据。' % self.lottery_name)
        times = 0
        while 1:
            found_data = None
            find_data = {'lottery': self.lottery_name}
            filter_data = {'_id': 0}
            try:
                found_data = list(
                    self.predict_urls_db.find(find_data, filter_data))
            except Exception as e:
                logger.error(e)

            logger.info(
                'db:%s find_data:%s, found_data:%s' %
                (self.predict_urls_db, find_data, len(list(found_data))))

            if len(found_data) > 0:
                p = pool.Pool(100)
                for url_data in found_data:
                    expert_id = url_data['expert_id']
                    data_type = url_data['data_type']
                    url = url_data['url']
                    gpd = GPD()
                    gpd.set(self.lottery_name, expert_id, data_type, url)
                    p.spawn(gpd.start)
                    time.sleep(0.1)
                p.join()
                time.sleep(5)
                times += 1

            if (not found_data) or times > 4:
                print('尝试次数:%s' % times)
                break
    def get_transactions_of_block_at_height(self, height: gw.CoinBlockHeight) -> List[gw.Transaction]:
        block_hash = self._ltc_proxy.getblockhash(height)
        block = self._ltc_proxy.getblock(block_hash)

        get_transaction_tasks = pool.Pool()  # litecoin server does not accept more than one parallel connection

        return [a for a in get_transaction_tasks.map(self.get_transaction, block['tx'])]
示例#17
0
    def run(self):
        scannerparam = self.scannerparam
        socket.setdefaulttimeout(scannerparam.timeout)
        _pool = pool.Pool(scannerparam.threadnum)
        time_start = time.time()
        print('开始执行...')
        params = []
        iplist = IpHelper.get_ip_list(scannerparam)
        portlist = PortHelper.get_port_list(scannerparam)

        for ip in iplist:
            for port in portlist:
                params.append({'ip': ip, 'port': port})

        self.taskcount = len(params)
        print('线程数:{0},ip总数:{1},待扫描任务总数:{2}'.format(scannerparam.threadnum,
                                                    len(iplist),
                                                    self.taskcount))

        for p in params:
            _pool.spawn(self.scan, p)
        time_end = time.time()
        print('执行结束,共花费{0}秒'.format(time_end - time_start))
        for x in self.open_ports:
            print("{0}:{1} open \n".format(x['ip'], x['port']))
示例#18
0
 def test_imap(self):
     p = pool.Pool(1)
     it = p.imap(divide_by, [1, 0, 2])
     self.assertEqual(next(it), 1.0)
     self.assertRaises(ZeroDivisionError, next, it)
     self.assertEqual(next(it), 0.5)
     self.assertRaises(StopIteration, next, it)
示例#19
0
 def __init__(self, domain, flag):
     self.domain = domain
     self.flag = flag
     self.p = pool.Pool(20)
     self.tasks = []
     self.subDomainEngine = []
     self.zDomainList = []
示例#20
0
def main():
    # work()
    # return
    p = pool.Pool(200)

    while True:
        p.spawn(work)
示例#21
0
    def track(self) -> List[dict]:
        tracker_logger.info('Tracking branches for %s' % self._remote)
        include = re.compile(r'|'.join(self._branches or []), re.IGNORECASE)
        exclude = re.compile(r'|'.join(self._no_branches or []), re.IGNORECASE)
        branches = self._wrapper.get_branches(self._remote)
        filtered = []

        for branch, authordate in branches:
            if self._branches and not include.search(branch):
                continue

            if self._no_branches and exclude.search(branch):
                continue

            if self._after_date and authordate < self._after_date:
                continue

            if self._before_date and authordate > self._before_date:
                continue

            filtered.append(branch)

        def track_branch(branch):
            return branch, self._track_branch(branch)

        gpool = pool.Pool(self._greenlets)
        for branch, tracked in gpool.imap(track_branch, filtered):
            yield self._remote, branch, tracked
示例#22
0
 def start_crawling(self,
                    url: str,
                    show_urls=False,
                    allow_query_string=False):
     """Initialize job specific variables and start a scheduler to assign crawling jobs. Return all unique urls."""
     self.base_url = urlparse(url)
     assert self.base_url.netloc != '', "Error: no url found in url {}".format(
         url)
     self.show_urls = show_urls  # set to true if want to show urls on console
     self.allow_query_string = allow_query_string  # set to true if a query string should be counted as a unique url
     self.worker_pool = pool.Pool(self.worker_pool_size)
     self.urlQ = queue.Queue(
         maxsize=self.queue_size
     )  # Queue to keep track of next urls to crawl, #TODO use database for more scalability.
     self.urlQ.put(url)  # initialize queue with the first url
     self.crawled_urls = set(
     )  # Set to keep track of visited urls, #TODO use database for more scalability.
     self.crawled_urls_file = open(
         "urls.txt", "w+"
     )  # A file to keep saving intermediate results, # TODO use as cache to resume a crawler.
     self.url_crawled = event.Event(
     )  # Event to announce crawling finished by a gevent, consumed by scheduler to reassign jobs if new urls
     self._scheduler = gevent.spawn(self.scheduler)  # start the scheduler
     self._scheduler.join()  # wait for scheduler to finish
     return self.crawled_urls  # return unique urls
示例#23
0
 def test_imap_unordered(self):
     p = pool.Pool(1)
     it = p.imap_unordered(divide_by, [1, 0, 2])
     self.assertEqual(it.next(), 1.0)
     self.assertRaises(ZeroDivisionError, it.next)
     self.assertEqual(it.next(), 0.5)
     self.assertRaises(StopIteration, it.next)
示例#24
0
 def write_in_slave(self, song_id, start, end, hot=False):
     start, end = int(start), int(end)
     data_gen = self.post_data(0, start=start, end=end)
     pool = g_pool.Pool(size=self._pool_size)
     for _ in range(start, end, self._limit):
         pool.spawn(self.write_wrapper, song_id, data_gen, hot)
     pool.join()
示例#25
0
 def wait(self, timeout):
     p = pool.Pool()
     g = p.spawn(gevent.sleep, 10)
     try:
         p.join(timeout=timeout, raise_error=True)
     finally:
         g.kill()
示例#26
0
    def run(self, seedfile, progress_queue, output_queue):
        task_total = count_file_linenum(seedfile)
        proc_name = current_process().name
        sys.stdout = ProcessIO(output_queue)

        def progress_tracking(greenlet):
            count = getattr(progress_tracking, 'count', 0) + 1
            setattr(progress_tracking, 'count', count)
            progress_queue.put((proc_name, count, task_total))
            return greenlet

        po = pool.Pool(self.pool_size)
        with open(seedfile) as f:
            for line in f:
                g = po.apply_async(func=self.pool_task_with_timeout,
                                   args=(line, ),
                                   kwds=None,
                                   callback=self.callback)
                g.link(progress_tracking)
                po.add(g)

        try:
            po.join()
        except (KeyboardInterrupt, SystemExit) as ex:
            print(str(ex))
            po.kill()
示例#27
0
def process_ip_list(ip_list, output_file, start):
    ip_list_num = len(ip_list)
    for ip_pointer in range(start, start + ip_list_num, 1000):
        sem.acquire()
        print("[*] Start scanning at", ip_pointer)
        print(ip_port_dict)
        ip_port_dict.clear()
        with open("scan.log", "a") as f:
            f.write(str(ip_pointer))
        pool_ = pool.Pool(200)
        gevent_list = []
        for ip_addr in ip_list[ip_pointer:ip_pointer + 1000]:
            g = pool_.spawn(scan, ip_addr)
            gevent_list.append(g)
        gevent.joinall(gevent_list)
        headers = ["IP", "端口"]
        while flag[0] > 0:
            pass
        with open(output_file, 'a') as csv_file:
            csv_write = csv.writer(csv_file)
            if ip_pointer == 0:
                csv_write.writerow(headers)
            csv_write.writerows(list(ip_port_dict.items()))
        flag[0] = 1000
        sem.release()
示例#28
0
 def test_imap_unordered(self):
     p = pool.Pool(1)
     it = p.imap_unordered(divide_by, [1, 0, 2])
     self.assertEqual(six.advance_iterator(it), 1.0)
     self.assertRaises(ZeroDivisionError, six.advance_iterator, it)
     self.assertEqual(six.advance_iterator(it), 0.5)
     self.assertRaises(StopIteration, six.advance_iterator, it)
示例#29
0
def start():
    p = pool.Pool(pool_size)
    gevent.spawn(stop, p)

    while alive:
        st = time.time()
        p.spawn(work)
示例#30
0
 def run(self, proxyList):
     logger.info('Running Validator.')
     self.rator.begin()
     while 1:
         try:
             if proxyList:
                 self.rator.pull_table(self.db.table)
                 pen = len(proxyList)
                 logger.info(
                     'Proxies from Collector is detected,length : %d ' %
                     pen)
                 pop_len = pen if pen <= VALIDATE_AMOUNT else VALIDATE_AMOUNT
                 stanby_proxies = [proxyList.pop() for x in range(pop_len)]
                 logger.info(
                     'Start to verify the collected proxy data,amount: %d '
                     % pop_len)
                 gpool = pool.Pool(CONCURRENCY)
                 gevent.joinall([
                     gpool.spawn(self.validate_proxy, i)
                     for i in stanby_proxies if i
                 ])
                 logger.info(
                     'Validation finished.Left collected proxies:%d' %
                     len(proxyList))
                 time.sleep(VALIDATE_F)
         except Exception as e:
             logger.error('Error class : %s , msg : %s ' % (e.__class__, e))
             self.rator.end()
             logger.info('Validator shuts down.')
             return