def getUrlList(sUrl, jUrl): #获取此sUrl中的10条nsrc记录 html = urllib2.urlopen(sUrl).read() nsrcList = regex_nsrc.findall(html) #遍历获取到的10条nsrc记录 jumpUrl = [jUrl + nsrc for nsrc in nsrcList] pool = threadPool(5) urlList = pool.map(getBaseUrl, jumpUrl) pool.close() pool.join() # for nsrc in nsrcList: # print ('.'), # #测试(jUrl+nsrc)的可访问性 # #若跳转太多或访问出错 # #则跳过此条(jUrl+nsrc) # html2 = getHtml(jUrl+nsrc) # if html2 == 'Error': # print html2 # continue # #成功获取到跳到百度转码的网页(html2) # u = regex_url.search(html2) # if u: # urlList.append(u.group(1).replace('amp;','')) # #直接跳转到真实网页,由于不同的网站的 # #网络环境不同会导致访问时间可能太长,故屏蔽 # # else: # # u = urllib2.urlopen(jUrl+nsrc).geturl() # # urlList.append(u) #返回获取到的真实url return urlList
def start(self, keywords, url_constructor, thread_num=10, search_handler=lambda k, t: [{ 'result': t }], result_handler=lambda k, t: (k, t)): self.start_time = time.time() self.search_handler = search_handler self.result_handler = result_handler keyword_urls = [] results = [] logging.info('Parser start') for keyword in keywords: keyword_urls.extend(url_constructor(keyword)) logging.info('Construct url with {}'.format(keyword)) logging.info('Start {} thread'.format(thread_num)) pool = threadPool(thread_num) # multi thread debug # [results.extend(r) for r in map(self.request, keyword_urls) if r] [results.extend(r) for r in pool.map(self.request, keyword_urls) if r] logging.info('End all thread') pool.close() pool.join self.end_time = time.time() logging.info('Spend time {}'.format(self.time())) return filter(lambda t: bool(t), results)
def thControler(targetList): print 'starting checking threads.........' try: th = threadPool(8) th.map(check, targetList) except Exception as e: print e th.close() th.join()
def thControler(targetList): print 'starting checking threads.........' try: th=threadPool(8) th.map(check,targetList) except Exception as e: print e th.close() th.join()
def run_pool(target): ''' param target: target function for each thread to execute ''' pool = threadPool(50) pool.map_async(target, urls) pool.close() pool.join()
def main(): wd = 'inurl:php?id=' wd = urllib.quote(wd) ksearchUrl = searchUrl.replace('keyWord', wd) for n in range(6, 7): n = n * 10 currentSearchUrl = ksearchUrl.replace('pageNum', str(n)) print currentSearchUrl urlList = getUrlList(currentSearchUrl, jumpUrl) autoChk = autoSqli('http://127.0.0.1:8775') chkPool = threadPool(3) rsts = chkPool.map(autoChk.run, urlList) pool.close() pool.join() return (0)
def main(): wd = 'inurl:php?id=' wd = urllib.quote(wd) ksearchUrl = searchUrl.replace('keyWord', wd) for n in range(11, 12): n = n * 10 currentSearchUrl = ksearchUrl.replace('pageNum', str(n)) print currentSearchUrl urlList = getUrlList(currentSearchUrl, jumpUrl) chkPool = threadPool(processes=3) for u in urlList: t = autoSqli('http://127.0.0.1:8775') chkPool.apply_async(t.run, (u, )) chkPool.close() chkPool.join() return (0)
def main(): wd = 'inurl:php?id=' wd = urllib.quote(wd) ksearchUrl = searchUrl.replace('keyWord',wd) for n in range(11,12): n = n * 10 currentSearchUrl = ksearchUrl.replace('pageNum',str(n)) print currentSearchUrl urlList = getUrlList(currentSearchUrl,jumpUrl) chkPool = threadPool(processes = 3) for u in urlList: t = autoSqli('http://127.0.0.1:8775') chkPool.apply_async(t.run,(u,)) chkPool.close() chkPool.join() return (0)
def _map(self, func, vals): """parallel mapping function Args: func (Function): to apply vals ([object]): list of values to apply to function Returns: ([object]) list of return values """ cpuc = multiprocessing.cpu_count() pool = threadPool(cpuc if self.n_jobs <= -1 or self.n_jobs >= cpuc else self.n_jobs) vals = pool.map(func, vals) pool.close() pool.join() return vals
def getUrlList(sUrl,jUrl): #获取此sUrl中的10条nsrc记录 html = urllib2.urlopen(sUrl).read() nsrcList = regex_nsrc.findall(html) #遍历获取到的10条nsrc记录 jumpUrl = [jUrl+nsrc for nsrc in nsrcList] pool = threadPool(5) urlList = pool.map(getBaseUrl,jumpUrl) pool.close() pool.join() # for nsrc in nsrcList: # print ('.'), # #测试(jUrl+nsrc)的可访问性 # #若跳转太多或访问出错 # #则跳过此条(jUrl+nsrc) # html2 = getHtml(jUrl+nsrc) # if html2 == 'Error': # print html2 # continue # #成功获取到跳到百度转码的网页(html2) # u = regex_url.search(html2) # if u: # urlList.append(u.group(1).replace('amp;','')) # #直接跳转到真实网页,由于不同的网站的 # #网络环境不同会导致访问时间可能太长,故屏蔽 # # else: # # u = urllib2.urlopen(jUrl+nsrc).geturl() # # urlList.append(u) #返回获取到的真实url return urlList
import time import requests as re from multiprocessing import Pool from multiprocessing.dummy import Pool as threadPool urls = [ 'http://www.python.org', 'http://www.python.org/about/', 'http://www.onlamp.com/pub/a/python/2003/04/17/metaclasses.html', 'http://www.python.org/doc/', 'http://www.python.org/download/', 'http://www.python.org/getit/', 'http://www.python.org/community/', 'https://wiki.python.org/moin/', 'http://planet.python.org/', 'https://wiki.python.org/moin/LocalUserGroups', 'http://www.python.org/psf/', 'http://docs.python.org/devguide/', 'http://www.python.org/community/awards/' ] start_time = time.time() pool = threadPool(5) results = pool.map(re.get, urls) pool.close() pool.join() print('Done! Time taken: {}'.format(time.time() - start_time))
pool.join() #多线程 import os import threading from multiprocessing.dummy import Pool as threadPool def f(x): pid = os.getpid() tid = threading.get_ident() print(pid, tid, x) if __name__ == '__main__': tpool = threadPool(4) for i in range(1, 11): tpool.apply_async(f, args=[i]) tpool.close() tpool.join() #协程 import os import threading import asyncio async def f(x): pid = os.getpid() tid = threading.get_ident() print(pid, tid, x)
def main(): pool = threadPool(20) proxies = Proxy().get_proxies() with open('proxy.txt', 'w') as f: for proxy in filter(bool, pool.map(detect_alive_proxy, proxies)): f.write(proxy + '\n')
def multiThread(func, argList): connPool = threadPool(20) rtProcList = connPool.map(func, argList) connPool.close() connPool.join() return rtProcList
for movie in movie_from_url(url): write_to_file(movie.json()) if __name__ == '__main__': # # 多进程 # start = clock() # pool = Pool(1) # pool.map(main, [i * 10 for i in range(10)]) # pool.close() # pool.join() # end = clock() # print(start, end) # print((end - start)) # # # # 单进程 # start = clock() # for i in range(10): # main(i * 10) # end = clock() # print((end - start)) # 多线程 start = clock() pool = threadPool(1) pool.map(main, [i * 10 for i in range(10)]) pool.close() pool.join() end = clock() print(start, end) print((end - start))