def get_weibo_comment(weibo_content_id): url = URL_COMMIT_URL.replace('id=4184562986557218', 'id=' + weibo_content_id) content = request_url(url, 'get', '') try: result_json = json.loads(content) ok_flag = result_json['ok'] test_comment_pool = ThreadPool(10) if ok_flag == 1: comment_data_arr = result_json['data']['data'] get_comment_by_arr(weibo_content_id, comment_data_arr) page_count = result_json['data']['max'] for page_index in range(2, page_count + 1): url = url.replace('&page=' + str(page_index - 1), '&page=' + str(page_index)) content = request_url(url, 'get', '') try: result_json = json.loads(content) ok_flag = result_json['ok'] if ok_flag == 1: comment_data_arr = result_json['data']['data'] #get_comment_by_arr(weibo_content_id,comment_data_arr) test_comment_pool(get_comment_by_arr, ( weibo_content_id, comment_data_arr, ), callback=None) except Exception as e: continue test_comment_pool.close() except Exception as e: pass
def scrap(self,url): if url is not None and len(url) > 0: url_info = tldextract.extract(url) self.domain = url_info.domain print '[INFO] Scrapper::scrap, domain',self.domain tp = ThreadPool(max_workers=120) tp.add_job(threaded_function,[self.domain,url]) else: print '[ALARM] Scrapper:scrap, invalid url'
def __init__(self, crawlername, workerThreadNum, pollInterval=0.5, pollTimeout=None, downloadTimeout=5): self.threadPool = ThreadPool(workerThreadNum) self.crawlername = crawlername self.pollTimeout = pollTimeout self.crawlerThread = CrawlerThread(self.threadPool, pollTimeout) self.mp3Downloader = MP3Downloader(downloadTimeout)
def siper_data(user_id, start_date, end_date): main_pool = ThreadPool(5) start_date = datetime.strptime(start_date, '%Y-%m-%d') end_date = datetime.strptime(end_date, '%Y-%m-%d') # 采集微博用户信息 get_user_info(user_id) # 读取微博ID container_id = get_containerid(user_id) # 采集微博LINK时间以及ID weibo_link_dict = get_weibo_link(user_id, container_id, start_date, end_date) main_pool.run(siper_weibo_content, (weibo_link_dict, ), callback=None) main_pool.run(siper_weibo_comment, (weibo_link_dict, ), callback=None)
def scan(self): if self.source: threadpool = ThreadPool.ThreadPool(1) else: threadpool = ThreadPool.ThreadPool(5) for rule in self.rules: threadpool.addtask(rule.detect, (), (self.results, rule)) #if rule.detect(): # self.results.append(rule) threadpool.start() threadpool.wait() #threadpool.clear() #threadpool.stop() pass
def process_request(self, request, client_address): """ 构造一个任务,提交给线程池执行 """ work = ThreadPool.WorkRequest(super().process_request, args=(request, client_address)) self.threadPool.putRequest(work)
def __init__(self,crawlername,workerThreadNum, pollInterval=0.5,pollTimeout=None,downloadTimeout=5): self.threadPool = ThreadPool(workerThreadNum) self.crawlername = crawlername self.pollTimeout = pollTimeout self.crawlerThread = CrawlerThread(self.threadPool,pollTimeout) self.mp3Downloader = MP3Downloader(downloadTimeout)
def __init__(self, parent): super().__init__(parent) self.fixedSprites = list( range(0, 8) ) # eight fixed top arrow, each arrow should be put in the number of the arrow self.arrowSprites = [] self.levelFeedbackSprites = [ None ] * 2 # level feedback of two players, [0] -> p1, [1] -> p2 self.setFocusPolicy(Qt.StrongFocus) self.threadPool = ThreadPool(20) self.spritePrototypeFactory = SpritePrototypeFactory() self.spritePrototypeFactory.prepare(MyGameView.GAME_WIDTH, MyGameView.GAME_HEIGHT) self.createAllTopFixedArrow() self.scoreRecordersMap = defaultdict(ScoreRecorder)
def unzip_zip_file_async(zipfilename, unziptodir, end_cb=None): if not os.path.exists(unziptodir): os.makedirs(unziptodir) def unzip(zipfilename, unziptodir, end_cb=None): zfobj = zipfile.ZipFile(zipfilename, 'r') for name in zfobj.namelist(): name = name.replace('\\', '/') if name.endswith('/'): os.makedirs(os.path.join(unziptodir, name)) else: ext_filename = os.path.join(unziptodir, name) ext_dir = os.path.dirname(ext_filename) if not os.path.exists(ext_dir): os.makedirs(ext_dir) data = zfobj.read(name) outfile = open(ext_filename, 'wb') outfile.write(data) outfile.close() if None != end_cb: end_cb(zipfilename, zfobj.namelist()) t = ThreadPool.ThreadPool().Thread(target=unzip, args=(zipfilename, unziptodir, end_cb)) t.start() return t
def main(): global thread_pool, store global headers, host, user_id user_id, thread_count, db_name = option_parser() store = Store(db_name) thread_pool = ThreadPool.Thread_Pool(thread_count) host = "http://blog.csdn.net" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36" " (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36" } front_page_url = host + "/" + user_id param = { "url": front_page_url } # 向任务队列添加下载博客首页的任务 # 随后会自动解析下载所有目录页,进而解析下载所有博文 # 当所有下载任务均结束后,队列为空。在任务进行时队列不会为空。 # 所以可以通过等待队列为空进行线程同步 thread_pool.add_work(Download_Front_Page, param) thread_pool.wait_queue_empty() store.store() print("---end---")
def preprocessPercentileRatios(self): print "preprocessPercentileRatios start" distributionsFile = self.getDatasetSlidingSizesFile() if os.path.isfile(distributionsFile): #Teh distributions file is processed print "The distribution file exists" return self.initializePropertiesComputeStructures(False) print "computing the ratios" try: zpa = zipfile.ZipFile(distributionsFile,"w",zipfile.ZIP_DEFLATED) zpa.writestr("dummy.txt","dummy file") zpa.close() self.ziplock = threading.Lock() # Create a pool with three worker threads pool = ThreadPool.ThreadPool(5) sys.setcheckinterval(1000) for windowSize in self.slidingWindowSizes: if self.useWindowThreading: pool.queueTask(self.preprocessWindowSize, windowSize, None) else: self.preprocessWindowSize(windowSize) # When all tasks are finished, allow the threads to terminate if self.useWindowThreading: print "Joining the threads" pool.joinAll() except: os.unlink(distributionsFile) raise print "preprocessPercentileRatios end"
def __init__(self, store_location): #logging.debug("OFFLINE IMAGE CACHE STARTUP") self._store_location = store_location self._threadpool = ThreadPool.ThreadPool(5, "OfflineImageCache", daemon=True) self._cachers = {} # dict of urlcacher objects?
def __init__(self,filepath): with open(filepath,'r',encoding='utf-8') as f:#解析配置文件 self.config=json.loads(f.read()) self.max_Task=self.config['max_Task']#最大任务数 self.thread_count=self.config['threadCount']#线程数 self.M_BLL=Mission_BLL.Mission()#任务表 self.SI_BLL=SpiderInfo_BLL.SpiderInfo()#配置表 self.HI_BLL=HoneyInfo_BLL.HoneyInfo()#数据库信息 self.EI_BLL=ExceptionInfo_BLL.ExceptionInfo()#异常信息 self.Exception_Model=Model.exceptionInfo("","")#异常模型 self.PRI=["high","common","low"]#优先级 self.T_Pool = ThreadPool(self.thread_count,self.max_Task)#任务线程池 #日志信息 self.loger=logging.getLogger() self.loger.setLevel(logging.DEBUG) hfile=logging.FileHandler("./Hive/LOG/ServerLogInfo.log") formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') hfile.setFormatter(formatter) self.loger.addHandler(hfile) self.loger.info("Server初始化!")
def process_jobs(num_threads, job_queue): """ Process the jobs in parallel """ # Create the thread pool tp = ThreadPool.ThreadPool(num_threads) # Process the jobs job_queue = tp.process_jobs(job_queue) for i in range(len(job_queue)): job = job_queue[i] print(job.thread.thread_id, job.start_time)
def __init__(self): self.countPosition = 0 self.threadPool = ThreadPool.ThreadPool(1000) self.diffTime = 1114 self.deviationTime = 200 self.timer = QBasicTimer() self.__noteMap = collections.defaultdict(list) # <startTime, note[]> self.__player = None self.__musicPlayerView = None self.sheetRecorder = None self.scoreRecorder = None self.sheet = None
def __init__(self): self.fairness_factor = 0.2 self.fairness_scores = {} self.share_scores = {} self.dag_planners = {} self.alpha = 0.5 self.available_bandwidth = 1200 # 10Gbps = 1.2 GBps = 1200 MBps self.cache_block_size = 1 # 1MBype self.cache = cache.Cache(10000) self.n_thp = 10 self.d3n_conn = thp.ThreadPool(self.n_thp)
def __init__(self, sizeOfBuffer, numberOfCores, numberOfThreads, timeQuantum, contextSwitchTime): Buffer.Buffer.initBufferCount() self.buffer = Buffer.Buffer(sizeOfBuffer) self.numberOfCores = numberOfCores self.cores = [] for y in list(range(numberOfCores)): self.cores.append(Core.Core(y, 0)) self.threadPool = ThreadPool.ThreadPool(numberOfThreads) self.timeQuantum = timeQuantum self.contextSwitchTime = contextSwitchTime
class MP3Crawler: ''' @param crawlername @param workerThreadNum @param pollInterval: interval time to poll task from task queue @param pollTimeout: timeout seconds to poll a task from task queue @param downloadTimeout: timeout seconds to download media from web ''' def __init__(self,crawlername,workerThreadNum, pollInterval=0.5,pollTimeout=None,downloadTimeout=5): self.threadPool = ThreadPool(workerThreadNum) self.crawlername = crawlername self.pollTimeout = pollTimeout self.crawlerThread = CrawlerThread(self.threadPool,pollTimeout) self.mp3Downloader = MP3Downloader(downloadTimeout) def start(self): '''start crawl''' self.crawlerThread.start() def stop(self): '''stop crawl, block until all tasks finish''' self.threadPool.stop() self.crawlerThread.dismiss() self.crawlerThread.join() def waitUtilComplete(self): '''wait until all tasks complete''' self.threadPool.wait() def __checkTask(self,task): if task.has_key('type')==False or task.has_key('url') ==False: return False if task['type']!='mp3' and task['type']!='html' and task['type']!='json': return False if task.has_key('savePath')==False: return False else: return True def downloadMP3(self,url,filePath): self.mp3Downloader.downloadM(url, filePath) def __printResult(self,request,result): print "---Result from request %s : %r" % (request.requestID,result) #pass def addTask(self,task): '''add a mp3 download task ''' if self.__checkTask(task) == False: print 'Task not Avilable:', task return req = WorkRequest(self.downloadMP3,args=[task['url'],task['savePath']],kwds={},callback=self.__printResult) self.threadPool.putRequest(req) print "work request #%s added." % req.requestID
def __init__(self, view, home, share_path): PTVhtml.PTVhtml.__init__(self, view, home, share_path) self._htmlview = None self._document_lock = threading.Lock() self._image_cache = SimpleImageCache.SimpleImageCache() self._css = "" self._last_link_time = 0 self._view = view f = open(os.path.join(share_path, "gtkhtml.css")) for l in f.readlines(): self._css += l f.close() self._image_pool = ThreadPool.ThreadPool(5, "PlanetView") self._dl_total = 0 self._dl_count = 0
def __init__(self, app, media_dir, progress_callback=None, finished_callback=None): self.index = 0 #should this be lucene compatible? if utils.RUNNING_HILDON: max_downloads = 1 else: max_downloads = 5 self._style = BYDATE self.pool = ThreadPool.ThreadPool(max_downloads, "MediaManager") self.downloads = [] self.db = app.db self.time_appendix = 0 self.bt_settings = {'min_port': 6881, 'max_port': 6999, 'ul_limit': 0} self.id_time = 0 self.quitting = False self._net_connected = True self.pause_state = RUNNING if finished_callback: self.app_callback_finished = finished_callback else: self.app_callback_finished = self._basic_finished_callback if progress_callback: self.app_callback_progress = progress_callback else: self.app_callback_progress = self._basic_progress_callback home = self.db.home if media_dir[0] == '~': media_dir = os.getenv('HOME') + media_dir[1:] try: os.stat(media_dir) except: try: os.mkdir(media_dir) except: raise NoDir, "error creating " + media_dir self._media_dir = media_dir app.connect('online-status-changed', self.__online_status_changed) app.connect('new-database', self.__new_database_cb)
def siper_weibo_comment(weibo_link_dict): comment_pool = ThreadPool(10) print 'weibo comment siper start ...' for weibo_content_id in weibo_link_dict: created_at = weibo_link_dict[weibo_content_id] print weibo_content_id, created_at comment_pool.run(get_weibo_comment, (weibo_content_id, ), callback=None) # 解析微博评论 #get_weibo_comment(weibo_content_id) print 'weibo comment siper end ...' comment_pool.close()
def siper_weibo_content(weibo_link_dict): content_pool = ThreadPool(10) print 'weibo content siper start ....' # 采集微博内容信息 for weibo_content_id in weibo_link_dict: created_at = weibo_link_dict[weibo_content_id] content_pool.run(get_weibo_content, (user_id, weibo_content_id, created_at), callback=None) # 采集微博内容信息 #get_weibo_content(user_id,weibo_content_id,created_at) print 'weibo content siper end ....' content_pool.close()
def unzip_7z_file_async(zipfilename, unziptodir, end_cb=None): if not os.path.exists(unziptodir): os.makedirs(unziptodir) def unzip(zipfilename, unziptodir, end_cb=None): try: archive = py7zr.SevenZipFile(zipfilename, mode='r') names = archive.getnames() archive.extractall(path=unziptodir) archive.close() if None != end_cb: end_cb(zipfilename, names) except Exception as e: print(e) t = ThreadPool.ThreadPool().Thread(target=unzip, args=(zipfilename, unziptodir, end_cb)) t.start() return t
def getRemoteFileByUrlAsync(url, path, progress_cb=None, end_cb=None): file_size = 0 try: file_size = int(urlopen(url).info().get('Content-Length', -1)) except Exception as e: print(e) return None def getRemoteFunc(url, path, file_size, progress_cb=None, end_cb=None): try: if not os.path.exists(os.path.dirname(path)): os.makedirs(os.path.dirname(path)) pbar = tqdm(total=file_size, initial=0, desc=path, unit='B', unit_scale=True) r = requests.get(url, stream=True, verify=False) curSize = 0.0 chunksize = 1024 * 1024 * 4 with open(path, "wb") as f: for chunk in r.iter_content(chunk_size=chunksize): f.write(chunk) curSize = curSize + len(chunk) if None != progress_cb: progress_cb(curSize, file_size) pbar.update(chunksize) f.close() pbar.close() if None != end_cb: end_cb(path, True) except Exception as e: print(e) if None != end_cb: end_cb(path, False) t = ThreadPool.ThreadPool().Thread(target=getRemoteFunc, args=(url, path, file_size, progress_cb, end_cb)) t.start() return t
def siper_weibo_comment(weibo_link_dict, user_id): comment_pool = ThreadPool(MAX_THREAD) print 'weibo comment siper start ...' for weibo_content_id in weibo_link_dict: created_at = weibo_link_dict[weibo_content_id] print weibo_content_id, created_at #get_weibo_comment_spider(weibo_content_id,user_id) comment_pool.run(get_weibo_comment_spider, ( weibo_content_id, user_id, ), callback=None) # 解析微博评论 #get_weibo_comment(weibo_content_id) print 'weibo comment siper end ...' comment_pool.close()
class WorkThread(QtCore.QThread): signal_str = QtCore.pyqtSignal(str) scan_thread_pool = ThreadPool.ThreadPoolManager(thread_num) # 线程池 def __init__(self, scan_range, result): super(WorkThread, self).__init__() self.scan_range = scan_range self.result = result def run(self): print('WorkThread is running') startIp = self.scan_range[0].split('.') endIp = self.scan_range[1].split('.') startPort = int(self.scan_range[2]) endPort = int(self.scan_range[3]) # for i in range(len(startIp)): # startIp[i] = int(startIp[i]) # for i in range(len(endIp)): # endIp[i] = int(endIp[i]) if startIp[0] != endIp[0] or startIp[1] != endIp[1]: self.signal_str.emit('范围区间过大,不想扫 `') return # 已知假设 startip小于等于endip ,startport小于等于endport s_ip2 = int(startIp[2]) s_ip3 = int(startIp[3]) e_ip2 = int(endIp[2]) e_ip3 = int(endIp[3]) for i in range(startPort, endPort + 1): for j in range(s_ip3, e_ip3 + 1): for k in range(s_ip2, e_ip2 + 1): if not stopTag: dst_ip = startIp[0] + '.' + startIp[1] + '.' + str( k) + '.' + str(j) # 目标ip str型 dst_port = i # 目标端口 int型 # scan_info = (dst_ip, dst_port, open_port) WorkThread.scan_thread_pool.add_job( scan, dst_ip, dst_port, self.result) print('WorkThread is exited')
def __init__(self, server_address, RequestHandlerClass, bind_and_activate=True, thread_num=10, RequestInterceptorClasses=[]): """Constructor. thread_num 线程池默认的线程数量 """ self.requestInterceptorClasses = RequestInterceptorClasses #初始化线程池 self.threadPool = ThreadPool.ThreadPool(thread_num) #设置urllib opener, HTTP server接收到请求时,需要通过urllib 转发出请求,这里全局设置urllib cj = http.cookiejar.CookieJar() #proxyHandler = urllib.request.ProxyHandler({'http','127.0.0.1:8888'}) 代理 opener = urllib.request.build_opener( urllib.request.HTTPCookieProcessor(cj), KIZHTTPRedirectHandler, KIZHttpErrorHandler) urllib.request.install_opener(opener) super().__init__(server_address, RequestHandlerClass, bind_and_activate)
def spider_page(url, fid): content = request_url(url, 'get', '') soup = BeautifulSoup(content, 'lxml') tbody_list = soup.find('table', id='threadlisttableid').find_all('tbody') main_pool = ThreadPool(MAX_THREAD) for tbody in tbody_list: id = tbody.get('id') id = id.replace('separatorline', '').replace('stickthread', '').replace('normalthread', '').replace('_', '') if len(id) > 0 and id: content_url = 'https://bbs.ichunqiu.com/thread-' + str( id) + '-1-1.html' #ichunqiu_sipder(content_url,fid) main_pool.run(ichunqiu_sipder, ( content_url, fid, ), callback=None) main_pool.close()
def get_weibo_comment_spider(content_id, user_id): url = 'https://weibo.com/aj/v6/comment/big?id=' + content_id + '&filter=all&page=1' try: page_count = get_weibo_comment_ext(url, content_id, user_id) comment_pool = ThreadPool(MAX_THREAD) for i in range(2, page_count + 1): url = 'https://weibo.com/aj/v6/comment/big?id=' + content_id + '&filter=all&page=' + str( i) try: #get_weibo_comment_ext(url,content_id,user_id) comment_pool.run(get_weibo_comment_ext, (url, content_id, user_id), callback=None) except Exception as e1: print e1 traceback.print_exc() continue comment_pool.close() except Exception as e: traceback.print_exc()
import tkinter as tk import tkinter.messagebox import socket import time import threading import queue import sys from Encryption import encryptPasswd, encodeId, decodeId, pad, readMessage, readRoomList, readUserList from Settings import HOST, PORT, COMMAND_CODE import ThreadPool client_thread_pool = ThreadPool.ThreadPool(3, True) MessageQueue = queue.Queue() RoomsList = list() # True for Rooms List HAVE BEEN Read, False for not RoomsListFlag = True UsersList = list() # True for Users List HAVE BEEN Read, False for not UsersListFlag = True ReturnCode = 0 # True for message HAVE BEEN Read, False for HAVE NOT BEEN Read ReturnCodeFlag = True ChatRooms = dict() def clientReceiveLogic(conn): global RoomsListFlag, RoomsList, ReturnCode, ReturnCodeFlag, loginPage conn = conn[0] while True: data = MessageQueue.get() Command = int.from_bytes(data[0:2], byteorder='big')
#python computeFVs.py videos vid_in vid_out if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("vid_path", help="Directory of the input videos", type=str) parser.add_argument("vid_in", help="list of input videos in .txt file", type=str) parser.add_argument("output_dir", help="output directory to save FVs (.fisher files)", type=str) parser.add_argument("gmm_list", help="File of saved list of GMMs", type=str) args = parser.parse_args() f = open(args.vid_in, 'r') input_videos = f.readlines() f.close() input_videos = [line.split()[0].split('/')[-1] for line in [video.rstrip() for video in input_videos]] ###Just to prevent overwriting already processed vids completed_vids = [filename.split('.')[0] for filename in os.listdir(args.output_dir) if filename.endswith('.npz')] overlap = [vid for vid in input_videos if vid.split('.')[0] in completed_vids] #Multi-threaded FV construction. numThreads = 10 pool = ThreadPool.ThreadPool(numThreads) for vid in input_videos: if vid not in overlap: pool.add_task(processVideo,vid,args.vid_path,args.output_dir,args.gmm_list) pool.wait_completion()
return asyncDecorator if __name__ == "__main__": from time import sleep from ThreadPool import * try: xrange except NameError: xrange = range class TestClass(): @Async() def testDecorated(self): print (345) testClass = TestClass() testClass.testDecorated() @Async(executor=ThreadPool(5)) def func(a, b): print ("func called") sleep(1) print ("func exit:" + str(a)) @Async() def funcWithoutExecutor(a): print (a) for x in xrange(1, 10): funcWithoutExecutor("noExecutor:" + str(x)) for x in xrange(1, 15): func(x, 2)
author = string_tag.find('a',class_='xi2').string break for date_div in soup.find_all('div',class_='cl',attrs={'style':'font-size: 12px; color: #888888;'}): tmp_str = str(date_div) date_arr = re.findall('\d{4}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}:\d{1,2}',tmp_str) if len(date_arr) > 0: content_date = date_arr[0] break content_id = MySQLdb.escape_string(content_id) title = MySQLdb.escape_string(title) author = MySQLdb.escape_string(author) content_date = MySQLdb.escape_string(content_date) connection = pool.connection() cursor = connection.cursor() sql = "INSERT IGNORE INTO ichunqiu_content(id,content_id,title,url,author,content_date,create_date,update_date) VALUES (DEFAULT,'"+content_id+"','"+title+"','"+url+"','"+author+"','"+content_date+"',NOW(),NOW())" #print sql cursor.execute(sql) connection.commit() cursor.close() connection.close() print 'content_id -->',content_id,' ok' if __name__ == '__main__': main_pool = ThreadPool(MAX_THREAD) for i in range(33742,34832+1): url = 'https://bbs.ichunqiu.com/thread-'+str(i)+'-1-1.html' #ichunqiu_sipder(url) main_pool.run(ichunqiu_sipder,(url,), callback=None) #break # print url main_pool.close()
from ThreadPool import * def tester(num): print("FXXXXXK %d" % (num, )) def tester1(): print("FXXXXXK") if __name__ == '__main__': thread_pool = ThreadPool(10) for i in range(1000): thread_pool.append_job(tester, i) for i in range(1000): thread_pool.append_job(tester1) thread_pool.start() thread_pool.join()
def test(): import random import time import datetime def do_work(data): time.sleep(random.randint(1, 3)) res = str(datetime.datetime.now()) + "" + str(data) return res def print_result(request, result): print "---Result from request %s : %r" % (request.requestID, result) main = ThreadPool(3) for i in range(40): req = WorkRequest(do_work, args=[i], kwds={}, callback=print_result) main.putRequest(req) print "work request #%s added." % req.requestID print '-' * 20, main.workersize(), '-' * 20 counter = 0 while True: try: time.sleep(0.5) main.poll() if (counter == 5): print "Add 3 more workers threads" main.createWorkers(3) print '-' * 20, main.workersize(), '-' * 20 if (counter == 10): print "dismiss 2 workers threads" main.dismissWorkers(2) print '-' * 20, main.workersize(), '-' * 20 counter += 1 except NoResultsPending: print "no pending results" break main.stop() print "Stop"
#!/usr/bin/python import Queue from ThreadPool import * from util import * hosts = get_hosts() queue = Queue.Queue() tp = ThreadPool(queue, len(hosts)) for hostname in hosts: tp.add_job(read_result, hostname) tp.wait_for_complete()
#! /usr/bin/env python from CULCrawler import CULCrawler from ThreadPool import * from Config import config import urllib from Daemon import * if __name__ == "__main__": #createDaemon() crawler = CULCrawler() threadNum = 5 pool = ThreadPool(threadNum) crawl_page_num = int(config.get_config('crawl_pages')) for i in xrange(crawl_page_num): url = 'http://www.citeulike.org/home/page/' + str(i + 1) pool.queueTask(crawler.crawl, url) # keywords search f = open("keywords", "r") for keyword in f.readlines(): keyword = keyword.strip() query = urllib.urlencode({'q' : keyword}) url_prefix = 'http://www.citeulike.org/search/all/page/' for i in xrange(crawl_page_num): url = url_prefix + str(i + 1) + '?' + query #print url pool.queueTask(crawler.crawl, url) f.close() pool.joinAll()