self.thread_num = thread_num def new_worker(self): def start_thread_manager(): manager = MultiThreadWorkerManager(self.exit_flag, self.job_queue, self.done_queue, self.thread_num, self.worker_constructor, self.constructor_args, self.constructor_kwargs) manager.monitor_worker() return multiprocessing.Process(target=start_thread_manager) if __name__ == '__main__': begin_time = time.clock() name_red = 'testA' name_eu = 'testB' redis_que = RedisQueues(name_red) eu_que = EuQueue(name_eu, redis_que) # eu_que = EuQueue() hosts = ["http://www.yahoo.com", "http://google.com","http://www.amazon.com","http://www.taobao.com/","http://www.apple.com","http://www.baidu.com"] #提取已经下载的页面里面的URL,保存至队列中 for item in hosts: print "item1:", item redis_que.put(item) exit_flag = multiprocessing.Value('i', 0, lock=True) print exit_flag worker_num = 1 max_job_num = 1 # crawler_mas = MultiThreadWorkerManager(exit_flag, redis_que, eu_que, worker_num, CuWorker, max_job_num) crawler_mas = MultiProcessMultiThreadWorkerManager(exit_flag, redis_que, eu_que, worker_num, CuWorker, max_job_num)
pagetoken = '&pagetoken=%s' % josn_response['next_page_token'] url = re.sub(r'&pagetoken=.*', pagetoken, url) save_url_ToQueue(url) # self.parse_html(url) else: pass elif status == 'OVER_QUERY_LIMIT': self.key = self.app_keys_pop() url = re.sub(r'&key=.*&pagetoken', '&key=%s&pagetoken' % self.key, url) save_url_ToQueue(url) # self.parse_html(url) else: return # from datetime import datetime if __name__ == "__main__": # border_location_right = { # "lat": 24.970, # "lng": 121.666 # } # print gpp.get_url(app_keys[0], border_location_right, 'food') name_red = 'testA' redis_que = RedisQueues(name_red) print redis_que.length() gpp = GooglePlacesParser(redis_que) gpp.save_url_ToQueue() print redis_que.length()
manager = MultiThreadWorkerManager(self.exit_flag, self.job_queue, self.done_queue, self.thread_num, self.worker_constructor, self.constructor_args, self.constructor_kwargs) manager.monitor_worker() return multiprocessing.Process(target=start_thread_manager) if __name__ == '__main__': begin_time = time.clock() name_red = 'testA' name_eu = 'testB' redis_que = RedisQueues(name_red) eu_que = EuQueue(name_eu, redis_que) # eu_que = EuQueue() hosts = [ "http://www.yahoo.com", "http://google.com", "http://www.amazon.com", "http://www.taobao.com/", "http://www.apple.com", "http://www.baidu.com" ] #提取已经下载的页面里面的URL,保存至队列中 for item in hosts: print "item1:", item redis_que.put(item) exit_flag = multiprocessing.Value('i', 0, lock=True) print exit_flag
# time.sleep(60) print 'the information about all people has saved!' print '完成 退出' # url = 'https://s3.amazonaws.com/codecave/tut1.html' # url= 'http://www.douban.com/group/search?cat=1019&q=%E6%9C%88%E5%AB%82' # data_words = ['妈妈','宝宝','怀孕','待产包','月子餐','月嫂'] if __name__ == '__main__': # pool = multiprocessing.Pool(processes=4) start = time.clock() dbs = MongoCRUD() data_words = ['妈妈', '宝宝', '怀孕', '待产包', '月子餐', '月嫂', '妈咪'] data_words = ['妈妈'] group_queue = RedisQueues('GROUP_QUEUE1') group_queue.delete() # all = get_url_all(data_words,group_queue) # get_url_all(data_words,group_queue,dbs) # print len(all) # print '豆瓣用户UID存入数据库' # dbs.people_url_insert(all) save_data() elapsed = (time.clock() - start) print("Time used:", elapsed) ''' people_queue = RedisQueues('PEOPLE_QUEUE1') dbs = MongoCRUD() # get_data(data_words,group_queue,people_queue) people_all =get_data(data_words,group_queue,people_queue)
self.crud.save_map_data_insert(results) if 'next_page_token' in josn_response: pagetoken = '&pagetoken=%s' % josn_response['next_page_token'] url = re.sub(r'&pagetoken=.*', pagetoken, url) save_url_ToQueue(url) # self.parse_html(url) else: pass elif status == 'OVER_QUERY_LIMIT': self.key = self.app_keys_pop() url = re.sub(r'&key=.*&pagetoken', '&key=%s&pagetoken' % self.key, url) save_url_ToQueue(url) # self.parse_html(url) else: return # from datetime import datetime if __name__ == "__main__": # border_location_right = { # "lat": 24.970, # "lng": 121.666 # } # print gpp.get_url(app_keys[0], border_location_right, 'food') name_red = 'testA' redis_que = RedisQueues(name_red) print redis_que.length() gpp = GooglePlacesParser(redis_que) gpp.save_url_ToQueue() print redis_que.length()
else: pass elif status == 'OVER_QUERY_LIMIT': self.key = self.app_keys_pop() url = re.sub(r'&key=.*&pagetoken', '&key=%s&pagetoken' % self.key, url) self.parse_html(url) else: return ''' def run(self): all_locations = self.crud.read_all_locations() if len(self.keys) > 0: for location in all_locations: for type in types: url = self.get_url(location, type) print url self.parse_html(url) self.crud.update_location_status(location['_id']) else: print "*------*-*all app keys have been used*-*-----*" sys.exit() ''' if __name__ == '__main__': name_red = 'testA' redis_que = RedisQueues(name_red) # eu_que = EuQueue(redis_que)