if task_list.is_empty(): print("所有任务已完成") sock.close() break # 等待并接收来自Slave的连接(程序会在这里停住,直到有Slave连接) conn, addr = sock.accept() conn.settimeout(10) try: # 接收到来自Slave的请求(程序会在这里停住,直到接收到Slave的消息) req = conn.recv(1024).decode("utf-8") if req.startswith("get"): # 如果Slave发送的消息以"get"开头,则给它发回一个用来爬取的URL # 消息的格式:"get,123456" # slave_id取得发来消息的Slave的ID slave_id = req.split(",")[1] task_url = task_list.get_task() # 把url发给Slave print("向'Slave {0}' 分配爬取 '{1}'".format(slave_id, task_url)) conn.send(task_url.encode("utf-8")) elif req.startswith("done"): # 如果Slave发送的消息以"done"开头,说明它是在告诉master它完成了一个任务 # 消息的格式:"done,123456,https://finance.sina.com.cn/china/gncj/2018-10-17/doc-ifxeuwws5236619.shtml" # slave_id取得发来消息的Slave的ID,done_url取得Slave发来的完成爬取的页面的链接 slave_id = req.split(",")[1] done_url = req.split(",")[2] # 这里将已爬取的页面完全从任务列表中删除 print("'Slave {0}' 完成爬取 '{1}'".format(slave_id, done_url)) task_list.done_task(done_url) conn.send("ok".encode("utf-8")) except socket.timeout: print("套接字连接超时")
def main(): addr = "0.0.0.0" port = 9992 main_url = "http://money.163.com/special/00252C1E/gjcj.html" task_list = TaskList(timeout=30) sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.bind((addr, port)) sock.listen(50) #driver = webdriver.Chrome() #driver.get(main_url) print("正在从网页中解析URL链接...") def gethtmltext(url, code="gbk"): try: r = requests.get(url) r.raise_for_status() r.encoding = code return r.text except requests.exceptions.ConnectionError: return "" html = gethtmltext(main_url) try: if html == "": print("---html error1!---") soup = BeautifulSoup(html, 'html.parser') url_info = soup.find_all('div', attrs={'class': 'list_item clearfix'}) news_url = list() for i in url_info: # noinspection PyBroadException try: a = i.find(name='h2') url = a.find(name='a').attrs['href'] news_url.append(url) print(url) except: continue task_list.put_tasks(news_url) except: print("---url error2!---") # driver.close() print("等待client中.......") while 1: if task_list.is_empty(): print("====任务完成====") sock.close() break conn, addr = sock.accept() # 接受TCP连接,并返回新的套接字与IP地址 print('Connected by\n', addr, conn) # 输出客户端的IP地址 try: data = conn.recv(1024).decode("gbk") if data.split(',')[0] == "get": client_id = data.split(',')[1] task_url = task_list.get_task() print("向client {0} 分配 {1}".format(client_id, task_url)) conn.send(task_url.encode("gbk")) elif data.split(',')[0] == "done": client_id = data.split(',')[1] client_url = data.split(',')[2] print("client {0}' 完成爬取 {1}".format(client_id, client_url)) task_list.done_task(client_url) conn.send("ok".encode("gbk")) except socket.timeout: print("Timeout!") conn.close() # 关闭连接