# url = "http://list.jd.hk/list.html?cat=1319%2C1525%2C1548&gjz=0&go=0" # soup = commodity_list._Analyze_Soup(url) # 爬取完整的目录URL信息 try: lists = soup.find_all('ul', {'class': 'menu-drop-list clearfix'}) for list in lists: index = list.find_all('li') for li in index: name = li.a.string url = li.a['href'] print(name) head = "http://list.jd.hk" url = name + "@@@" + head + url print(url) # 写入txt文件 file.write(url + "\n") except: pass print("运行终了") if __name__ == '__main__': with open('JD_menu+_url.txt', mode='w', encoding="utf-8") as file: _All_Index() #文件去重 import Deduplication Deduplication._Deduplication("JD_menu+_url", "JD_index_url")
# 从本地读取爬到的静态网页,创建BeautifulSoup对象 def _JDURL(): soup = BeautifulSoup(open("/Users/fuzhe/PycharmProjects/Python_other_Tools/Python_other_Tools/JD/jdhk.html", encoding="utf-8")) # 分类目录相关信息 lists = soup.find_all('div', {'class': 'item'}) # print(lists) for item in lists: href = item.find_all() for a in href: url = a.a if url is not None: try: name = a.a.string url = a.a['href'] print(name) print(url) file.write(name + "@@@" + url + '\n') except: pass if __name__ == '__main__': with open('JD_menu_urls.txt', mode='w', encoding="utf-8") as file: _JDURL() Deduplication._Deduplication("JD_menu_urls.txt", "JD_menu_url.txt") print("运行终了")