def craw_urls(self): start = 372 for i in range(start, self.page_nums): request = Request.Request(self.url + str(i)) for key in self.params.keys(): request.add_header(key, self.params[key]) response = Request.urlopen(request) html = response.read() html = html.decode('utf-8') infos = re.findall(self.url_regex, html) save_content = "" for info in infos: new_url = self.url_unqoate(info[-1]) new_infor = [info[0], info[1], info[-1], new_url] save_content += self.seperator.join(new_infor) + "\n" tools.check_build_file(self.url_file) tools.write(self.url_file, content=save_content, mode="a") print(i, len(infos))
def craw_url(self,page_index,save_path): url = "http://weibo.cn/breakingnews?page=" + str(page_index) header = ['Host', 'weibo.cn', 'User-Agent', 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0', 'Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language', 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Cookie', 'SCF=Ah6oK9ne4mmUNoYw4kUuNRmslSDJZqMC8SFA5i4tUHBOxdAcSzsIBEEOZfx3fQNj0BgpLdQSDXoBtnymKFxl8KA.; SUHB=0z1B6sSFzJ07wI; _T_WM=7fe561e14961c07e54388eb18a1b0902; SUB=_2A2502RrGDeRhGedG6loS-SbLzzuIHXVUJaaOrDV6PUJbkdANLVmtkW0WkE6llUm_KXMeRq22wEZ0nvVBRQ..; SSOLoginState=1507682966', 'DNT', '1', 'Connection', 'keep-alive', 'Upgrade-Insecure-Requests', '1'] request = Request.Request(url) params = {} for i in range(header.__len__() - 1): if i % 2 == 0: request.add_header(header[i], header[i + 1]) params[header[i]] = header[i + 1] response = Request.urlopen(request) html = response.read() html = html.decode('utf-8') regex = "【(.*?)】(.*?)<a href=\"(http.*?)[\u4e00-\u9fa5]*?\"" infos = re.findall(regex, html) save_content = "" for info in infos: new_url = self.url_unqoate(info[-1]) reheader = requests.head(new_url).headers if "Location" in reheader: reurl = reheader["Location"] else: reurl = new_url if "pic" in reurl or "vedio" in reurl: continue new_infor = [info[0], info[1],reurl] save_content+= '\t'.join(new_infor)+"\n" tools.check_build_file(save_path) tools.write(save_path,content=save_content,mode="a") return len(infos)