示例#1
0
    def craw_urls(self):
        start = 372
        for i in range(start, self.page_nums):
            request = Request.Request(self.url + str(i))
            for key in self.params.keys():
                request.add_header(key, self.params[key])
            response = Request.urlopen(request)

            html = response.read()
            html = html.decode('utf-8')
            infos = re.findall(self.url_regex, html)
            save_content = ""
            for info in infos:
                new_url = self.url_unqoate(info[-1])
                new_infor = [info[0], info[1], info[-1], new_url]
                save_content += self.seperator.join(new_infor) + "\n"

            tools.check_build_file(self.url_file)
            tools.write(self.url_file, content=save_content, mode="a")
            print(i, len(infos))
示例#2
0
    def craw_url(self,page_index,save_path):
        url = "http://weibo.cn/breakingnews?page=" + str(page_index)
        header = ['Host', 'weibo.cn', 'User-Agent',
                  'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0', 'Accept',
                  'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language',
                  'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Cookie',
                  'SCF=Ah6oK9ne4mmUNoYw4kUuNRmslSDJZqMC8SFA5i4tUHBOxdAcSzsIBEEOZfx3fQNj0BgpLdQSDXoBtnymKFxl8KA.; SUHB=0z1B6sSFzJ07wI; _T_WM=7fe561e14961c07e54388eb18a1b0902; SUB=_2A2502RrGDeRhGedG6loS-SbLzzuIHXVUJaaOrDV6PUJbkdANLVmtkW0WkE6llUm_KXMeRq22wEZ0nvVBRQ..; SSOLoginState=1507682966',
                  'DNT', '1', 'Connection', 'keep-alive', 'Upgrade-Insecure-Requests', '1']
        request = Request.Request(url)
        params = {}
        for i in range(header.__len__() - 1):
            if i % 2 == 0:
                request.add_header(header[i], header[i + 1])
                params[header[i]] = header[i + 1]
        response = Request.urlopen(request)
        html = response.read()
        html = html.decode('utf-8')
        regex = "【(.*?)】(.*?)<a href=\"(http.*?)[\u4e00-\u9fa5]*?\""
        infos = re.findall(regex, html)
        save_content = ""
        for info in infos:
            new_url = self.url_unqoate(info[-1])
            reheader = requests.head(new_url).headers
            if "Location" in reheader:
                reurl = reheader["Location"]
            else:
                reurl = new_url
            if "pic" in reurl or "vedio" in reurl:
                continue
            new_infor = [info[0], info[1],reurl]

            save_content+= '\t'.join(new_infor)+"\n"

        tools.check_build_file(save_path)
        tools.write(save_path,content=save_content,mode="a")
        return len(infos)