def __init__(self): self.headers_one = { 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cache-Control': 'max-age=0', # 'Connection':'keep-alive', 'Cookie': 'cn_1255169715_dplus=%7B%22distinct_id%22%3A%20%2216730471952668-0ecf0ba7ae41cb-414f0120-15f900-16730471953461%22%2C%22sp%22%3A%20%7B%22%24_sessionid%22%3A%200%2C%22%24_sessionTime%22%3A%201542776168%2C%22%24dp%22%3A%200%2C%22%24_sessionPVTime%22%3A%201542776168%7D%7D; UM_distinctid=16730471952668-0ecf0ba7ae41cb-414f0120-15f900-16730471953461; JSESSIONID=208cee9fea61049d61e7d18f9e9c275ecf530a9e308a94dde36658adc01a0594; wuid=154945905891357; wuid_createAt=2018-11-21 12:56:9', 'Host': 'www.yidianzixun.com', 'Referer': 'http://www.yidianzixun.com/channel/c11', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest' } self.proxies = ['218.95.55.154:4243'] # 去重列表 self.set_list = [] # self.error_url_list = [] # cookies池和id池 ''' { 'channel_id': '', 'cookies': '' }, ''' # 通过系统时间自动计算时间间隔 date = datetime.now() - timedelta(days=3) # 七天前的时间,不包括今天, str_time = str(date).split(' ')[0] yesterday = datetime.now() - timedelta(days=1) # 昨天时间 now_time = str(yesterday).split(' ')[0] print('爬取时间段:{}到{}'.format(str_time, now_time)) logging.info('爬取时间段:{}到{}'.format(str_time, now_time)) # 定义开始时间 y-m-d 离现在时间远 self.start_time = str_time # 定义结束时间 y-m-d 离现在时间近 self.end_time = now_time try: self.page_ip = proxies.res_ip() # self.page_ip = '115.219.77.241:2316' except: time.sleep(3) print('调用ip时发生错误:{}'.format(traceback.format_exc())) logger.error('调用ip时发生错误:{}'.format(traceback.format_exc())) self.page_ip = proxies.res_ip() self.ip_count = 0
def run(self): url = 'http://www.yidianzixun.com/home/q/news_list_for_channel' get_time = time.time() get_time = ''.join(str(get_time).split('.')) url_list = [ # 体育 'http://www.yidianzixun.com/home/q/news_list_for_channel?channel_id=13402171666&cstart=0&cend=10&infinite=true&refresh=1&__from__=wap&_spt=yz~eaod%3B9%3E%3A8%3B%3D%3B%3C%3C%3C%3A%3B%3A&appid=web_yidian&_={}', # NBA 'http://www.yidianzixun.com/home/q/news_list_for_channel?channel_id=13402171682&cstart=0&cend=10&infinite=true&refresh=1&__from__=wap&_spt=yz~eaod%3B9%3E%3A8%3B%3D%3B%3C28%3A%3B%3A&appid=web_yidian&_={}', # 财经 'http://www.yidianzixun.com/home/q/news_list_for_channel?channel_id=13402171698&cstart=0&cend=10&infinite=true&refresh=1&__from__=wap&_spt=yz~eaod%3B9%3E%3A8%3B%3D%3B%3C32%3A%3B%3A&appid=web_yidian&_={}' ] for get_url in url_list: for i in range(2): try: for j in range(30): url = get_url.format( str(time.time()).replace('.', '')[:-4]) try: self.get_news_list_port(url) except requests.exceptions.ProxyError: print(traceback.format_exc()) break except TypeError: print(traceback.format_exc()) logger.error('内容解析错误') except: print(traceback.format_exc()) logger.error('其他错误') time.sleep(10) self.page_ip = proxies.res_ip()
def get_channel_id(self): url = 'http://www.yidianzixun.com/channel/c11' try: response = requests.get(url, proxies={'http': self.page_ip}, timeout=30) data = response.content.decode() data = re.search('channel_id(.*?)汽车', data).group(0) channel_id = re.search('\d{8,15}', data).group(0) cokies = response.headers['Set-Cookie'] print(cokies) id = re.search('JSESSIONID=([a-z0-9]{30,80});', cokies).group(1) return channel_id, id except: print(traceback.format_exc()) if self.ip_count < 10: self.page_ip = proxies.res_ip() print('跟换ip中: ', self.page_ip) self.ip_count += 1 time.sleep(5) self.get_channel_id() else: raise IndexError
def run(self): url = 'http://www.yidianzixun.com/home/q/news_list_for_channel' get_time = time.time() get_time = ''.join(str(get_time).split('.')) get_time = get_time[:-2] start = 0 for i in range(1, 16): time.sleep(5) print('第{}次大循环'.format(str(i))) try: channel_id, jession_id = self.get_channel_id() print(channel_id) for j in range(40): # tool = random.choice(self.tool_list) # cookie = 'JSESSIONID={}; wuid=955594968988162; wuid_createAt=2018-12-28 9:23:51; weather_auth=2; Hm_lvt_15fafbae2b9b11d280c79eff3b840e45=1545960232; UM_distinctid=167f26914dc782-0cf8055a8c462f-5d1e331c-15f900-167f26914dd486; CNZZDATA1255169715=841120563-1545955593-null%7C1545955593; captcha=s%3A6cb2d7fd90216ee034e5063e173e8bd8.ytEeuzpbuzDpjq0fMjQB99MS3QYW4tagEAy6RClglQc; Hm_lpvt_15fafbae2b9b11d280c79eff3b840e45=1545960241; cn_1255169715_dplus=%7B%22distinct_id%22%3A%20%22167f26914dc782-0cf8055a8c462f-5d1e331c-15f900-167f26914dd486%22%2C%22sp%22%3A%20%7B%22%24_sessionid%22%3A%200%2C%22%24_sessionTime%22%3A%201545960240%2C%22%24dp%22%3A%200%2C%22%24_sessionPVTime%22%3A%201545960240%7D%7D'.format(jession_id) cookie = 'UM_distinctid=1683bb2492431a-094bc6de6a2a34-5d1e331c-15f900-1683bb249257ee; JSESSIONID={}; wuid=789835887113804; wuid_createAt=2019-01-16 9:36:16; weather_auth=2; Hm_lvt_15fafbae2b9b11d280c79eff3b840e45=1547602576; Hm_lpvt_15fafbae2b9b11d280c79eff3b840e45=1547602576; CNZZDATA1255169715=432644633-1547597213-http%253A%252F%252Fwww.yidianzixun.com%252F%7C1547597213; cn_1255169715_dplus=%7B%22distinct_id%22%3A%20%221683bb2492431a-094bc6de6a2a34-5d1e331c-15f900-1683bb249257ee%22%2C%22sp%22%3A%20%7B%22%24_sessionid%22%3A%200%2C%22%24_sessionTime%22%3A%201547602597%2C%22%24dp%22%3A%200%2C%22%24_sessionPVTime%22%3A%201547602597%7D%7D; captcha=s%3A229eaf5dc322b2b9390527804ef8124c.yGCMPLr%2FjIqPwCriUMcf0ZznNz7D3ZAabmz2DZWLIVM'.format( jession_id) # print(cookie) ip = self.page_ip # try: spt = self.get_spt(start, channel_id) # 生成加密参数 print(spt) end = start + 10 params = { 'channel_id': channel_id, 'cstart': start, 'cend': end, 'infinite': 'true', 'refresh': '1', '__from__': 'pc', 'multi': '5', '_spt': spt, 'appid': 'web_yidian', '_': get_time } self.get_news_list_port(url, params, cookie, ip) # except: # pass except TypeError: # print('暂停900s') # time.sleep(900) logger.error('内容解析错误', traceback.format_exc()) except: logger.error('其他错误', traceback.format_exc()) self.page_ip = proxies.res_ip() # 为了提升爬取数量,从百度采集一次 logger.info('开始通过百度爬取数据......') for i in range(0, 25): i = i * 10 self.get_news_url(str(i)) print(list(set(self.error_url_list))) logger.info('爬虫爬取完毕......')
def get_news_list_port(self, url, params, cookie, ip): self.headers_one['Cookie'] = cookie response = requests.get(url, params=params, headers=self.headers_one, proxies={'https': ip}) print(response.url) data = response.content.decode() data = json.loads(data) data = data['result'] print(data) if data: for news in data: item = {} title = news['title'] item['title'] = title itemid = news['itemid'] url = 'http://www.yidianzixun.com/article/' + itemid news_date = news['date'] get_date = re.search('\d{4}-\d{2}-\d{2}', news_date).group(0) if 'V_' not in itemid: if url not in self.set_list: # self.write_news_jsonfile(item) try: print(url) self.get_news_page_info(url) self.set_list.append(url) except IndexError: print('网页解析错误', url, 111111) self.error_url_list.append(url) self.page_ip = proxies.res_ip() time.sleep(10) print('更换ip:', self.page_ip) else: time.sleep(10) print('重试中......') self.page_ip = proxies.res_ip() self.get_news_list_port(url, params, cookie, ip)
def get_news_url(self, carts): """ 从百度搜索关键词,然后获取符合的新闻的url :param carts: :return: """ # 时间 get_time = time.time() str_time = str(get_time)[:-4] date = datetime.now() - timedelta(days=7) a = str(date)[:-7] timeArray = time.strptime(a, "%Y-%m-%d %H:%M:%S") # 转换为时间戳: timeStamp = int(time.mktime(timeArray)) end_time = str(timeStamp) + '.' + str_time.split('.')[1] print(str_time, end_time) url = 'https://www.baidu.com/s?q1={}&q2=&q3=&q4=&gpc=stf%3D{}%2C{}%7Cstftype%3D1&ft=&q5=&q6=xiaohongshu.com&tn=baiduadv'.format( str(carts), end_time, str_time) print(url) # ip = random.choice(self.proxies_list) response = requests.get(url, headers=self.headers_one, verify=False, timeout=30) # , proxies={'https': ip} content = etree.HTML(response.content.decode()) if content.xpath('.//h3[@class="t"]/a/@href'): url_list = content.xpath('.//h3[@class="t"]/a/@href') print(url_list) for url in url_list: news_url = url.replace('http', 'https') try: self.get_news_page(news_url) except KeyError: print('ip可能被封') except IndexError: print('可能出现滑动验证') if self.ip_count < 100: print('更换ip中......') self.ip = proxies.res_ip() self.ip_count += 1 self.get_news_page(news_url) else: print('ip超过使用量') # time.sleep(2) elif content.xpath('.//div[@class="content_none"]/div/p//text()'): txt = content.xpath('.//div[@class="content_none"]/div/p//text()') print(txt) else: print('其他错误', url) print('------------------------------------------------')
def run(self): for url in open('./../toutiao/new_url_file.json'): if self.ip_count < 150: url = url.strip() print('一个爬虫正在爬取网址{}'.format(url)) logger.info('一个爬虫正在爬取网址{}'.format(url)) try: self.get_news_page(url, self.ip) except requests.exceptions.ProxyError: print('远程连接无响应,重试一次中.......', ) try: if self.ip_count < 150: print('更换ip中......') self.ip = proxies.res_ip() self.ip_count += 1 self.get_news_page(url, self.ip) else: self.get_news_page(url, self.ip) except requests.exceptions.ProxyError: print('重试链接....远程连接无响应......') except Exception as e: print('发生其他异常{}'.format(e)) print('更换ip中......') time.sleep(10) self.ip = proxies.res_ip() self.ip_count += 1 try: self.get_news_page(url, self.ip) except: pass time.sleep(1) print('一个网址爬虫结束.....') else: print('使用ip已达到{}个'.format(str(self.ip_count)) + ',爬虫停止运行......') break logger.info('爬取完毕......')
def __init__(self, file_path, comment_path): self.headers_two = { 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cache-Control': 'max-age=0', # 'Connection':'keep-alive', 'Cookie': 'cn_1255169715_dplus=%7B%22distinct_id%22%3A%20%2216730471952668-0ecf0ba7ae41cb-414f0120-15f900-16730471953461%22%2C%22sp%22%3A%20%7B%22%24_sessionid%22%3A%200%2C%22%24_sessionTime%22%3A%201542776168%2C%22%24dp%22%3A%200%2C%22%24_sessionPVTime%22%3A%201542776168%7D%7D; UM_distinctid=16730471952668-0ecf0ba7ae41cb-414f0120-15f900-16730471953461; JSESSIONID=208cee9fea61049d61e7d18f9e9c275ecf530a9e308a94dde36658adc01a0594; wuid=154945905891357; wuid_createAt=2018-11-21 12:56:9', 'Host': 'www.baidu.com', 'Referer': 'http://www.yidianzixun.com/channel/c11', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest' } self.proxies = ['218.95.55.154:4243'] # 去重列表 self.set_list = [] # self.error_url_list = [] self.headers_one = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Host': 'www.baidu.com', # 'Proxy-Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36' } self.user_agent = [ 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.1', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/536.6', 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/536.6', 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.1', ] a = str(datetime.now()) hour = a.split(' ')[-1].split(':')[0] num = int(hour) / 3 num = int(num) * 3 if num == 0: # 对于凌晨 0 点的判断 # 时间判断部分 date = datetime.now() - timedelta(days=1) news_start_time = str(date).split(' ')[0] yesterday = datetime.now() - timedelta(days=1) # 昨天时间 yesterday = str(yesterday).split(' ')[0] else: # 时间判断部分 date = datetime.now() - timedelta(days=0) news_start_time = str(date).split(' ')[0] yesterday = datetime.now() - timedelta(days=0) # 昨天时间 yesterday = str(yesterday).split(' ')[0] # 定义开始时间 y-m-d 离现在时间远 news_start_time self.start_time = news_start_time # 定义结束时间 y-m-d 离现在时间近 yesterday self.end_time = yesterday try: self.page_ip = proxies.res_ip() print('ip: ', self.page_ip) # self.page_ip = '116.248.160.138:4261' except: time.sleep(3) print('调用ip时发生错误:{}'.format(traceback.format_exc())) logger.error('调用ip时发生错误:{}'.format(traceback.format_exc())) self.page_ip = proxies.res_ip() self.ip_count = 0 # 定义评论的抓取时间范围 # self.comment_start_time = yesterday # 一天回复 self.comment_start_time = '' # 不限定时间回复 self.comment_end_time = yesterday # self.comment_end_time = yesterday self.is_get_comment = True self.file_name_time = self.get_file_name_time() self.file_path = file_path self.comment_apth = comment_path self.hdfsclient = HdfsClient(url='http://192.168.1.205:14000', user='******') hour = str(datetime.now()).split(' ')[-1].split(':')[0] if str(hour) != '00': two_hour_ago = int(hour) - 2 if len(str(two_hour_ago)) == 1: two_hour_ago = '0' + str(two_hour_ago) self.hour_name = str(two_hour_ago) + '_' + str(hour) else: self.hour_name = '22_24' self.hdfsclient.makedirs('{}/{}/{}'.format( self.file_path, self.file_name_time.split(' ')[0].replace('-', ''), self.hour_name)) # 创建每日文件夹 self.hdfsclient.makedirs('{}/{}/{}'.format( self.comment_apth, self.file_name_time.split(' ')[0].replace('-', ''), self.hour_name)) # 创建每日文件夹 self.time_time = str(time.time()).split('.')[0]
def __init__(self): self.headers_one = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Host': 'www.baidu.com', # 'Proxy-Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36' } self.headers_two = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', # 'Connection': 'keep-alive', 'Host': 'www.baidu.com', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36' } self.headers_three = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', # 'Connection': 'keep-alive', 'Host': 'www.xiaohongshu.com', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36' } self.start_url = '' # 评论接口模板 self.commnet_port_url = '' # # 打开json文件 # self.news_jsonfile = open('./sina_newsfile.json', 'wb') # self.comment_jsonfile = open('./sina_commentfile.json', 'wb') date = datetime.now() - timedelta(days=3) news_start_time = str(date).split(' ')[0] yesterday = datetime.now() - timedelta(days=1) # 昨天时间 yesterday = str(yesterday).split(' ')[0] print('爬取时间段:{}到{}'.format(news_start_time, yesterday)) logging.info('爬取时间段:{}到{}'.format(news_start_time, yesterday)) # 定义开始时间 y-m-d 离现在时间远 news_start_time self.start_time = news_start_time # 定义结束时间 y-m-d 离现在时间近 yesterday self.end_time = yesterday # 标记爬虫工作 self.is_work = True # ip代理 self.proxies_list = [ '121.231.226.210:4252', ] self.ip = proxies.res_ip() self.ip_count = 0 # url去重list self.set_list = []
def __init__(self): # 'cookie':'uuid="w:d0214807f672416fb7d3ee0431aa13a3"; UM_distinctid=1674ef3a9800-0bce565d4c8dc4-414f0120-15f900-1674ef3a981290; _ga=GA1.2.823209007.1543222670; _gid=GA1.2.547615301.1543222670; CNZZDATA1259612802=603836554-1543213069-%7C1543218469; __tasessionId=tpisw88851543281460530; csrftoken=d9a6dad7de6c1fbbf3ddd1a3de811481; tt_webid=6628070185327625741', # ':authority':'www.toutiao.com', # ':method':'GET', # ':path':'/api/pc/feed/?category=news_car&utm_source=toutiao&widen=1&max_behot_time=0&max_behot_time_tmp=0&tadrequire=true&as=A1E56B7F8CD9B35&cp=5BFC39BB43B5DE1&_signature=pMmtcAAA.0TvpJ9rFvhWIKTJrW', # ':scheme':'https', # 'cache-control': 'max-age=0', # 'cookie': 'tt_webid=6628733243796178436; tt_webid=6628733243796178436; csrftoken=3a6f2dc0f315bd1fe957319a75bba4ed; uuid="w:2203d39caf3249c0bcda19ee5839b850"; UM_distinctid=1675827673a27a-0dd556679b3f63-3a3a5d0c-15f900-1675827673b22c; __tasessionId=qb2c0x9mb1543386267822; CNZZDATA1259612802=992935523-1543369669-%7C1543385869', # 'referer': 'https://www.toutiao.com/ch/news_car/', self.headers_one = { 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'cache-control': 'max-age=0', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36' } self.start_url = 'https://www.toutiao.com/api/pc/feed/' # 评论接口模板 self.commnet_port_url = '' date = datetime.now() - timedelta(days=3) news_start_time = str(date).split(' ')[0] yesterday = datetime.now() - timedelta(days=1) # 昨天时间 yesterday = str(yesterday).split(' ')[0] print('爬取时间段:{}到{}'.format(news_start_time, yesterday)) logging.info('爬取时间段:{}到{}'.format(news_start_time, yesterday)) # 定义开始时间 y-m-d 离现在时间远 news_start_time = news_start_time # 定义结束时间 y-m-d 离现在时间近 new_end_time = yesterday # 定义开始时间 y-m-d self.start_time = news_start_time # 定义结束时间 y-m-d self.end_time = new_end_time # 标记爬虫工作 self.is_work = True # 评论页数 self.comment_page_num = 1 # 去重列表 self.set_list = [] # 代理ip self.proxies = [ '112.245.235.249:4243', # '59.53.47.4:4249' ] # 搜集问答类网页的列表 self.questions_list = [] # 读取url列表 with open('./../toutiao/new_url_file.json', 'r') as f: self.url_list = f.readlines() # 获取ip self.ip = proxies.res_ip() # ip计数 self.ip_count = 0