def callback(ch, method, properties, body): ip = method.consumer_tag body = json.loads(body.decode()) city = body['city'][0] url = body['url'] shop_id = body['shop_id'] kind_code = body['kind_code'] info = body['info'] response = request_get(url, ip, connection) try: if response == 'un_url': log.info('此url没有商店,url={}'.format(url)) ch.basic_ack(delivery_tag=method.delivery_tag) return html = response.text # 查询原网页保存了没 is_exist = coll_html.find_one({'url': url}) if not is_exist: data_html = { 'html': html, 'url': url, } coll_html.insert_one(data_html) data = anlayzer_mongo(html, shop_id, city, kind_code, info) if data: coll_update.update_one({'shop_id': shop_id}, {'$set': data}) except Exception as e: connection.process_data_events() channel.basic_publish( exchange='', routing_key=all_url_queue, body=json.dumps(body), ) ch.basic_ack(delivery_tag=method.delivery_tag)
def callback(ch, method, properties, body): ip = method.consumer_tag body = json.loads(body.decode()) city_name = body['city_name'] cooking_url = body['cooking_url'] region_name = body['region_name'] street_name = body['street_name'] kind_code = body['kind_code'] pinyin = body['pinyin'] logo = cooking_url.split('/')[-1] url = 'http://www.dianping.com/' + pinyin + '/' + kind_code + '/' + logo html = request_get(url, ip, connection) try: tree = etree.HTML(html.content.decode()) page_list = tree.xpath('//a[@class="PageLink"]') if not page_list: data1 = { 'html': html.content.decode(), 'kind_code': kind_code, 'city': [city_name, region_name, street_name], } print('只有一页,url={}'.format(url)) html_put_in_queue(data1) ch.basic_ack(delivery_tag=method.delivery_tag) return data1 = { 'html': html.content.decode(), 'kind_code': kind_code, 'city': [city_name, region_name, street_name] } print('放入第一页') html_put_in_queue(data1) for i in range(2, int(page_list[-1].text) + 1): not_first_url = 'http://www.dianping.com/' + pinyin + '/' + kind_code + '/' + logo + 'p' + str( i) data2 = { 'url': not_first_url, 'kind_code': kind_code, 'city': [city_name, region_name, street_name] } print(data2) url_put_in_queue(data2) except Exception as e: channel.basic_publish( exchange='', routing_key=cooking_queue, body=json.dumps(body), properties=pika.BasicProperties( delivery_mode=2, # make message persistent )) ch.basic_ack(delivery_tag=method.delivery_tag)
def start_detail(): for i in range(1000000000): url = 'http://www.dianping.com/shop/' + str(i) ip = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % { "host": "http-pro.abuyun.com", "port": "9010", "user": "******", "pass": "******" } try: response = request_get(url, ip) if response == 'un_url': continue html = response.text print(html) except Exception as e: print(e)
def callback(ch, method, properties, body): ip = method.consumer_tag body = json.loads(body.decode()) url = body['url'] city = body['city'] kind_code = body['kind_code'] response = request_get(url, ip,connection) try: data1 = {'html': response.text, 'kind_code': kind_code, 'city': city} print('放入队列,URL={}'.format(url)) html_put_in_queue(data1) response.close() except Exception as e: channel.basic_publish(exchange='', routing_key=list_queue, body=json.dumps(body), ) ch.basic_ack(delivery_tag=method.delivery_tag)
def callback(ch, method, properties, body): ip = method.consumer_tag body = json.loads(body.decode()) city_name = body['city_name'] region_url = body['region_url'] region_name = body['region_name'] pinyin = body['pinyin'] kind_code = body['kind_code'] logo = region_url.split('/')[-1] url = 'http://www.dianping.com/' + pinyin + '/' + kind_code + '/' + logo response = request_get(url, ip, connection) try: tree = etree.HTML(response.content.decode()) # 判断是否小于50页 # 抓取所有的街道的url和名字 if kind_code == 'ch90': page_list = tree.xpath('//a[@class="pageLink"]') street_url_list = tree.xpath( '//div[@id="J_shopsearch"]/div[2]/div/ul/li/a[@class="D"]') else: page_list = tree.xpath('//a[@class="PageLink"]') street_url_list = tree.xpath( '//*[@id="region-nav-sub"]/a[@data-cat-id]') if not page_list: # 放入队列 data1 = { 'html': response.content.decode(), 'city': [city_name, region_name], 'kind_code': kind_code, } html_put_in_queue(data1) print('只有一页') ch.basic_ack(delivery_tag=method.delivery_tag) return if page_list[-1].text == '50': for street_obj in street_url_list: if kind_code == 'ch90': street_url = 'http:' + street_obj.attrib['href'] street_name = street_obj.xpath('text()')[0] else: street_url = street_obj.attrib['href'] street_name = street_obj.xpath('span')[0].text if street_name == '不限' or street_name == '更多': continue data = { 'city_name': city_name, 'region_name': region_name, 'street_name': street_name, 'street_url': street_url, 'pinyin': pinyin, 'kind_code': kind_code, } print(data) channel.queue_declare(queue=street_queue) channel.basic_publish( exchange='', routing_key=street_queue, body=json.dumps(data), properties=pika.BasicProperties( delivery_mode=2, # make message persistent )) else: data1 = { 'html': response.content.decode(), 'city': [city_name, region_name], 'kind_code': kind_code, } html_put_in_queue(data1) for i in range(2, int(page_list[-1].text) + 1): not_first_url = 'http://www.dianping.com/' + pinyin + '/' + kind_code + '/' + logo + 'p' + str( i) data2 = { 'url': not_first_url, 'city': [city_name, region_name], 'kind_code': kind_code, } print(data2) url_put_in_queue(data2) except Exception as e: channel.basic_publish(exchange='', routing_key=region_queue, body=json.dumps(body)) ch.basic_ack(delivery_tag=method.delivery_tag)
def callback(ch, method, properties, body): ip = method.consumer_tag body = json.loads(body.decode()) city_name = body['city_name'] pinyin = body['pinyin'] kind_code = body['kind_code'] url = 'http://www.dianping.com/' + pinyin + '/' + kind_code response = request_get(url, ip, connection) try: tree = etree.HTML(response.text) # 抓取所有的行政区的url和名字 if kind_code == 'ch90': page_list = tree.xpath('//a[@class="pageLink"]') region_url_list = tree.xpath('//a[@data-click-bid="b_4wybqh04"]') else: page_list = tree.xpath('//a[@class="PageLink"]') region_url_list = tree.xpath( '//*[@id="region-nav"]/a[@data-cat-id]') if not page_list: # 放入队列 data1 = { 'html': response.text, 'city': [city_name], 'kind_code': kind_code, } html_put_in_queue(data1) print('只有一页') ch.basic_ack(delivery_tag=method.delivery_tag) return # 判断是否小于50页 if page_list[-1].text == '50': for region_obj in region_url_list: if kind_code == 'ch90': region_name = region_obj.xpath('text()')[0] region_url = 'http:' + region_obj.attrib['href'] else: region_name = region_obj.xpath('span')[0].text region_url = region_obj.attrib['href'] data = { 'city_name': city_name, 'region_url': region_url, 'region_name': region_name, 'pinyin': pinyin, 'kind_code': kind_code } print(data) channel.queue_declare(queue=region_queue) channel.basic_publish(exchange='', routing_key=region_queue, body=json.dumps(data), properties=pika.BasicProperties( delivery_mode=2, )) else: data1 = { 'html': response.content.decode(), 'city': [city_name], 'kind_code': kind_code, } html_put_in_queue(data1) print('放入一个html页面') for i in range(2, int(page_list[-1].text) + 1): not_first_url = 'http://www.dianping.com/' + pinyin + '/' + kind_code + '/p' + str( i) data2 = { 'url': not_first_url, 'city': [city_name], 'kind_code': kind_code, } url_put_in_queue(data2) print('放入第%s个url' % (i - 1)) except Exception as e: channel.basic_publish(exchange='', routing_key=city_queue, body=json.dumps(body), properties=pika.BasicProperties( delivery_mode=2, )) ch.basic_ack(delivery_tag=method.delivery_tag)
# # 收集热门商圈 # hot_list = tree.xpath('//div[@id="bussi-nav"]/a') # hot_dict = {} # for hot in hot_dict: # hot_url = hot.xpath('@href')[0] # hot_code = hot_url.split('/')[-1] # hot_name = hot.xpath('span/text()')[0] # hot_dict[hot_code] = hot_name # 收集区域字典 region_list = tree.xpath('//*[@id="region-nav"]/a') region_dict = {} for region in region_list: region_url = region.xpath('@href')[0] # 区域 region_name = region.xpath('span/text()')[0] response = request_get(region_url, ip) if not response: print(region_url, '-' * 100) continue html_2 = response.text tree_2 = etree.HTML(html_2) try: street_list = tree_2.xpath('//div[@id="region-nav-sub"]/a/span/text()')[1:] if not street_list: data = { 'city': city, 'region': region_name, 'street': None } print(data) coll.insert_one(data)
def callback(ch, method, properties, body): ip = method.consumer_tag body = json.loads(body.decode()) city_name = body['city_name'] street_url = body['street_url'] street_name = body['street_name'] region_name = body['region_name'] pinyin = body['pinyin'] kind_code = body['kind_code'] logo = street_url.split('/')[-1] html = request_get(street_url, ip, connection) try: tree = etree.HTML(html.content.decode()) # 判断是否小于50页 page_list = tree.xpath('//a[@class="PageLink"]') # 抓取所有的街道的url和名字 cooking_url_list = tree.xpath('//*[@id="classfy"]/a[@data-cat-id]') if not page_list: # 放入队列 data1 = { 'html': html.text, 'city': [city_name, region_name, street_name], 'kind_code': kind_code, } print('只有一页,url={}'.format(street_url)) html_put_in_queue(data1) ch.basic_ack(delivery_tag=method.delivery_tag) return if page_list[-1].text == '50': for cooking_obj in cooking_url_list: cooking_url = cooking_obj.attrib['href'] data = { 'city_name': city_name, 'cooking_url': cooking_url, 'region_name': region_name, 'street_name': street_name, 'kind_code': kind_code, 'pinyin': pinyin } print(data) channel.queue_declare(queue=cooking_queue) channel.basic_publish( exchange='', routing_key=cooking_queue, body=json.dumps(data), properties=pika.BasicProperties( delivery_mode=2, # make message persistent )) else: data1 = { 'html': html.text, 'city': [city_name, region_name, street_name], 'kind_code': kind_code, } html_put_in_queue(data1) for i in range(2, int(page_list[-1].text) + 1): not_first_url = 'http://www.dianping.com/' + pinyin + '/' + kind_code + '/' + logo + 'p' + str( i) data2 = { 'url': not_first_url, 'city': [city_name, region_name, street_name], 'kind_code': kind_code, } print(data2) url_put_in_queue(data2) except Exception as e: channel.basic_publish( exchange='', routing_key=street_queue, body=json.dumps(body), properties=pika.BasicProperties( delivery_mode=2, # make message persistent )) ch.basic_ack(delivery_tag=method.delivery_tag)