def get_links_from_web(): driver = launch_driver() url = 'https://www.sindelantal.mx' driver.get(url) # 解析页面 # selenium stores th source HTML in the driver's page_source attribute soup = BeautifulSoup(driver.page_source, 'html.parser') cities_rests = soup.find("div", {"class": "cities-rests"}) lis = cities_rests.find_all("li", {"class": "internal-link"}) links = [] for li in lis: tag_a = li.a link = { "title": tag_a['title'], "url": url + tag_a['href'], } links.append(link) return links
def get_restaurant_info(r): driver = launch_driver() try: # 请求链接 url = r.get('url') logging.info(url) driver.get(url) # 重定向判定 if driver.current_url == url: pass else: logging.info('该网页进行了重定向,稍后再请求.') return -1 # 首先分析信息,更新到new_info # 从script中解析出新信息 new_info = parse_new_info(driver) new_info['state'] = r.get('state') new_info['rid'] = r.get('rid') # 统计评论 if new_info.get('count_rating') is None or int(new_info.get('count_rating', 0)) == 0: logging.info('餐馆评论为0, 可以直接跳过') update_restaurant(new_info) return 0 else: # 加载评论 load_comments(driver) source_page = BeautifulSoup(driver.page_source, 'html.parser') comments = parse_comments(source_page, new_info) save_comments(comments, r) update_restaurant(new_info) return 0 except Exception as e: logging.info(traceback.format_exc()) return -1 finally: driver.close()
def get_restaurants_page(local_data): # 启动浏览器 driver = launch_driver() host_url = 'https://www.ifood.com.br/' driver.get(host_url) try: # 先找到邮编和街道的切换链接 switch_link = check_element_by_xpath(driver, '//*[@id="buscaCepPorEndereco"]') wait_to_display(switch_link) switch_link.click() # 看看是否得到了输入地址信息的表单 # 为select标签选择值 state_element = check_element_by_xpath( driver, '//*[@id="box7"]/form/div[1]/div[1]/select') wait_to_display(state_element) Select(state_element).select_by_value(local_data.get('state')) # 看看是否得到了输入地址信息的表单 # 为select标签选择值, 选择两次,因为选择state后重新加载了 sleep(3) city_element = check_element_by_xpath( driver, '//*[@id="box7"]/form/div[1]/div[2]/select') wait_to_display(city_element) Select(city_element).select_by_visible_text( local_data.get('city').upper()) # 为两个input标签添加信息 street_input = check_element_by_xpath( driver, '//*[@id="box7"]/form/div[2]/input[1]') number_input = check_element_by_xpath( driver, '//*[@id="box7"]/form/div[2]/input[2]') wait_to_display(street_input) wait_to_display(number_input) street_input.send_keys(local_data.get('street')) number_input.send_keys(local_data.get('number')) # 点击下街道编号输入框,确保自动补全的下拉框收回 number_input.click() sleep(2) number_input.click() # 提交数据 # 确保加载结束, 要么超时,要么找到标签 submit_button = check_element_by_xpath( driver, '//*[@id="pre-search-naoSeiMeuCep"]/input[1]') wait_to_display(submit_button) submit_button.click() log('提交登录') # 确保加载结束, 要么超时,要么找到搜索结果,点击标签1标签 # 搜索地址成功,选择第一个地址 select_result = check_element_by_xpath( driver, '//*[@id="box7"]/form/div[4]/div/table/tbody/tr[1]/td[3]/a') wait_to_display(select_result) select_result.click() # 此处应该就能加载出餐馆列表了。 # 下拉加载所有餐馆 count_mark = 0 log('加载餐馆列表中。。。') while count_mark < 1000: driver.execute_script( "$('html,body').animate({'scrollTop':'700000000'},1000)") # 查找有没有加载 if check_element_by_xpath( driver, '//*[@id="content"]/div[2]/div[2]') != -1: # 找到了就继续 count_mark += 1 # log('拉取餐馆列表进度', count_mark) continue else: # 查看到了底部 if check_element_by_xpath(driver, '//*[@id="suggestLink"]') != -1: log('加载结束') break count_mark += 1 log('开始获取所有餐馆的url') r_state = local_data.get('state') rs_new = get_restaurants_list(driver, r_state) log('新获取餐馆{}家。'.format(len(list(rs_new.keys())))) rs_origin = load_restaurants(r_state) log('原有餐馆{}家。'.format(len(list(rs_origin.keys())))) rs_origin.update(rs_new) log('更新后共有餐馆{}家。'.format(len(list(rs_origin.keys())))) save_restaurants(rs_origin, r_state) # 清理缓存 # driver.delete_all_cookies() driver.close() return 0 except Exception as e: log(e) log(traceback.format_exc()) driver.close() return -1
def get_restaurants_page(link): # 启动浏览器 driver = launch_driver() url = link.get('url') logging.info('当前请求:{}'.format(url)) driver.get(url) try: # 此处应该就能加载出餐馆列表了。 # 下拉加载所有餐馆 # print(driver.page_source) forbid = check_element_by_xpath(driver, '//*[@id="searchMsg"]') if forbid == -1: pass else: logging.info(forbid.text) logging.info('ip被禁止访问') return -1 count_mark = 0 logging.info('加载餐馆列表中。。。') # 先判断是否需要请求多页, 如果为空或者一页显示完整, 不再请求 if check_element_by_xpath(driver, '//*[@id="suggestLink"]') != -1: logging.info('加载结束') rs_new = get_restaurants_list(driver) logging.info('加载餐馆: {}'.format(len(rs_new))) insert_restaurants(rs_new) else: # 不使用下拉, 尝试使用js发送ajax请求, 请求内容见下文 """ $.ajax({ url:'https://www.sindelantal.mx/lista-restaurantes/filtro', method: 'post', data: { page: 12, city: 'CUAJIMALPA', state: 'DF', ordenacao: 0 }, success: function(res) { console.log(res) }, fail: function(error) { alert(error) }, complete: function() { } }) """ logging.info('加载更多...') page = 1 local_info = url.split("/")[-2] city_name = local_info.split('-')[0] state_name = local_info.split('-')[1] while page < 500: page += 1 post_data = { 'url': 'https://www.sindelantal.mx/lista-restaurantes/filtro', 'method': 'post', 'data': { 'page': page, 'city': city_name, 'state': state_name, 'ordenacao': 0 } } filtro_script_fore = '$.ajax({}'.format(post_data)[:-1] filtro_script_end = """ ,success: function(res) { $(".tabs.flex-tab").append(res) }, fail: function(error) { console.log(error) }, complete: function() { } })""" filtro_script = filtro_script_fore + filtro_script_end driver.execute_script(filtro_script) # 查找是否存在标签restaurant-card-link, 如果为空, 则说明没有餐馆或者加载结束 rs_new = get_restaurants_list(driver) logging.info('加载餐馆: {}'.format(len(rs_new))) if len(rs_new) == 0: logging.info('加载结束') break else: insert_restaurants(rs_new) # 清空 driver.execute_script( "$('.restaurant-card-link').remove()") logging.info('共下拉加载次数:{}'.format(page)) driver.close() return 0 except Exception as e: logging.info(e) logging.info(traceback.format_exc()) driver.close() return -1
def driver(): driver = launch_driver() yield driver driver.quit()
def get_restaurant_info(r): driver = launch_driver() try: # 请求链接 url = r.get('url') log(url) driver.get(url) # 重定向判定 if driver.current_url == url: pass else: log('该网页进行了重定向,不再获取.') return 0 # 首先分析参观信息,更新到new_info, 如果评论数量为0,直接跳过,不再点击评论 # 从script中解析出新信息 sleep(2) new_info = parse_new_info(driver) if int(new_info.get('count_rating')) == 0: log('餐馆评论为0, 可以直接跳过') return 0 # 开始点击加载评论 show_rate = check_element_by_xpath(driver, '//*[@id="showRating"]/a') show_rate.click() # 增加判断,如果js加载失败或点击加载失败,没有active表示没有加载出评论,直接跳过 parent_tag = check_element_by_xpath(show_rate, '..') if parent_tag.get_attribute('class') != 'active': log('评论加载失败,跳过') return -1 # 拉取评论 log('拉取评论中...') while True: driver.execute_script( "$('html,body').animate({'scrollTop':'700000000'},1000)") ver_mais = check_element_by_xpath( driver, '//*[@id="ratingContent"]/div[3]/div[2]') # 没有ver_mais, 就结束. 实际正常应该是ver_mais不可见了 if ver_mais == -1: break else: # log('没找到评论加载完毕,说明等待加载') wait_to_display(ver_mais) if ver_mais.is_displayed() is False: break ver_mais.click() # log('加载更多评论。。。') # 等待三秒页面加载 sleep(2) # 加载结束, 解析源代码 source_page = BeautifulSoup(driver.page_source, 'html.parser') # 解析出评论 comments = parse_comments(source_page, new_info) # 更新餐馆信息,保存评论信息 # 获得的新的信息,不再更新到源文件,直接输出到csv文件吧 # r.update(new_info) new_info['state'] = r.get('state') new_info['id'] = r.get('id') r_to_csv(new_info) save_comments(comments, r) return 0 except Exception as e: log(traceback.format_exc()) return -1 finally: driver.close()
def get_restaurant_info(r): driver = launch_driver() try: # 请求链接 url = r.get('url') logging.info(url) driver.get(url) # 重定向判定 if driver.current_url == url: pass else: logging.info('该网页进行了重定向,稍后再请求.') return -1 # 检测弹窗并模拟点击 try: ad_button = driver.find_element_by_class_name('ab-message-button') if ad_button != -1: logging.info('页面弹出了广告') ad_button.click() sleep(1) except Exception as e: # logging.info(e) # logging.info(traceback.format_exc()) pass # 首先分析参观信息,更新到new_info, 如果评论数量为0,直接跳过,不再点击评论 # 从script中解析出新信息 new_info = parse_new_info(driver) if new_info.get('count_rating') is None: pass elif int(new_info.get('count_rating')) == 0: logging.info('餐馆评论为0, 可以直接跳过') new_info['state'] = r.get('state') new_info['rid'] = r.get('rid') update_restaurant(new_info) return 0 # 20181219 ifood退出新版网页, 弹窗选择取消 cancelModal_tag = check_element_by_xpath(driver, '//*[@id="cancelModal"]') if cancelModal_tag is None: logging.info('新页面提醒没有弹出') else: cancelModal_tag.click() logging.info('点击下一个按钮, 稍后再看') sleep(2) # 开始点击加载评论 show_rate = check_element_by_xpath(driver, '//*[@id="showRating"]/a') show_rate.click() # 增加判断,如果js加载失败或点击加载失败,没有active表示没有加载出评论,直接跳过 parent_tag = check_element_by_xpath(show_rate, '..') if parent_tag.get_attribute('class') != 'active': logging.info('评论加载失败,跳过') return -1 # 拉取评论之前先判断是否没有评论 sleep(2) no_comment = driver.find_element_by_class_name('no-comments') if no_comment.is_displayed() is True: logging.info('评论还是为零, 结束下拉.') new_info['state'] = r.get('state') new_info['rid'] = r.get('rid') new_info['count_rating'] = 0 update_restaurant(new_info) return 0 # 拉取评论 logging.info('拉取评论中...') while True: driver.execute_script( "$('html,body').animate({'scrollTop':'700000000'},1000)") ver_mais = check_element_by_xpath( driver, '//*[@id="ratingContent"]/div[3]/div[2]') # 没有ver_mais, 就结束. 实际正常应该是ver_mais不可见了 if ver_mais == -1: break else: # logging.info('没找到评论加载完毕,说明等待加载') wait_to_display(ver_mais) if ver_mais.is_displayed() is True: ver_mais.click() sleep(2) else: break # 加载结束, 解析源代码 source_page = BeautifulSoup(driver.page_source, 'html.parser') # 解析出评论 comments = parse_comments(source_page, new_info) # 更新餐馆信息,保存评论信息 # 获得的新的信息,不再更新到源文件,直接输出到csv文件吧 # r.update(new_info) new_info['state'] = r.get('state') new_info['rid'] = r.get('rid') # 统计不含评论内容的评论 tmp_count = 0 for comment in comments.values(): if comment.get('source') == "R": tmp_count += 1 new_info['count_rating'] = tmp_count update_restaurant(new_info) save_comments(comments, r) return 0 except Exception as e: logging.info(traceback.format_exc()) return -1 finally: driver.close()
def get_restaurants_page(link): # 启动浏览器 driver = launch_driver() url = link.get('url') logging.info('当前请求:{}'.format(url)) driver.get(url) try: # 此处应该就能加载出餐馆列表了。 # 下拉加载所有餐馆 # print(driver.page_source) # 重定向判定 if driver.current_url == url: pass else: logging.info('该网页进行了重定向,稍后再请求.') return -1 forbid = check_element_by_xpath(driver, '//*[@id="searchMsg"]') if forbid == -1: pass else: logging.info(forbid.text) logging.info('ip被禁止访问') return -1 # 检测弹窗并模拟点击 try: ad_button = driver.find_element_by_class_name('ab-message-button') if ad_button != -1: logging.info('页面弹出了广告') ad_button.click() sleep(1) except Exception as e: # logging.info(e) # logging.info(traceback.format_exc()) pass count_mark = 0 logging.info('加载餐馆列表中。。。') # 先判断是否需要请求多页, 如果为空或者一页显示完整, 不再请求 if check_element_by_xpath(driver, '//*[@id="suggestLink"]') != -1: logging.info('加载结束') rs_new = get_restaurants_list(driver) logging.info('加载餐馆: {}'.format(len(rs_new))) insert_restaurants(rs_new) else: # 不使用下拉, 尝试使用js发送ajax请求, 请求内容见下文 """ $.ajax({ url:'https://www.ifood.com.br/lista-restaurantes/filtro', method: 'post', data: { page: 12, city: 'SAO-PAULO', state: 'SP', ordenacao: 0 }, success: function(res) { console.log(res) }, fail: function(error) { alert(error) }, complete: function() { } }) """ logging.info('加载更多...') page = 1 local_info = url.split("/")[-2] index = local_info.rfind('-') city_name = local_info[:index] state_name = local_info[index + 1:] while page < 500: # 因为访问首页会自动请求page=1, 所有这里从2开始 page += 1 post_data = { 'url': 'https://www.ifood.com.br/lista-restaurantes/filtro', 'method': 'post', 'data': { 'page': page, 'city': city_name, 'state': state_name, 'ordenacao': 0 } } filtro_script_fore = '$.ajax({}'.format(post_data)[:-1] filtro_script_end = """ ,success: function(res) { $(".tabs.flex-tab").append(res) }, fail: function(error) { console.log(error) }, complete: function() { } })""" filtro_script = filtro_script_fore + filtro_script_end driver.execute_script(filtro_script) # 查找是否存在标签restaurant-card-link, 如果为空, 则说明没有餐馆或者加载结束 rs_new = get_restaurants_list(driver) logging.info('加载餐馆: {}'.format(len(rs_new))) if len(rs_new) == 0: logging.info('加载结束') break else: insert_restaurants(rs_new) # 清空 driver.execute_script( "$('.restaurant-card-link').remove()") logging.info('共下拉加载次数:{}'.format(page)) driver.close() return 0 except Exception as e: logging.info(e) logging.info(traceback.format_exc()) driver.close() return -1