Пример #1
0
def get_links_from_web():
    driver = launch_driver()
    url = 'https://www.sindelantal.mx'
    driver.get(url)
    # 解析页面
    # selenium stores th source HTML in the driver's page_source attribute
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    cities_rests = soup.find("div", {"class": "cities-rests"})
    lis = cities_rests.find_all("li", {"class": "internal-link"})
    links = []
    for li in lis:
        tag_a = li.a
        link = {
            "title": tag_a['title'],
            "url": url + tag_a['href'],
        }
        links.append(link)
    return links
Пример #2
0
def get_restaurant_info(r):
    driver = launch_driver()

    try:
        # 请求链接
        url = r.get('url')
        logging.info(url)
        driver.get(url)

        # 重定向判定
        if driver.current_url == url:
            pass
        else:
            logging.info('该网页进行了重定向,稍后再请求.')
            return -1

        # 首先分析信息,更新到new_info
        # 从script中解析出新信息
        new_info = parse_new_info(driver)
        new_info['state'] = r.get('state')
        new_info['rid'] = r.get('rid')

        # 统计评论
        if new_info.get('count_rating') is None or int(new_info.get('count_rating', 0)) == 0:
            logging.info('餐馆评论为0, 可以直接跳过')
            update_restaurant(new_info)
            return 0
        else:
            # 加载评论
            load_comments(driver)
            source_page = BeautifulSoup(driver.page_source, 'html.parser')
            comments = parse_comments(source_page, new_info)
            save_comments(comments, r)
            update_restaurant(new_info)
            return 0

    except Exception as e:
        logging.info(traceback.format_exc())
        return -1
    finally:
        driver.close()
Пример #3
0
def get_restaurants_page(local_data):
    # 启动浏览器
    driver = launch_driver()

    host_url = 'https://www.ifood.com.br/'
    driver.get(host_url)
    try:
        # 先找到邮编和街道的切换链接
        switch_link = check_element_by_xpath(driver,
                                             '//*[@id="buscaCepPorEndereco"]')
        wait_to_display(switch_link)
        switch_link.click()

        # 看看是否得到了输入地址信息的表单
        # 为select标签选择值
        state_element = check_element_by_xpath(
            driver, '//*[@id="box7"]/form/div[1]/div[1]/select')
        wait_to_display(state_element)
        Select(state_element).select_by_value(local_data.get('state'))

        # 看看是否得到了输入地址信息的表单
        # 为select标签选择值, 选择两次,因为选择state后重新加载了
        sleep(3)
        city_element = check_element_by_xpath(
            driver, '//*[@id="box7"]/form/div[1]/div[2]/select')
        wait_to_display(city_element)
        Select(city_element).select_by_visible_text(
            local_data.get('city').upper())

        # 为两个input标签添加信息
        street_input = check_element_by_xpath(
            driver, '//*[@id="box7"]/form/div[2]/input[1]')
        number_input = check_element_by_xpath(
            driver, '//*[@id="box7"]/form/div[2]/input[2]')

        wait_to_display(street_input)
        wait_to_display(number_input)
        street_input.send_keys(local_data.get('street'))
        number_input.send_keys(local_data.get('number'))
        # 点击下街道编号输入框,确保自动补全的下拉框收回
        number_input.click()
        sleep(2)
        number_input.click()
        # 提交数据
        # 确保加载结束, 要么超时,要么找到标签
        submit_button = check_element_by_xpath(
            driver, '//*[@id="pre-search-naoSeiMeuCep"]/input[1]')
        wait_to_display(submit_button)
        submit_button.click()
        log('提交登录')

        # 确保加载结束, 要么超时,要么找到搜索结果,点击标签1标签
        # 搜索地址成功,选择第一个地址
        select_result = check_element_by_xpath(
            driver,
            '//*[@id="box7"]/form/div[4]/div/table/tbody/tr[1]/td[3]/a')
        wait_to_display(select_result)
        select_result.click()

        # 此处应该就能加载出餐馆列表了。
        # 下拉加载所有餐馆
        count_mark = 0
        log('加载餐馆列表中。。。')
        while count_mark < 1000:
            driver.execute_script(
                "$('html,body').animate({'scrollTop':'700000000'},1000)")
            # 查找有没有加载
            if check_element_by_xpath(
                    driver, '//*[@id="content"]/div[2]/div[2]') != -1:
                # 找到了就继续
                count_mark += 1
                # log('拉取餐馆列表进度', count_mark)
                continue
            else:
                # 查看到了底部
                if check_element_by_xpath(driver,
                                          '//*[@id="suggestLink"]') != -1:
                    log('加载结束')
                    break
            count_mark += 1

        log('开始获取所有餐馆的url')
        r_state = local_data.get('state')
        rs_new = get_restaurants_list(driver, r_state)
        log('新获取餐馆{}家。'.format(len(list(rs_new.keys()))))
        rs_origin = load_restaurants(r_state)
        log('原有餐馆{}家。'.format(len(list(rs_origin.keys()))))
        rs_origin.update(rs_new)
        log('更新后共有餐馆{}家。'.format(len(list(rs_origin.keys()))))
        save_restaurants(rs_origin, r_state)
        # 清理缓存
        # driver.delete_all_cookies()
        driver.close()
        return 0
    except Exception as e:
        log(e)
        log(traceback.format_exc())
        driver.close()
        return -1
Пример #4
0
def get_restaurants_page(link):
    # 启动浏览器
    driver = launch_driver()
    url = link.get('url')
    logging.info('当前请求:{}'.format(url))
    driver.get(url)
    try:
        # 此处应该就能加载出餐馆列表了。
        # 下拉加载所有餐馆
        # print(driver.page_source)
        forbid = check_element_by_xpath(driver, '//*[@id="searchMsg"]')
        if forbid == -1:
            pass
        else:
            logging.info(forbid.text)
            logging.info('ip被禁止访问')
            return -1
        count_mark = 0
        logging.info('加载餐馆列表中。。。')

        # 先判断是否需要请求多页, 如果为空或者一页显示完整, 不再请求
        if check_element_by_xpath(driver, '//*[@id="suggestLink"]') != -1:
            logging.info('加载结束')
            rs_new = get_restaurants_list(driver)
            logging.info('加载餐馆: {}'.format(len(rs_new)))
            insert_restaurants(rs_new)
        else:
            # 不使用下拉, 尝试使用js发送ajax请求, 请求内容见下文
            """
            $.ajax({
                url:'https://www.sindelantal.mx/lista-restaurantes/filtro',
                method: 'post',
                data: {
                    page: 12,
                    city: 'CUAJIMALPA',
                    state: 'DF',
                    ordenacao: 0
                },
                success: function(res) {
                    console.log(res)
                },
                fail: function(error) {
                    alert(error)
                },
                complete: function() {
                }
            })
            """
            logging.info('加载更多...')
            page = 1
            local_info = url.split("/")[-2]
            city_name = local_info.split('-')[0]
            state_name = local_info.split('-')[1]
            while page < 500:
                page += 1
                post_data = {
                    'url':
                    'https://www.sindelantal.mx/lista-restaurantes/filtro',
                    'method': 'post',
                    'data': {
                        'page': page,
                        'city': city_name,
                        'state': state_name,
                        'ordenacao': 0
                    }
                }
                filtro_script_fore = '$.ajax({}'.format(post_data)[:-1]
                filtro_script_end = """
                    ,success: function(res) {
                        $(".tabs.flex-tab").append(res)
                    },
                    fail: function(error) {
                        console.log(error)
                    },
                    complete: function() {
                    }
                    })"""
                filtro_script = filtro_script_fore + filtro_script_end
                driver.execute_script(filtro_script)
                # 查找是否存在标签restaurant-card-link, 如果为空, 则说明没有餐馆或者加载结束
                rs_new = get_restaurants_list(driver)
                logging.info('加载餐馆: {}'.format(len(rs_new)))
                if len(rs_new) == 0:
                    logging.info('加载结束')
                    break
                else:
                    insert_restaurants(rs_new)
                    # 清空
                    driver.execute_script(
                        "$('.restaurant-card-link').remove()")

            logging.info('共下拉加载次数:{}'.format(page))
        driver.close()
        return 0
    except Exception as e:
        logging.info(e)
        logging.info(traceback.format_exc())
        driver.close()
        return -1
Пример #5
0
def driver():
    driver = launch_driver()
    yield driver
    driver.quit()
Пример #6
0
def get_restaurant_info(r):
    driver = launch_driver()

    try:
        # 请求链接
        url = r.get('url')
        log(url)
        driver.get(url)

        # 重定向判定
        if driver.current_url == url:
            pass
        else:
            log('该网页进行了重定向,不再获取.')
            return 0

        # 首先分析参观信息,更新到new_info, 如果评论数量为0,直接跳过,不再点击评论
        # 从script中解析出新信息
        sleep(2)
        new_info = parse_new_info(driver)
        if int(new_info.get('count_rating')) == 0:
            log('餐馆评论为0, 可以直接跳过')
            return 0

        # 开始点击加载评论
        show_rate = check_element_by_xpath(driver, '//*[@id="showRating"]/a')
        show_rate.click()

        # 增加判断,如果js加载失败或点击加载失败,没有active表示没有加载出评论,直接跳过
        parent_tag = check_element_by_xpath(show_rate, '..')
        if parent_tag.get_attribute('class') != 'active':
            log('评论加载失败,跳过')
            return -1

        # 拉取评论
        log('拉取评论中...')
        while True:
            driver.execute_script(
                "$('html,body').animate({'scrollTop':'700000000'},1000)")
            ver_mais = check_element_by_xpath(
                driver, '//*[@id="ratingContent"]/div[3]/div[2]')
            # 没有ver_mais, 就结束. 实际正常应该是ver_mais不可见了
            if ver_mais == -1:
                break
            else:
                # log('没找到评论加载完毕,说明等待加载')
                wait_to_display(ver_mais)
                if ver_mais.is_displayed() is False:
                    break
                ver_mais.click()
                # log('加载更多评论。。。')
                # 等待三秒页面加载
                sleep(2)

        # 加载结束, 解析源代码
        source_page = BeautifulSoup(driver.page_source, 'html.parser')

        # 解析出评论
        comments = parse_comments(source_page, new_info)

        # 更新餐馆信息,保存评论信息
        # 获得的新的信息,不再更新到源文件,直接输出到csv文件吧

        # r.update(new_info)
        new_info['state'] = r.get('state')
        new_info['id'] = r.get('id')
        r_to_csv(new_info)
        save_comments(comments, r)
        return 0
    except Exception as e:
        log(traceback.format_exc())
        return -1
    finally:
        driver.close()
Пример #7
0
def get_restaurant_info(r):
    driver = launch_driver()

    try:
        # 请求链接
        url = r.get('url')
        logging.info(url)
        driver.get(url)

        # 重定向判定
        if driver.current_url == url:
            pass
        else:
            logging.info('该网页进行了重定向,稍后再请求.')
            return -1

        # 检测弹窗并模拟点击
        try:
            ad_button = driver.find_element_by_class_name('ab-message-button')
            if ad_button != -1:
                logging.info('页面弹出了广告')
                ad_button.click()
                sleep(1)
        except Exception as e:
            # logging.info(e)
            # logging.info(traceback.format_exc())
            pass

        # 首先分析参观信息,更新到new_info, 如果评论数量为0,直接跳过,不再点击评论
        # 从script中解析出新信息
        new_info = parse_new_info(driver)
        if new_info.get('count_rating') is None:
            pass
        elif int(new_info.get('count_rating')) == 0:
            logging.info('餐馆评论为0, 可以直接跳过')
            new_info['state'] = r.get('state')
            new_info['rid'] = r.get('rid')
            update_restaurant(new_info)
            return 0

        # 20181219 ifood退出新版网页, 弹窗选择取消
        cancelModal_tag = check_element_by_xpath(driver,
                                                 '//*[@id="cancelModal"]')
        if cancelModal_tag is None:
            logging.info('新页面提醒没有弹出')
        else:
            cancelModal_tag.click()
            logging.info('点击下一个按钮, 稍后再看')
            sleep(2)

        # 开始点击加载评论
        show_rate = check_element_by_xpath(driver, '//*[@id="showRating"]/a')
        show_rate.click()

        # 增加判断,如果js加载失败或点击加载失败,没有active表示没有加载出评论,直接跳过
        parent_tag = check_element_by_xpath(show_rate, '..')
        if parent_tag.get_attribute('class') != 'active':
            logging.info('评论加载失败,跳过')
            return -1

        # 拉取评论之前先判断是否没有评论
        sleep(2)
        no_comment = driver.find_element_by_class_name('no-comments')
        if no_comment.is_displayed() is True:
            logging.info('评论还是为零, 结束下拉.')
            new_info['state'] = r.get('state')
            new_info['rid'] = r.get('rid')
            new_info['count_rating'] = 0
            update_restaurant(new_info)
            return 0

        # 拉取评论
        logging.info('拉取评论中...')
        while True:
            driver.execute_script(
                "$('html,body').animate({'scrollTop':'700000000'},1000)")
            ver_mais = check_element_by_xpath(
                driver, '//*[@id="ratingContent"]/div[3]/div[2]')
            # 没有ver_mais, 就结束. 实际正常应该是ver_mais不可见了
            if ver_mais == -1:
                break
            else:
                # logging.info('没找到评论加载完毕,说明等待加载')
                wait_to_display(ver_mais)
                if ver_mais.is_displayed() is True:
                    ver_mais.click()
                    sleep(2)
                else:
                    break

        # 加载结束, 解析源代码
        source_page = BeautifulSoup(driver.page_source, 'html.parser')

        # 解析出评论
        comments = parse_comments(source_page, new_info)

        # 更新餐馆信息,保存评论信息
        # 获得的新的信息,不再更新到源文件,直接输出到csv文件吧
        # r.update(new_info)
        new_info['state'] = r.get('state')
        new_info['rid'] = r.get('rid')
        # 统计不含评论内容的评论
        tmp_count = 0
        for comment in comments.values():
            if comment.get('source') == "R":
                tmp_count += 1
        new_info['count_rating'] = tmp_count
        update_restaurant(new_info)
        save_comments(comments, r)
        return 0
    except Exception as e:
        logging.info(traceback.format_exc())
        return -1
    finally:
        driver.close()
Пример #8
0
def get_restaurants_page(link):
    # 启动浏览器
    driver = launch_driver()
    url = link.get('url')
    logging.info('当前请求:{}'.format(url))
    driver.get(url)
    try:
        # 此处应该就能加载出餐馆列表了。
        # 下拉加载所有餐馆
        # print(driver.page_source)

        # 重定向判定
        if driver.current_url == url:
            pass
        else:
            logging.info('该网页进行了重定向,稍后再请求.')
            return -1

        forbid = check_element_by_xpath(driver, '//*[@id="searchMsg"]')
        if forbid == -1:
            pass
        else:
            logging.info(forbid.text)
            logging.info('ip被禁止访问')
            return -1

        # 检测弹窗并模拟点击
        try:
            ad_button = driver.find_element_by_class_name('ab-message-button')
            if ad_button != -1:
                logging.info('页面弹出了广告')
                ad_button.click()
                sleep(1)
        except Exception as e:
            # logging.info(e)
            # logging.info(traceback.format_exc())
            pass

        count_mark = 0
        logging.info('加载餐馆列表中。。。')
        # 先判断是否需要请求多页, 如果为空或者一页显示完整, 不再请求
        if check_element_by_xpath(driver, '//*[@id="suggestLink"]') != -1:
            logging.info('加载结束')
            rs_new = get_restaurants_list(driver)
            logging.info('加载餐馆: {}'.format(len(rs_new)))
            insert_restaurants(rs_new)
        else:
            # 不使用下拉, 尝试使用js发送ajax请求, 请求内容见下文
            """
            $.ajax({
                url:'https://www.ifood.com.br/lista-restaurantes/filtro',
                method: 'post',
                data: {
                    page: 12,
                    city: 'SAO-PAULO',
                    state: 'SP',
                    ordenacao: 0
                },
                success: function(res) {
                    console.log(res)
                },
                fail: function(error) {
                    alert(error)
                },
                complete: function() {
                }
            })
            """
            logging.info('加载更多...')
            page = 1
            local_info = url.split("/")[-2]
            index = local_info.rfind('-')
            city_name = local_info[:index]
            state_name = local_info[index + 1:]
            while page < 500:
                # 因为访问首页会自动请求page=1, 所有这里从2开始
                page += 1
                post_data = {
                    'url':
                    'https://www.ifood.com.br/lista-restaurantes/filtro',
                    'method': 'post',
                    'data': {
                        'page': page,
                        'city': city_name,
                        'state': state_name,
                        'ordenacao': 0
                    }
                }
                filtro_script_fore = '$.ajax({}'.format(post_data)[:-1]
                filtro_script_end = """
                    ,success: function(res) {
                        $(".tabs.flex-tab").append(res)
                    },
                    fail: function(error) {
                        console.log(error)
                    },
                    complete: function() {
                    }
                    })"""
                filtro_script = filtro_script_fore + filtro_script_end
                driver.execute_script(filtro_script)
                # 查找是否存在标签restaurant-card-link, 如果为空, 则说明没有餐馆或者加载结束
                rs_new = get_restaurants_list(driver)
                logging.info('加载餐馆: {}'.format(len(rs_new)))
                if len(rs_new) == 0:
                    logging.info('加载结束')
                    break
                else:
                    insert_restaurants(rs_new)
                    # 清空
                    driver.execute_script(
                        "$('.restaurant-card-link').remove()")

            logging.info('共下拉加载次数:{}'.format(page))
        driver.close()
        return 0
    except Exception as e:
        logging.info(e)
        logging.info(traceback.format_exc())
        driver.close()
        return -1