Python Crawl.crawl_by_get 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: crawl

클래스/타입: Crawl

메소드/함수: crawl_by_get

hotexamples.com에서의 예제들: 10

Python Crawl.crawl_by_get - 10개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 crawl.Crawl.crawl_by_get에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

Crawl(30)

crawl_by_get(10)

crawl(5)

__init__(3)

get_incresed_fans(2)

get_fans_str(2)

get_cities_url(1)

get_name(1)

get_hotel_list(1)

get_history(1)

get_evaluation(1)

get(1)

get_article(1)

fetch(1)

crawl_followers(1)

crawl_by_post(1)

append_path(1)

append_extension(1)

all_come_to_bowl(1)

reverse_crawl(1)

예제 #1

파일 보기

파일: engine.py 프로젝트: myirelias/TransferStation

class Engine:
    def __init__(self):
        self.crawl = Crawl()
        self.analysis = Analysis()
        self.pipe = Pipeline()

    def _engine_city_link(self):
        """
        获取所有城市的名称和url链接，结果输出到file_city_list.txt文本中
        :return:
        """
        content = self.crawl.crawl_by_get(setting.START_URL,
                                          headers=setting.HEADERS,
                                          proxies=self._engine_use_proxy())
        element_city = self.analysis.analysis_by_xpath(content,
                                                       setting.XPATH_CITY_A)
        city_list = []
        for each_element in element_city:
            city_name = self.analysis.analysis_by_xpath(
                each_element, setting.XPATH_CITY_NAME)
            city_url = self.analysis.analysis_by_xpath(each_element,
                                                       setting.XPATH_CITY_URL)
            city_list.append('{}\u0001{}'.format(''.join(city_name),
                                                 ''.join(city_url)))
        self.pipe.pipe_txt_save(city_list, filename=setting.FILE_CITY_LIST)

    def _engine_amuse_link(self):
        """
        获取每个城市中所有的娱乐场所的链接
        :return:
        """
        city_list = self.pipe.pipe_txt_load(filename=setting.FILE_CITY_LIST)
        for each_city in city_list:
            try:
                url = each_city.strip().split('\u0001')[1] + '-wanle'
                name = each_city.strip().split('\u0001')[0]
                params_city = {'page': 0}
                maxpage = 200  # 默认最大页数
                while True:
                    save_list = []
                    params_city['page'] += 1
                    content = self.crawl.crawl_by_get(
                        url,
                        headers=setting.HEADERS,
                        params=params_city,
                        proxies=self._engine_use_proxy(),
                        retry=2,
                        timeout=15)
                    if not content:
                        break
                    # 获取总页数
                    if params_city['page'] == 1:
                        # 找到最大页数,使用map函数
                        pagecount = map(
                            lambda x: int(x) if x != '下一页' else -1,
                            self.analysis.analysis_by_xpath(
                                content, xpahter=setting.XPATH_NEXTPAGE))
                        try:
                            maxpage = max(pagecount)
                        except:
                            break
                    element_li = self.analysis.analysis_by_xpath(
                        content, xpahter=setting.XPATH_LI)
                    if not element_li:
                        break

                    for each_ele in element_li:
                        amuse_name = self.analysis.analysis_by_xpath(
                            each_ele, xpahter=setting.XPATH_AMUSE_NAME)
                        amuse_type = self.analysis.analysis_by_xpath(
                            each_ele, xpahter=setting.XPATH_AMUSE_TYPE)
                        amuse_url = self.analysis.analysis_by_xpath(
                            each_ele, xpahter=setting.XPATH_AMUSE_URL)
                        try:
                            save_info = '{}\u0001{}\u0001{}\u0001{}'.format(
                                name, ''.join(amuse_name), ''.join(amuse_type),
                                ''.join(amuse_url))
                        except:
                            continue
                        save_list.append(save_info)
                    self.pipe.pipe_txt_save(save_list,
                                            filename=setting.FILE_AMUSE_LIST,
                                            savetype='a')
                    if params_city['page'] >= maxpage:
                        break
                    time.sleep(0.2)
            except:
                continue

    def _engine_amuse_info(self):
        """
        获取所有娱乐场所详细数据
        :return:
        """
        amuse_list = self.pipe.pipe_txt_load(filename=setting.FILE_AMUSE_LIST)
        for each_amuse in amuse_list:
            try:
                # 娱乐场所数据
                amuse_info = each_amuse.strip().split('\u0001')
                city_name = amuse_info[0]
                amuse_name = amuse_info[1]
                amuse_type = amuse_info[2]
                amuse_url = amuse_info[3]
                find_id = re.search(re.compile(r'p-oi(\d+)-'), amuse_url)
                if find_id:
                    amuse_id = find_id.group(1)
                else:
                    amuse_id = 0
                # 获取娱乐场所详细信息
                content = self.crawl.crawl_by_get(
                    amuse_url,
                    headers=setting.HEADERS,
                    proxies=self._engine_use_proxy(),
                    retry=5,
                    timeout=10)
                detail = self.analysis.analysis_by_xpath(
                    content, xpahter=setting.XPATH_AMUSE_DETAIL)
                detail['city_name'] = city_name
                detail['amuse_name'] = amuse_name
                detail['amuse_type'] = amuse_type
                detail['amuse_url'] = amuse_url
                detail['amuse_id'] = amuse_id
                detail['get_time'] = datetime.datetime.now().strftime(
                    '%Y-%m-%d %H:%M:%S')
                # 存储数据
                # 字段顺序
                # city_name, amuse_name, amuse_type, amuse_id,
                # score, ranking, describe, address, tel, open_time, arrive, intro, web, get_time, amuse_url
                save_data = '{0[city_name]}\u0001{0[amuse_name]}\u0001{0[amuse_type]}\u0001' \
                            '{0[amuse_id]}\u0001{0[score]}\u0001{0[ranking]}\u0001' \
                            '{0[describe]}\u0001{0[address]}\u0001{0[tel]}\u0001' \
                            '{0[open_time]}\u0001{0[arrive]}\u0001{0[intro]}\u0001' \
                            '{0[web]}\u0001{0[get_time]}\u0001{0[amuse_url]}\u0001'.format(detail)
                self.pipe.pipe_txt_save(save_data,
                                        filename=setting.FILE_AMUSE_INFO,
                                        savetype='a')
                # self.pipe.pipe_mongo_save(detail, dbname='db_qunaer', colname='col_shop_info')
                time.sleep(0.1)
            except Exception as e:
                print('crawl error', e)
                continue

    def _engine_amuse_comments(self):
        """
        获取所有购物店评论数据
        :return:
        """
        amuse_list = self.pipe.pipe_txt_load(filename=setting.FILE_AMUSE_LIST)
        # 每个店铺最新评论时间表
        check_dict = self.pipe.pipe_pickle_load(
            filename=setting.FILE_COMMENTS_CHECK)
        if not check_dict:
            check_dict = {}
        for each_amuse in amuse_list:
            try:
                # 店铺数据
                city = each_amuse.strip().split('\u0001')[0]
                amuse = each_amuse.strip().split('\u0001')[1]
                type = each_amuse.strip().split('\u0001')[2]
                amuse_url = each_amuse.strip().split('\u0001')[3]
                find_id = re.search(re.compile(r'p-oi(\d+)-'), amuse_url)
                if not find_id:
                    break
                amuse_id = find_id.group(1)
                api = setting.COMMENTS_API.format(amuse_id)
                setting.HEADERS_COMMENTS['Referer'] = amuse_url
                params = {
                    'page': 0,
                    'pageSize': '10',
                    'poiList': 'true',
                    'rank': 0,  # 全部评论
                    'sortField': 0  # 按照时间排序
                }
                comments_time = set([])
                current_time = check_dict.get(amuse_id, '0')
                max_page = 1
                while True:
                    params['page'] += 1
                    content = self.crawl.crawl_by_get(
                        api,
                        headers=setting.HEADERS_COMMENTS,
                        proxies=self._engine_use_proxy(),
                        params=params,
                        retry=2,
                        timeout=15)
                    try:
                        content_dict = json.loads(content)
                    except:
                        break
                    if not content_dict.get('data'):
                        break
                    content_comments = content_dict.get('data')
                    # 第一遍抓取要确定评论页数
                    if params['page'] == 1:
                        page = self.analysis.analysis_by_xpath(
                            content_comments,
                            xpahter=setting.XPATH_COMMENTS_PAGE)
                        if page:
                            max_page = int(''.join(page))
                    elements_com = self.analysis.analysis_by_xpath(
                        content_comments, xpahter=setting.XPATH_COMMENTS_LI)
                    if not elements_com:
                        break
                    for each_element in elements_com:
                        title = self.analysis.analysis_by_xpath(
                            each_element, xpahter=setting.XPATH_COMMENTS_TITLE)
                        start = self.analysis.analysis_by_xpath(
                            each_element, xpahter=setting.XPATH_COMMENTS_START)
                        nick = self.analysis.analysis_by_xpath(
                            each_element, xpahter=setting.XPATH_COMMENTS_NICK)
                        more = self.analysis.analysis_by_xpath(
                            each_element, xpahter=setting.XPATH_COMMENTS_MORE)
                        if more:
                            content_more = self.crawl.crawl_by_get(
                                more[0],
                                headers=setting.HEADERS,
                                proxies=self._engine_use_proxy())
                            content = self.analysis.analysis_by_xpath(
                                content_more,
                                xpahter=setting.XPATH_COMMENTS_DETAIL)
                        else:
                            content = self.analysis.analysis_by_xpath(
                                each_element,
                                xpahter=setting.XPATH_COMMENTS_CONTENT)
                        date = self.analysis.analysis_by_xpath(
                            each_element, xpahter=setting.XPATH_COMMENTS_DATE)
                        deal_content = ''.join(
                            list(
                                map(
                                    lambda x: x.replace('\n', '').replace(
                                        '\r', '').replace('\t', '').replace(
                                            ' ', ''), content)))
                        if ''.join(date) > current_time:
                            commetents_info = {
                                'city':
                                city,
                                'amuse':
                                amuse,
                                'amuse_id':
                                amuse_id,
                                'type':
                                type,
                                'title':
                                ''.join(title),
                                'nick':
                                ''.join(nick),
                                'start':
                                ''.join(start),
                                'content':
                                deal_content,
                                'date':
                                ''.join(date),
                                'get_time':
                                datetime.datetime.now().strftime(
                                    '%Y-%m-%d %H:%M:%S'),
                                'url':
                                amuse_url
                            }
                            for eachkey in commetents_info.keys():
                                commetents_info[eachkey] = commetents_info[
                                    eachkey].replace('\n',
                                                     '').replace('\r', '')
                            # 存储数据
                            # 字段顺序
                            # city, amuse, amuse_id, type, title, nick, start, content, date, get_time, url
                            save_data = '{0[city]}\u0001{0[amuse]}\u0001{0[amuse_id]}\u0001' \
                                        '{0[type]}\u0001{0[title]}\u0001{0[nick]}\u0001' \
                                        '{0[start]}\u0001{0[content]}\u0001{0[date]}\u0001' \
                                        '{0[get_time]}\u0001{0[url]}'.format(commetents_info)
                            self.pipe.pipe_txt_save(
                                save_data,
                                filename=setting.FILE_AMUSE_COMMENTS,
                                savetype='a')
                            # self.pipe.pipe_mongo_save(commetents_info, dbname='db_qunaer', colname='col_shopping_comments')
                            comments_time.add(''.join(date))
                    # 超过评论最大页数则切换
                    if params['page'] >= max_page:
                        break
                    # 当前页面没有新增评论也切换至下一店铺
                    if not len(comments_time):
                        break
                # 每个店铺最新的评论时间
                if comments_time:
                    check_dict[amuse_id] = max(comments_time)
                # 抓取到的评论数据
                self.pipe.pipe_pickle_save(
                    check_dict, filename=setting.FILE_COMMENTS_CHECK)
            except:
                continue

    def _temp_city_info(self, cityname):
        """
        做22项数据处理时临时用
        :return:
        """
        citylist = self.pipe.pipe_txt_load(filename='city_list_total.txt')
        city_params = {
            '国别': '&',
            '省自治区全称': '&',
            '省自治区简称': '&',
            '市州全称': '&',
            '市州简称': '&',
            '区县全称': '&',
            '区县简称': '&',
            '地区编码': '&',
            '等级': '&'
        }
        spec_city = {
            '北京': '110000',
            '天津': '120000',
            '上海': '310000',
            '重庆': '500000'
        }
        for each in citylist:
            cityinfo = each.split('\u0001')
            if cityname in cityinfo:
                site = cityinfo.index(cityname)
                if site == 4 or site == 5:
                    city_params['国别'] = 'CN'
                    city_params['省自治区全称'] = cityinfo[0].strip()
                    city_params['省自治区简称'] = cityinfo[1].strip()
                    city_params['市州全称'] = cityinfo[2].strip()
                    city_params['市州简称'] = cityinfo[3].strip()
                    city_params['区县全称'] = cityinfo[4].strip()
                    city_params['区县简称'] = cityinfo[5].strip()
                    city_params['地区编码'] = cityinfo[-1].strip()
                    city_params['等级'] = '区县级'
                elif site == 2 or site == 3:
                    city_params['国别'] = 'CN'
                    city_params['省自治区全称'] = cityinfo[0].strip()
                    city_params['省自治区简称'] = cityinfo[1].strip()
                    city_params['市州全称'] = cityinfo[2].strip()
                    city_params['市州简称'] = cityinfo[3].strip()
                    city_params['地区编码'] = cityinfo[-1].strip()[:-2] + '00'
                    city_params['等级'] = '地市级'
                elif cityname in ['北京', '重庆', '上海', '天津']:
                    city_params['国别'] = 'CN'
                    city_params['省自治区全称'] = cityname + '市'
                    city_params['省自治区简称'] = cityname
                    city_params['市州全称'] = cityname + '市'
                    city_params['市州简称'] = cityname
                    city_params['地区编码'] = spec_city[cityname]
                    city_params['等级'] = '直辖'
                break

        return city_params

    @staticmethod
    def _engine_use_proxy():
        """
        使用代理ip
        :return: 代理ip
        """
        proxy_host = "proxy.abuyun.com"
        proxy_port = "9010"
        proxy_user = "******"
        proxy_pass = "******"
        proxy_meta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
            "host": proxy_host,
            "port": proxy_port,
            "user": proxy_user,
            "pass": proxy_pass
        }
        proxies = {"http": proxy_meta, "https": proxy_meta}

        return proxies

    def start_engine(self):
        # self._engine_city_link()
        # self._engine_amuse_link()
        # 店铺信息和店铺评论可以同时抓取的，用多进程实现，后期可根据需求添加该功能，目前未开发循环抓取功能
        # self._engine_amuse_info()
        self._engine_amuse_comments()

예제 #2

파일 보기

파일: engine.py 프로젝트: myirelias/TransferStation

class Engine:
    def __init__(self):
        self.crawl = Crawl()
        self.analysis = Analysis()
        self.pipe = Pipeline()

    def _engine_city_link(self):
        """
        获取所有城市的名称和url链接，结果输出到file_city_list.txt文本中
        :return:
        """
        content = self.crawl.crawl_by_get(setting.START_URL,
                                          headers=setting.HEADERS,
                                          proxies=self._engine_use_proxy())
        element_city = self.analysis.analysis_by_xpath(content,
                                                       setting.XPATH_CITY_A)
        city_list = []
        for each_element in element_city:
            city_name = self.analysis.analysis_by_xpath(
                each_element, setting.XPATH_CITY_NAME)
            city_url = self.analysis.analysis_by_xpath(each_element,
                                                       setting.XPATH_CITY_URL)
            city_list.append('{}\u0001{}'.format(''.join(city_name),
                                                 ''.join(city_url)))
        self.pipe.pipe_txt_save(city_list, filename=setting.FILE_CITY_LIST)

    def _engine_tactic_link(self):
        """
        获取每个城市中所有的攻略的链接
        :return:
        """
        city_list = self.pipe.pipe_txt_load(filename=setting.FILE_CITY_LIST)
        tactic_check = self.pipe.pipe_pickle_load(
            filename=setting.FILE_TACTIC_CHECK)
        if not tactic_check:
            tactic_check = set([])
        for each_city in city_list:
            """
            http://travel.qunar.com/travelbook/list/22-城市拼音-城市id/
            hot(hot为热门游记，elite为精华游记，start为行程计划)_ctime(ctime为按最新发表排序，heat为热度排序)/页码.htm 
            """
            try:
                url = each_city.strip().split('\u0001')[1]
                name = each_city.strip().split('\u0001')[0]
                pattern = re.compile(r'p-cs(\d+)-(\w+)')
                city_pname = re.search(pattern, url).group(2)
                city_id = re.search(pattern, url).group(1)
                # 拼接攻略所在url(1.城市拼音名称:city_pname, 2.城市id:city_id, 3.分类)
                tactic_type = ['hot', 'elite', 'start']  # 攻略分类，目前脚本先抓取hot类
                tactic_url = setting.TACTIC_URL.format(city_pname, city_id,
                                                       tactic_type[0])
                current_page = 0
                maxpage = 200  # 默认最大页数
                while True:
                    save_list = []
                    current_page += 1
                    content = self.crawl.crawl_by_get(
                        tactic_url + '{}.htm'.format(current_page),
                        headers=setting.HEADERS,
                        retry=2,
                        timeout=15,
                        proxies=self._engine_use_proxy())
                    if not content:
                        break
                    # 获取总页数
                    if current_page == 1:
                        # 找到最大页数,使用map函数
                        pagecount = map(
                            lambda x: int(x) if x != '下一页>' else -1,
                            self.analysis.analysis_by_xpath(
                                content, xpahter=setting.XPATH_NEXTPAGE))
                        try:
                            maxpage = max(pagecount)
                        except:
                            break
                    tactic_ids = self.analysis.analysis_by_xpath(
                        content, xpahter=setting.XPATH_ID)
                    for each_id in tactic_ids:
                        each_url = 'http://travel.qunar.com/youji/{}'.format(
                            each_id)
                        save_info = '{}\u0001{}\u0001{}\u0001{}\u0001{}'.format(
                            name, city_pname, city_id, each_id, each_url)
                        if each_id not in tactic_check:
                            save_list.append(save_info)
                            tactic_check.add(each_id)
                    if save_list:
                        self.pipe.pipe_txt_save(
                            save_list,
                            filename=setting.FILE_TACTIC_LIST,
                            savetype='a')
                    if current_page >= maxpage:
                        break
                    time.sleep(0.2)
            except:
                continue

    def _engine_tactic_info(self):
        """
        获取所有攻略详细数据
        :return:
        """
        tactic_list = self.pipe.pipe_txt_load(
            filename=setting.FILE_TACTIC_LIST)
        for each_tactic in tactic_list:
            try:
                # 攻略数据
                tactic_info = each_tactic.strip().split('\u0001')
                city_name = tactic_info[0]
                city_pname = tactic_info[1]
                city_id = tactic_info[2]
                tactic_id = tactic_info[3]
                tactic_url = tactic_info[4]
                # 获取娱乐场所详细信息
                content = self.crawl.crawl_by_get(
                    tactic_url,
                    headers=setting.HEADERS,
                    proxies=self._engine_use_proxy(),
                    retry=3,
                    timeout=15)
                detail = self.analysis.analysis_by_xpath(
                    content, xpahter=setting.XPATH_TACTIC_DETAIL)
                detail['city_name'] = city_name
                detail['city_pname'] = city_pname
                detail['city_id'] = city_id
                detail['tactic_id'] = tactic_id
                detail['tactic_url'] = tactic_url
                detail['get_time'] = datetime.datetime.now().strftime(
                    '%Y-%m-%d %H:%M:%S')
                # 存储数据
                # 字段顺序
                # city_name, city_pname, city_id,
                # tactic_id,title,author,
                # create_date,start_date,days,
                # avgs_price,person,play_type,
                # content,get_time, tactic_url
                save_data = '{0[city_name]}\u0001{0[city_pname]}\u0001{0[city_id]}\u0001' \
                            '{0[tactic_id]}\u0001{0[title]}\u0001{0[author]}\u0001' \
                            '{0[create_date]}\u0001{0[start_date]}\u0001{0[days]}\u0001' \
                            '{0[avgs_price]}\u0001{0[person]}\u0001{0[play_type]}\u0001' \
                            '{0[content]}\u0001{0[get_time]}\u0001{0[tactic_url]}\u0001'.format(detail)
                self.pipe.pipe_txt_save(save_data,
                                        filename=setting.FILE_TACTIC_INFO,
                                        savetype='a')
                # self.pipe.pipe_mongo_save(detail, dbname='db_qunaer', colname='col_shop_info')
                time.sleep(0.1)
            except Exception as e:
                print('crawl error', e)
                continue

    def _engine_tactic_comments(self):
        """
        获取所有攻略评论数据
        :return:
        """
        tactic_list = self.pipe.pipe_txt_load(
            filename=setting.FILE_TACTIC_LIST)
        # 每个店铺最新评论时间表

        for each_tactic in tactic_list:
            try:
                # 店铺数据
                each_info = each_tactic.strip().split('\u0001')
                city_name = each_info[0]
                city_pname = each_info[1]
                city_id = each_info[2]
                tactic_id = each_info[3]
                tactic_url = each_info[4]
                setting.HEADERS_COMMENTS['Referer'] = tactic_url
                params = {
                    'bookId': tactic_id,  # 攻略id
                    'csrfToken':
                    'o7mGNaK63wbEaYFJTnDue14WX7sPlyXB',  # 暂时固定token
                    'page': 0,  # 页码
                    'pageSize': 30,  # 每页数量
                }
                while True:
                    params['page'] += 1
                    content = self.crawl.crawl_by_get(
                        setting.COMMENTS_API,
                        headers=setting.HEADERS_COMMENTS,
                        proxies=self._engine_use_proxy(),
                        params=params,
                        retry=2,
                        timeout=15)
                    try:
                        content_dict = json.loads(content)
                    except:
                        break
                    if not content_dict.get('data', {}).get('html'):
                        break
                    content_comments = content_dict.get('data', {}).get('html')
                    # 第一遍抓取要确定评论页数
                    elements_com = self.analysis.analysis_by_xpath(
                        content_comments, xpahter=setting.XPATH_COMMENTS_LI)
                    if not elements_com:
                        break
                    for each_element in elements_com:
                        ask_content = self.analysis.analysis_by_xpath(
                            each_element,
                            xpahter=setting.XPATH_COMMENTS_ASK_CONTENT)
                        answer_content = self.analysis.analysis_by_xpath(
                            each_element,
                            xpahter=setting.XPATH_COMMENTS_ANSWER_CONTENT)
                        ask_date = self.analysis.analysis_by_xpath(
                            each_element,
                            xpahter=setting.XPATH_COMMENTS_ASK_DATE)
                        answer_date = self.analysis.analysis_by_xpath(
                            each_element,
                            xpahter=setting.XPATH_COMMENTS_ANSWER_DATE)

                        commetents_info = {
                            'city_name':
                            city_name,
                            'city_id':
                            city_id,
                            'tactic_id':
                            tactic_id,
                            'ask_content':
                            ask_content,
                            'answer_content':
                            answer_content,
                            'ask_date':
                            ask_date,
                            'answer_date':
                            answer_date,
                            'get_time':
                            datetime.datetime.now().strftime(
                                '%Y-%m-%d %H:%M:%S'),
                            'tactic_url':
                            tactic_url
                        }
                        for eachkey in commetents_info.keys():

                            if isinstance(commetents_info[eachkey], str):
                                commetents_info[eachkey] = commetents_info[eachkey]\
                                    .replace('\n', '').replace('\r', '').replace('\xa0', '')
                            elif isinstance(commetents_info[eachkey], list):
                                commetents_info[eachkey] = ''.join(commetents_info[eachkey])\
                                    .replace('\n', '').replace('\r', '')
                            # 存储数据
                            # 字段顺序
                            # city_name, city_id, tactic_id,
                            # ask_content, answer_content, ask_date,
                            # answer_date, get_time, tactic_url,
                        save_data = '{0[city_name]}\u0001{0[city_id]}\u0001{0[tactic_id]}\u0001' \
                                    '{0[ask_content]}\u0001{0[answer_content]}\u0001{0[ask_date]}\u0001' \
                                    '{0[answer_date]}\u0001{0[get_time]}\u0001' \
                                    '{0[tactic_url]}\u0001'.format(commetents_info)
                        self.pipe.pipe_txt_save(
                            save_data,
                            filename=setting.FILE_TACTIC_COMMENTS,
                            savetype='a')
                        # self.pipe.pipe_mongo_save(commetents_info, dbname='db_qunaer', colname='col_shopping_comments')
            except:
                continue

    def _temp_city_info(self, cityname):
        """
        做22项数据处理时临时用
        :return:
        """
        citylist = self.pipe.pipe_txt_load(filename='city_list_total.txt')
        city_params = {
            '国别': '&',
            '省自治区全称': '&',
            '省自治区简称': '&',
            '市州全称': '&',
            '市州简称': '&',
            '区县全称': '&',
            '区县简称': '&',
            '地区编码': '&',
            '等级': '&'
        }
        spec_city = {
            '北京': '110000',
            '天津': '120000',
            '上海': '310000',
            '重庆': '500000'
        }
        for each in citylist:
            cityinfo = each.split('\u0001')
            if cityname in cityinfo:
                site = cityinfo.index(cityname)
                if site == 4 or site == 5:
                    city_params['国别'] = 'CN'
                    city_params['省自治区全称'] = cityinfo[0].strip()
                    city_params['省自治区简称'] = cityinfo[1].strip()
                    city_params['市州全称'] = cityinfo[2].strip()
                    city_params['市州简称'] = cityinfo[3].strip()
                    city_params['区县全称'] = cityinfo[4].strip()
                    city_params['区县简称'] = cityinfo[5].strip()
                    city_params['地区编码'] = cityinfo[-1].strip()
                    city_params['等级'] = '区县级'
                elif site == 2 or site == 3:
                    city_params['国别'] = 'CN'
                    city_params['省自治区全称'] = cityinfo[0].strip()
                    city_params['省自治区简称'] = cityinfo[1].strip()
                    city_params['市州全称'] = cityinfo[2].strip()
                    city_params['市州简称'] = cityinfo[3].strip()
                    city_params['地区编码'] = cityinfo[-1].strip()[:-2] + '00'
                    city_params['等级'] = '地市级'
                elif cityname in ['北京', '重庆', '上海', '天津']:
                    city_params['国别'] = 'CN'
                    city_params['省自治区全称'] = cityname + '市'
                    city_params['省自治区简称'] = cityname
                    city_params['市州全称'] = cityname + '市'
                    city_params['市州简称'] = cityname
                    city_params['地区编码'] = spec_city[cityname]
                    city_params['等级'] = '直辖'
                break

        return city_params

    @staticmethod
    def _engine_use_proxy():
        """
        使用代理ip
        :return: 代理ip
        """
        proxy_host = "proxy.abuyun.com"
        proxy_port = "9010"
        proxy_user = "******"
        proxy_pass = "******"
        proxy_meta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
            "host": proxy_host,
            "port": proxy_port,
            "user": proxy_user,
            "pass": proxy_pass
        }
        proxies = {"http": proxy_meta, "https": proxy_meta}

        return proxies

    def start_engine(self):
        self._engine_city_link()
        # 本版块循环策略为循环抓取攻略，然后评论每次抓取一次攻略列表之后，抓取一遍所有攻略所有评论，并入存入新的文本
        self._engine_tactic_link()
        self._engine_tactic_info()
        self._engine_tactic_comments()

예제 #3

파일 보기

파일: engine.py 프로젝트: myirelias/TransferStation

class Engine:
    def __init__(self):
        self.crawl = Crawl()
        self.pipe = Pipeline()
        self.analysis = Analysis()

    # def _engine_residential_area_by_json(self):
    #     """
    #     获取小区数据,output为json，
    #     但是高德那边返回的json数据小区更位置对应不上，只能使用xml数据，故不用该模块，使用xml
    #     """
    #     citys = self.pipe.pipe_txt_load(filename=setting.FILE_CITY_ID)
    #     types = self.pipe.pipe_txt_load(filename=setting.FILE_TYPE_ID)
    #     current_params = deepcopy(setting.PARAMS)
    #     current_params['key'] = setting.KEY
    #     # 每种类型
    #     for each_type in types:
    #         typeinfo = each_type.strip().split('\u0001')
    #         type_id = typeinfo[0]  # 类型id
    #         type_large = typeinfo[1]  # 类型大分类
    #         type_middle = typeinfo[2]  # 类型中分类
    #         type_small = typeinfo[3]  # 类型小分类
    #         current_params['types'] = type_id
    #         save_filename = '{}_{}_{}_{}.txt'.format(type_id, type_large, type_middle, type_small)
    #         # 每个城市
    #         for each_city in citys:
    #             cityinfo = each_city.strip().split('\u0001')
    #             province = cityinfo[0]  # 省名
    #             city_name = cityinfo[1]  # 城市名
    #             city_id = cityinfo[2]  # 城市id
    #             current_params['city'] = city_id
    #             current_params['page'] = 0
    #             save_data = []
    #             while True:
    #                 current_params['page'] += 1
    #                 content_json = self.crawl.crawl_by_get(setting.SEARCH_API, params=current_params,
    #                                                        retry=2, timeout=30)
    #                 try:
    #                     data_json = json.loads(content_json)
    #                 except:
    #                     continue
    #                 pois_list = data_json.get('pois')
    #                 if not pois_list:
    #                     break
    #                 for each_poi in pois_list:
    #                     """
    #                     字段说明：
    #                     id: 唯一ID, name: 名称, pcode: poi所在省份编码,  pname: poi所在省份名称,citycode: 城市编码,
    #                     cityname: 城市名,adcode: 区域编码, adname: 区域名称,address: 地址,  alias: 别名,
    #                     biz_ext: 深度信息, biz_type: 行业类型, business_area: 所在商圈, discount_num: 优惠信息数目,
    #                     distance: 离中心点距离(此结果仅在周边搜索的时候有值), email: 该POI的电子邮箱, entr_location: 入口经纬度,
    #                     exit_location: 出口经纬度, gridcode: 地理格ID, groupbuy_num: 团购数据, indoor_data: 室内地图相关数据,
    #                     indoor_map: 是否有室内地图标志, location: 经纬度, navi_poiid: 地图编号, photos: 照片相关信息,
    #                     postcode: 邮编, tag: 该POI的特色内容, tel: 该POI的电话, type: 兴趣点类型, typecode: 兴趣点类型编码,
    #                     website: 该POI的网址
    #                     """
    #                     save_dict = {}
    #                     save_dict['id'] = each_poi.get('id', '')  # id: 唯一ID
    #                     save_dict['name'] = each_poi.get('name', '')  # name: 名称
    #                     save_dict['pcode'] = each_poi.get('pcode', '')  # pcode: poi所在省份编码
    #                     save_dict['pname'] = each_poi.get('pname', '')  # pname: poi所在省份名称
    #                     save_dict['citycode'] = each_poi.get('citycode', '')  # citycode: 城市编码
    #                     save_dict['cityname'] = each_poi.get('cityname', '')  # cityname: 城市名
    #                     save_dict['adcode'] = each_poi.get('adcode', '')  # adcode: 区域编码
    #                     save_dict['adname'] = each_poi.get('adname', '')  # adname: 区域名称
    #                     save_dict['address'] = each_poi.get('address', '')  # address: 地址
    #                     save_dict['alias'] = each_poi.get('alias', '')  # alias: 别名
    #                     save_dict['biz_ext'] = each_poi.get('biz_ext', '')  # biz_ext: 深度信息
    #                     save_dict['biz_type'] = each_poi.get('biz_type', '')  # biz_type: 行业类型
    #                     save_dict['business_area'] = each_poi.get('business_area', '')  # business_area: 所在商圈
    #                     save_dict['discount_num'] = each_poi.get('discount_num', '')  # discount_num: 优惠信息数目
    #                     save_dict['email'] = each_poi.get('email', '')  # email: 该POI的电子邮箱
    #                     save_dict['entr_location'] = each_poi.get('entr_location', '')  # entr_location: 入口经纬度
    #                     save_dict['exit_location'] = each_poi.get('exit_location', '')  # exit_location: 出口经纬度
    #                     save_dict['gridcode'] = each_poi.get('gridcode', '')  # gridcode: 地理格ID
    #                     save_dict['groupbuy_num'] = each_poi.get('groupbuy_num', '')  # groupbuy_num: 团购数据
    #                     save_dict['indoor_data'] = each_poi.get('indoor_data', '')  # indoor_data: 室内地图相关数据
    #                     save_dict['indoor_map'] = each_poi.get('indoor_map', '')  # indoor_map: 是否有室内地图标志
    #                     save_dict['location'] = each_poi.get('location', '')  # location: 经纬度
    #                     save_dict['navi_poiid'] = each_poi.get('navi_poiid', '')  # navi_poiid: 地图编号
    #                     photos = each_poi.get('photos', [])  # photos: 照片相关信息
    #                     save_dict['photo_info'] = ''
    #                     for each_photo in photos:
    #                         if isinstance(each_photo.get('title', {}), dict):
    #                             each_photo['title'] = 'notitle'
    #                         save_dict['photo_info'] += '{0[title]}-{0[url]},'.format(each_photo)
    #                     save_dict['postcode'] = each_poi.get('postcode', '')  # postcode: 邮编
    #                     save_dict['tag'] = each_poi.get('tag', '')  # tag: 该POI的特色内容
    #                     save_dict['tel'] = each_poi.get('tel', '')  # tel: 该POI的电话
    #                     save_dict['type'] = each_poi.get('type', '')  # type: 兴趣点类型
    #                     save_dict['typecode'] = each_poi.get('typecode', '')  # typecode: 兴趣点类型编码
    #                     save_dict['website'] = each_poi.get('website', '')  # website: 该POI的网址
    #                     for each_key in save_dict.keys():
    #                         save_dict[each_key] = \
    #                             save_dict[each_key] if not isinstance(save_dict[each_key], dict) else ''
    #                     # 存储字段类型
    #                     # id, name, pcode, pname, citycode, cityname, adcode, adname,
    #                     # address, alias, biz_type, business_area, discount_num, email,
    #                     # entr_location, exit_location, gridcode, groupbuy_num, indoor_data,
    #                     # indoor_map, location, navi_poiid, photo_info, postcode, tag, tel, type, typecode, website,
    #                     save_info = '{0[id]}\u0001{0[name]}\u0001{0[pcode]}\u0001{0[pname]}\u0001' \
    #                                 '{0[citycode]}\u0001{0[cityname]}\u0001{0[adcode]}\u0001{0[adname]}\u0001' \
    #                                 '{0[address]}\u0001{0[alias]}\u0001{0[biz_type]}\u0001{0[business_area]}\u0001' \
    #                                 '{0[discount_num]}\u0001{0[email]}\u0001{0[entr_location]}\u0001' \
    #                                 '{0[exit_location]}\u0001' \
    #                                 '{0[gridcode]}\u0001{0[groupbuy_num]}\u0001{0[indoor_data]}\u0001' \
    #                                 '{0[indoor_map]}\u0001' \
    #                                 '{0[location]}\u0001{0[navi_poiid]}\u0001{0[photo_info]}\u0001{0[postcode]}\u0001' \
    #                                 '{0[tag]}\u0001{0[tel]}\u0001{0[type]}\u0001{0[typecode]}\u0001' \
    #                                 '{0[website]}'.format(save_dict)
    #                     save_data.append(save_info)
    #                     time.sleep(0.1)
    #             self.pipe.pipe_txt_save(save_data, filename=save_filename, savetype='a')

    def _engine_residential_area(self):
        """获取小区数据"""
        citys = self.pipe.pipe_txt_load(filename=setting.FILE_CITY_ID)
        types = self.pipe.pipe_txt_load(filename=setting.FILE_TYPE_ID)
        current_params = deepcopy(setting.PARAMS)
        current_params['key'] = setting.KEY
        # 每种类型
        for each_type in types:
            typeinfo = each_type.strip().split('\u0001')
            type_id = typeinfo[0]  # 类型id
            type_large = typeinfo[1]  # 类型大分类
            type_middle = typeinfo[2]  # 类型中分类
            type_small = typeinfo[3]  # 类型小分类
            current_params['types'] = type_id
            save_filename = '{}_{}_{}_{}.txt'.format(type_id, type_large, type_middle, type_small)
            # 每个城市
            for each_city in citys:
                cityinfo = each_city.strip().split('\u0001')
                province = cityinfo[0]  # 省名
                city_name = cityinfo[1]  # 城市名
                city_id = cityinfo[2]  # 城市id
                current_params['city'] = city_id
                current_params['page'] = 0
                save_data = []
                while True:
                    current_params['page'] += 1
                    content = self.crawl.crawl_by_get(setting.SEARCH_API, params=current_params,
                                                      retry=2, timeout=30)
                    try:
                        con = re.search(re.compile(r'<response>(.*?)</response>', re.S), content).group(1)
                        pois_list = self.analysis.analysis_by_xpath(con, xpahter=setting.XPATH_POIS)
                    except:
                        continue
                    if not pois_list:
                        break
                    for each_poi in pois_list:
                        """
                        字段说明：
                        id: 唯一ID, name: 名称, pcode: poi所在省份编码,  pname: poi所在省份名称,citycode: 城市编码, 
                        cityname: 城市名,adcode: 区域编码, adname: 区域名称,address: 地址,  alias: 别名, 
                        biz_ext: 深度信息, biz_type: 行业类型, business_area: 所在商圈, discount_num: 优惠信息数目,
                        distance: 离中心点距离(此结果仅在周边搜索的时候有值), email: 该POI的电子邮箱, entr_location: 入口经纬度,
                        exit_location: 出口经纬度, gridcode: 地理格ID, groupbuy_num: 团购数据, indoor_data: 室内地图相关数据, 
                        indoor_map: 是否有室内地图标志, location: 经纬度, navi_poiid: 地图编号, photos: 照片相关信息, 
                        postcode: 邮编, tag: 该POI的特色内容, tel: 该POI的电话, type: 兴趣点类型, typecode: 兴趣点类型编码, 
                        website: 该POI的网址
                        """
                        save_dict = self.analysis.analysis_by_xpath(each_poi, xpahter=setting.XPATH_DETAIL)
                        photos = self.analysis.analysis_by_xpath(each_poi, xpahter=setting.XPATH_PHOTOS)
                        photo_info = ''
                        for each_photo in photos:
                            photo_dict = self.analysis.analysis_by_xpath(each_photo, xpahter=setting.XPATH_PHOTO_DETAIL)
                            photo_dict['title'] = photo_dict['title'] if photo_dict['title'] else 'no_title'
                            photo_info += '{0[title]}-{0[url]},'.format(photo_dict)
                        save_dict['photo_info'] = photo_info
                        # 存储字段类型
                        # id, name, pcode, pname, citycode, cityname, adcode, adname,
                        # address, alias, biz_type, business_area, discount_num, email,
                        # entr_location, exit_location, gridcode, groupbuy_num, indoor_data,
                        # indoor_map, location, navi_poiid, photo_info, postcode, tag, tel, type, typecode, website,
                        save_info = '{0[id]}\u0001{0[name]}\u0001{0[pcode]}\u0001{0[pname]}\u0001' \
                                    '{0[citycode]}\u0001{0[cityname]}\u0001{0[adcode]}\u0001{0[adname]}\u0001' \
                                    '{0[address]}\u0001{0[alias]}\u0001{0[biz_type]}\u0001{0[business_area]}\u0001' \
                                    '{0[discount_num]}\u0001{0[email]}\u0001{0[entr_location]}\u0001' \
                                    '{0[exit_location]}\u0001' \
                                    '{0[gridcode]}\u0001{0[groupbuy_num]}\u0001{0[indoor_data]}\u0001' \
                                    '{0[indoor_map]}\u0001' \
                                    '{0[location]}\u0001{0[navi_poiid]}\u0001{0[photo_info]}\u0001{0[postcode]}\u0001' \
                                    '{0[tag]}\u0001{0[tel]}\u0001{0[type]}\u0001{0[typecode]}\u0001' \
                                    '{0[website]}'.format(save_dict)
                        save_data.append(save_info)
                        time.sleep(5)
                self.pipe.pipe_txt_save(save_data, filename=save_filename, savetype='a')

    @staticmethod
    def _engine_use_proxy():
        """
        使用代理ip
        :return: 代理ip
        """
        proxy_host = "proxy.abuyun.com"
        proxy_port = "9010"
        proxy_user = "******"
        proxy_pass = "******"
        proxy_meta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {"host": proxy_host,
                                                                     "port": proxy_port,
                                                                     "user": proxy_user,
                                                                     "pass": proxy_pass}
        proxies = {"http": proxy_meta,
                   "https": proxy_meta}

        return proxies

    def run_engine(self):
        self._engine_residential_area()

예제 #4

파일 보기

class Engine:
    def __init__(self):
        self.crawl = Crawl()
        self.analysis = Analysis()
        self.pipe = Pipeline()

    def _engine_city_link(self):
        """
        获取所有城市的名称和url链接，结果输出到file_city_list.txt文本中
        :return:
        """
        content = self.crawl.crawl_by_get(setting.START_URL, headers=setting.HEADERS, proxies=self._engine_use_proxy())
        element_city = self.analysis.analysis_by_xpath(content, setting.XPATH_CITY_A)
        city_list = []
        for each_element in element_city:
            city_name = self.analysis.analysis_by_xpath(each_element, setting.XPATH_CITY_NAME)
            city_url = self.analysis.analysis_by_xpath(each_element, setting.XPATH_CITY_URL)
            city_list.append('{}\u0001{}'.format(''.join(city_name), ''.join(city_url)))
        self.pipe.pipe_txt_save(city_list, filename=setting.FILE_CITY_LIST)

    def _engine_surround_link(self):
        """
        获取每个城市中所有的周边游玩地点的链接
        :return:
        """
        city_list = self.pipe.pipe_txt_load(filename=setting.FILE_CITY_LIST)
        for each_city in city_list:
            url = each_city.strip().split('\u0001')[1] + '-zhoubian'
            name = each_city.strip().split('\u0001')[0]
            page = 1
            maxpage = 200  # 默认最大页数
            while True:
                try:
                    next_url = url + '-2-1-{}'.format(page)
                    save_list = []
                    # 获取总页数

                    content = self.crawl.crawl_by_get(next_url, headers=setting.HEADERS,
                                                      proxies=self._engine_use_proxy(),
                                                      retry=3, timeout=15)
                    # 找到最大页数,使用map函数
                    if page == 1:
                        pagecount = map(lambda x: int(x) if x != '下一页' else -1,
                                        self.analysis.analysis_by_xpath(content, xpahter=setting.XPATH_NEXTPAGE))
                        if pagecount:
                            maxpage = max(pagecount)
                    element_li = self.analysis.analysis_by_xpath(content, xpahter=setting.XPATH_DIV)
                    if not element_li:
                        break
                    for each_ele in element_li:
                        surround_name = self.analysis.analysis_by_xpath(each_ele, xpahter=setting.XPATH_SURROUND_NAME)
                        surround_type = self.analysis.analysis_by_xpath(each_ele, xpahter=setting.XPATH_SURROUND_TYPE)
                        surround_url = self.analysis.analysis_by_xpath(each_ele, xpahter=setting.XPATH_SURROUND_URL)
                        try:
                            save_info = '{}\u0001{}\u0001{}\u0001{}'.format(name, ''.join(surround_name),
                                                                            '-'.join(surround_type),
                                                                            ''.join(surround_url))
                        except:
                            continue
                        save_list.append(save_info)
                    self.pipe.pipe_txt_save(save_list, filename=setting.FILE_SURROUND_LIST, savetype='a')
                    if page >= maxpage:
                        break
                    page += 1
                    time.sleep(0.2)
                except:
                    break

    def _engine_surround_info(self):
        """
        获取所有周边游场所详细数据
        :return:
        """
        surround_list = self.pipe.pipe_txt_load(filename=setting.FILE_SURROUND_LIST)
        for each_res in surround_list:
            try:
                # 景区数据
                surround_info = each_res.strip().split('\u0001')
                city_name = surround_info[0]
                surround_name = surround_info[1]
                surround_url = surround_info[3]
                surround_type = surround_info[2]
                find_id = re.search(re.compile(r'p-oi(\d+)-'), surround_url)
                if find_id:
                    surround_id = find_id.group(1)
                else:
                    surround_id = 0
                # 获取店铺详细信息
                content = self.crawl.crawl_by_get(surround_url, headers=setting.HEADERS, proxies=self._engine_use_proxy(),
                                                  retry=5, timeout=15)
                detail = self.analysis.analysis_by_xpath(content, xpahter=setting.XPATH_SURROUND_DETAIL)
                detail['city_name'] = city_name
                detail['surround_name'] = surround_name
                detail['surround_url'] = surround_url
                detail['surround_id'] = surround_id
                detail['surround_type'] = surround_type
                detail['get_time'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                # 字段顺序：city_name, surround_name, surround_id, surround_type
                # score, ranking, describe, address, tel, web, time, open_time, arrive,
                # ticket, travel_time, tip, surround_url, get_time
                save_data = '{0[city_name]}\u0001{0[surround_name]}\u0001{0[surround_id]}\u0001{0[surround_type]}\u0001' \
                            '{0[score]}\u0001{0[ranking]}\u0001{0[describe]}\u0001' \
                            '{0[address]}\u0001{0[tel]}\u0001{0[web]}\u0001' \
                            '{0[time]}\u0001{0[open_time]}\u0001{0[arrive]}\u0001' \
                            '{0[ticket]}\u0001{0[travel_time]}\u0001{0[tip]}\u0001' \
                            '{0[surround_url]}\u0001{0[get_time]}'.format(detail)
                self.pipe.pipe_txt_save(save_data, filename=setting.FILE_SURROUND_INFO, savetype='a')
                # self.pipe.pipe_mongo_save(detail, dbname='db_qunaer', colname='col_scenic_info')
                time.sleep(0.2)
            except:
                continue

    def _engine_surround_comments(self):
        """
        获取所有景区评论数据
        :return:
        """
        scen_list = self.pipe.pipe_txt_load(filename=setting.FILE_SURROUND_LIST)
        # 每个景区最新评论时间表
        check_dict = self.pipe.pipe_pickle_load(filename=setting.FILE_COMMENTS_CHECK)
        if not check_dict:
            check_dict = {}
        for each_res in scen_list:
            try:
                # 景区数据
                city = each_res.strip().split('\u0001')[0]
                surround = each_res.strip().split('\u0001')[1]
                surround_type = each_res.strip().split('\u0001')[2]
                surround_url = each_res.strip().split('\u0001')[3]
                find_id = re.search(re.compile(r'p-oi(\d+)-'), surround_url)
                if find_id:
                    surround_id = find_id.group(1)
                else:
                    continue
                api = setting.COMMENTS_API.format(surround_id)
                setting.HEADERS_COMMENTS['Referer'] = surround_url
                params = {
                    'page': 0,
                    'pageSize': '10',
                    'poiList': 'true',
                    'rank': 0,  # 全部评论
                    'sortField': 0  # 按照时间排序
                }
                comments_time = set([])
                current_time = check_dict.get(surround_id, '0')
                max_page = 1
                while True:
                    params['page'] += 1
                    content = self.crawl.crawl_by_get(api, headers=setting.HEADERS_COMMENTS,
                                                      proxies=self._engine_use_proxy(),
                                                      params=params, retry=3, timeout=15)
                    try:
                        content_dict = json.loads(content)
                    except:
                        break
                    if not content_dict.get('data'):
                        break
                    content_comments = content_dict.get('data')
                    # 第一遍抓取要确定评论页数
                    if params['page'] == 1:
                        page = self.analysis.analysis_by_xpath(content_comments, xpahter=setting.XPATH_COMMENTS_PAGE)
                        if page:
                            max_page = int(''.join(page))
                    elements_com = self.analysis.analysis_by_xpath(content_comments, xpahter=setting.XPATH_COMMENTS_LI)
                    if not elements_com:
                        break
                    for each_element in elements_com:
                        title = self.analysis.analysis_by_xpath(each_element, xpahter=setting.XPATH_COMMENTS_TITLE)
                        start = self.analysis.analysis_by_xpath(each_element, xpahter=setting.XPATH_COMMENTS_START)
                        nick = self.analysis.analysis_by_xpath(each_element, xpahter=setting.XPATH_COMMENTS_NICK)
                        more = self.analysis.analysis_by_xpath(each_element, xpahter=setting.XPATH_COMMENTS_MORE)
                        if more:
                            content_more = self.crawl.crawl_by_get(more[0], headers=setting.HEADERS,
                                                                   proxies=self._engine_use_proxy(), retry=2,
                                                                   timeout=15)
                            content = self.analysis.analysis_by_xpath(content_more,
                                                                      xpahter=setting.XPATH_COMMENTS_DETAIL)
                        else:
                            content = self.analysis.analysis_by_xpath(each_element,
                                                                      xpahter=setting.XPATH_COMMENTS_CONTENT)
                        date = self.analysis.analysis_by_xpath(each_element, xpahter=setting.XPATH_COMMENTS_DATE)
                        deal_content = ''.join(
                            list(map(lambda x: x.replace('\n', '').replace('\r', '').replace('\t', '').
                                     replace(' ', ''), content)))
                        if ''.join(date) > current_time:
                            commetents_info = {
                                'city': city,
                                'surround': surround,
                                'surround_id': surround_id,
                                'title': ''.join(title),
                                'nick': ''.join(nick),
                                'start': ''.join(start),
                                'content': deal_content,
                                'date': ''.join(date),
                                'get_time': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                                'url': surround_url
                            }
                            comments_time.add(''.join(date))
                            for eachkey in commetents_info.keys():
                                commetents_info[eachkey] = commetents_info[eachkey].replace('\n', '').replace('\r', '')
                            # 存储数据
                            # 字段顺序
                            # city, surround, surround_id, title, nick, start, content, date, get_time, url
                            save_data = '{0[city]}\u0001{0[surround]}\u0001{0[surround_id]}\u0001' \
                                        '{0[title]}\u0001{0[nick]}\u0001{0[start]}\u0001' \
                                        '{0[content]}\u0001{0[date]}\u0001{0[get_time]}\u0001' \
                                        '{0[url]}\u0001'.format(commetents_info)
                            self.pipe.pipe_txt_save(save_data, filename=setting.FILE_SURROUND_COMMENTS,
                                                    savetype='a')
                            # self.pipe.pipe_mongo_save(save_list, dbname='db_qunaer', colname='col_scenic_comments')

                    if params['page'] >= max_page:
                        break
                    # 当前页面没有新增评论也切换至下一店铺
                    if not len(comments_time):
                        break
                if comments_time:
                    check_dict[surround_id] = max(comments_time)
                # 抓取到的评论数据
                self.pipe.pipe_pickle_save(check_dict, filename=setting.FILE_COMMENTS_CHECK)
            except:
                continue
                # 每个店铺最新的评论时间

    @staticmethod
    def _engine_use_proxy():
        """
        使用代理ip
        :return: 代理ip
        """
        proxy_host = "proxy.abuyun.com"
        proxy_port = "9010"
        proxy_user = "******"
        proxy_pass = "******"
        proxy_meta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {"host": proxy_host,
                                                                     "port": proxy_port,
                                                                     "user": proxy_user,
                                                                     "pass": proxy_pass}
        proxies = {"http": proxy_meta,
                   "https": proxy_meta}

        return proxies

    def start_engine(self):
        self._engine_city_link()
        self._engine_surround_link()
        self._engine_surround_info()
        self._engine_surround_comments()

예제 #5

파일 보기

파일: engine.py 프로젝트: myirelias/TransferStation

class Engine:
    def __init__(self):
        self.crawl = Crawl()
        self.analysis = Analysis()
        self.pipe = Pipeline()

    def _engine_city_link(self):
        """
        获取所有城市的名称和url链接，结果输出到file_city_list.txt文本中
        :return:
        """
        content = self.crawl.crawl_by_get(setting.START_URL,
                                          headers=setting.HEADERS,
                                          proxies=self._engine_use_proxy())
        element_city = self.analysis.analysis_by_xpath(content,
                                                       setting.XPATH_CITY_A)
        city_list = []
        for each_element in element_city:
            city_name = self.analysis.analysis_by_xpath(
                each_element, setting.XPATH_CITY_NAME)
            city_url = self.analysis.analysis_by_xpath(each_element,
                                                       setting.XPATH_CITY_URL)
            city_list.append('{}\u0001{}'.format(''.join(city_name),
                                                 ''.join(city_url)))
        self.pipe.pipe_txt_save(city_list, filename=setting.FILE_CITY_LIST)

    def _engine_scenic_link(self):
        """
        获取每个城市中所有的热门景点的链接
        :return:
        """
        city_list = self.pipe.pipe_txt_load(filename=setting.FILE_CITY_LIST)
        for each_city in city_list:
            url = each_city.strip().split('\u0001')[1] + '-jingdian'
            city_name = each_city.strip().split('\u0001')[0]
            content = self.crawl.crawl_by_get(url,
                                              headers=setting.HEADERS,
                                              proxies=self._engine_use_proxy(),
                                              retry=2,
                                              timeout=15)
            element_a = self.analysis.analysis_by_xpath(
                content, xpahter=setting.XPATH_HOT_A)
            save_list = []
            for each_ele in element_a:
                scenic_full_name = self.analysis.analysis_by_xpath(
                    each_ele, xpahter=setting.XPATH_HOT_NAME)
                current_url = self.analysis.analysis_by_xpath(
                    each_ele, xpahter=setting.XPATH_HOT_HREF)
                scenic_name = ''.join(scenic_full_name).replace('旅游攻略', '')
                scenic_url = ''.join(current_url)
                scenic_id = re.search(re.compile(r'p-oi(\d+)-'),
                                      scenic_url).group(1)
                # 存储字段
                # city_name, scenic_id, scenic_name, scenic_url
                save_info = '{}\u0001{}\u0001{}\u0001{}'.format(
                    city_name, scenic_id, scenic_name, scenic_url)
                save_list.append(save_info)
            self.pipe.pipe_txt_save(save_list,
                                    filename=setting.FILE_SCENIC_LIST,
                                    savetype='a')

    @staticmethod
    def _engine_use_proxy():
        """
        使用代理ip
        :return: 代理ip
        """
        proxy_host = "proxy.abuyun.com"
        proxy_port = "9010"
        proxy_user = "******"
        proxy_pass = "******"
        proxy_meta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
            "host": proxy_host,
            "port": proxy_port,
            "user": proxy_user,
            "pass": proxy_pass
        }
        proxies = {"http": proxy_meta, "https": proxy_meta}

        return proxies

    def start_engine(self):
        self._engine_city_link()
        self._engine_scenic_link()

예제 #6

파일 보기

class Engine:
    def __init__(self):
        self.crawl = Crawl()
        self.pipe = Pipeline()
        self.analysis = Analysis()

    def _engine_search_by_city(self):
        """指定城市检索关键字数据"""
        city_id = self.pipe.pipe_txt_load(filename=setting.FILE_CITY_ID)
        history_id = list(
            map(lambda x: x.strip(),
                self.pipe.pipe_txt_load(filename=setting.FILE_HISTORY_ID)))
        current_params = deepcopy(setting.PARAMS)
        current_params['ak'] = setting.KEY
        for k, v in setting.QUERY_DICT.items():
            filename = 'baidu_{}.txt'.format(k)
            for query in v.get('query'):
                current_params['query'] = query  # 检索内容
                for each_city in city_id:
                    current_params['page_num'] = 0
                    citycode = each_city.strip().split('\u0001')[1]
                    current_params['region'] = citycode  # citycode,检索行政区域
                    while True:
                        time.sleep(0.2)
                        # 每种类型
                        current_params['page_num'] += 1
                        content = self.crawl.crawl_by_get(
                            setting.SEARCH_API,
                            params=current_params,
                            retry=2,
                            timeout=20)
                        try:
                            content_dict = json.loads(content)
                        except:
                            continue
                        results = content_dict.get('results', [])
                        if not results:
                            break
                        for each in results:
                            """
                            字段说明：
                            uid: 唯一标识, name: 名称, address: 地址, province: 所在省, city: 所在城市, area: 所在区域, 
                            street_id: 街道id, location: 地图坐标
                            tag: 标签类型, type: 类型, detail_url: 详情url,
                            """
                            # 存储数据
                            # uid, name, address, province, city, area, street_id, location
                            # (detail_info) tag, type, detail_url,
                            lat = each.get('location', {}).get('lat', 0)
                            lng = each.get('location', {}).get('lng', 0)
                            tag = each.get('detail_info', {}).get('tag', '')
                            uid = each.get('uid', '')
                            if uid in history_id:
                                continue
                            check_tag = tag.split(';')[0]
                            # 过滤一下，如果抓取到的数据不存在标签也默认为是正确的数据
                            if check_tag in v.get('tag') or check_tag == '':
                                save_dict = {
                                    'uid':
                                    each.get('uid', ''),
                                    'name':
                                    each.get('name', ''),
                                    'address':
                                    each.get('address', ''),
                                    'province':
                                    each.get('province', ''),
                                    'city':
                                    each.get('city', ''),
                                    'area':
                                    each.get('area', ''),
                                    'street_id':
                                    each.get('street_id', ''),
                                    'location':
                                    '{},{}'.format(lat, lng),
                                    'tag':
                                    tag,
                                    'type':
                                    each.get('detail_info',
                                             {}).get('type', ''),
                                    'detail_url':
                                    each.get('detail_info',
                                             {}).get('detail_url', '')
                                }
                                save_info = '{0[uid]}\u0001{0[name]}\u0001{0[address]}\u0001' \
                                            '{0[province]}\u0001{0[city]}\u0001{0[area]}\u0001' \
                                            '{0[street_id]}\u0001{0[location]}\u0001' \
                                            '{0[tag]}\u0001' \
                                            '{0[type]}\u0001{0[detail_url]}'.format(save_dict)
                                self.pipe.pipe_txt_save(
                                    uid,
                                    filename=setting.FILE_HISTORY_ID,
                                    savetype='a')
                                self.pipe.pipe_txt_save(save_info,
                                                        filename=filename,
                                                        savetype='a')

    def _engine_search_by_location(self):
        """
        指定坐标点检索关键字数据
        所有坐标数据来自 _engine_search_by_city 模块根据城市检索关键字的数据
        此模块开发原因是百度返回数据量只有400，想通过坐标获取更多数据
        :return:
        """

        city_name = list(
            map(lambda x: x.strip().split('\u0001')[1],
                self.pipe.pipe_txt_load(filename=setting.FILE_CITY_ID)))
        location_list = self._engine_all_location()
        history_id = list(
            map(lambda x: x.strip(),
                self.pipe.pipe_txt_load(filename=setting.FILE_HISTORY_ID)))
        current_params = deepcopy(setting.PARAMS)
        current_params['ak'] = setting.KEY
        for k, v in setting.QUERY_DICT.items():
            filename = 'baidu_{}.txt'.format(k)
            for query in v.get('query'):
                current_params['query'] = query  # 检索内容
                for each_location in location_list:
                    current_params['page_num'] = 0
                    current_params['location'] = each_location  # 检索坐标
                    while True:
                        time.sleep(0.2)
                        # 每种类型
                        current_params['page_num'] += 1
                        content = self.crawl.crawl_by_get(
                            setting.SEARCH_API,
                            params=current_params,
                            retry=2,
                            timeout=20)
                        try:
                            content_dict = json.loads(content)
                        except:
                            continue
                        results = content_dict.get('results', [])
                        if not results:
                            break
                        for each in results:
                            """
                            字段说明：
                            uid: 唯一标识, name: 名称, address: 地址, province: 所在省, city: 所在城市, area: 所在区域, 
                            street_id: 街道id, location: 地图坐标
                            tag: 标签类型, type: 类型, detail_url: 详情url,
                            """
                            # 存储数据
                            # uid, name, address, province, city, area, street_id, location
                            # (detail_info) tag, type, detail_url,
                            area = each.get('area', '')
                            if area not in city_name:  # 根绝坐标点抓取数据可能会超出目前限制的大成都范围，所以限制个区域吧
                                continue
                            lat = each.get('location', {}).get('lat', 0)
                            lng = each.get('location', {}).get('lng', 0)
                            tag = each.get('detail_info', {}).get('tag', '')
                            uid = each.get('uid', '')
                            if uid in history_id:
                                continue
                            check_tag = tag.split(';')[0]
                            # 过滤一下，如果抓取到的数据不存在标签也默认为是正确的数据
                            if check_tag in v.get('tag') or check_tag == '':
                                save_dict = {
                                    'uid':
                                    each.get('uid', ''),
                                    'name':
                                    each.get('name', ''),
                                    'address':
                                    each.get('address', ''),
                                    'province':
                                    each.get('province', ''),
                                    'city':
                                    each.get('city', ''),
                                    'area':
                                    each.get('area', ''),
                                    'street_id':
                                    each.get('street_id', ''),
                                    'location':
                                    '{},{}'.format(lat, lng),
                                    'tag':
                                    tag,
                                    'type':
                                    each.get('detail_info',
                                             {}).get('type', ''),
                                    'detail_url':
                                    each.get('detail_info',
                                             {}).get('detail_url', '')
                                }
                                save_info = '{0[uid]}\u0001{0[name]}\u0001{0[address]}\u0001' \
                                            '{0[province]}\u0001{0[city]}\u0001{0[area]}\u0001' \
                                            '{0[street_id]}\u0001{0[location]}\u0001' \
                                            '{0[tag]}\u0001' \
                                            '{0[type]}\u0001{0[detail_url]}'.format(save_dict)
                                self.pipe.pipe_txt_save(
                                    uid,
                                    filename=setting.FILE_HISTORY_ID,
                                    savetype='a')
                                self.pipe.pipe_txt_save(save_info,
                                                        filename=filename,
                                                        savetype='a')

    def _engine_all_location(self):
        """
        获取所有坐标点
        :return:
        """
        all_location = []
        for k, v in setting.QUERY_DICT.items():
            filename = 'baidu_{}.txt'.format(k)
            area_list = self.pipe.pipe_txt_load(filename=filename)
            if not area_list:
                continue
            all_location.extend(
                list(map(lambda x: x.strip().split('\u0001')[7], area_list)))
        return set(all_location)

    @staticmethod
    def _engine_use_proxy():
        """
        使用代理ip
        :return: 代理ip
        """
        proxy_host = "proxy.abuyun.com"
        proxy_port = "9010"
        proxy_user = "******"
        proxy_pass = "******"
        proxy_meta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
            "host": proxy_host,
            "port": proxy_port,
            "user": proxy_user,
            "pass": proxy_pass
        }
        proxies = {"http": proxy_meta, "https": proxy_meta}

        return proxies

    def run_engine(self):
        while True:
            self._engine_search_by_city()
            self._engine_search_by_location()
            nowtime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            save_log = []
            for k, v in setting.QUERY_DICT.items():
                filename = 'baidu_{}.txt'.format(k)
                save_log.append('[{}] {}: {} 条'.format(
                    nowtime, k,
                    len(self.pipe.pipe_txt_load(filename=filename))))
            save_log.append('=' * 30)
            self.pipe.pipe_txt_save(save_log,
                                    filename=setting.FILE_LOG_INFO,
                                    savetype='a')

예제 #7

파일 보기

class Engine:
    def __init__(self):
        self.crawl = Crawl()
        self.analysis = Analysis()
        self.pipe = Pipeline()
        self._use_log()
        try:
            self.args_dict = eval(sys.argv[1:])
            if not isinstance(self.args_dict, dict):
                raise ValueError('args must be like key-value ')
        except Exception as e:
            self.args_dict = {}
            logging.warning('get args failed:{}'.format(e))
        self.proxies = self.args_dict.get('proxies')  # 代理配置
        self.hdfs = self.args_dict.get('hdfs', {})  # hdfs配置
        # 如果没有这两个参数 直接报异常 不执行
        if not self.hdfs or not self.proxies:
            raise ValueError('args not have hdfs or proxies')
        self.sleep_time = self.args_dict.get('sleep_time', 0.2)  # 休眠时间
        self.service_args = self.args_dict.get('service_args',
                                               {})  # PhantomJS代理配置
        self.aliyun_log = self.args_dict.get('aliyun_log', {})
        self.alilog = AliyunLog(
            '{}_{}'.format(setting.OTA_NAME, setting.CATEGORY_NAME),
            endp=self.aliyun_log.get('endpoint', endpoint),
            accid=self.aliyun_log.get('accessKeyId', accessKeyId),
            acckey=self.aliyun_log.get('accessKey', accessKey),
            proj=self.aliyun_log.get('project', project),
            logst=self.aliyun_log.get('logstore',
                                      logstore))  # 阿里云log配置文件,需要校验如果没有该参数会不会报错
        try:
            self.HDFS = HDFileSystem(host=self.hdfs.get(
                'ip', '192.168.100.178'),
                                     port=self.hdfs.get('port', 8020))
        except:
            pass

    def _engine_city_link(self):
        """
        获取所有城市的名称和url链接，结果输出到file_city_list.txt文本中
        :return:
        """
        content = self.crawl.crawl_by_get(setting.START_URL,
                                          headers=setting.HEADERS,
                                          proxies=self.proxies)
        element_city = self.analysis.analysis_by_xpath(content,
                                                       setting.XPATH_CITY_A)
        city_list = []
        for each_element in element_city:
            city_name = self.analysis.analysis_by_xpath(
                each_element, setting.XPATH_CITY_NAME)
            city_url = self.analysis.analysis_by_xpath(each_element,
                                                       setting.XPATH_CITY_URL)
            city_list.append('{}\u0001{}'.format(''.join(city_name),
                                                 ''.join(city_url)))
        self.pipe.pipe_txt_save(city_list, filename=setting.FILE_CITY_LIST)

    def _engine_restaurant_link(self):
        """
        获取每个城市中所有的美食店铺的链接
        抓取之前获取当前已抓取的美食店铺id，当前抓取的id或进行校验是否为新增
        新增数据则存入到对应的TEMP文件中，最后本次循化完毕后，统一推送新增数据到HDFS
        本次循化所有模块执行完毕后，新增数据要追加入历史数据中，追加成功后修改新增数据文件名称，以便后面的新增文件不与前一次数据冲突
        修改新政文件名称时候使用完成抓取当日的日期作为文件名称前缀
        :return:
        """
        city_list = self.pipe.pipe_txt_load(filename=setting.FILE_CITY_LIST)
        # 获取已经抓取店铺id，便于识别新增数据
        history_restautrant = self.pipe.pipe_txt_load(
            filename=setting.FILE_RESTAURANT_LIST)
        history_id = set(
            map(lambda x: x.strip().split('\u0001')[2],
                [each for each in history_restautrant]))
        for each_city in set(city_list):
            # try:
            url = each_city.strip().split('\u0001')[1] + '-meishi'
            name = each_city.strip().split('\u0001')[0]
            params_city = {'page': 0}
            maxpage = 200  # 默认最大页数
            while True:
                save_list = []
                params_city['page'] += 1
                content = self.crawl.crawl_by_get(url,
                                                  headers=setting.HEADERS,
                                                  params=params_city,
                                                  proxies=self.proxies,
                                                  retry=5)
                if not content:
                    break
                element_li = self.analysis.analysis_by_xpath(
                    content, xpahter=setting.XPATH_LI)
                if not element_li:
                    break
                for each_ele in element_li:
                    restaurant_name = self.analysis.analysis_by_xpath(
                        each_ele, xpahter=setting.XPATH_RES_NAME)
                    restaurant_type = self.analysis.analysis_by_xpath(
                        each_ele, xpahter=setting.XPATH_RES_TYPE)
                    restaurant_url = self.analysis.analysis_by_xpath(
                        each_ele, xpahter=setting.XPATH_RES_URL)
                    current_id = re.search(re.compile(r'p-oi(\d+)-'),
                                           ''.join(restaurant_url)).group(1)
                    if current_id in history_id:
                        continue
                    else:
                        history_id.add(current_id)
                    try:
                        # 存储字段
                        # name, restaurant_name, current_id, restaurant_type，, restaurant_url
                        save_info = '{}\u0001{}\u0001{}\u0001{}\u0001{}'.format(
                            name, ''.join(restaurant_name), current_id,
                            ''.join(restaurant_type), ''.join(restaurant_url))
                    except Exception as e:
                        self.alilog.warning('[list] {}'.format(e))
                        continue
                    save_list.append(save_info)
                if save_list:
                    self.pipe.pipe_txt_save(
                        save_list,
                        filename=setting.TEMP_RESTAURANT_LIST,
                        savetype='a')
                if params_city['page'] >= maxpage:
                    break
                time.sleep(self.sleep_time)
                # except:
                #     continue

    def _engine_restaurant_info(self):
        """
        获取所有餐厅详细数据
        :return:
        """
        res_list = self.pipe.pipe_txt_load(
            filename=setting.FILE_RESTAURANT_LIST)
        temp_list = self.pipe.pipe_txt_load(
            filename=setting.TEMP_RESTAURANT_LIST)
        res_list.extend(temp_list)
        history_restautrant = self.pipe.pipe_txt_load(
            filename=setting.FILE_RESTAURANT_INFO)
        history_id = set(
            map(lambda x: x.strip().split('\u0001')[2],
                [each for each in history_restautrant]))
        for each_res in set(res_list):
            try:
                # 店铺数据
                res_info = each_res.strip().split('\u0001')
                city_name = res_info[0]
                res_name = res_info[1]
                res_id = res_info[2]
                if res_id in history_id:
                    continue
                else:
                    history_id.add(res_id)
                res_type = res_info[3]
                res_url = res_info[4]
                # 获取店铺详细信息
                content = self.crawl.crawl_by_get(res_url,
                                                  headers=setting.HEADERS,
                                                  proxies=self.proxies,
                                                  retry=5,
                                                  timeout=10)
                detail = self.analysis.analysis_by_xpath(
                    content, xpahter=setting.XPATH_RES_DETAIL)
                detail['city_name'] = city_name
                detail['restaurant_name'] = res_name
                detail['restaurant_type'] = res_type
                detail['restaurant_url'] = res_url
                detail['restaurant_id'] = res_id
                detail['get_time'] = datetime.datetime.now().strftime(
                    '%Y-%m-%d %H:%M:%S')
                # 构建存储的数据
                # 字段：
                # city_name, restaurant_name, restaurant_id, restaurant_type,
                # score, ranking, price, describe, address, tel, open_time, dish, arrive, intro, restaurant_url,
                # get_time datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                savedata = '{0[city_name]}\u0001{0[restaurant_name]}\u0001{0[restaurant_id]}\u0001' \
                           '{0[restaurant_type]}\u0001{0[score]}\u0001{0[ranking]}\u0001{0[price]}\u0001' \
                           '{0[describe]}\u0001{0[address]}\u0001{0[tel]}\u0001{0[open_time]}\u0001' \
                           '{0[dish]}\u0001{0[arrive]}\u0001{0[intro]}\u0001{0[restaurant_url]}\u0001' \
                           '{0[get_time]}'.format(detail)
                self.pipe.pipe_txt_save(savedata,
                                        filename=setting.TEMP_RESTAURANT_INFO,
                                        savetype='a')
                time.sleep(self.sleep_time)
            except Exception as e:
                self.alilog.warning('[detail] {}'.format(e))
                continue

    def _engine_restaurant_comments(self):
        """
        获取所有餐厅评论数据
        :return:
        """
        res_list = self.pipe.pipe_txt_load(
            filename=setting.FILE_RESTAURANT_LIST)
        temp_list = self.pipe.pipe_txt_load(
            filename=setting.TEMP_RESTAURANT_LIST)
        res_list.extend(temp_list)
        # 每个店铺最新评论时间表
        check_dict = self.pipe.pipe_pickle_load(
            filename=setting.FILE_COMMENTS_CHECK)
        if not check_dict:
            check_dict = {}
        for each_res in res_list:
            try:
                # 店铺数据
                city = each_res.strip().split('\u0001')[0]
                food = each_res.strip().split('\u0001')[1]
                res_id = each_res.strip().split('\u0001')[2]
                type = each_res.strip().split('\u0001')[3]
                res_url = each_res.strip().split('\u0001')[4]
                api = setting.COMMENTS_API.format(res_id)
                setting.HEADERS_COMMENTS['Referer'] = res_url
                params = {
                    'page': 0,
                    'pageSize': '10',
                    'poiList': 'true',
                    'rank': 0,  # 全部评论
                    'sortField': 0  # 按照时间排序
                }
                comments_time = set([])
                current_time = check_dict.get(res_id, '0')
                while True:
                    time.sleep(self.sleep_time)
                    try:
                        params['page'] += 1
                        content = self.crawl.crawl_by_get(
                            api,
                            headers=setting.HEADERS_COMMENTS,
                            proxies=self.proxies,
                            params=params,
                            retry=3,
                            timeout=20)
                        content_dict = json.loads(content)
                        if not content_dict.get('data'):
                            break
                        content_comments = content_dict.get('data')
                        elements_com = self.analysis.analysis_by_xpath(
                            content_comments,
                            xpahter=setting.XPATH_COMMENTS_LI)
                        if not elements_com:
                            break
                        for each_element in elements_com:
                            title = self.analysis.analysis_by_xpath(
                                each_element,
                                xpahter=setting.XPATH_COMMENTS_TITLE)
                            start = self.analysis.analysis_by_xpath(
                                each_element,
                                xpahter=setting.XPATH_COMMENTS_START)
                            nick = self.analysis.analysis_by_xpath(
                                each_element,
                                xpahter=setting.XPATH_COMMENTS_NICK)
                            more = self.analysis.analysis_by_xpath(
                                each_element,
                                xpahter=setting.XPATH_COMMENTS_MORE)
                            if more:
                                content_more = self.crawl.crawl_by_get(
                                    more[0],
                                    headers=setting.HEADERS,
                                    proxies=self.proxies)
                                content = self.analysis.analysis_by_xpath(
                                    content_more,
                                    xpahter=setting.XPATH_COMMENTS_DETAIL)
                            else:
                                content = self.analysis.analysis_by_xpath(
                                    each_element,
                                    xpahter=setting.XPATH_COMMENTS_CONTENT)
                            date = self.analysis.analysis_by_xpath(
                                each_element,
                                xpahter=setting.XPATH_COMMENTS_DATE)
                            try:
                                deal_content = ''.join(
                                    list(
                                        map(
                                            lambda x: x.replace('\n', '').
                                            replace('\r', '').replace(
                                                '\t', '').replace(' ', ''),
                                            content)))
                            except:
                                self.alilog.info(
                                    '[review] have no deal_content')
                                deal_content = ''
                            if ''.join(date) > current_time:
                                commetents_info = {
                                    'city':
                                    city,
                                    'food':
                                    food,
                                    'food_id':
                                    res_id,
                                    'type':
                                    type,
                                    'title':
                                    ''.join(title),
                                    'nick':
                                    ''.join(nick),
                                    'start':
                                    ''.join(start),
                                    'content':
                                    deal_content,
                                    'date':
                                    ''.join(date),
                                    'get_time':
                                    datetime.datetime.now().strftime(
                                        '%Y-%m-%d %H:%M:%S'),
                                    'url':
                                    res_url
                                }
                                for eachkey in commetents_info.keys():
                                    commetents_info[eachkey] = commetents_info[
                                        eachkey].replace('\n',
                                                         '').replace('\r', '')
                                # 存储数据
                                # 字段顺序：city, food, food_id, type, title, nick, start, content, date, get_time, url
                                save_info = '{0[city]}\u0001{0[food]}\u0001{0[food_id]}\u0001' \
                                            '{0[type]}\u0001{0[title]}\u0001{0[nick]}\u0001' \
                                            '{0[start]}\u0001{0[content]}\u0001{0[date]}\u0001' \
                                            '{0[get_time]}\u0001{0[url]}'.format(commetents_info)

                                self.pipe.pipe_txt_save(
                                    save_info,
                                    filename=setting.TEMP_RESTAURANT_COMMENTS,
                                    savetype='a')
                                comments_time.add(''.join(date))
                        # 当前页面没有新增评论也切换至下一店铺
                        if not len(comments_time):
                            break
                    except Exception as e:
                        self.alilog.warning('[review] {}'.format(e))
                        break
                # 每个店铺最新的评论时间
                if comments_time:
                    check_dict[res_id] = max(comments_time)
                    # 抓取到的评论数据
                self.pipe.pipe_pickle_save(
                    check_dict, filename=setting.FILE_COMMENTS_CHECK)
            except Exception as e:
                self.alilog.warning('[review] {}'.format(e))
                continue

    def _engine_restaurant_link_by_args(self):
        """
        根据配置参数来进行抓取，从该模块提供参数的接口
        :return:
        """
        # 传入的参数中是否有dist参数,此处暂时默认arg_dist为一个字符串参数，实际是一个列表
        arg_dist = self.args_dict.get('dist', [])
        # 如果没该参数，则全部抓取所有城市数据
        if not arg_dist:
            self._engine_restaurant_link()
        else:
            try:
                city_dict = eval(
                    self.pipe.pipe_txt_load(
                        filename='./DATA/file_city_dict.txt'))
            except Exception as e:
                logging.warning('get city dict error: {}'.format(e))
            # 假设此处获取到了待抓取的url
            prov = arg_dist[0]  # 省
            city = arg_dist[1]  # 市
            area = arg_dist[2]  # 县

            city_dict = {
                '四川省': {
                    '成都市': {
                        '': 'http1'
                    },
                    '德阳市': {
                        '': 'http2'
                    },
                    '眉山市': {
                        '': 'http3'
                    },
                    '人寿市': {
                        '': 'http4'
                    },
                }
            }
            if prov and city and area:
                current_list = city_dict.get(prov, {}).get(city,
                                                           {}).get(area, '')
                city_list = [current_list]
            elif prov and city and not area:
                current_list = city_dict.get(prov, {}).get(city, {})
                city_list = set([])
                for name, url in current_list.items():
                    city_list.add(url)
            elif prov and not city and not area:
                current_list = city_dict.get(prov, {})
                city_list = set([])
                for eachkey in current_list.keys():
                    for url in current_list[eachkey].values():
                        city_list.add(url)
            else:
                raise ValueError('args_dist error')

            # 获取已经抓取店铺id，便于识别新增数据
            history_restautrant = self.pipe.pipe_txt_load(
                filename=setting.FILE_RESTAURANT_LIST)
            history_id = set(
                map(lambda x: x.strip().split('\u0001')[2],
                    [each for each in history_restautrant]))
            for each_city in set(city_list):
                # try:
                url = each_city.strip().split('\u0001')[1] + '-meishi'
                name = each_city.strip().split('\u0001')[0]
                params_city = {'page': 0}
                maxpage = 200  # 默认最大页数
                while True:
                    save_list = []
                    params_city['page'] += 1
                    content = self.crawl.crawl_by_get(url,
                                                      headers=setting.HEADERS,
                                                      params=params_city,
                                                      proxies=self.proxies,
                                                      retry=5)
                    if not content:
                        break
                    element_li = self.analysis.analysis_by_xpath(
                        content, xpahter=setting.XPATH_LI)
                    if not element_li:
                        break
                    for each_ele in element_li:
                        restaurant_name = self.analysis.analysis_by_xpath(
                            each_ele, xpahter=setting.XPATH_RES_NAME)
                        restaurant_type = self.analysis.analysis_by_xpath(
                            each_ele, xpahter=setting.XPATH_RES_TYPE)
                        restaurant_url = self.analysis.analysis_by_xpath(
                            each_ele, xpahter=setting.XPATH_RES_URL)
                        current_id = re.search(
                            re.compile(r'p-oi(\d+)-'),
                            ''.join(restaurant_url)).group(1)
                        if current_id in history_id:
                            continue
                        else:
                            history_id.add(current_id)
                        try:
                            # 存储字段
                            # name, restaurant_name, current_id, restaurant_type，, restaurant_url
                            save_info = '{}\u0001{}\u0001{}\u0001{}\u0001{}'.format(
                                name, ''.join(restaurant_name), current_id,
                                ''.join(restaurant_type),
                                ''.join(restaurant_url))
                        except Exception as e:
                            self.alilog.warning('[list] {}'.format(e))
                            continue
                        save_list.append(save_info)
                    if save_list:
                        self.pipe.pipe_txt_save(
                            save_list,
                            filename=setting.TEMP_RESTAURANT_LIST,
                            savetype='a')
                    if params_city['page'] >= maxpage:
                        break
                    time.sleep(self.sleep_time)
                    # except:
                    #     continue

    def _temp_city_info(self, cityname):
        """
        做22项数据处理时临时用
        :return:
        """
        citylist = self.pipe.pipe_txt_load(filename='city_list_total.txt')
        city_params = {
            '国别': '&',
            '省自治区全称': '&',
            '省自治区简称': '&',
            '市州全称': '&',
            '市州简称': '&',
            '区县全称': '&',
            '区县简称': '&',
            '地区编码': '&',
            '等级': '&'
        }
        spec_city = {
            '北京': '110000',
            '天津': '120000',
            '上海': '310000',
            '重庆': '500000'
        }
        for each in citylist:
            cityinfo = each.split('\u0001')
            if cityname in cityinfo:
                site = cityinfo.index(cityname)
                if site == 4 or site == 5:
                    city_params['国别'] = 'CN'
                    city_params['省自治区全称'] = cityinfo[0].strip()
                    city_params['省自治区简称'] = cityinfo[1].strip()
                    city_params['市州全称'] = cityinfo[2].strip()
                    city_params['市州简称'] = cityinfo[3].strip()
                    city_params['区县全称'] = cityinfo[4].strip()
                    city_params['区县简称'] = cityinfo[5].strip()
                    city_params['地区编码'] = cityinfo[-1].strip()
                    city_params['等级'] = '区县级'
                elif site == 2 or site == 3:
                    city_params['国别'] = 'CN'
                    city_params['省自治区全称'] = cityinfo[0].strip()
                    city_params['省自治区简称'] = cityinfo[1].strip()
                    city_params['市州全称'] = cityinfo[2].strip()
                    city_params['市州简称'] = cityinfo[3].strip()
                    city_params['地区编码'] = cityinfo[-1].strip()[:-2] + '00'
                    city_params['等级'] = '地市级'
                elif cityname in ['北京', '重庆', '上海', '天津']:
                    city_params['国别'] = 'CN'
                    city_params['省自治区全称'] = cityname + '市'
                    city_params['省自治区简称'] = cityname
                    city_params['市州全称'] = cityname + '市'
                    city_params['市州简称'] = cityname
                    city_params['地区编码'] = spec_city[cityname]
                    city_params['等级'] = '直辖'
                break

        return city_params

    @staticmethod
    def _engine_use_proxy():
        """
        使用代理ip
        :return: 代理ip
        """
        proxy_host = "****"
        proxy_port = "****"
        proxy_user = "******"
        proxy_pass = "******"
        proxy_meta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
            "host": proxy_host,
            "port": proxy_port,
            "user": proxy_user,
            "pass": proxy_pass
        }
        proxies = {"http": proxy_meta, "https": proxy_meta}

        return proxies

    # 集群操作
    def _engine_push_hdfs(self, filename):
        try:
            if os.path.exists('DATA/' + filename):
                # HDFS.put(当前文件，目标文件)
                self.HDFS.put('DATA/' + filename,
                              '/user/spider/everyday/{}'.format(filename))
            # 推送备份数据
            for eachfile in [
                    setting.FILE_RESTAURANT_LIST, setting.FILE_RESTAURANT_INFO,
                    setting.FILE_RESTAURANT_COMMENTS
            ]:
                if os.path.exists('DATA/' + eachfile):
                    # HDFS.put(当前文件，目标文件)
                    self.HDFS.put(
                        'DATA/' + eachfile,
                        '/user/spider/xieyangjie/Qunar/{}'.format(eachfile))
        except Exception as e:
            print('集群挂了', e)

    @staticmethod
    def _use_log(LOGFMT=None, DATEFMT=None):
        """
        本地日志记录
        该模块用于配置日志记录格式以及存储位置等
        :return:
        """
        LOGFMT = "%(asctime)s - %(levelname)s - %(message)s"
        DATEFMT = "%Y/%m/%d %H:%M:%S"
        logging.basicConfig(filename='./logbag/{}_{}_{}.log'.format(
            setting.OTA_NAME, setting.CATEGORY_NAME,
            datetime.datetime.today().strftime('%Y%m%d')),
                            format=LOGFMT,
                            datefmt=DATEFMT,
                            level=logging.INFO)

    def start_engine(self):
        self.alilog.debug('script {}_{} running'.format(
            setting.OTA_NAME, setting.CATEGORY_NAME))
        self._engine_restaurant_link_by_args()
        return
        self._engine_restaurant_link_by_args()
        self._engine_restaurant_info()
        self._engine_restaurant_comments()
        logging.info('{}_{} spider running'.format(setting.OTA_NAME,
                                                   setting.CATEGORY_NAME))
        try:
            self._engine_city_link()
            self.alilog.debug('script {}_{} running'.format(
                setting.OTA_NAME, setting.CATEGORY_NAME))
            while True:
                self._engine_restaurant_link()
                self._engine_restaurant_info()
                self._engine_restaurant_comments()
                current_time = datetime.datetime.now().strftime('%Y-%m-%d')
                file_dict = {
                    setting.FILE_RESTAURANT_LIST:
                    setting.TEMP_RESTAURANT_LIST,
                    setting.FILE_RESTAURANT_INFO:
                    setting.TEMP_RESTAURANT_INFO,
                    setting.FILE_RESTAURANT_COMMENTS:
                    setting.TEMP_RESTAURANT_COMMENTS
                }
                for f, t in file_dict.items():
                    newname = 'qunar{}({}).txt'.format(t[4:-4], current_time)
                    if os.path.exists('DATA/{}'.format(f)):
                        temp = self.pipe.pipe_txt_load(filename=t)
                        if temp:
                            self.pipe.pipe_txt_save(list(
                                map(lambda x: x.strip(), temp)),
                                                    filename=f,
                                                    savetype='a')
                            os.rename('DATA/{}'.format(t),
                                      'DATA/{}'.format(newname))
                        else:
                            self.pipe.pipe_txt_save('', filename=newname)
                    else:
                        os.rename('DATA/{}'.format(t), 'DATA/{}'.format(f))
                        shutil.copy('DATA/{}'.format(f),
                                    'DATA/{}'.format(newname))
                    self._engine_push_hdfs(newname)
                self.alilog.debug('script {}_{} finish'.format(
                    setting.OTA_NAME, setting.CATEGORY_NAME))
        except Exception as e:
            self.alilog.error('script {}_{} error {}'.format(
                setting.OTA_NAME, setting.CATEGORY_NAME, e))

예제 #8

파일 보기

class Engine(object):
    def __init__(self):
        self.crawl = Crawl()
        self.analysis = Analysis()
        self.pipe = Pipeline()

    def _engine_get_citylist(self):
        """
        获取城市列表，包括城市的url及城市名称
        :return:
        """
        content = self.crawl.crawl_by_get(setting.START_URL,
                                          headers=setting.HEADERS)
        res = self.analysis.analysis_by_xpath(content,
                                              xpahter=setting.XPATH_CITYLIST_A)
        saveinfo = set([])
        for each in res:
            cityname = self.analysis.analysis_by_xpath(
                each, xpahter=setting.XPATH_TEXT)
            cityhref = self.analysis.analysis_by_xpath(
                each, xpahter=setting.XPATH_HREF)
            citylink = setting.START_URL + cityhref[0][1:]
            try:
                savelist = '{}\u0001{}'.format(cityname[0], citylink)
                saveinfo.add(savelist)
            except:
                continue
        self.pipe.pipe_txt_save(saveinfo,
                                filename=setting.FILE_CITY_LIST,
                                savetype='w')

    def _engine_get_touristlist(self):
        """
        获取所有景区链接以及id
        :return:
        """
        # 清空文本
        self.pipe.pipe_remove_file(setting.FILE_TOURIST_LIST)
        citylist = self.pipe.pipe_txt_load(filename=setting.FILE_CITY_LIST,
                                           loadtype='r')
        for eachcity in citylist:
            try:
                saveinfo = set([])
                params = {
                    'from': 'mpshouye_hotdest_more',
                    'keyword': '柳州',
                    'page': 1
                }
                cityname = eachcity.strip().split('\u0001')[0]
                params['keyword'] = cityname
                while True:
                    content = self.crawl.crawl_by_get(
                        setting.TOURIS_URL,
                        params=params,
                        headers=setting.HEADERS,
                        proxies=self._engine_use_proxy(),
                        retry=3,
                        timeout=15)
                    res_element = self.analysis.analysis_by_xpath(
                        content, xpahter=setting.XPATH_TOURIST_A)
                    if not res_element:
                        break
                    for eachelement in res_element:
                        tourist_name = self.analysis.analysis_by_xpath(
                            eachelement, xpahter=setting.XPATH_TEXT)
                        tourist_href = self.analysis.analysis_by_xpath(
                            eachelement, xpahter=setting.XPATH_HREF)
                        tourist_link = setting.START_URL + tourist_href[0][1:]
                        pattern = re.compile(r'detail_(\d+)', re.S)
                        re_id = re.search(pattern, tourist_link)
                        if re_id:
                            tourist_id = re_id.group(1)
                        else:
                            tourist_id = ''
                        # 数据结构依次为 景区名字 景区id 景区链接
                        saveinfo.add('{}\u0001{}\u0001{}'.format(
                            tourist_name[0], tourist_id, tourist_link))
                        # print(saveinfo)
                    params['page'] += 1
                self.pipe.pipe_txt_save(saveinfo,
                                        filename=setting.FILE_TOURIST_LIST,
                                        savetype='a')
            except:
                continue

    def _engine_get_touristinfo(self):
        """
        获取每个景区详细信息
        :return:
        """
        tourist_list = self.pipe.pipe_txt_load(
            filename=setting.FILE_TOURIST_LIST)
        for eachtourist in tourist_list:
            try:
                tourist_url = eachtourist.strip().split('\u0001')[2]
                content = self.crawl.crawl_by_get(
                    tourist_url,
                    headers=setting.HEADERS,
                    proxies=self._engine_use_proxy())
                res = self.analysis.analysis_by_xpath(
                    content, setting.XPATH_TOURIST_DETAIL)
                # 存储数据
                # 字段顺序
                # t_name, t_type, t_des, address, score, price, describe
                save_data = '{0[t_name]}\u0001{0[t_type]}\u0001{0[t_des]}\u0001' \
                            '{0[address]}\u0001{0[score]}\u0001{0[price]}\u0001' \
                            '{0[describe]}'.format(res)
                self.pipe.pipe_txt_save(save_data,
                                        filename=setting.FILE_TOURIST_INFO,
                                        savetype='a')
                time.sleep(0.1)
            except:
                continue

    def _engine_get_comments(self):
        """
        获取景区评论数据
        :return:
        """
        # 景区名称/id/链接
        tourist_list = self.pipe.pipe_txt_load(
            filename='file_tourist_list.txt', loadtype='r')
        for each_tourist in tourist_list:
            try:
                tourist_id = each_tourist.strip().split('\u0001')[1]
                tourist_url = each_tourist.strip().split('\u0001')[2]
                tourist_name = each_tourist.strip().split('\u0001')[0]
                # 评论翻页参数
                params_comments = {
                    'sightId': '12579',
                    'index': 0,
                    'page': 0,
                    'pageSize': '10',
                    'tagType': '0',
                }
                # 查询景区节点数据
                check_node = self.pipe.pipe_pickle_load(
                    filename=setting.FILE_TOURIST_CHECK)
                if not check_node:
                    check_node = {}
                tourist_node = check_node.get(tourist_id, {})
                # 节点中记录的上次抓取评论数量
                node_count = tourist_node.get('comments_count', 0)
                # 节点中记录的上次抓取的最大时间节点
                node_latest = tourist_node.get('comments_latest', '0')
                savelist = []  # 有效评论(新增评论)
                latest_time = set([])  # 评论时间集合
                datanum = -1  # 评论数量
                while True:
                    params_comments['sightId'] = tourist_id
                    params_comments['index'] += 1
                    params_comments['page'] += 1
                    setting.HEADERS_COMMENTS['Referer'] = tourist_url
                    content = self.crawl.crawl_by_get(
                        setting.COMMENTS_API,
                        headers=setting.HEADERS_COMMENTS,
                        params=params_comments,
                        proxies=self._engine_use_proxy(),
                        retry=2,
                        timeout=15)
                    contnet_dict = json.loads(content)
                    # 查看当前评论数量,自在第一页的时候进行这一步检查
                    if params_comments['page'] == 1:
                        taglist = contnet_dict.get('data',
                                                   {}).get('tagList', [])
                        if taglist:
                            for each in taglist:
                                if each.get('tagName') == '全部':
                                    datanum = each.get('tagNum')
                                    break
                        # 如果节点中的数据数量与当前景区实时评论数量一致，则说明没有新增评论
                        if node_count == datanum:
                            break
                    # 获取评论列表
                    datalist = contnet_dict.get('data',
                                                {}).get('commentList', [])
                    if not datalist:
                        break
                    # 直接写到mongodb中去 如果后期需要输出到txt文本，请修改此处
                    current_data = False
                    for each in datalist:
                        current_time = each.get('date')
                        each['tourist_id'] = tourist_id
                        each['tourist_name'] = tourist_name
                        each['tourist_url'] = tourist_url
                        each['get_time'] = datetime.datetime.now().strftime(
                            '%Y-%m-%d %H:%M:%S')
                        if current_time > node_latest:
                            # 存储数据
                            # 字段顺序
                            # tourist_name, tourist_id, author, commentId, content, date, score, get_time,tourist_url
                            save_data = '{0[tourist_name]}\u0001{0[tourist_id]}\u0001{0[author]}\u0001' \
                                        '{0[commentId]}\u0001{0[content]}\u0001{0[date]}\u0001' \
                                        '{0[score]}\u0001{0[get_time]}\u0001{0[tourist_url]}\u0001'.format(each)
                            self.pipe.pipe_txt_save(
                                save_data,
                                filename=setting.FILE_TOURIST_COMMENTS,
                                savetype='a')
                            latest_time.add(current_time)
                            current_data = True
                    # 如果当前页面没有新增数据，且页码数大于15页面，则此景区本次评论抓取结束
                    if not current_data and params_comments['page'] >= 15:
                        break
                    time.sleep(0.2)
                # 若从页面获取数据量失败，则不更新数据量这一字段
                if datanum != -1:
                    tourist_node['comments_count'] = datanum
                # 若没有新增的数据 则不更新数据时间节点这一字段
                if latest_time:
                    tourist_node['comments_latest'] = max(latest_time)
                check_node = {tourist_id: tourist_node}
                self.pipe.pipe_pickle_save(check_node,
                                           filename=setting.FILE_TOURIST_CHECK)
            except:
                continue

    @staticmethod
    def _engine_use_proxy():
        """
        使用代理ip
        :return: 代理ip
        """
        proxy_host = "proxy.abuyun.com"
        proxy_port = "9010"
        proxy_user = "******"
        proxy_pass = "******"
        proxy_meta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
            "host": proxy_host,
            "port": proxy_port,
            "user": proxy_user,
            "pass": proxy_pass
        }
        proxies = {"http": proxy_meta, "https": proxy_meta}

        return proxies

    def engine_run(self):
        self._engine_get_citylist()
        self._engine_get_touristlist()
        self._engine_get_touristinfo()
        self._engine_get_comments()

예제 #9

파일 보기

class Engine:
    """
    成都公交线路数据抓取脚本
    """
    def __init__(self):
        self.crawl = Crawl()
        self.analysis = Analysis()
        self.pipe = Pipeline()

    def _engine_bus_info(self):
        """
        获取所有bus的urls
        :return:
        """
        content_home = self.crawl.crawl_by_get(setting.START_URL,
                                               headers=setting.HEADERS,
                                               retry=2,
                                               timeout=30)
        each_list = self.analysis.analysis_by_xpath(content_home,
                                                    xpahter=setting.XPATH_LIST)
        urls = list(map(lambda x: setting.DOMAIN_URL.format(x), each_list))
        for each in urls:
            content_bus = self.crawl.crawl_by_get(each,
                                                  headers=setting.HEADERS,
                                                  retry=2,
                                                  timeout=30)
            bus_list = self.analysis.analysis_by_xpath(
                content_bus, xpahter=setting.XPATH_BUS)
            bus_urls = list(
                map(lambda x: setting.DOMAIN_URL.format(x), bus_list))
            if bus_urls:
                self.pipe.pipe_txt_save(bus_urls,
                                        filename=setting.FILE_BUS_LIST)

    def _engine_bus_detail(self):
        """
        获取bus详细信息
        :return:
        """
        bus_urls = self.pipe.pipe_txt_load(filename=setting.FILE_BUS_LIST)
        for each_bus in bus_urls:
            content_detail = self.crawl.crawl_by_get(each_bus,
                                                     headers=setting.HEADERS,
                                                     retry=2,
                                                     timeout=30)
            detail_info = self.analysis.analysis_by_xpath(
                content_detail, xpahter=setting.XPATH_DETAIL)
            # 存储字段
            # name,time,ticket,company,update,station
            """
            name:线路名称，time:收发时间,ticket:票价,
            company:所属公司,update:最后更新时间,station:途经站点,
            """
            save_info = '{0[name]}\u0001{0[time]}\u0001{0[ticket]}\u0001' \
                        '{0[company]}\u0001{0[update]}\u0001{0[station]}'.format(detail_info)
            self.pipe.pipe_txt_save(save_info,
                                    filename=setting.FILE_BUS_DETAIL)
            time.sleep(2)

    @staticmethod
    def _engine_use_proxy():
        """
        使用代理ip
        :return: 代理ip
        """
        proxy_host = "proxy.abuyun.com"
        proxy_port = "9010"
        proxy_user = "******"
        proxy_pass = "******"
        proxy_meta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
            "host": proxy_host,
            "port": proxy_port,
            "user": proxy_user,
            "pass": proxy_pass
        }
        proxies = {"http": proxy_meta, "https": proxy_meta}

        return proxies

    def run_engine(self):
        self._engine_bus_info()
        self._engine_bus_detail()

예제 #10

파일 보기

파일: engine.py 프로젝트: myirelias/TransferStation

class Engine:
    def __init__(self):
        self.crawl = Crawl()
        self.analysis = Analysis()
        self.pipe = Pipeline()

    def _engine_city_link(self):
        """
        获取所有城市的名称和url链接，结果输出到file_city_list.txt文本中
        :return:
        """
        content = self.crawl.crawl_by_get(setting.START_URL,
                                          headers=setting.HEADERS,
                                          proxies=self._engine_use_proxy())
        element_city = self.analysis.analysis_by_xpath(content,
                                                       setting.XPATH_CITY_A)
        city_list = []
        for each_element in element_city:
            city_name = self.analysis.analysis_by_xpath(
                each_element, setting.XPATH_CITY_NAME)
            city_url = self.analysis.analysis_by_xpath(each_element,
                                                       setting.XPATH_CITY_URL)
            city_list.append('{}\u0001{}'.format(''.join(city_name),
                                                 ''.join(city_url)))
        self.pipe.pipe_txt_save(city_list, filename=setting.FILE_CITY_LIST)

    def _engine_restaurant_link(self):
        """
        获取每个城市中所有的美食店铺的链接
        抓取之前获取当前已抓取的美食店铺id，当前抓取的id或进行校验是否为新增
        新增数据则存入到对应的TEMP文件中，最后本次循化完毕后，统一推送新增数据到HDFS
        本次循化所有模块执行完毕后，新增数据要追加入历史数据中，追加成功后修改新增数据文件名称，以便后面的新增文件不与前一次数据冲突
        修改新政文件名称时候使用完成抓取当日的日期作为文件名称前缀
        :return:
        """
        city_list = self.pipe.pipe_txt_load(filename=setting.FILE_CITY_LIST)
        # 获取已经抓取店铺id，便于识别新增数据
        history_restautrant = self.pipe.pipe_txt_load(
            filename=setting.FILE_RESTAURANT_LIST)
        history_id = set(
            map(lambda x: x.strip().split('\u0001')[2],
                [each for each in history_restautrant]))
        for each_city in city_list:
            # try:
            url = each_city.strip().split('\u0001')[1] + '-meishi'
            name = each_city.strip().split('\u0001')[0]
            params_city = {'page': 0}
            maxpage = 200  # 默认最大页数
            while True:
                save_list = []
                params_city['page'] += 1
                content = self.crawl.crawl_by_get(
                    url,
                    headers=setting.HEADERS,
                    params=params_city,
                    proxies=self._engine_use_proxy(),
                    retry=5)
                if not content:
                    break
                element_li = self.analysis.analysis_by_xpath(
                    content, xpahter=setting.XPATH_LI)
                if not element_li:
                    break
                for each_ele in element_li:
                    restaurant_name = self.analysis.analysis_by_xpath(
                        each_ele, xpahter=setting.XPATH_RES_NAME)
                    restaurant_type = self.analysis.analysis_by_xpath(
                        each_ele, xpahter=setting.XPATH_RES_TYPE)
                    restaurant_url = self.analysis.analysis_by_xpath(
                        each_ele, xpahter=setting.XPATH_RES_URL)
                    current_id = re.search(re.compile(r'p-oi(\d+)-'),
                                           ''.join(restaurant_url)).group(1)
                    if current_id in history_id:
                        continue
                    try:
                        save_info = '{}\u0001{}\u0001{}\u0001{}\u0001{}'.format(
                            name, ''.join(restaurant_name), current_id,
                            ''.join(restaurant_type), ''.join(restaurant_url))
                    except:
                        continue
                    save_list.append(save_info)
                if save_list:
                    self.pipe.pipe_txt_save(
                        save_list,
                        filename=setting.TEMP_RESTAURANT_LIST,
                        savetype='a')
                if params_city['page'] >= maxpage:
                    break
                time.sleep(0.1)
            # except:
            #     continue

    def _engine_restaurant_info(self):
        """
        获取所有餐厅详细数据
        :return:
        """
        res_list = self.pipe.pipe_txt_load(
            filename=setting.FILE_RESTAURANT_LIST)
        temp_list = self.pipe.pipe_txt_load(
            filename=setting.TEMP_RESTAURANT_LIST)
        res_list.extend(temp_list)
        history_restautrant = self.pipe.pipe_txt_load(
            filename=setting.FILE_RESTAURANT_INFO)
        history_id = set(
            map(lambda x: x.strip().split('\u0001')[2],
                [each for each in history_restautrant]))
        for each_res in res_list:
            try:
                # 店铺数据
                res_info = each_res.strip().split('\u0001')
                city_name = res_info[0]
                res_name = res_info[1]
                res_id = res_info[2]
                if res_id in history_id:
                    continue
                res_type = res_info[3]
                res_url = res_info[4]
                # 获取店铺详细信息
                content = self.crawl.crawl_by_get(
                    res_url,
                    headers=setting.HEADERS,
                    proxies=self._engine_use_proxy(),
                    retry=5,
                    timeout=10)
                detail = self.analysis.analysis_by_xpath(
                    content, xpahter=setting.XPATH_RES_DETAIL)
                detail['city_name'] = city_name
                detail['restaurant_name'] = res_name
                detail['restaurant_type'] = res_type
                detail['restaurant_url'] = res_url
                detail['restaurant_id'] = res_id
                detail['get_time'] = datetime.datetime.now().strftime(
                    '%Y-%m-%d %H:%M:%S')
                # 构建存储的数据
                # 字段：
                # city_name, restaurant_name, restaurant_id, restaurant_type,
                # score, ranking, price, describe, address, tel, open_time, dish, arrive, intro, restaurant_url,
                # get_time datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                savedata = '{0[city_name]}\u0001{0[restaurant_name]}\u0001{0[restaurant_id]}\u0001' \
                           '{0[restaurant_type]}\u0001{0[score]}\u0001{0[ranking]}\u0001{0[price]}\u0001' \
                           '{0[describe]}\u0001{0[address]}\u0001{0[tel]}\u0001{0[open_time]}\u0001' \
                           '{0[dish]}\u0001{0[arrive]}\u0001{0[intro]}\u0001{0[restaurant_url]}\u0001' \
                           '{0[get_time]}'.format(detail)
                self.pipe.pipe_txt_save(savedata,
                                        filename=setting.TEMP_RESTAURANT_INFO,
                                        savetype='a')
                # self.pipe.pipe_mongo_save(detail, dbname='db_qunaer', colname='col_food_info')
                time.sleep(0.02)
            except Exception as e:
                print('crawl error', e)
                continue

    def _engine_restaurant_comments(self):
        """
        获取所有餐厅评论数据
        :return:
        """
        res_list = self.pipe.pipe_txt_load(
            filename=setting.FILE_RESTAURANT_LIST)
        temp_list = self.pipe.pipe_txt_load(
            filename=setting.TEMP_RESTAURANT_LIST)
        res_list.extend(temp_list)
        # 每个店铺最新评论时间表
        check_dict = self.pipe.pipe_pickle_load(
            filename=setting.FILE_COMMENTS_CHECK)
        if not check_dict:
            check_dict = {}
        for each_res in res_list:
            try:
                # 店铺数据
                city = each_res.strip().split('\u0001')[0]
                food = each_res.strip().split('\u0001')[1]
                res_id = each_res.strip().split('\u0001')[2]
                type = each_res.strip().split('\u0001')[3]
                res_url = each_res.strip().split('\u0001')[4]
                api = setting.COMMENTS_API.format(res_id)
                setting.HEADERS_COMMENTS['Referer'] = res_url
                params = {
                    'page': 0,
                    'pageSize': '10',
                    'poiList': 'true',
                    'rank': 0,  # 全部评论
                    'sortField': 0  # 按照时间排序
                }
                comments_time = set([])
                current_time = check_dict.get(res_id, '0')
                while True:
                    time.sleep(0.2)
                    try:
                        params['page'] += 1
                        content = self.crawl.crawl_by_get(
                            api,
                            headers=setting.HEADERS_COMMENTS,
                            proxies=self._engine_use_proxy(),
                            params=params,
                            retry=3,
                            timeout=20)
                        content_dict = json.loads(content)
                        if not content_dict.get('data'):
                            break
                        content_comments = content_dict.get('data')
                        elements_com = self.analysis.analysis_by_xpath(
                            content_comments,
                            xpahter=setting.XPATH_COMMENTS_LI)
                        if not elements_com:
                            break
                        for each_element in elements_com:
                            title = self.analysis.analysis_by_xpath(
                                each_element,
                                xpahter=setting.XPATH_COMMENTS_TITLE)
                            start = self.analysis.analysis_by_xpath(
                                each_element,
                                xpahter=setting.XPATH_COMMENTS_START)
                            nick = self.analysis.analysis_by_xpath(
                                each_element,
                                xpahter=setting.XPATH_COMMENTS_NICK)
                            more = self.analysis.analysis_by_xpath(
                                each_element,
                                xpahter=setting.XPATH_COMMENTS_MORE)
                            if more:
                                content_more = self.crawl.crawl_by_get(
                                    more[0],
                                    headers=setting.HEADERS,
                                    proxies=self._engine_use_proxy())
                                content = self.analysis.analysis_by_xpath(
                                    content_more,
                                    xpahter=setting.XPATH_COMMENTS_DETAIL)
                            else:
                                content = self.analysis.analysis_by_xpath(
                                    each_element,
                                    xpahter=setting.XPATH_COMMENTS_CONTENT)
                            date = self.analysis.analysis_by_xpath(
                                each_element,
                                xpahter=setting.XPATH_COMMENTS_DATE)
                            try:
                                deal_content = ''.join(
                                    list(
                                        map(
                                            lambda x: x.replace('\n', '').
                                            replace('\r', '').replace(
                                                '\t', '').replace(' ', ''),
                                            content)))
                            except:
                                deal_content = ''
                            if ''.join(date) > current_time:
                                commetents_info = {
                                    'city':
                                    city,
                                    'food':
                                    food,
                                    'food_id':
                                    res_id,
                                    'type':
                                    type,
                                    'title':
                                    ''.join(title),
                                    'nick':
                                    ''.join(nick),
                                    'start':
                                    ''.join(start),
                                    'content':
                                    deal_content,
                                    'date':
                                    ''.join(date),
                                    'get_time':
                                    datetime.datetime.now().strftime(
                                        '%Y-%m-%d %H:%M:%S'),
                                    'url':
                                    res_url
                                }
                                for eachkey in commetents_info.keys():
                                    commetents_info[eachkey] = commetents_info[
                                        eachkey].replace('\n',
                                                         '').replace('\r', '')
                                # 存储数据
                                # 字段顺序：city, food, food_id, type, title, nick, start, content, date, get_time, url
                                save_info = '{0[city]}\u0001{0[food]}\u0001{0[food_id]}\u0001' \
                                            '{0[type]}\u0001{0[title]}\u0001{0[nick]}\u0001' \
                                            '{0[start]}\u0001{0[content]}\u0001{0[date]}\u0001' \
                                            '{0[get_time]}\u0001{0[url]}'.format(commetents_info)

                                self.pipe.pipe_txt_save(
                                    save_info,
                                    filename=setting.TEMP_RESTAURANT_COMMENTS,
                                    savetype='a')
                                comments_time.add(''.join(date))
                        # 当前页面没有新增评论也切换至下一店铺
                        if not len(comments_time):
                            break
                    except:
                        break
                # 每个店铺最新的评论时间
                if comments_time:
                    check_dict[res_id] = max(comments_time)
                    # 抓取到的评论数据
                self.pipe.pipe_pickle_save(
                    check_dict, filename=setting.FILE_COMMENTS_CHECK)
            except Exception as e:
                print(e)
                continue

    def _temp_city_info(self, cityname):
        """
        做22项数据处理时临时用
        :return:
        """
        citylist = self.pipe.pipe_txt_load(filename='city_list_total.txt')
        city_params = {
            '国别': '&',
            '省自治区全称': '&',
            '省自治区简称': '&',
            '市州全称': '&',
            '市州简称': '&',
            '区县全称': '&',
            '区县简称': '&',
            '地区编码': '&',
            '等级': '&'
        }
        spec_city = {
            '北京': '110000',
            '天津': '120000',
            '上海': '310000',
            '重庆': '500000'
        }
        for each in citylist:
            cityinfo = each.split('\u0001')
            if cityname in cityinfo:
                site = cityinfo.index(cityname)
                if site == 4 or site == 5:
                    city_params['国别'] = 'CN'
                    city_params['省自治区全称'] = cityinfo[0].strip()
                    city_params['省自治区简称'] = cityinfo[1].strip()
                    city_params['市州全称'] = cityinfo[2].strip()
                    city_params['市州简称'] = cityinfo[3].strip()
                    city_params['区县全称'] = cityinfo[4].strip()
                    city_params['区县简称'] = cityinfo[5].strip()
                    city_params['地区编码'] = cityinfo[-1].strip()
                    city_params['等级'] = '区县级'
                elif site == 2 or site == 3:
                    city_params['国别'] = 'CN'
                    city_params['省自治区全称'] = cityinfo[0].strip()
                    city_params['省自治区简称'] = cityinfo[1].strip()
                    city_params['市州全称'] = cityinfo[2].strip()
                    city_params['市州简称'] = cityinfo[3].strip()
                    city_params['地区编码'] = cityinfo[-1].strip()[:-2] + '00'
                    city_params['等级'] = '地市级'
                elif cityname in ['北京', '重庆', '上海', '天津']:
                    city_params['国别'] = 'CN'
                    city_params['省自治区全称'] = cityname + '市'
                    city_params['省自治区简称'] = cityname
                    city_params['市州全称'] = cityname + '市'
                    city_params['市州简称'] = cityname
                    city_params['地区编码'] = spec_city[cityname]
                    city_params['等级'] = '直辖'
                break

        return city_params

    @staticmethod
    def _engine_use_proxy():
        """
        使用代理ip
        :return: 代理ip
        """
        proxy_host = "proxy.abuyun.com"
        proxy_port = "9010"
        proxy_user = "******"
        proxy_pass = "******"
        proxy_meta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
            "host": proxy_host,
            "port": proxy_port,
            "user": proxy_user,
            "pass": proxy_pass
        }
        proxies = {"http": proxy_meta, "https": proxy_meta}

        return proxies

    # 集群操作
    @staticmethod
    def _engine_push_hdfs(filename):
        try:
            if os.path.exists('DATA/' + filename):
                # HDFS.put(当前文件，目标文件)
                HDFS.put('DATA/' + filename,
                         '/user/spider/everyday/{}'.format(filename))

        except Exception as e:
            print('集群挂了', e)

    def start_engine(self):
        # self._engine_city_link()
        while True:
            # self._engine_restaurant_link()
            # self._engine_restaurant_info()
            # self._engine_restaurant_comments()
            current_time = datetime.datetime.now().strftime('%Y-%m-%d')
            file_dict = {
                setting.FILE_RESTAURANT_LIST: setting.TEMP_RESTAURANT_LIST,
                setting.FILE_RESTAURANT_INFO: setting.TEMP_RESTAURANT_INFO,
                setting.FILE_RESTAURANT_COMMENTS:
                setting.TEMP_RESTAURANT_COMMENTS
            }
            for f, t in file_dict.items():
                if os.path.exists('DATA/{}'.format(f)):
                    temp = self.pipe.pipe_txt_load(filename=t)
                    newname = 'qunar{}({}).txt'.format(t[4:-4], current_time)
                    if temp:
                        self.pipe.pipe_txt_save(temp, filename=f, savetype='a')
                        os.rename('DATA/{}'.format(t),
                                  'DATA/{}'.format(newname))
                else:
                    os.rename('DATA/{}'.format(t), 'DATA/{}'.format(f))
                    newname = 'qunar{}({}).txt'.format(t[4:-4], current_time)
                    shutil.copy('DATA/{}'.format(f), 'DATA/{}'.format(newname))
                self._engine_push_hdfs(newname)