예제 #1
0
    def __init__(self):
        requests_times = global_config.getRaw('config', 'requests_times')
        self.cookie = global_config.getRaw('config', 'Cookie')
        self.ua = global_config.getRaw('config', 'user-agent')

        # Todo 这句话加在这里会明显拖慢加载速度,但是单独取出并不会浪费很长时间,因此暂时搁置虚假请求头
        # Todo 作为对策,在这里判断ua不能为None
        # self.ua_engine = Factory.create()
        if self.ua is None:
            logger.error('user agent 暂时不支持为空')
            sys.exit()

        self.cookie_pool = global_config.getRaw('config', 'use_cookie_pool')
        self.cookie_pool = True if self.cookie_pool == 'True' else False
        if self.cookie_pool is True:
            logger.info('使用cookie池')
            if not os.path.exists('cookies.txt'):
                logger.error('cookies.txt文件不存在')
                sys.exit()
        try:
            self.stop_times = self.parse_stop_time(requests_times)
        except:
            logger.error('配置文件requests_times解析错误,检查输入(必须英文标点)')
            sys.exit()
        self.global_time = 0
        pass
예제 #2
0
    def __init__(self):
        requests_times = global_config.getRaw('config', 'requests_times')
        self.cookie = global_config.getRaw('config', 'Cookie')
        self.ua = global_config.getRaw('config', 'user-agent')

        self.ua_engine = Factory.create()
        if self.ua is None:
            logger.error('user agent 暂时不支持为空')
            sys.exit()

        self.cookie_pool = global_config.getRaw('config', 'use_cookie_pool')
        self.cookie_pool = True if self.cookie_pool == 'True' else False
        if self.cookie_pool is True:
            logger.info('使用cookie池')
            if not os.path.exists('cookies.txt'):
                logger.error('cookies.txt文件不存在')
                sys.exit()

        self.ip_proxy = global_config.getRaw('proxy', 'use_proxy')
        self.ip_proxy = True if self.ip_proxy == 'True' else False
        if self.ip_proxy:
            self.proxy_pool = []

        try:
            self.stop_times = self.parse_stop_time(requests_times)
        except:
            logger.error('配置文件requests_times解析错误,检查输入(必须英文标点)')
            sys.exit()
        self.global_time = 0
        pass
예제 #3
0
 def get_proxy(self):
     """
     获取代理
     """
     try:
         repeat_nub = int(global_config.getRaw('proxy', 'repeat_nub'))
     except:
         logger.warning('repeat_nub 格式不正确,应为正整数')
         sys.exit()
     # http 提取模式
     if global_config.getRaw('proxy', 'http_extract') == '1':
         # 代理池为空,提取代理
         if len(self.proxy_pool) == 0:
             proxy_url = global_config.getRaw('proxy', 'http_link')
             r = requests.get(proxy_url)
             r_json = r.json()
             for proxy in r_json:
                 # 重复添加,多次利用
                 for _ in range(repeat_nub):
                     self.proxy_pool.append([proxy['ip'], proxy['port']])
         # 获取ip
         proxies = self.http_proxy_utils(self.proxy_pool[0][0],
                                         self.proxy_pool[0][1])
         self.proxy_pool.remove(self.proxy_pool[0])
         return proxies
     # 秘钥提取模式
     elif global_config.getRaw('proxy', 'key_extract') == '1':
         pass
     pass
예제 #4
0
 def __init__(self):
     self.location_id = global_config.getRaw('detail', 'location_id')
     self.channel_id = global_config.getRaw('detail', 'channel_id')
     self.custom_search_url = global_config.getRaw('detail', 'search_url')
     self.need_detail = global_config.getRaw('detail', 'need_detail')
     self.need_comment = global_config.getRaw('detail', 'need_comment')
     self.requests_util = requests_util
     self.jump_wait = False
예제 #5
0
    def __init__(self):
        requests_times = global_config.getRaw('config', 'requests_times')
        self.cookie = global_config.getRaw('config', 'cookie')
        self.ua = global_config.getRaw('config', 'user-agent')

        self.ua_engine = Factory.create()

        try:
            self.stop_times = self.parse_stop_time(requests_times)
        except:
            logger.error('配置文件requests_times解析错误,检查输入(必须英文标点)')
            sys.exit()
        self.global_time = 0
        pass
예제 #6
0
 def get_header(self, cookie):
     ua = global_config.getRaw('config', 'user-agent')
     if ua is None:
         ua_engine = Factory.create()
         ua = ua_engine.user_agent()
     header = {'User-Agent': ua, 'Cookie': cookie}
     return header
예제 #7
0
                    type=int,
                    required=False,
                    default=0,
                    help='spider as custom(just review)')
parser.add_argument('--shop_id',
                    type=str,
                    required=False,
                    default='',
                    help='custom shop id')
args = parser.parse_args()
if __name__ == '__main__':
    # args.review = 1
    # args.normal = 0
    # args.shop_id = 'l8QDQukrl2tXhzmY'
    if args.normal == 1:
        keyword = global_config.getRaw('detail', 'keyword')
        need_first = True if global_config.getRaw(
            'detail', 'need_first') is 'True' else False
        need_pages = int(global_config.getRaw('detail', 'need_pages'))

        s = Search()
        s.search(keyword, need_first, need_pages)
    if args.detail == 1:
        from function.detail import Detail

        shop_id = args.shop_id
        logger.info('爬取店铺id:' + shop_id + '详情')
        d = Detail()
        d.get_detail(shop_id)
    if args.review == 1:
        from function.review import Review
예제 #8
0
 def update_cookie(self):
     self.cookie = global_config.getRaw('config', 'Cookie')
예제 #9
0
    def __init__(self):
        # config 的 config
        self.USE_COOKIE_POOL = True if global_config.getRaw('config', 'use_cookie_pool') == 'True' else False
        self.COOKIE = global_config.getRaw('config', 'Cookie')
        self.USER_AGENT = global_config.getRaw('config', 'user-agent')
        self.SAVE_MODE = global_config.getRaw('config', 'save_mode')
        self.MONGO_PATH = global_config.getRaw('config', 'mongo_path')
        self.REQUESTS_TIMES = global_config.getRaw('config', 'requests_times')
        self.UUID = global_config.getRaw('config', 'uuid')
        self.TCV = global_config.getRaw('config', 'tcv')

        # config 的 detail
        self.KEYWORD = global_config.getRaw('detail', 'keyword')
        self.LOCATION_ID = global_config.getRaw('detail', 'location_id')
        self.CHANNEL_ID = global_config.getRaw('detail', 'channel_id')
        self.SEARCH_URL = global_config.getRaw('detail', 'search_url')
        assert self.SEARCH_URL == '' or self.SEARCH_URL.endswith('p'), 'search_url 没有拼接p'
        self.NEED_FIRST = True if global_config.getRaw('detail', 'need_first') == 'True' else False
        try:
            self.NEED_SEARCH_PAGES = int(global_config.getRaw('detail', 'need_pages'))
        except:
            logger.error('need_pages 必须为整数')
            exit()

        # config 的 proxy
        self.USE_PROXY = True if global_config.getRaw('proxy', 'use_proxy') == 'True' else False
        if self.USE_PROXY:
            try:
                self.REPEAT_NUMBER = int(global_config.getRaw('proxy', 'repeat_nub'))
            except:
                logger.error('repeat_nub 必须为整数')
                exit()
        else:
            self.REPEAT_NUMBER = 0
        self.HTTP_EXTRACT = True if global_config.getRaw('proxy', 'http_extract') == 'True' else False
        self.HTTP_LINK = global_config.getRaw('proxy', 'http_link')
        self.KEY_EXTRACT = True if global_config.getRaw('proxy', 'key_extract') == 'True' else False
        self.KEY_ID = global_config.getRaw('proxy', 'key_id')
        self.KEY_KEY = global_config.getRaw('proxy', 'key_key')
        assert not (self.HTTP_EXTRACT is True and self.KEY_EXTRACT is True), '代理模式不可以全为True'

        # require 的 shop phone
        self.NEED_DETAIL = True if require_config.getRaw('shop_phone', 'need') == 'True' else False
        self.NEED_PHONE_DETAIL = True if require_config.getRaw('shop_phone', 'need_detail') == 'True' else False
        if self.NEED_PHONE_DETAIL:
            logger.warn('开启了电话详情模式,会降低速度并增加反爬概率')

        # require 的 shop review
        self.NEED_REVIEW = True if require_config.getRaw('shop_review', 'need') == 'True' else False
        self.NEED_REVIEW_DETAIL = True if require_config.getRaw('shop_review', 'more_detail') == 'True' else False
        if self.NEED_REVIEW_DETAIL:
            logger.warn('开启了评论详情模式,会降低速度并增加反爬概率')
            try:
                self.NEED_REVIEW_PAGES = int(require_config.getRaw('shop_review', 'need_pages'))
            except:
                logger.error('need_pages 必须为整数')
                exit()
        else:
            self.NEED_REVIEW_PAGES = 0
예제 #10
0
        ┃   ┃   神兽保佑
        ┃   ┃   代码无BUG!
        ┃   ┗━━━━━━━━━┓
        ┃CREATE BY SNIPER┣┓
        ┃             ┏┛
        ┗━┓ ┓ ┏━━━┳ ┓ ┏━┛
          ┃ ┫ ┫   ┃ ┫ ┫
          ┗━┻━┛   ┗━┻━┛

"""
import requests
from function.search import Search
from utils.config import global_config
from utils.get_font_map import get_review_map_file

cookie = global_config.getRaw('config', 'cookie')
ua = global_config.getRaw('config', 'user-agent')


def get_header():
    """
    获取请求头
    :return:
    """
    header = {'User-Agent': ua, 'Cookie': cookie}
    return header


if __name__ == '__main__':
    # debug search
    Search().search('一方', only_need_first=False, needed_pages=10)
예제 #11
0
 def __init__(self):
     self.requests_util = requests_util
     self.pages_needed = global_config.getRaw('save', 'review_pages')
     pass
예제 #12
0
 def __init__(self):
     self.cookie = global_config.getRaw('config', 'cookie')
     self.ua = global_config.getRaw('config', 'user-agent')
     self.location_id = global_config.getRaw('config', 'location_id')
     self.ua_engine = Factory.create()
     self.saver = Saver()