예제 #1
0
    def __init__(self, brand_id, extra_cond=None, max_images=15):
        print str.format(
            'Publishing (brand_id={0}, max_images={1}, extra_cond="{2}")...',
            brand_id, max_images, extra_cond)
        # 某一单品最大发布的图片数量
        self.max_images = max_images
        self.brand_id = brand_id
        if not extra_cond:
            extra_cond = ['1']
        elif not iterable(extra_cond):
            extra_cond = [extra_cond]
        self.extra_cond = extra_cond
        self.tot = 0
        self.progress = 0
        # 国家的展示顺序
        self.region_order = {
            k: info.region_info()[k]['weight']
            for k in info.region_info()
        }

        self.products_tbl = 'products'
        self.prod_mt_tbl = 'products_mfashion_tags'
        self.mt_tbl = 'mfashion_tags'
        self.prod_ot_tbl = 'products_original_tags'
        self.ot_tbl = 'original_tags'
        self.price_hist = 'products_price_history'
예제 #2
0
    def __init__(self, name, region, *a, **kw):
        self.name = str.format('{0}-{1}', name, '-'.join(region) if region else 'all')
        super(MFashionSpider, self).__init__(*a, **kw)

        if not region:
            self.region_list = self.get_supported_regions()
        else:
            self.region_list = list((set(region) if iterable(region) else
                                     {region}).intersection(set(self.get_supported_regions())))
예제 #3
0
    def start_requests(self):
        for region in self.region_list:
            metadata = {'region': region, 'brand_id': getattr(self, 'spider_data')['brand_id'],
                        'tags_mapping': {}, 'category': []}

            tmp = getattr(self, 'spider_data')['home_urls'][region]
            start_urls = tmp if iterable(tmp) else [tmp]
            for url in start_urls:
                m = copy.deepcopy(metadata)
                yield Request(url=url, meta={'userdata': m}, callback=self.parse, errback=self.onerr)
예제 #4
0
    def query_match(self,
                    selects,
                    table,
                    matches=None,
                    extra=None,
                    tail_str=None,
                    use_result=False,
                    distinct=False):
        """
        查询:相当于SELECT ... FROM ... WHERE col=val
        :param selects: 需要select的字段
        :param table: 查询的表名称
        :param matches: dict类型,查询条件
        :param extra: 其它的查询条件
        :param tail_str: 添加在查询语句末尾的字符串
        :param use_result:
        :return:
        """
        if not extra:
            extra = ['1']
        elif not iterable(extra):
            extra = [extra]

        if not iterable(selects):
            selects = [selects]

        def func(arg):
            k, v = arg
            return unicode.format(u'{0}="{1}"', k,
                                  self.sql_escape(v)) if v else unicode.format(
                                      u'{0} IS NULL', k)

        match_str = ' AND '.join(map(func,
                                     matches.items())) if matches else '1'
        extra_cond = ' AND '.join(extra)
        statement = unicode.format(
            u'SELECT {5} {0} FROM {1} WHERE {2} AND {3} {4}',
            ', '.join(selects), table, match_str, extra_cond,
            tail_str if tail_str else '', 'DISTINCT' if distinct else '')
        self.db.query(statement.encode('utf-8'))
        return self.db.use_result() if use_result else self.db.store_result()
예제 #5
0
 def __init__(self,
              src_spec=getattr(glob, 'DATABASE')['DB_SPEC'],
              dst_spec=getattr(glob, 'DATABASE')['DB_SPEC'],
              cond=None):
     self.progress = 0
     self.tot = 1
     if cond:
         if iterable(cond):
             self.cond = cond
         else:
             self.cond = [cond]
     else:
         self.cond = ['1']
     self.src_spec = src_spec
     self.dst_spec = dst_spec
예제 #6
0
    def __init__(self, region):
        if iterable(region):
            HogoBossSpider.spider_data['home_urls'] = {
                reg: str.format('http://store-{0}.hugoboss.com', reg)
                if reg != 'cn' else 'http://store.hugoboss.cn'
                for reg in region
            }
        else:
            k = region
            HogoBossSpider.spider_data['home_urls'] = {
                k: str.format('http://store-{0}.hugoboss.com', k)
                if k != 'cn' else 'http://store.hugoboss.cn'
            }

        super(HogoBossSpider, self).__init__('Hugo Boss', region)
예제 #7
0
 def start_requests(self):
     for region in self.region_list:
         metadata = {
             'region': region,
             'brand_id': self.spider_data['brand_id'],
             'tags_mapping': {}
         }
         tmp = self.spider_data['home_urls']['common']
         cookie = {
             'DKI_FiftyOneInternationalCookie':
             str.format('{0}-{1}', region.upper(),
                        self.spider_data['curreny'][region])
         }
         start_urls = tmp if iterable(tmp) else [tmp]
         for url in start_urls:
             m = copy.deepcopy(metadata)
             yield Request(url=url,
                           meta={'userdata': m},
                           callback=self.parse,
                           errback=self.onerr,
                           cookies=cookie,
                           dont_filter=True)
예제 #8
0
    def run(self):
        last_update = self.last_update
        extra_cond = self.extra_cond

        if not extra_cond:
            extra_cond = []
        elif not iterable(extra_cond):
            extra_cond = [extra_cond]
        if last_update:
            extra_cond.append(
                unicode.format(u'update_time > "{0}"', last_update))
        extra_cond.append('mapping_list IS NOT NULL')

        # MFashion标签的缓存记录
        cached_mfashion = {}

        # 标签更新原理:original_tags存放原始标签。根据update_time字段可以得到最近更新过的标签。由于标签系统具备一定传染性,所以
        # 该标签对应brand/region下的所有标签都必须重做
        rs = self.db.query_match(['brand_id', 'region'],
                                 self.original_tags_tbl, {},
                                 extra=extra_cond,
                                 distinct=True)
        # 需要处理的标签
        tag_dict = {}
        for i in xrange(rs.num_rows()):
            brand_id, region = rs.fetch_row()[0]
            for val in self.db.query_match(
                ['idmappings', 'mapping_list'],
                    self.original_tags_tbl, {
                        'brand_id': brand_id,
                        'region': region
                    },
                    extra='mapping_list IS NOT NULL').fetch_row(maxrows=0):
                tag_dict[val[0]] = json.loads(val[1].replace("'", '"'))

            # 删除旧单品/标签关系
            self.db.execute(
                str.format(
                    'DELETE FROM p2 USING {0} AS p1, {1} AS p2 WHERE p1.idproducts=p2.idproducts '
                    'AND p1.brand_id={2} AND region="{3}"', self.products,
                    self.prod_mt_tbl, brand_id, region))

        self.tot = len(tag_dict)
        self.progress = 0
        for tid, rule in tag_dict.items():
            self.progress += 1
            self.db.start_transaction()
            try:
                # 所有相关的单品
                pid_list = [
                    int(val[0]) for val in self.db.query_match(
                        ['idproducts'], self.prod_tag_tbl, {
                            'id_original_tags': tid
                        }).fetch_row(maxrows=0)
                ]

                # 添加MFashion标签
                for tag in rule:
                    if tag not in cached_mfashion:
                        self.db.insert({'tag': tag},
                                       self.mfashion_tags_tbl,
                                       ignore=True)
                        tid = int(
                            self.db.query_match(['idmfashion_tags'],
                                                self.mfashion_tags_tbl, {
                                                    'tag': tag
                                                }).fetch_row()[0][0])
                        cached_mfashion[tag] = tid

                    self.db.insert([{
                        'idproducts': pid,
                        'id_mfashion_tags': cached_mfashion[tag]
                    } for pid in pid_list],
                                   self.prod_mt_tbl,
                                   ignore=True)

                self.db.commit()
            except ValueError:
                self.db.rollback()
            except:
                self.db.rollback()
                raise
예제 #9
0
def set_up_spider(spider_class, data, spider_type='default'):
    """
    设置爬虫对象
    @param spider_type: 爬虫类型,是update还是monitor,还是普通爬虫
    @param spider_class:
    @param data: 爬虫的配置参数
    @return:
    """

    crawler = Crawler(Settings())
    crawler.settings.values['BOT_NAME'] = 'mstore_bot'

    if spider_type == 'update':
        crawler.settings.values['ITEM_PIPELINES'] = {'scrapper.pipelines.UpdatePipeline': 800}
        brand_list = [int(tmp) for tmp in (data['brand'] if 'brand' in data else [])]
        if 'region' in data:
            region_list = data['region']
        elif 'r' in data:
            region_list = data['r']
        else:
            region_list = None
        spider = spider_class(brand_list, region_list, getattr(glob, 'DATABASE')['DB_SPEC'])
        welcome_msg = str.format('Updating started, processing the following brands: {0}',
                                 ', '.join(str(tmp) for tmp in brand_list))

        # TODO update类型的spider可以指定多个品牌,导致user agent很不好处理。这是一种不太好的架构,以后考虑改成:一个UpdateSpider对应一个特定的品牌
        major_brand = brand_list[0]
    elif spider_type == 'monitor':
        crawler.settings.values['ITEM_PIPELINES'] = {'scrapper.pipelines.MonitorPipeline': 800}
        brand = int(data['brand'][0])
        region = data['region'][0]
        idmonitor = int(data['idmonitor'][0])
        parameter = {'brand_id': brand, 'region': region}
        spider = spider_class(idmonitor, parameter, getattr(glob, 'DATABASE')['DB_SPEC'])
        welcome_msg = str.format('STARTING MONITORING, idmonitory={0}, brand={1}, region={2}', idmonitor, brand,
                                 region)
        major_brand = brand
    else:
        crawler.settings.values['ITEM_PIPELINES'] = {'scrapper.pipelines.ProductImagePipeline': 800,
                                                     'scrapper.pipelines.ProductPipeline': 300} \
            if getattr(glob, 'DATABASE')['WRITE_DATABASE'] else {}
        if 'job' in data:
            job_path = get_job_path(spider_class.spider_data['brand_id']) + '-1'
            if 'rst-job' in data:
                shutil.rmtree(job_path, ignore_errors=True)
            crawler.settings.values['JOBDIR'] = job_path

        # Telnet支持
        # crawler.settings.values['TELNETCONSOLE_HOST'] = '127.0.0.1'
        # if 'telnet' in data and data['telnet']:
        #     start_port = int(data['telnet'][0])
        # else:
        #     start_port = spider_class.spider_data['brand_id']
        # crawler.settings.values['TELNETCONSOLE_PORT'] = [start_port, start_port + 8]

        # 图像数据存储
        crawler.settings.values['IMAGES_STORE'] = get_images_store(spider_class.spider_data['brand_id'])
        # crawler.settings.values['IMAGES_THUMBS'] = {'small': (480, 480), 'medium': (1200, 1200)}
        crawler.settings.values['IMAGES_MIN_HEIGHT'] = 128
        crawler.settings.values['IMAGES_MIN_WIDTH'] = 128

        # 获取爬虫区域
        region_list = data['r']
        if not region_list:
            region_list = spider_class.get_supported_regions()
        elif not iterable(region_list):
            region_list = [region_list]

        region_list = filter(lambda val: info.region_info()[val]['status'], region_list)

        if 'exclude-region' in data:
            for r in data['exclude-region']:
                if r in region_list:
                    region_list.pop(region_list.index(r))

        spider = spider_class(region_list)
        welcome_msg = str.format('Spider started, processing the following regions: {0}', ', '.join(region_list))

        major_brand = spider_class.spider_data['brand_id']

    crawler.settings.values['AUTOTHROTTLE_ENABLED'] = False

    # 设置spider的user agent
    # 优先级如下:
    # 1. 命令行中--user-agent参数指定
    # 2. 配置文件指定,参见global_settings['USER_AGENT']项目
    # 3. spider_class中,spider_data['USER_AGENT']指定
    # 4. 默认为chrome
    # TODO 以上第2项暂未实现
    ua_map = {
        'chrome': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.69 Safari/537.36',
        'iphone': 'Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_2 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8H7 Safari/6533.18.5',
        'ipad': 'Mozilla/5.0 (iPad; U; CPU OS 3_2 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 Mobile/7B334b Safari/531.21.10'}

    if 'user-agent' in data:
        ua = data['user-agent'][0]
    else:
        spider_spec = info.spider_info()[major_brand]['spider_class']
        if 'user_agent' in spider_spec.spider_data:
            ua = spider_spec.spider_data['user_agent']
        else:
            ua = 'chrome'
    crawler.settings.values['USER_AGENT'] = ua_map[ua.lower()] if ua.lower() in ua_map else ua

    # 设置spider的proxy信息
    crawler.settings.values['DOWNLOADER_MIDDLEWARES'] = {
        'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware': 1}
    if 'proxy' in data:
        try:
            crawler.settings.values['PROXY_ENABLED'] = True
        except AttributeError:
            crawler.settings.values['PROXY_ENABLED'] = False
    else:
        crawler.settings.values['PROXY_ENABLED'] = False

    # TODO deal with cookies
    # cookie_flag = getattr(glob, 'COOKIES_ENABLED', False)
    # try:
    #     cookie_flag = (data['cookie'][0].lower() == 'true')
    # except (IndexError, KeyError):
    #     pass
    # crawler.settings.values['COOKIES_ENABLED'] = cookie_flag
    #
    # try:
    #     crawler.settings.values['COOKIES_DEBUG'] = getattr(glob, 'DEBUG')['COOKIES_DEBUG']
    # except (AttributeError, KeyError):
    #     crawler.settings.values['COOKIES_DEBUG'] = False

    crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
    crawler.configure()

    spider.log(welcome_msg, log.INFO)
    crawler.crawl(spider)
    crawler.start()

    return spider