示例#1
0
def process_company_item(item, spider):
    now = datetime.datetime.utcnow()
    doc = item['doc']
    #删除部分
    doc.pop('detail_url', None)
    doc.pop('about_url', None)
    doc.pop('contact_url', None)
    #计算部分
    shop_owner_type = calc_shop_owner_type(doc['shop_name'])
    shop_site_url_hash = fnvhash.fnv_64a_str(doc['shop_site_url'])
    calc_doc = {
        'shop_owner_type': shop_owner_type,
        'shop_site_url_hash': str(shop_site_url_hash),
    }
    #提取数据的默认填充部分
    default_doc1 = {
        'crawl_time': now,
        'shop_site_type': spider.shop_site_type,
        'shop_name': None,
        'shop_site_url': None,
        'shop_products': None,
        'shop_launch_time': None,
        'shop_address': None,
        'shop_contacts': None,
        'shop_phone': None,
        'shop_cellphone': None,
        'shop_fax': None,
        'shop_email': None,
        'shop_qq': None,
    }
    #默认填充数据
    default_doc2 = {
        'shop_type_id': None,
        'shop_area_id': None,
        'shop_certified': 1,
        'city_id': 1,
        'is_bad_url': 0,
        'is_bad_time': None,
        'deleted': 0,
        'isRead': 0,
        'isImport': 0,
    }
    all_doc = chain(calc_doc.iteritems(), default_doc1.iteritems(), default_doc2.iteritems())
    for k, v in all_doc:
        doc.setdefault(k, v)
示例#2
0
    def parse_list_page(self, response):
        """
        商店名:抓取
        性质:计算
        来源:计算
        类别:抓取?
        店主:抓取
        地址:抓取
        联系方式:抓取
        创店日期:抓取
        主营产品:抓取
        状态:?
        操作:不需要
        shop_info = {}
        shop_info['shop_type_id'] = 10
        shop_info['shop_name'] = self.company
        shop_info['shop_address'] = self.address
        if not self.address:
            shop_info['shop_address'] = '山东淄博'
        shop_info['shop_contacts'] = self.contact
        shop_info['shop_phone'] = self.phone
        shop_info['shop_products'] = self.keywords
        shop_info['shop_site_url'] = self.site_url
        shop_info['shop_site_url_hash'] = fnvhash.fnv_64a_str(self.site_url)
        shop_info['shop_site_type'] = 24
        shop_info['shop_certified'] = 1
        shop_info['shop_owner_type'] = 1
        company_key = ['厂','站','公司', '事务所', '集团']
        for item in company_key:
            if item in self.company:
                shop_info['shop_owner_type'] = 2
        """


        multi_xpath = '//*[@id="sw_mod_searchlist"]/ul/li'
        html5_response = response_html5parse(response)
        page_hxs = HtmlXPathSelector(html5_response)
        multi_hxs = page_hxs.select(multi_xpath)
        for hxs in multi_hxs:
            #shop_products为shop相关所有描述.包括主营产品,简要描述和细节描述
            #提取部分
            shop_name = ''.join(hxs.select('.//a[@class="sw-ui-font-title14"]//text()').extract())
            shop_address = ''.join(hxs.select('.//div[@class="sm-offerResult-address"]/a/text()').extract())
            #主营产品: 有机化工原料; 苯 醇 酯 醚 类 批发
            #这是部分主营产品,所有主营产品在主营产品 链接里的标题里面
            shop_part_products = ''.join(hxs.select('.//div[@class="sm-offerResult-sale"]//text()').extract())
            shop_brief = ''.join(hxs.select('.//div[@class="sm-offerResult-sub"]//text()').extract())
            creditdetail_url = ''.join(hxs.select('.//a[@class="sw-ui-font-title14"]/@href').extract())
            creditdetail_url = urllib.unquote(creditdetail_url).strip()
            #计算部分
            shop_products = shop_brief + shop_part_products
            creditdetail_url_query = get_url_query(creditdetail_url)
            creditdetail_url_query.pop('tracelog', None)
            creditdetail_url = change_url_query(creditdetail_url, creditdetail_url_query)
            shop_site_url = get_site_url(creditdetail_url)
            shop_owner_type = calc_shop_owner_type(shop_name)
            shop_site_url_hash = fnvhash.fnv_64a_str(shop_site_url)
            #无对应数据部分
            shop_qq = None
            shop_email = None
            lack_doc = {
                'shop_qq': shop_qq,
                'shop_email': shop_email,
            }
            #默认填充部分
            shop_type_id = None
            shop_area_id = None
            shop_site_type = self.shop_site_type
            shop_certified = 1
            city_id = 1
            is_bad_url = 0
            is_bad_time = None
            deleted = 0
            isRead = 0
            isImport = 0
            default_doc = {
                'shop_type_id': shop_type_id,
                'shop_area_id': shop_area_id,
                'shop_site_type': shop_site_type,
                'shop_certified': shop_certified,
                'city_id': city_id,
                'is_bad_url': is_bad_url,
                'is_bad_time': is_bad_time,
                'deleted': deleted,
                'isRead': isRead,
                'isImport': isImport,
            }
            now = datetime.datetime.utcnow()

            doc = {
                'shop_name': shop_name,
                'shop_address': shop_address,
                'shop_products': shop_products,
                'shop_site_url': shop_site_url,
                'shop_site_url_hash': shop_site_url_hash,
                'show_owner_type': shop_owner_type,
                'crawl_time': now,
            }
            doc.update(lack_doc)
            doc.update(default_doc)

            detail_url = creditdetail_url
            list_url = response.url
            query = response.meta.get('query')
            item = LegItem(collection=self.collection, doc=doc,
                              detail_url=detail_url, list_url=list_url, query=query)
            if detail_url and self.visit_detail:
                detail_request = Request(detail_url, callback=self.parse_detail_page)
                detail_request.meta['item'] = item
                detail_request.meta['query'] = query
                yield detail_request
            else:
                yield item