Python extract_title示例，processdata.extract_title Python示例

示例#1

0

显示文件

    def parse_intr(self, tree, xpath):
        dom = tree.xpath(xpath)
        introduce = {}
        temporary = ''
        for item in dom:
            item = item.strip()
            if item == '':
                continue
            elif item.find('：') > 0:
                item = item.split('：', 1)
                if item[1] == '':
                    temporary = extract_title(item[0])
                else:
                    introduce[extract_title(item[0])] = extract_text(item[1])
            else:
                if temporary != '':
                    introduce[temporary] = extract_text(item)
                    temporary = ''
                else:
                    continue

        if introduce != '':
            return introduce
        else:
            return ''

示例#2

0

显示文件

    def crawl(self):
        # wareId = '1229271'
        # wareId = '1391817787'
        # priorcategory = ["家居家装","清洁用品","衣物清洁"]
        # presentcategory = ['1','2','3']
        # ids = uuid.uuid1()


        wareId = self.key
        ids =  self.data.get('uuid')
        category_data = extract_category(self)

        url = 'http://m.360buy.com/product/guige/%s.html'%(str(wareId))
        html_stream = ProcessData.get_web_data(url)
        tree = etree.HTML(html_stream.text)
        xpath = "//table[@class='Ptable']/tr/td/text()"
        dom = tree.xpath(xpath)
        specifications = {}
        temporary = ''
        i = 0
        for item in dom:
            item = item.strip()
            if item == '':
                continue
            if i%2 ==0:
                specifications[item] = ''
                temporary = extract_title(item)
            else:
                specifications[temporary] = extract_text(item)

            i += 1

        data = {
            'ecnorms':specifications
        }
        # specifications = json.dumps(specifications, ensure_ascii=False)
        introduce = IntroduceCrawler.crawl(wareId,ids)
        ecbrands = introduce[u'品牌'] if introduce.get(u'品牌') else ''
   #     ecnames = introduce[u'商品名称'].replace('\'',' ') if introduce.get(u'商品名称') else ''
        ecnames = introduce[u'商品名称'] if introduce.get(u'商品名称') else ''
        crawl_data = {
            'id': ids,
            'source': self.data.get('source'),
            'source_id': wareId,
            'summary': specifications,
            'introduce': introduce,
            'name': ecnames,
            'brand': ecbrands
        }
        crawl_data.update(category_data)
        model = EcDetailModel(crawl_data)
        export(model)

示例#3

0

显示文件

    def crawl(wareId,ids ):
        import sys
        reload(sys)
        sys.setdefaultencoding("utf-8")

        url = 'http://item.jd.com/%s.html'%(str(wareId))
        html_stream = ProcessData.get_web_data(url)
        if html_stream=={}:
            return {}
        html_stream.encoding = 'gb2312'
        tree = etree.HTML(html_stream.text)
        xpath = "//div[@id='product-detail-1']/ul[@class='detail-list']/li//text()"
        dom = tree.xpath(xpath)
        introduce = {}
        temporary = ''
        for item in dom:
            item = item.strip()
            if item == '':
                continue
            elif item.find('：') >0:
                item = item.split('：',1)
                if item[1] == '':
                    temporary = extract_title(item[0])
                else:
                    introduce[extract_title(item[0])] = extract_text(item[1])
            else:
                if temporary != '':
                    introduce[temporary] = extract_text(item)
                    temporary = ''
                else:
                    continue

        if introduce != '':
            return introduce
        else:
            return ''

示例#4

0

显示文件

 def parse_summary(self, tree, xpath):
     dom = tree.xpath(xpath)
     specifications = {}
     temporary = ''
     i = 0
     for item in dom:
         item = item.strip()
         if item == '':
             continue
         if i % 2 == 0:
             specifications[item] = ''
             temporary = extract_title(item)
         else:
             specifications[temporary] = extract_text(item)
         i += 1
     return specifications

示例#5

0

显示文件

class ThirdCrawler(Crawler):
    type = "ecommerce.jd.thirdlvl"

    def crawl(self):
        fid = self.key
        categorys = self.data['priorcategory']
 #       fid = '1625'
 #       categorys = ["家居家装","清洁用品"]

        start_urls = "http://gw.m.360buy.com/client.action?functionId=catelogy&body="
        thrid_urls = {
            'catelogyId':str(fid),
            'isDescription':'false',
            'isIcon':'false',
            'level':'2'
        }
        url = start_urls + quote(str(thrid_urls))

        try:
            jsons = ProcessData.get_json_data(url)
            lists = jsons['catelogyList']
        except Exception,e:
            self.logger.error(url)
            self.logger.error(e)
            print 'error ',url
            return
        if lists == []:
            return {}
        for i in range(len(lists)):

            cid = lists[i]['cid']
#            presentcategory = []
            priorcategory = []
            priorcategory.extend(categorys)
            priorcategory.append(extract_title(lists[i]['name']))
            data = {
                'priorcategory':priorcategory,
#                'presentcategory':presentcategory
            }
            Scheduler.schedule(ListCrawler.type, key=cid, data=data)

示例#6

0

显示文件

class FirstCrawler(Crawler):
    type = "ecommerce.jd.firstlvl"

    @staticmethod
    def init(conf=None):
        pass
  #      Scheduler.schedule(FirstCrawler.type, interval=86400)

    def crawl(self):
        start_urls = "http://gw.m.360buy.com/client.action?functionId=catelogy&body="
        sencond_urls = {
            'catelogyId': '0',
            'isDescription': 'true',
            'isIcon': 'true',
            'level': '0'
        }

        url = start_urls + quote(str(sencond_urls))
        try:
            jsons = ProcessData.get_json_data(url)
            lists = jsons['catelogyList']
        except Exception,e:
            self.logger.error(url)
            self.logger.error(e)
            print 'error ',url
            return
        for i in range(len(lists)):

            cid = lists[i]['cid']
            priorcategory = []
#            presentcategory = []
            priorcategory.append(extract_title(lists[i]['name']))
            data = {
                'priorcategory':priorcategory,
#                'presentcategory':presentcategory
            }

            Scheduler.schedule(SecondCrawler.type, key=cid, data=data)

示例#7

0

显示文件

    def crawl(self):

        # fid = '1620'
        # categorys = ["家居家装"]

        fid = self.key
        categorys = self.data['priorcategory']

        start_urls = "http://gw.m.360buy.com/client.action?functionId=catelogy&body="
        sencond_urls = {
            'catelogyId': str(fid),
            'isDescription': 'true',
            'isIcon': 'true',
            'level':'1'
        }
        url = start_urls + quote(str(sencond_urls))
        #print 'url ',url
        try:
            jsons = ProcessData.get_json_data(url)
            lists = jsons['catelogyList']
        except:
            print 'error ',url
            return
        if lists == []:
            return {}
        for i in range(len(lists)):

            cid = lists[i]['cid']
#            presentcategory = []
            priorcategory = []
            priorcategory.extend(categorys)
            priorcategory.append(extract_title(lists[i]['name']))
            data = {
                'priorcategory':priorcategory,
#                'presentcategory':presentcategory
            }
            Scheduler.schedule(ThirdCrawler.type, key=cid, data=data)