Python TopItem示例，spider_tophub.items.TopItem Python示例

示例#1

0

显示文件

文件： catchen.py 项目： w898275706/tophub

    def parse(self, response):
        time = datetime.now()

        for result in response.xpath('//div[@class="date-outer"]'):
            node_id = 68
            title = result.xpath(
                './/h3[contains(@class,"post-title")]/a/text()').get()
            url = result.xpath(
                './/h3[contains(@class,"post-title")]/a/@href').get()
            "2019-11-15T21:04:00-08:00"
            publish_at = datetime.strptime(
                result.xpath('.//abbr[@class="published"]/@title').get()[:-6],
                "%Y-%m-%dT%H:%M:%S")
            description_content = result.xpath(
                './/span[@class="post-labels"]/a/text()').get()
            create_at = time
            update_at = time
            position = None
            yield TopItem(node_id=node_id,
                          title=title,
                          url=url,
                          publish_at=publish_at,
                          description_content=description_content,
                          create_at=create_at,
                          update_at=update_at,
                          position=position)

示例#2

0

显示文件

文件： qdaily_all.py 项目： w898275706/tophub

    def parse(self, response):
        time = datetime.now()
        print(
            "长度",
            len(
                response.xpath(
                    '//div[@class="page-content"]/div/div[contains(@class,"article")]'
                )))
        for result in response.xpath(
                '//div[@class="page-content"]/div/div[contains(@class,"article")]'
        ):
            title = result.xpath('./a/div[2]/h3').xpath('string(.)').get()
            if title:
                url = 'https://www.qdaily.com' + result.xpath(
                    './a/@href').get()
                node_id = 72
                create_at = time
                update_at = time
                "2019-12-02 14:27:56 +0800"
                publish_at = datetime.strptime(
                    result.xpath(
                        './/span[@class="smart-date"]/@data-origindate').get(),
                    '%Y-%m-%d %H:%M:%S +0800')
                position = None
                description_content = None

                yield TopItem(publish_at=publish_at,
                              title=title,
                              url=url,
                              node_id=node_id,
                              description_content=description_content,
                              create_at=create_at,
                              update_at=update_at,
                              position=position)

示例#3

0

显示文件

 def parse(self, response):
     time = datetime.now()
     i = 0
     for result in response.xpath('//td[@class="keyword"]'):
         i += 1
         node_id = 13
         title = result.xpath('a/text()').get()
         url = result.xpath('a/@href').get()
         publish_at = time
         # 相邻td第二个
         description_content = int_to_str(
             int(
                 result.xpath(
                     './following-sibling::td[2]/span/text()').get()))
         create_at = time
         update_at = time
         position = i
         yield TopItem(node_id=node_id,
                       title=title,
                       url=url,
                       publish_at=publish_at,
                       description_content=description_content,
                       create_at=create_at,
                       update_at=update_at,
                       position=position)

示例#4

0

显示文件

    def parse(self, response):
        time = datetime.now()
        for result in response.xpath(
                '//div[contains(@class,"packery-container")]/div'):
            title = result.xpath('.//img/@alt').get()[6:]
            url = 'https://www.qdaily.com' + result.xpath('./a/@href').get()
            node_id = 71
            create_at = time
            update_at = time
            "2019-12-02 14:27:56 +0800"
            publish_at = datetime.strptime(
                result.xpath(
                    './/span[@class="smart-date"]/@data-origindate').get(),
                '%Y-%m-%d %H:%M:%S +0800')
            position = None

            description_content = None
            yield TopItem(publish_at=publish_at,
                          title=title,
                          url=url,
                          node_id=node_id,
                          description_content=description_content,
                          create_at=create_at,
                          update_at=update_at,
                          position=position)

示例#5

0

显示文件

文件： ruanyifeng.py 项目： w898275706/tophub

 def parse(self, response):
     time = datetime.now()
     for result in response.xpath(
             '//*[@id="alpha-inner"]/div[3]/div/ul/li'):
         title = result.xpath('./a/text()').get()
         url = result.xpath('./a/@href').get()
         node_id = 48
         create_at = time
         update_at = time
         publish_at = datetime.strptime(
             result.xpath('./text()').get(), "%Y.%m.%d：")
         position = None
         description_content = None
         yield TopItem(publish_at=publish_at,
                       title=title,
                       url=url,
                       node_id=node_id,
                       description_content=description_content,
                       create_at=create_at,
                       update_at=update_at,
                       position=position)
     result = response.xpath('//div[contains(@class,"entry-asset")]')
     title = result.xpath('./div[1]/h2/a/text()').get()
     url = result.xpath('./div[1]/h2/a/@href').get()
     node_id = 48
     create_at = time
     update_at = time
     position = None
     description_content = None
     publish_at = datetime.strptime(
         result.xpath('.//abbr/text()').get(), "%Y年%m月%d日 %H:%M")
     yield TopItem(publish_at=publish_at,
                   title=title,
                   url=url,
                   node_id=node_id,
                   description_content=description_content,
                   create_at=create_at,
                   update_at=update_at,
                   position=position)

示例#6

0

显示文件

 def parse_node(self, response, node):
     now_time = datetime.now()
     item = TopItem()
     item['node_id'] = 56
     item['title'] = node.xpath('atom:title/text()').get()
     item['url'] = node.xpath('atom:link/@href').get()
     item['publish_at'] = datetime.strptime(
         node.xpath('atom:published/text()').get(),
         "%Y-%m-%dT%H:%M:%S.000Z") + timedelta(hours=8)
     item['description_content'] = None
     item['create_at'] = now_time
     item['update_at'] = now_time
     item['position'] = None
     return item

示例#7

0

显示文件

文件： bilibili_ribang.py 项目： w898275706/tophub

 def parse(self, response):
     time = datetime.now()
     for result in response.xpath('//li[@class="rank-item"]'):
         title = result.xpath('.//a[@class="title"]/text()').get()
         url = 'https:' + result.xpath('.//a[@class="title"]/@href').get()
         node_id = 95
         create_at = time
         update_at = time
         publish_at = time
         position = result.xpath('./div[1]/text()').get()
         description_content = result.xpath('.//div[@class="detail"]/span/text()').get()
         yield TopItem(publish_at=publish_at, title=title, url=url, node_id=node_id,
                       description_content=description_content, create_at=create_at, update_at=update_at,
                       position=position)

示例#8

0

显示文件

文件： ifanr_chanpin.py 项目： w898275706/tophub

    def parse(self, response):
        time = datetime.now()
        data = json.loads(response.text)['objects']

        for result in data:
            publish_at = datetime.fromtimestamp(int(result["created_at"]))
            title = result['post_title']
            url = result['post_url']
            node_id = 81
            description_content = None
            create_at = time
            update_at = time
            position = 0
            yield TopItem(node_id=node_id, title=title, url=url, publish_at=publish_at,
                          description_content=description_content, create_at=create_at, update_at=update_at,
                          position=int(position))

示例#9

0

显示文件

文件： weibo_xinshidai.py 项目： w898275706/tophub

 def parse(self, response):
     time = datetime.now()
     i = 0
     for result in response.xpath('//*[@id="pl_top_realtimehot"]/table/tbody/tr'):
         i += 1
         title = result.xpath('td[2]/a/text()').get()[1:-1]
         url = 'https://s.weibo.com' + (result.xpath('td[2]/a/@href_to').get() if result.xpath('td[2]/a/@href_to').get() else result.xpath('td[2]/a/@href').get())
         node_id = 34
         create_at = time
         update_at = time
         publish_at = time
         position = i
         description_content = None
         yield TopItem(publish_at=publish_at, title=title, url=url, node_id=node_id,
                       description_content=description_content, create_at=create_at, update_at=update_at,
                       position=position)

示例#10

0

显示文件

    def parse(self, response):
        now_time = datetime.now()
        data = json.loads(response.text)

        for result in (data['stories']+data['top_stories']):
            node_id = 11
            title = result['title']
            url =  result['url']
            position = 0
            description_content = None
            create_at = now_time
            update_at = now_time
            publish_at = now_time
            yield TopItem(node_id=node_id, title=title, url=url, publish_at=publish_at,
                          description_content=description_content, create_at=create_at, update_at=update_at,
                          position=position)

示例#11

0

显示文件

文件： changhai.py 项目： w898275706/tophub

 def parse_node(self, response, selector):
     now_time = datetime.now()
     for node in selector.xpath('./channel/item'):
         item = TopItem()
         item['node_id'] = 65
         item['title'] = node.xpath('./title/text()').get()
         item['url'] = node.xpath('./link/text()').get()
         "28 Nov 2019 07:40:00 EST"
         item['publish_at'] = datetime.strptime(
             node.xpath('./pubDate/text()').get(),
             "%d %b %Y %H:%M:%S EST") + timedelta(hours=13)
         item['description_content'] = None
         item['create_at'] = now_time
         item['update_at'] = now_time
         item['position'] = None
         yield item

示例#12

0

显示文件

 def parse(self, response):
     now_time = datetime.now()
     data = json.loads(response.text)['data']['list']
     for tmp in data:
         for result in tmp:
             node_id = 105
             title = result['article_title']
             url = 'https://www.smzdm.com/p/' + result['article_id']
             position = result['sort']
             description_content = result['article_price']
             create_at = now_time
             update_at = now_time
             publish_at = now_time
             yield TopItem(node_id=node_id, title=title, url=url, publish_at=publish_at,
                           description_content=description_content, create_at=create_at, update_at=update_at,
                           position=position)

示例#13

0

显示文件

文件： douban_huati24.py 项目： w898275706/tophub

 def parse(self, response):
     time = datetime.now()
     i = 0
     for result in response.xpath('//*[@id="content"]/div/div[2]/div[3]//li'):
         i += 1
         title = result.xpath('./a/text()').get()
         url = result.xpath('./a/@href').get()
         node_id = 45
         create_at = time
         update_at = time
         publish_at = time
         position = i
         description_content = result.xpath('./span/text()').get()
         yield TopItem(publish_at=publish_at, title=title, url=url, node_id=node_id,
                       description_content=description_content, create_at=create_at, update_at=update_at,
                       position=position)

示例#14

0

显示文件

文件： douban_group.py 项目： w898275706/tophub

 def parse(self, response):
     time = datetime.now()
     i = 0
     for result in response.xpath('//*[@class="channel-item"]/div[2]'):
         i += 1
         title = result.xpath('./h3/a/text()').get()
         url = result.xpath('./h3/a/@href').get()
         node_id = 47
         create_at = time
         update_at = time
         publish_at = time
         position = i
         description_content = result.xpath('.//span[@class="from"]/a/text()').get()
         yield TopItem(publish_at=publish_at, title=title, url=url, node_id=node_id,
                       description_content=description_content, create_at=create_at, update_at=update_at,
                       position=position)

示例#15

0

显示文件

文件： douyin_zhengnengliang.py 项目： w898275706/tophub

 def parse(self, response):
     now_time = datetime.now()
     data = json.loads(response.text)['aweme_list']
     i = 0
     for result in data:
         i += 1
         node_id = 100
         title = result['aweme_info']['desc']
         url = result['aweme_info']['share_url']
         position = i
         description_content = int_to_str(result['hot_value'])
         create_at = now_time
         update_at = now_time
         publish_at = now_time
         yield TopItem(node_id=node_id, title=title, url=url, publish_at=publish_at,
                       description_content=description_content, create_at=create_at, update_at=update_at,
                       position=position)

示例#16

0

显示文件

    def parse(self, response):
        now_time = datetime.now()

        for result in response.xpath('//article'):

            node_id = 64
            title = result.xpath('./header/h1/a/text()').get()
            url = result.xpath('./header/h1/a/@href').get()
            # '2019-10-23T22:42:06+08:00'
            publish_at = datetime.strptime(result.xpath('./header/div/a/time/@datetime').get(), '%Y-%m-%dT%H:%M:%S+08:00')
            description_content = None
            create_at = now_time
            update_at = now_time
            position = None
            yield TopItem(node_id=node_id, title=title, url=url, publish_at=publish_at,
                          description_content=description_content, create_at=create_at, update_at=update_at,
                          position=position)

示例#17

0

显示文件

文件： iamsujie.py 项目： w898275706/tophub

    def parse(self, response):
        now_time = datetime.now()

        for result in response.xpath('//channel/item'):

            node_id = 62
            title = result.xpath('title/text()').get()
            url = result.xpath('link/text()').get()
            # 'Wed, 13 Nov 2019 07:01:48 +0000'
            publish_at = datetime.strptime(result.xpath('pubDate/text()').get(),"%a, %d %b %Y %H:%M:%S +0000") + timedelta(hours=8)
            description_content = result.xpath('category[1]/text()').get()
            create_at = now_time
            update_at = now_time
            position = None
            yield TopItem(node_id=node_id, title=title, url=url, publish_at=publish_at,
                          description_content=description_content, create_at=create_at, update_at=update_at,
                          position=position)

示例#18

0

显示文件

文件： est.py 项目： w898275706/tophub

    def parse(self, response):
        now_time = datetime.now()

        for result in response.xpath('//div[contains(@class,"blog-card")]/article'):

            node_id = 70
            title = result.xpath('./a/h2/text()').get()
            url = "https://blog.est.im/"+result.xpath('./a/@href').get()
            # '2019-10-31T19:59:27'
            publish_at = datetime.strptime(result.xpath('./div/p[1]/time/@datetime').get()[:-6],'%Y-%m-%dT%H:%M:%S')
            description_content = None
            create_at = now_time
            update_at = now_time
            position = None
            yield TopItem(node_id=node_id, title=title, url=url, publish_at=publish_at,
                          description_content=description_content, create_at=create_at, update_at=update_at,
                          position=position)

示例#19

0

显示文件

    def parse(self, response):
        time = datetime.now()

        for result in response.xpath('//*[@id="alpha-inner"]/div[@class="entry"]'):

            node_id = 55
            title = result.xpath('./h3/text()').get()
            url = result.xpath('.//p[@class="entry-more-link"]/a/@href').get()
            publish_at = datetime.strptime(result.xpath('./preceding-sibling::*[2]/text()').get(), "%B %d, %Y")
            # 相邻td第二个
            description_content = None
            create_at = time
            update_at = time
            position = None
            yield TopItem(node_id=node_id, title=title, url=url, publish_at=publish_at,
                          description_content=description_content, create_at=create_at, update_at=update_at,
                          position=position)

示例#20

0

显示文件

文件： taobao_quantianbang.py 项目： w898275706/tophub

 def parse(self, response):
     now_time = datetime.now()
     data = json.loads(response.text)['data']
     i = 0
     for result in data:
         i += 1
         node_id = 104
         title = result['dtitle'] + " 原价" + str(result['yuanjia']) + " 现价" + str(result['jiage'])
         url = 'http://shop.fulibus.net/index.php?r=/detailed&id={0}&jump=2'.format(str(result['id']))
         position = i
         description_content = "月销" + int_to_str(result['xiaoliang'])
         create_at = now_time
         update_at = now_time
         publish_at = now_time
         yield TopItem(node_id=node_id, title=title, url=url, publish_at=publish_at,
                       description_content=description_content, create_at=create_at, update_at=update_at,
                       position=position)

示例#21

0

显示文件

 def parse(self, response):
     now_time = datetime.now()
     data = json.loads(response.text)['data'][0]["words"]
     i = 0
     for result in data:
         i += 1
         node_id = 14
         title = result['word']
         url = 'https://m.toutiao.com/search/?keyword=' + result['word']
         position = i
         description_content = int_to_str(result['params']['fake_click_cnt'])
         create_at = now_time
         update_at = now_time
         #转换为时间戳便于排序
         publish_at = now_time
         yield TopItem(node_id=node_id, title=title, url=url, publish_at=publish_at,
                       description_content=description_content, create_at=create_at, update_at=update_at,
                       position=position)

示例#22

0

显示文件

 def parse(self, response):
     time = datetime.now()
     i = 0
     for result in json.loads(
             response.xpath('/html/body/script[1]').re(
                 r"(window.initialState=)(.*\})")
         [1])['newsflashCatalogData']['data']['hotlist']['data']:
         i += 1
         item = TopItem()
         item['node_id'] = 3
         item['publish_at'] = datetime.strptime(result['published_at'],
                                                "%Y-%m-%d %H:%M:%S")
         item['description_content'] = int_to_str(
             result['counters']['view_count'])
         item['create_at'] = time
         item['update_at'] = time
         item['position'] = i
         item['title'] = result['title']
         item['url'] = 'https://36kr.com/p/' + str(result['id'])
         yield item

示例#23

0

显示文件

    def parse(self, response):
        time = datetime.now()

        for result in response.xpath('//ul[@class="posts"]/li'):
            node_id = 58
            title = result.xpath('./a/text()').get()
            url = "https://limboy.me" + result.xpath('./a/@href').get()
            publish_at = datetime.strptime(
                result.xpath('./span[1]/text()').get(), "%Y-%m-%d")
            description_content = result.xpath('./span[2]/text()').get()
            create_at = time
            update_at = time
            position = None
            yield TopItem(node_id=node_id,
                          title=title,
                          url=url,
                          publish_at=publish_at,
                          description_content=description_content,
                          create_at=create_at,
                          update_at=update_at,
                          position=position)

示例#24

0

显示文件

文件： pengpai_hotnews.py 项目： w898275706/tophub

 def parse(self, response):
     time = datetime.now()
     i = 0
     for result in response.xpath('//ul[@id="listhot0"]/li')[:10]:
         i += 1
         title = result.xpath('a/text()').get()
         url = 'https://www.thepaper.cn/' + result.xpath('a/@href').get()
         node_id = 12
         create_at = time
         update_at = time
         publish_at = time
         position = i
         description_content = None
         yield TopItem(publish_at=publish_at,
                       title=title,
                       url=url,
                       node_id=node_id,
                       description_content=description_content,
                       create_at=create_at,
                       update_at=update_at,
                       position=position)

示例#25

0

显示文件

    def parse(self, response):
        time = datetime.now()
        data = json.loads(response.text)['data']

        for result in data:
            publish_at = datetime.fromtimestamp(
                int(result["time_into_pool"] / 1000000))
            title = result['title']
            url = 'https://m.chouti.com/link/' + str(result['id'])
            node_id = 4
            description_content = None
            create_at = time
            update_at = time
            position = 0
            yield TopItem(node_id=node_id,
                          title=title,
                          url=url,
                          publish_at=publish_at,
                          description_content=description_content,
                          create_at=create_at,
                          update_at=update_at,
                          position=int(position))

示例#26

0

显示文件

    def parse(self, response):
        now_time = datetime.now()
        data = json.loads(response.text)['list']

        for result in data:
            node_id = 9
            title = result['title']
            url = 'https://sspai.com/post/' + str(result['id'])
            publish_at = datetime.fromtimestamp(result["recommend_to_home_at"])
            description_content = result['author']['nickname']
            create_at = now_time
            update_at = now_time
            #转换为时间戳便于排序
            position = 0
            yield TopItem(node_id=node_id,
                          title=title,
                          url=url,
                          publish_at=publish_at,
                          description_content=description_content,
                          create_at=create_at,
                          update_at=update_at,
                          position=position)

示例#27

0

显示文件

文件： pojie52_jinriretie.py 项目： w898275706/tophub

 def parse(self, response):
     time = datetime.now()
     i = 0
     for result in response.xpath('//*[@id="wp"]//tr[not(@class)]'):
         i += 1
         title = result.xpath('./th/a/text()').get()
         url = 'https://www.52pojie.cn/' + result.xpath(
             './th/a/@href').get()
         node_id = 87
         create_at = time
         update_at = time
         publish_at = time
         position = i
         description_content = None
         yield TopItem(publish_at=publish_at,
                       title=title,
                       url=url,
                       node_id=node_id,
                       description_content=description_content,
                       create_at=create_at,
                       update_at=update_at,
                       position=position)

示例#28

0

显示文件

文件： news163_quanhot24.py 项目： w898275706/tophub

 def parse(self, response):
     time = datetime.now()
     for result in response.xpath(
             '//div[contains(@class,"areabg1")]/div[2]/div/div[2]//tr[position()>=2]'
     ):
         title = result.xpath('td[1]/a/text()').get()
         url = result.xpath('td[1]/a/@href').get()
         node_id = 26
         create_at = time
         update_at = time
         publish_at = time
         position = int(result.xpath('td[1]/span/text()').get())
         description_content = int_to_str(
             int(result.xpath('td[2]/text()').get()))
         yield TopItem(publish_at=publish_at,
                       title=title,
                       url=url,
                       node_id=node_id,
                       description_content=description_content,
                       create_at=create_at,
                       update_at=update_at,
                       position=position)

示例#29

0

显示文件

 def parse(self, response):
     now_time = datetime.now()
     data = json.loads(response.text)['rankList']
     i = 0
     for result in data:
         i += 1
         node_id = 101
         title = result['contentTitle']
         url = result['shareUrl']
         position = i
         description_content = int_to_str(result['viewCount'])
         create_at = now_time
         update_at = now_time
         publish_at = now_time
         yield TopItem(node_id=node_id,
                       title=title,
                       url=url,
                       publish_at=publish_at,
                       description_content=description_content,
                       create_at=create_at,
                       update_at=update_at,
                       position=position)

示例#30

0

显示文件

 def parse(self, response):
     time = datetime.now()
     i = 0
     for result in response.xpath('//article[@class="Box-row"]'):
         i += 1
         node_id = 85
         title = result.xpath('h1/a/@href').get()[1:]
         url = 'https://github.com' + result.xpath('h1/a/@href').get()
         publish_at = time
         description_content = result.xpath('div[2]/a[1]')[0].xpath(
             'string(.)').get().replace(' ', '').replace('\n', '').replace(
                 ',', '') + " stars"
         create_at = time
         update_at = time
         position = i
         yield TopItem(node_id=node_id,
                       title=title,
                       url=url,
                       publish_at=publish_at,
                       description_content=description_content,
                       create_at=create_at,
                       update_at=update_at,
                       position=position)