def parse(self, response): time = datetime.now() for result in response.xpath('//div[@class="date-outer"]'): node_id = 68 title = result.xpath( './/h3[contains(@class,"post-title")]/a/text()').get() url = result.xpath( './/h3[contains(@class,"post-title")]/a/@href').get() "2019-11-15T21:04:00-08:00" publish_at = datetime.strptime( result.xpath('.//abbr[@class="published"]/@title').get()[:-6], "%Y-%m-%dT%H:%M:%S") description_content = result.xpath( './/span[@class="post-labels"]/a/text()').get() create_at = time update_at = time position = None yield TopItem(node_id=node_id, title=title, url=url, publish_at=publish_at, description_content=description_content, create_at=create_at, update_at=update_at, position=position)
def parse(self, response): time = datetime.now() print( "长度", len( response.xpath( '//div[@class="page-content"]/div/div[contains(@class,"article")]' ))) for result in response.xpath( '//div[@class="page-content"]/div/div[contains(@class,"article")]' ): title = result.xpath('./a/div[2]/h3').xpath('string(.)').get() if title: url = 'https://www.qdaily.com' + result.xpath( './a/@href').get() node_id = 72 create_at = time update_at = time "2019-12-02 14:27:56 +0800" publish_at = datetime.strptime( result.xpath( './/span[@class="smart-date"]/@data-origindate').get(), '%Y-%m-%d %H:%M:%S +0800') position = None description_content = None yield TopItem(publish_at=publish_at, title=title, url=url, node_id=node_id, description_content=description_content, create_at=create_at, update_at=update_at, position=position)
def parse(self, response): time = datetime.now() i = 0 for result in response.xpath('//td[@class="keyword"]'): i += 1 node_id = 13 title = result.xpath('a/text()').get() url = result.xpath('a/@href').get() publish_at = time # 相邻td第二个 description_content = int_to_str( int( result.xpath( './following-sibling::td[2]/span/text()').get())) create_at = time update_at = time position = i yield TopItem(node_id=node_id, title=title, url=url, publish_at=publish_at, description_content=description_content, create_at=create_at, update_at=update_at, position=position)
def parse(self, response): time = datetime.now() for result in response.xpath( '//div[contains(@class,"packery-container")]/div'): title = result.xpath('.//img/@alt').get()[6:] url = 'https://www.qdaily.com' + result.xpath('./a/@href').get() node_id = 71 create_at = time update_at = time "2019-12-02 14:27:56 +0800" publish_at = datetime.strptime( result.xpath( './/span[@class="smart-date"]/@data-origindate').get(), '%Y-%m-%d %H:%M:%S +0800') position = None description_content = None yield TopItem(publish_at=publish_at, title=title, url=url, node_id=node_id, description_content=description_content, create_at=create_at, update_at=update_at, position=position)
def parse(self, response): time = datetime.now() for result in response.xpath( '//*[@id="alpha-inner"]/div[3]/div/ul/li'): title = result.xpath('./a/text()').get() url = result.xpath('./a/@href').get() node_id = 48 create_at = time update_at = time publish_at = datetime.strptime( result.xpath('./text()').get(), "%Y.%m.%d:") position = None description_content = None yield TopItem(publish_at=publish_at, title=title, url=url, node_id=node_id, description_content=description_content, create_at=create_at, update_at=update_at, position=position) result = response.xpath('//div[contains(@class,"entry-asset")]') title = result.xpath('./div[1]/h2/a/text()').get() url = result.xpath('./div[1]/h2/a/@href').get() node_id = 48 create_at = time update_at = time position = None description_content = None publish_at = datetime.strptime( result.xpath('.//abbr/text()').get(), "%Y年%m月%d日 %H:%M") yield TopItem(publish_at=publish_at, title=title, url=url, node_id=node_id, description_content=description_content, create_at=create_at, update_at=update_at, position=position)
def parse_node(self, response, node): now_time = datetime.now() item = TopItem() item['node_id'] = 56 item['title'] = node.xpath('atom:title/text()').get() item['url'] = node.xpath('atom:link/@href').get() item['publish_at'] = datetime.strptime( node.xpath('atom:published/text()').get(), "%Y-%m-%dT%H:%M:%S.000Z") + timedelta(hours=8) item['description_content'] = None item['create_at'] = now_time item['update_at'] = now_time item['position'] = None return item
def parse(self, response): time = datetime.now() for result in response.xpath('//li[@class="rank-item"]'): title = result.xpath('.//a[@class="title"]/text()').get() url = 'https:' + result.xpath('.//a[@class="title"]/@href').get() node_id = 95 create_at = time update_at = time publish_at = time position = result.xpath('./div[1]/text()').get() description_content = result.xpath('.//div[@class="detail"]/span/text()').get() yield TopItem(publish_at=publish_at, title=title, url=url, node_id=node_id, description_content=description_content, create_at=create_at, update_at=update_at, position=position)
def parse(self, response): time = datetime.now() data = json.loads(response.text)['objects'] for result in data: publish_at = datetime.fromtimestamp(int(result["created_at"])) title = result['post_title'] url = result['post_url'] node_id = 81 description_content = None create_at = time update_at = time position = 0 yield TopItem(node_id=node_id, title=title, url=url, publish_at=publish_at, description_content=description_content, create_at=create_at, update_at=update_at, position=int(position))
def parse(self, response): time = datetime.now() i = 0 for result in response.xpath('//*[@id="pl_top_realtimehot"]/table/tbody/tr'): i += 1 title = result.xpath('td[2]/a/text()').get()[1:-1] url = 'https://s.weibo.com' + (result.xpath('td[2]/a/@href_to').get() if result.xpath('td[2]/a/@href_to').get() else result.xpath('td[2]/a/@href').get()) node_id = 34 create_at = time update_at = time publish_at = time position = i description_content = None yield TopItem(publish_at=publish_at, title=title, url=url, node_id=node_id, description_content=description_content, create_at=create_at, update_at=update_at, position=position)
def parse(self, response): now_time = datetime.now() data = json.loads(response.text) for result in (data['stories']+data['top_stories']): node_id = 11 title = result['title'] url = result['url'] position = 0 description_content = None create_at = now_time update_at = now_time publish_at = now_time yield TopItem(node_id=node_id, title=title, url=url, publish_at=publish_at, description_content=description_content, create_at=create_at, update_at=update_at, position=position)
def parse_node(self, response, selector): now_time = datetime.now() for node in selector.xpath('./channel/item'): item = TopItem() item['node_id'] = 65 item['title'] = node.xpath('./title/text()').get() item['url'] = node.xpath('./link/text()').get() "28 Nov 2019 07:40:00 EST" item['publish_at'] = datetime.strptime( node.xpath('./pubDate/text()').get(), "%d %b %Y %H:%M:%S EST") + timedelta(hours=13) item['description_content'] = None item['create_at'] = now_time item['update_at'] = now_time item['position'] = None yield item
def parse(self, response): now_time = datetime.now() data = json.loads(response.text)['data']['list'] for tmp in data: for result in tmp: node_id = 105 title = result['article_title'] url = 'https://www.smzdm.com/p/' + result['article_id'] position = result['sort'] description_content = result['article_price'] create_at = now_time update_at = now_time publish_at = now_time yield TopItem(node_id=node_id, title=title, url=url, publish_at=publish_at, description_content=description_content, create_at=create_at, update_at=update_at, position=position)
def parse(self, response): time = datetime.now() i = 0 for result in response.xpath('//*[@id="content"]/div/div[2]/div[3]//li'): i += 1 title = result.xpath('./a/text()').get() url = result.xpath('./a/@href').get() node_id = 45 create_at = time update_at = time publish_at = time position = i description_content = result.xpath('./span/text()').get() yield TopItem(publish_at=publish_at, title=title, url=url, node_id=node_id, description_content=description_content, create_at=create_at, update_at=update_at, position=position)
def parse(self, response): time = datetime.now() i = 0 for result in response.xpath('//*[@class="channel-item"]/div[2]'): i += 1 title = result.xpath('./h3/a/text()').get() url = result.xpath('./h3/a/@href').get() node_id = 47 create_at = time update_at = time publish_at = time position = i description_content = result.xpath('.//span[@class="from"]/a/text()').get() yield TopItem(publish_at=publish_at, title=title, url=url, node_id=node_id, description_content=description_content, create_at=create_at, update_at=update_at, position=position)
def parse(self, response): now_time = datetime.now() data = json.loads(response.text)['aweme_list'] i = 0 for result in data: i += 1 node_id = 100 title = result['aweme_info']['desc'] url = result['aweme_info']['share_url'] position = i description_content = int_to_str(result['hot_value']) create_at = now_time update_at = now_time publish_at = now_time yield TopItem(node_id=node_id, title=title, url=url, publish_at=publish_at, description_content=description_content, create_at=create_at, update_at=update_at, position=position)
def parse(self, response): now_time = datetime.now() for result in response.xpath('//article'): node_id = 64 title = result.xpath('./header/h1/a/text()').get() url = result.xpath('./header/h1/a/@href').get() # '2019-10-23T22:42:06+08:00' publish_at = datetime.strptime(result.xpath('./header/div/a/time/@datetime').get(), '%Y-%m-%dT%H:%M:%S+08:00') description_content = None create_at = now_time update_at = now_time position = None yield TopItem(node_id=node_id, title=title, url=url, publish_at=publish_at, description_content=description_content, create_at=create_at, update_at=update_at, position=position)
def parse(self, response): now_time = datetime.now() for result in response.xpath('//channel/item'): node_id = 62 title = result.xpath('title/text()').get() url = result.xpath('link/text()').get() # 'Wed, 13 Nov 2019 07:01:48 +0000' publish_at = datetime.strptime(result.xpath('pubDate/text()').get(),"%a, %d %b %Y %H:%M:%S +0000") + timedelta(hours=8) description_content = result.xpath('category[1]/text()').get() create_at = now_time update_at = now_time position = None yield TopItem(node_id=node_id, title=title, url=url, publish_at=publish_at, description_content=description_content, create_at=create_at, update_at=update_at, position=position)
def parse(self, response): now_time = datetime.now() for result in response.xpath('//div[contains(@class,"blog-card")]/article'): node_id = 70 title = result.xpath('./a/h2/text()').get() url = "https://blog.est.im/"+result.xpath('./a/@href').get() # '2019-10-31T19:59:27' publish_at = datetime.strptime(result.xpath('./div/p[1]/time/@datetime').get()[:-6],'%Y-%m-%dT%H:%M:%S') description_content = None create_at = now_time update_at = now_time position = None yield TopItem(node_id=node_id, title=title, url=url, publish_at=publish_at, description_content=description_content, create_at=create_at, update_at=update_at, position=position)
def parse(self, response): time = datetime.now() for result in response.xpath('//*[@id="alpha-inner"]/div[@class="entry"]'): node_id = 55 title = result.xpath('./h3/text()').get() url = result.xpath('.//p[@class="entry-more-link"]/a/@href').get() publish_at = datetime.strptime(result.xpath('./preceding-sibling::*[2]/text()').get(), "%B %d, %Y") # 相邻td第二个 description_content = None create_at = time update_at = time position = None yield TopItem(node_id=node_id, title=title, url=url, publish_at=publish_at, description_content=description_content, create_at=create_at, update_at=update_at, position=position)
def parse(self, response): now_time = datetime.now() data = json.loads(response.text)['data'] i = 0 for result in data: i += 1 node_id = 104 title = result['dtitle'] + " 原价" + str(result['yuanjia']) + " 现价" + str(result['jiage']) url = 'http://shop.fulibus.net/index.php?r=/detailed&id={0}&jump=2'.format(str(result['id'])) position = i description_content = "月销" + int_to_str(result['xiaoliang']) create_at = now_time update_at = now_time publish_at = now_time yield TopItem(node_id=node_id, title=title, url=url, publish_at=publish_at, description_content=description_content, create_at=create_at, update_at=update_at, position=position)
def parse(self, response): now_time = datetime.now() data = json.loads(response.text)['data'][0]["words"] i = 0 for result in data: i += 1 node_id = 14 title = result['word'] url = 'https://m.toutiao.com/search/?keyword=' + result['word'] position = i description_content = int_to_str(result['params']['fake_click_cnt']) create_at = now_time update_at = now_time #转换为时间戳便于排序 publish_at = now_time yield TopItem(node_id=node_id, title=title, url=url, publish_at=publish_at, description_content=description_content, create_at=create_at, update_at=update_at, position=position)
def parse(self, response): time = datetime.now() i = 0 for result in json.loads( response.xpath('/html/body/script[1]').re( r"(window.initialState=)(.*\})") [1])['newsflashCatalogData']['data']['hotlist']['data']: i += 1 item = TopItem() item['node_id'] = 3 item['publish_at'] = datetime.strptime(result['published_at'], "%Y-%m-%d %H:%M:%S") item['description_content'] = int_to_str( result['counters']['view_count']) item['create_at'] = time item['update_at'] = time item['position'] = i item['title'] = result['title'] item['url'] = 'https://36kr.com/p/' + str(result['id']) yield item
def parse(self, response): time = datetime.now() for result in response.xpath('//ul[@class="posts"]/li'): node_id = 58 title = result.xpath('./a/text()').get() url = "https://limboy.me" + result.xpath('./a/@href').get() publish_at = datetime.strptime( result.xpath('./span[1]/text()').get(), "%Y-%m-%d") description_content = result.xpath('./span[2]/text()').get() create_at = time update_at = time position = None yield TopItem(node_id=node_id, title=title, url=url, publish_at=publish_at, description_content=description_content, create_at=create_at, update_at=update_at, position=position)
def parse(self, response): time = datetime.now() i = 0 for result in response.xpath('//ul[@id="listhot0"]/li')[:10]: i += 1 title = result.xpath('a/text()').get() url = 'https://www.thepaper.cn/' + result.xpath('a/@href').get() node_id = 12 create_at = time update_at = time publish_at = time position = i description_content = None yield TopItem(publish_at=publish_at, title=title, url=url, node_id=node_id, description_content=description_content, create_at=create_at, update_at=update_at, position=position)
def parse(self, response): time = datetime.now() data = json.loads(response.text)['data'] for result in data: publish_at = datetime.fromtimestamp( int(result["time_into_pool"] / 1000000)) title = result['title'] url = 'https://m.chouti.com/link/' + str(result['id']) node_id = 4 description_content = None create_at = time update_at = time position = 0 yield TopItem(node_id=node_id, title=title, url=url, publish_at=publish_at, description_content=description_content, create_at=create_at, update_at=update_at, position=int(position))
def parse(self, response): now_time = datetime.now() data = json.loads(response.text)['list'] for result in data: node_id = 9 title = result['title'] url = 'https://sspai.com/post/' + str(result['id']) publish_at = datetime.fromtimestamp(result["recommend_to_home_at"]) description_content = result['author']['nickname'] create_at = now_time update_at = now_time #转换为时间戳便于排序 position = 0 yield TopItem(node_id=node_id, title=title, url=url, publish_at=publish_at, description_content=description_content, create_at=create_at, update_at=update_at, position=position)
def parse(self, response): time = datetime.now() i = 0 for result in response.xpath('//*[@id="wp"]//tr[not(@class)]'): i += 1 title = result.xpath('./th/a/text()').get() url = 'https://www.52pojie.cn/' + result.xpath( './th/a/@href').get() node_id = 87 create_at = time update_at = time publish_at = time position = i description_content = None yield TopItem(publish_at=publish_at, title=title, url=url, node_id=node_id, description_content=description_content, create_at=create_at, update_at=update_at, position=position)
def parse(self, response): time = datetime.now() for result in response.xpath( '//div[contains(@class,"areabg1")]/div[2]/div/div[2]//tr[position()>=2]' ): title = result.xpath('td[1]/a/text()').get() url = result.xpath('td[1]/a/@href').get() node_id = 26 create_at = time update_at = time publish_at = time position = int(result.xpath('td[1]/span/text()').get()) description_content = int_to_str( int(result.xpath('td[2]/text()').get())) yield TopItem(publish_at=publish_at, title=title, url=url, node_id=node_id, description_content=description_content, create_at=create_at, update_at=update_at, position=position)
def parse(self, response): now_time = datetime.now() data = json.loads(response.text)['rankList'] i = 0 for result in data: i += 1 node_id = 101 title = result['contentTitle'] url = result['shareUrl'] position = i description_content = int_to_str(result['viewCount']) create_at = now_time update_at = now_time publish_at = now_time yield TopItem(node_id=node_id, title=title, url=url, publish_at=publish_at, description_content=description_content, create_at=create_at, update_at=update_at, position=position)
def parse(self, response): time = datetime.now() i = 0 for result in response.xpath('//article[@class="Box-row"]'): i += 1 node_id = 85 title = result.xpath('h1/a/@href').get()[1:] url = 'https://github.com' + result.xpath('h1/a/@href').get() publish_at = time description_content = result.xpath('div[2]/a[1]')[0].xpath( 'string(.)').get().replace(' ', '').replace('\n', '').replace( ',', '') + " stars" create_at = time update_at = time position = i yield TopItem(node_id=node_id, title=title, url=url, publish_at=publish_at, description_content=description_content, create_at=create_at, update_at=update_at, position=position)