def parse(self, response): for repos in response.css('div#user-repositories-list li'): item = ShiyanlouItem() item['name'] = repos.css( 'div.d-inline-block.mb-1 h3 a::text').re_first('\S+') item['update_time'] = repos.xpath( './/div[@class="f6 text-gray mt-2"]/relative-time/@datetime' ).extract_first() # 构造每个仓库自己的页面,获取提交、分支、发布版本的数量 request = response.follow( repos.css('div.d-inline-block.mb-1 h3 a::attr(href)')[0], callback=self.parse_info) # 将未完成的item通过meta传入parse_info request.meta['item'] = item # 继续产生新的请求 yield request # process next link next_page = None next_url = response.css('div.pagination a::attr(href)').extract() next_text = response.css('div.pagination a::text').extract() for l, t in zip(next_url, next_text): if t == 'Next': next_page = l # next_page为当前页面链接的下一个的页面,如果是最后一个页面则节点有链接且为Next # 的节点没有,也即next_page为None if next_page: # 产生新的Request,回调parse来处理 yield response.follow(next_page, callback=self.parse)
def parse(self, response): for Reposity in response.css('li.col-12'): item =ShiyanlouItem() item['name'] = Reposity.css('div.d-inline-block a::text').re_first('\s*([-\w]+)\s*') item['update_time'] = Reposity.css('relative-time::attr(datetime)').extract_first() yield item
def parse(self, response): repos = response.xpath('//li[@itemprop="owns"]') for repo in repos: item = ShiyanlouItem() item["repo_name"] = repo.xpath( ".//a[@itemprop='name codeRepository']/text()").extract_first( ).strip() item["update_time"] = repo.xpath( ".//relative-time/@datetime").extract_first() yield item # next page # spans = response.css('div.pagination span.disabled::text') # if len(spans) == 0 or spans[-1].extract() != 'Next': # next_url = response.css('div.paginate-container a:last-child::attr(href)').extract_first() # # next_url = response.css("div.pagiation-container a:last-child::attr(href)").extract_first() # yield response.follow(next_url, callback=self.parse) pages = response.css("div.paginate-container a:last-child::text") if pages.extract_first() == "Next": next_url = response.css( "div.paginate-container a:last-child::attr(href)" ).extract_first() yield response.follow(next_url, callback=self.parse)
def parse_item(self, response): item = ShiyanlouItem() for data in response.css('li.col-12'): item['repo_name'] = data.xpath( './/h3/a/text()').extract_first().strip() item['update_time'] = data.xpath( './/relative-time/@datetime').extract_first() yield item
def parse(self, response): for i in response.css('li.col-12'): yield ShiyanlouItem({ 'name': i.css('a::text').extract_first().strip(), 'update_time': i.css('relative-time::attr(datetime)').extract_first() })
def parse(self, response): repos = response.xpath('//li[@itemprop="owns"]') for repo in repos: item = ShiyanlouItem() item['repo_name'] = repo.xpath(".//a[@itemprop='name codeRepository']/text()").extract_first().strip() item['update_time'] = repo.xpath(".//relative-time/@datetime").extract_first() yield item
def parse(self, response): name_list = response.xpath('//*[@id="user-repositories-list"]/ul//div[1]/h3/a/text()').re(r'\s*(.+)') time_list = response.xpath('//*[@id="user-repositories-list"]/ul/li/div[3]/relative-time/@datetime').extract() for name,time in zip(name_list,time_list): item = ShiyanlouItem({ 'name':name, 'update_time':time }) yield item
def parse(self, response): for course in response.css('li.col-12'): item = ShiyanlouItem({ 'name': course.xpath(".//h3/a/text()").extract_first().strip(), 'update_time': course.xpath('.//relative-time/@datetime').extract_first() }) yield item
def parse(self, response): for url in response.css('li.col-12'): item = ShiyanlouItem() item['name'] = url.css('div.d-inline-block.mb-1 a::text').re_first('[^\s]+[\w]*') item['update_time'] = url.xpath('.//relative-time/@datetime').extract_first() git_url = response.urljoin(url.xpath('.//a/@href').extract_first()) request = scrapy.Request(git_url, callback=self.parse_info) request.meta['item'] = item yield request
def parse(self, response): for course in response.css('li.public'): item = ShiyanlouItem({ 'name': course.xpath('.//a[@itemprop="name codeRepository"]/text()'). re_first("\n\s*(.*)"), 'update_time': course.xpath('.//relative-time/@datetime').extract_first() }) yield item
def parse(self, response): for url in response.css('li.col-12'): item = ShiyanlouItem({ 'name': url.css('div.d-inline-block.mb-1 a::text').re_first( '[^\s]+[\w]*'), 'update_time': url.xpath('.//relative-time/@datetime').extract_first() }) yield item
def parse(self, response): repositories = response.xpath( '//*[@id="user-repositories-list"]/ul/li') for repository in repositories: item = ShiyanlouItem() item['repo_name'] = repository.xpath( './/a[@itemprop="name codeRepository"]/text()').extract_first( ).strip() item['update_time'] = repository.xpath( './/@datetime').extract_first().strip() yield item
def parse(self, response): for github in response.css('#user-repositories-list ul li'): item = ShiyanlouItem() item['name'] = github.xpath('.//div[1]/h3/a/text()').re_first( r'\n\s*([\w_-]*)') item['update_time'] = github.xpath( './/div[3]/relative-time/@datetime').extract_first() git_url = response.urljoin( github.xpath('.//div[1]/h3/a/@href').extract_first()) request = scrapy.Request(git_url, callback=self.parse_otherinfo) request.meta['item'] = item yield request
def parse(self, response): for repository in response.css('li.public'): item = ShiyanlouItem() item['name'] = repository.xpath( './/a[@itemprop="name codeRepository"]/text()').re_first( '\n\s*(.*)') item['update_time'] = repository.xpath( './/relative-time/@datetime').extract_first() repo_url = response.urljoin( repository.xpath('.//a/@href').extract_first()) request = scrapy.Request(repo_url, callback=self.parse_repo) request.meta['item'] = item yield request
def parse(self, response): repos = response.xpath('//div[@class="col-9 d-inline-block"]') for repo in repos: item = ShiyanlouItem({ 'repo_name': repo.xpath('.//h3/a/text()').re_first('\\n\\s*(.+)'), 'update_time': repo.xpath('.//relative-time').re_first( '<relative-time datetime="(.{20})') }) yield item for url in response.xpath( '//div[@class="pagination"]/a[contains(text(),"Next")]'): yield response.follow(url, callback=self.parse)
def parse(self, response): for course in response.css( 'li[class="col-12 d-block width-full py-4 border-bottom public source"]' ): yield ShiyanlouItem({ 'name': course.css('div[class="d-inline-block mb-1"] a::text'). re_first('[^\w]*(\w*)[^\w]*'), # 'update_time':response.xpath('div[@class="f6 text-gray mt-2"]/relative-time/@datetime').extract_first() 'update_time': course.css( 'div[class="f6 text-gray mt-2"] relative-time::attr(datetime)' ).extract_first() })
def parse(self, response): for course in response.css('li.public'): item = ShiyanlouItem() item['name'] = course.css( 'div[class="d-inline-block mb-1"] a::text').re_first( '[^\w]*(\w*)[^\w]*') # 'update_time':response.xpath('div[@class="f6 text-gray mt-2"]/relative-time/@datetime').extract_first() item['update_time'] = course.css( 'div[class="f6 text-gray mt-2"] relative-time::attr(datetime)' ).extract_first() url = response.urljoin(course.xpath('.//a/@href').extract_first()) request = scrapy.Request(url, callback=self.parse_repo) request.meta['item'] = item yield request
def parse(self, response): for course in response.css('div.course-body'): item = ShiyanlouItem({ 'name': course.css('div.course-name::text').extract_first(), 'description': course.css('div.course-desc::text').extract_first(), 'type': course.css('div.course-footer span.pull-right::text'). extract_first(default='mf'), 'students': course.xpath('.//span[contains(@class, "pull-left")]/text()[2]' ).re_first('[^\d]*(\d*)[^\d]*') }) yield item
def parse(self, response): for repo in response.css('li.col-12'): item = ShiyanlouItem() item['name'] = repo.xpath( './/div[@class="d-inline-block mb-1"]/h3/a/text()').re_first( r'\s*(.+)') item['update_time'] = repo.xpath( './/div[@class="f6 text-gray mt-2"]/relative-time/@datetime' ).extract_first() repo_url = response.urljoin( repo.xpath('.//div[@class="d-inline-block mb-1"]/h3/a/@href'). extract_first()) request = scrapy.Request(repo_url, callback=self.parse_repo) request.meta['item'] = item yield request
def parse(self, response): i = 0 while i < 30: i += 1 path = ('//*[@id="user-repositories-list"]/ul/li[{}]').format(i) for text in response.xpath(path): item = ShiyanlouItem({ 'name': ("".join( text.xpath(path + '/div[1]/h3/a/text()').re('(.+)'))).strip(), 'update_time': "".join( text.xpath(path + '/div[3]/relative-time').re( 'datetime="(.+)"')) }) yield item
def parse(self, response): for course in response.css('li.public'): item = ShiyanlouItem() item['name'] = course.xpath( './/a[@itemprop="name codeRepository"]/text()').re_first( r'\n\s*(.*)') item['update_time'] = course.xpath( './/relative-time/@datetime').extract_first() course_url = response.urljoin( course.xpath('.//a/@href').extract_first()) request = scrapy.Request(course_url, callback=self.parse_content) request.meta['item'] = item yield request spans = response.css('div.pagination span.disabled::text') if len(spans) == 0 or spans[-1].extract() != 'Next': next_url = response.css( 'div.pagination a:last-child::attr(href)').extract_first() yield response.follow(next_url, callback=self.parse)
def parse(self, response): repos = response.xpath('//li[@itemprop="owns"]') for repo in repos: item = ShiyanlouItem() item['repo_name'] = repo.xpath( ".//a[@itemprop='name codeRepository']/text()").extract_first( ).strip() item['update_time'] = repo.xpath( ".//relative-time/@datetime").extract_first() yield item # 如果 Next 按钮没被禁用,那么表示有下一页 spans = response.css('div.pagination span.disabled::text') if len(spans) == 0 or spans[-1].extract() != 'Next': next_url = response.css( 'div.pagination a:last-child::attr(href)').extract_first() yield response.follow(next_url, callback=self.parse)
def parse(self, response): i = 0 print(response) for course in response.css( 'li.col-12.d-block.width-full.py-4.border-bottom.public'): item = ShiyanlouItem({ 'name': course.css('div[class="d-inline-block mb-1"] h3 a::text' ).re_first('\s*(\w+)'), 'update_time': course.css('div[class="f6 text-gray mt-2"] ::attr(datetime)'). extract_first() }) github_url = 'https://github.com/' + response.xpath( '//h3/a/@href').extract()[i] i += 1 request = scrapy.Request(github_url, callback=self.parse_more) request.meta['item'] = item yield request
def parse(self, response): repos = response.xpath('//li[@itemprop="owns"]') for repo in repos: item = ShiyanlouItem() item['repo_name'] = repo.xpath('.//div/h3/a/text()').re_first( '\S+') item['update_time'] = repo.xpath( './/div[3]/relative-time/@datetime').re_first('\S+') yield item if response.xpath( './/div[@class="pagination"]/span[@class="disabled"]/text()' ).re_first('\S+') == 'Previous': url = response.xpath( './/div[@class="pagination"]/a/@href').extract_first() elif response.xpath( './/div[@class="pagination"]/span[@class="disabled"]/text()' ).re_first('\S+') == 'Next': sys.exit() else: url = response.xpath( './/div[@class="pagination"]/a[2]/@href').extract_first() yield Request(url, callback=self.parse)
def parse(self, response): #repositories = response.xpath('//ul[@class="col-12 d-flex width-full py-4 border-bottom public source"]') """ for repository in repositories: item = ShiyanlouItem() item['repo_name'] = repository.xpath('//div[@class="d-inline-block mb-1"]/h3/a/text()').extract_first() item['update_time'] = repository.xpath('//relative-time/@datetime').extract_first() """ #repo_names = repositories[0].xpath('//div[@class="d-inline-block mb-1"]/h3/a/text()').re('(.+)') #update_times = repositories[0].xpath('//relative-time/@datetime').extract() #for i in range(len(repo_names)): # item = ShiyanlouItem() # item['repo_name'] = repo_names[i] # item['update_time'] = update_times[i] # yield item repos = response.xpath('//li[@itemprop="owns"]') for repo in repos: item = ShiyanlouItem() item['repo_name'] = repo.xpath( "./div/div/h3/a/text()").extract_first().strip() item['update_time'] = repo.xpath( "./div/div/relative-time/@datetime").extract_first() yield item
def parse(self, response): for r in response.css('li.col-12'): yield ShiyanlouItem({ 'name':r.xpath('.//a[@itemprop="name codeRepository"]/text()').re_first("\n\s*(.*)"), 'update_time':r.xpath('.//relative-time/@datetime').extract_first() })