Exemplo n.º 1
0
 def parse_details(self, response):
     vars = ArticlesItem()
     vars["title"] = response.css("h1.sna_content_heading::text").extract_first().strip()
     vars["article_summary"] = response.css("span.article-summary::text").extract_first().strip()
     vars["article_content"] = [i.strip() for i in response.css("div.article-body p::text").extract()]
     vars["tags"] = [i.strip() for i in response.css("div.article-tags h2.tags::text").extract()]
     yield vars
Exemplo n.º 2
0
    def parse_abstract_page(self, response):
        # first check abstract using @class="abstract"
        abstract = response.xpath(
            '//p[@class="abstract"]/text()').extract_first()
        if (abstract == None):
            # extract abstract
            abstract = response.xpath('//div[@id="content"]/text()').extract()
            # remove white spaces
            abstract = list(map(lambda x: x.strip(), abstract))
            # leave on elements with at least 10 characters
            abstract = '.'.join([item for item in abstract if len(item) > 1])
        else:
            abstract = abstract.strip()

        # create object ArticlesItem and initialize its attributes
        item = ArticlesItem()
        item['Title'] = response.meta['title']
        item['Authors'] = response.meta['authors']
        item['Year'] = response.meta['year']
        item['Volume'] = response.meta['volume']
        item['Pdf_url'] = response.meta['pdf_url']
        item['Abstract'] = abstract
        item['Journal_Conference'] = "JMLR"

        yield item
 def parse_details(self, response):
     article = ArticlesItem()
     my_article = ""
     for i in response.css("div#readspeaker_maincontent p::text").extract():
         my_article += i
     article["article_content"] = my_article.replace("\n", "")
     article["tags"] = "صحة"
     my_article = ""
     yield article
Exemplo n.º 4
0
	def parse(self,response):
		hxs = HtmlXPathSelector(response)
		article_titles = hxs.select("//div[@class='c-article-title']")
		items = []
		for article_titles in article_titles:
			item = ArticlesItem()
			item["article_title"] = article_titles.select("a/text()").extract()
			items.append(item)
		return items
Exemplo n.º 5
0
    def parse(self, response):
        # if response.status == 200:
        #     print("成功解析下载地址")
        # else:
        #     print("下载地址解析失败")

        item = ArticlesItem()
        url = response.url
        item['file_urls'] = [url]
        yield item
Exemplo n.º 6
0
	def parse_article_page(self,response):
		# create object ArticlesItem
		item = ArticlesItem()
		item['Title'] = response.xpath('//h2[@class="subtitle"]/text()').extract_first()
		item['Authors'] = ','.join(response.xpath('//li[@class="author"]/a/text()').extract())
		item['Year'] = response.meta['year']
		item['Volume'] = '-'
		item['Abstract'] = response.xpath('//p[@class="abstract"]/text()').extract_first().strip()
		item['Pdf_url'] = 'https://papers.nips.cc' + response.xpath('//div[@class="main wrapper clearfix"]/a/@href').extract_first()
		item['Journal_Conference'] = "NIPS"
		
		yield item
 def parse_details(self, response):
     article = ArticlesItem()
     list_content = []
     my_article = ""
     article["title"] = response.css("h1.dmi-title::text").extract_first()
     for i in response.css("div.dmi-entry-content p::text").extract():
         if len(i) > 20:
             my_article += i
     article["article_content"] = my_article.replace("\xa0", "")
     article["tags"] = "صحة"
     my_article = ""
     if len(article["article_content"]) > 20:
         yield article
Exemplo n.º 8
0
 def parse_details(self, response):
     article = ArticlesItem()
     list_content = []
     my_article = ""
     article["title"] = response.css("h1.ft-ptitle::text").extract_first()
     for i in response.css("section.ft-entry p::text").extract():
         if len(i) > 20:
             my_article += i
     article["article_content"] = my_article.replace("\xa0", "")
     article["tags"] = [
         i.strip() for i in response.css("div.ft-ptags a::text").extract()
     ]
     my_article = ""
     if len(article["article_content"]) > 20 and len(article["tags"]) >= 1:
         yield article
Exemplo n.º 9
0
    def parse_details(self, response):
        article = ArticlesItem()
        list_content = []
        clear_space_list = []
        clear_line_list = []
        final_output = []
        article["title"] = response.css("h1.post-title.entry-title::text").extract_first().strip()
        for i in response.css("div.entry-content p"):
            list_content.append("".join(i.xpath('descendant-or-self::text()').extract()))
            clear_line_list = [i.replace("\n", " ") for i in list_content]
            clear_space_list = [i.replace("\xa0", "") for i in clear_line_list]
            final_output = list(filter(None, clear_space_list))

        article["article_content"] = final_output
        article["tags"] = response.css("span.tagcloud a::text").extract()
        if len(article["tags"]) >= 2 and article["article_content"]:
            yield article
Exemplo n.º 10
0
 def parse_details(self, response):
     article = ArticlesItem()
     list_content = []
     clear_space_list = []
     clear_line_list = []
     final_output = []
     article["title"] = response.css("div.article h1.heading::text").extract_first()
     for i in response.css("div.text.js-text.js-mediator-article p:not(:first-child):not(:last-child)"):
         list_content.append("".join(i.xpath('descendant-or-self::text()').extract()))
         clear_line_list = [i.replace("\n", " ") for i in list_content]
         clear_space_list = [i.replace("\xa0", "") for i in clear_line_list]
         final_output = list(filter(None, clear_space_list))
         del final_output[-1]  # i do not want to show the reference!
     article["article_content"] = final_output
     article["tags"] = response.css("div.news-tags.news-tags_article a::text").extract()
     if article["article_content"] and len(article["tags"]) > 1:
         yield article
Exemplo n.º 11
0
    def parse(self, response):

        items = []
        sites = response.css('.manual-list')
        for dl in sites.css('dl.manual-item-standard'):
            book = dl.css('.recommend-book')
            item = ArticlesItem()
            item['name'] = dl.css('.name::text').extract_first().strip()
            item['label'] = book.css('::attr(href)').extract_first().split(
                '/')[-1]
            item['cover_url'] = book.css(
                'img::attr(src)').extract_first().strip()
            item['author'] = 'bookstack'
            item['description'] = ''
            item['tags'] = []
            items.append(item)

        return items
Exemplo n.º 12
0
    def parse_details(self, response):
        article = ArticlesItem()
        list_content = []
        clear_line_list = []
        final_output = []

        article["title"] = response.css(
            "div.articleHeader h1::text").extract_first().strip()
        for i in response.css("div#articleBody p"):
            list_content.append("".join(
                i.xpath('descendant-or-self::text()').extract()))
            for _ in list_content:
                clear_line_list = self.clean_articles(list_content)
            final_output = list(filter(None, clear_line_list))
        article["article_content"] = final_output
        article["tags"] = response.css("div.tags h3  a::text").extract()

        if article["article_content"] and len(article["tags"]) > 1:
            yield article
Exemplo n.º 13
0
    def parse_details(self, response):
        var = ArticlesItem()
        list_content = []
        final_output = []
        var["title"] = response.css(
            "h1._2JPm2UuC56::text").extract_first().strip()
        for i in response.css(
                "div.clearfix.wysiwyg._2A-9LYJ7eK p:nth-child(n+2)"):
            list_content.append("".join(
                i.xpath('descendant-or-self::text()').extract()))
            final_output = [i.replace("\n", " ") for i in list_content]

        var["article_content"] = final_output
        var["tags"] = [
            i.strip()
            for i in response.css("ul.AsCeVPiOdE li a::text").extract()
        ]
        if len(var["tags"]) >= 1 and var[
                "article_content"]:  # do not save any article that has neither tag nor content!
            yield var
Exemplo n.º 14
0
	def parse(self, response):
		soup = BeautifulSoup(response.body)
			
		print 'crawled',response.url

		for domain, attrs in self.domain_router.items():
			if domain in response.url:
				attrs = json.loads(attrs)          # attrs={"href":"/users/.+"}
				for key, value in attrs.items():  
					attrs[key] = re.compile(value)    # attrs={"class":re(object)}
				try: author = soup.find_all(attrs=attrs)[0].text.strip()
				except: author = ''
				finally: break
			else:
				author = ''

		'''
		print 'crawled',response.url
		if soup.select('.author_name'):
			author = soup.select('.author_name')[0].text.strip()  #class='author_name'
			rule = 1
		elif soup.select('.author'):
			author = soup.select('.author')[0].text.strip()
			rule = 2
		elif soup.select('.author-link'):    
			author = soup.select('.author-link')[0].text.strip()
			rule = 3
		elif soup.select('.byline__author'):
			author = soup.select('.byline__author')[0].text.strip()
			rule = 4
		elif soup.find_all(href=re.compile("/users/.+")):
			author = soup.find_all(href=re.compile("/users/.+"))[0].text.strip()
			rule = 5
		elif soup.select('.cat_desc'):
			author = soup.select('.cat_desc')[0].text.strip()
			rule = 6
		elif soup.select('.stat-author'):
			author = soup.select('.stat-author')[0].text.strip()
			rule = 7
		elif soup.select('.js-authors-list'):
			author = soup.select('.js-authors-list')[0].text
			rule = 12		
		elif soup.find_all(rel="author"):
			author = soup.find_all(rel="author")[0].text.strip()
			rule = 8
		elif soup.find_all(href=re.compile("/user/.+")):
			author = soup.find_all(href=re.compile("/user/.+"))[0].text.strip()
			rule = 9
		elif soup.select('.fn'):
			author = soup.select('.fn')[0].text.strip()
			rule = 10
		else:
			author = ''
			rule = 11
			#error_articles.appends('{}, '.format(response.url))
		'''
		
		item = ArticlesItem()
		item['url'] = response.url
		item['author'] = author
		return item