def parse_details(self, response): var = CnnMiddleEastArticles() list_content, final_output = [], [] var["title"] = response.css( "h1._2JPm2UuC56::text").extract_first().strip() summary = response.css( "div.clearfix.wysiwyg._2A-9LYJ7eK p strong::text").extract_first( ).replace("\n", " ") if summary is None: summary = response.css( "div.clearfix.wysiwyg._2A-9LYJ7eK p strong span::text" ).extract_first().replace("\n", " ") summary = summary.partition( "(CNN)")[2] if "(CNN)" in summary else summary var["summary"] = summary for i in response.css( "div.clearfix.wysiwyg._2A-9LYJ7eK p:nth-child(n+2)"): list_content.append("".join( i.xpath('descendant-or-self::text()').extract())) temp1 = [i.replace("\n", " ") for i in list_content] temp2 = [i.replace("\r", " ") for i in temp1] final_output = [i.replace("\xa0", " ") for i in temp2] var["article_content"] = final_output yield var
def parse_details(self, response): list_content = [] middle_east = CnnMiddleEastArticles() middle_east["title"] = response.css("div.sna_content_head_cont h1.sna_content_heading::text").extract_first() \ .strip() for i in response.css( "div.article-body div#firstBodyDiv > p:nth-child(n+1)"): list_content.append("".join( i.xpath('descendant-or-self::text()').extract())) middle_east["article_content"] = list_content middle_east["tags"] = response.css( "div.article-tags.noprint div a h2::text").extract() if middle_east["article_content"] and len( middle_east["tags"]) > 1: # we need more than 2 tags at least! yield middle_east
def parse_details(self, response): var = CnnMiddleEastArticles() var["title"] = response.css("h1._2JPm2UuC56::text").extract_first().strip() # for the content of articles i will have to cover all the structure of the webpages try: temp = [i.rstrip() for i in response.css("div.wysiwyg p:not(div.first-child)::text").extract()] if len(temp) != 1: # if it has "" only! var["article_content"] = self.clear_input(temp) elif not var["article_content"] and len(temp) != 1: temp = [i.rstrip() for i in response.css("div.wysiwyg p:not(:first-child) > strong > span > span > span" " > span > span > span > span > span::text").extract()] var["article_content"] = self.clear_input(temp) elif not var["article_content"] and len(temp) != 1: temp = [i.rstrip() for i in response.css("div.wysiwyg p:not(:first-child) > span > span > span > span > span > span > span " "> span::text").extract()] var["article_content"] = self.clear_input(temp) elif not var["article_content"] and len(temp) != 1: temp = [i.rstrip() for i in response.css("div.wysiwyg p:not(:first-child)> span > span > span > span > span > " "span:nth-child(3) > span > span::text").extract()] var["article_content"] = self.clear_input(temp) elif not var["article_content"] and len(temp) != 1: temp = [i.rstrip() for i in response.css("div.wysiwyg p:not(:first-child)> span > span > span > span > span > " "span:nth-child(2) > span > span::text").extract()] var["article_content"] = self.clear_input(temp) elif not var["article_content"] and len(temp) != 1: temp = [i.rstrip() for i in response.css("div.wysiwyg._2A-9LYJ7eK p::text").extract()] var["article_content"] = self.clear_input(temp) else: var["article_content"] = "you did not cover this case." var["tags"] = [i.strip() for i in response.css("ul.AsCeVPiOdE li a::text").extract()] if var["tags"] and var["article_content"]: # do not save any article that has neither tag nor content! yield var except KeyError as e: print(e)