def parse(self, response): jsonresponse = json.loads(response.body_as_unicode()) for item in jsonresponse: headline = Headline() headline["title"] = item["title"] headline["title2"] = item["typeAttributes"]["deck"] headline["description"] = item["description"] headline["url"] = item["typeAttributes"]["url"] headline["imgurl"] = item["typeAttributes"]["imageLarge"] headline["tags"] = item["typeAttributes"]["urlSlug"].split("-") headline["score"] = item["typeAttributes"]["trending"][ "numViewers"] headline["timestamp"] = item["updatedAt"] headline["id"] = item["id"] if self.should_get_article(headline["id"]): yield scrapy.Request(url=headline["url"], meta={ "dont_cache": False, "headline": headline }, callback=self.parse_body) self.page += 1 if self.page < 10: yield scrapy.Request(url=self.HOST.format(self.page), meta={"dont_cache": self.dont_cache})
def parse(self, response): posts = response.css("article.post") for post in posts: headline = Headline() headline["title"] = post.css(".entry-title").xpath( "a/text()").get() headline["title2"] = response.css("article.post").css( ".entry-content::text").get().strip() headline["url"] = post.css(".entry-title").xpath("a/@href").get() imgsrc = post.css("figure.thumbnail").xpath("a/img/@src").get() imgsrc2 = post.css("figure.thumbnail").xpath( "a/img/@pm-lazy-src").get() if imgsrc is not None and imgsrc.find( "data:") == -1 and imgsrc.find("http") != -1: headline["imgurl"] = imgsrc elif imgsrc2 is not None and imgsrc2.find("http") != -1: headline["imgurl"] = imgsrc2 headline["id"] = post.xpath("@data-event-tracking").get().split( "|")[-2] if self.should_get_article(headline["id"]): yield scrapy.Request(url=headline["url"], meta={ "dont_cache": False, "headline": headline }, callback=self.parse_body) self.page += 1 yield scrapy.Request(url=self.HOST.format(self.page), meta={"dont_cache": self.dont_cache})
def parse(self, response): html = json.loads(response.body_as_unicode())["rendering"] res = scrapy.http.HtmlResponse(url=response.url, body=html, encoding="utf-8") stories = res.css("div.c-card>a") for story in stories: url = f"{self.HOST}{story.xpath('@href').get()}" id = url.split("/")[-2] author = story.css("span.c-card__author::text").get() title = story.css("div.c-card__hed-text::text").get() imgurl = story.css("img.c-image").xpath("@src").get() headline = Headline({ "id": id, "url": url, "author": author, "title": title, "imgurl": imgurl }) if self.should_get_article(headline["id"]): yield scrapy.Request(url=headline["url"], meta={ "dont_cache": False, "headline": headline }, callback=self.parse_body) self.last_id += 10 if self.last_id <= 80: yield scrapy.Request(url=self.FETCH_HOST.format(self.last_id), meta={"dont_cache": self.dont_cache})
def get_headlines(url): """ Gets headlines from http://www.newsapi.org :param url: url :return: array of headlines """ headlines = [] req = urllib.request.Request(url) response = urllib.request.urlopen(req).read().decode('utf8') r = json.loads(response) prev_published_at = str(datetime.now()).split(" ")[0] for re in r['articles']: if str(re['publishedAt']) == 'None': published_at = prev_published_at else: published_at = "" + str(re['publishedAt']).split('T')[0] prev_published_at = published_at h = Headline(re['title'].split('\n')[0], re['url'], re['source']['id'] or re['source']['name'], published_at, re['urlToImage']) headlines.append(h) return headlines
def classify_headlines(headlines, classifier): features = [bait_features(Headline(headline)) for headline in headlines] label_list = [] for feat in features: label_list.append(classifier.classify(feat)) bait_count = label_list.count('bait') return bait_count / len(label_list)
def parse(self, response): stories = response.css("div.story") for story in stories: headline = Headline() article = story.css("article") a = story.css("h3.story-h").xpath("a") headline["url"] = a.xpath("@href").get() headline["title"] = a.xpath("text()").get() headline["title2"] = article.css("div.story-txt").css("p::text").get() headline["id"] = story.xpath("@data-post_id").get() headline["imgurl"] = article.css("img.story-img").xpath("@src").get() self.last_id = int(headline["id"]) if self.should_get_article(headline["id"]): yield scrapy.Request(url=headline["url"],meta={"dont_cache":False,"headline":headline},callback=self.parse_body) url = self.HOST.format(self.last_id) yield scrapy.Request(url=url,meta={"dont_cache":self.dont_cache})
def parse(self, response): stories = response.css("article.post") for story in stories: headline = Headline() headline["url"] = story.css(".entry-title>a").xpath("@href").get() headline["title"] = story.css(".entry-title>a::text").get() data = json.loads(story.xpath("@data-evt-val").get()) headline["id"] = data["story"]["id"] if data["story"][ "id"] else headline["url"].split("/")[-1] headline["imgurl"] = story.css( "img.attachment-post-thumbnail").xpath("@src").get() if self.should_get_article(headline["id"]): yield scrapy.Request(url=headline["url"], meta={ "dont_cache": False, "headline": headline }, callback=self.parse_body)
def extract_headlines(self): """ Gather all headline information from the passed html code and return them as a list. :return list of headlines """ self.headlines.clear() result_block = self.raw_html.find_all('div', attrs={'class': 'g'}) for result in result_block: link = result.find('a', href=True) title = result.find('h3') description = result.find('span', attrs={'class': 'st'}) if link and title: link = link['href'] title = title.get_text() if description: description = description.get_text() if link != '#': self.headlines.append(Headline(title, link, description)) return self.headlines
def parse(self, response): stories = response.css(".story") for story in stories: headline = Headline() a = story.xpath("div[@class='story__body']/span/span/a") url = f"https://www.thestar.com{a.xpath('@href').get()}" title = a.xpath("span[@class='story__headline']/text()").get() title2 = a.xpath("p[@class='story__abstract']/text()").get() headline["url"] = url headline["title"] = title headline["title2"] = title2 headline["id"] = url.split("/")[-1].split(".")[0] if self.should_get_article(headline["id"]): yield scrapy.Request(url=headline["url"], meta={ "dont_cache": False, "headline": headline }, callback=self.parse_body)
def parse(self, response): stories = response.css("article.post") for story in stories: headline = Headline() headline["url"] = story.css("div.row>div.text>header>a").xpath( "@href").get() headline["title"] = story.css("div.row>div.text>header>a").xpath( "@title").get() headline["title2"] = story.css( "div.row>div.text>header>a>div.excerpt>p::text").get() headline["id"] = story.xpath("@id").get().split("-")[-1] headline["imgurl"] = story.css("img").xpath("@data-src").get() if self.should_get_article(headline["id"]): yield scrapy.Request(url=headline["url"], meta={ "dont_cache": False, "headline": headline }, callback=self.parse_body) self.page += 1 if self.page <= 10: yield scrapy.Request(url=self.HOST.format(self.page), meta={"dont_cache": self.dont_cache})
def create_headlines(): headline_tuples = pickle.load(open('headlines.p', 'rb')) headlines = [Headline(h[0], h[1]) for h in headline_tuples] random.shuffle(headlines) return headlines
def create_predictable_list(headline_string): predictor_as_object = Headline(headline_string, "none") predictable_values = [] predictable_values.append(create_x_vals(predictor_as_object)) return predictable_values