Пример #1
0
 def parse(self, response):
     jsonresponse = json.loads(response.body_as_unicode())
     for item in jsonresponse:
         headline = Headline()
         headline["title"] = item["title"]
         headline["title2"] = item["typeAttributes"]["deck"]
         headline["description"] = item["description"]
         headline["url"] = item["typeAttributes"]["url"]
         headline["imgurl"] = item["typeAttributes"]["imageLarge"]
         headline["tags"] = item["typeAttributes"]["urlSlug"].split("-")
         headline["score"] = item["typeAttributes"]["trending"][
             "numViewers"]
         headline["timestamp"] = item["updatedAt"]
         headline["id"] = item["id"]
         if self.should_get_article(headline["id"]):
             yield scrapy.Request(url=headline["url"],
                                  meta={
                                      "dont_cache": False,
                                      "headline": headline
                                  },
                                  callback=self.parse_body)
     self.page += 1
     if self.page < 10:
         yield scrapy.Request(url=self.HOST.format(self.page),
                              meta={"dont_cache": self.dont_cache})
Пример #2
0
 def parse(self, response):
     posts = response.css("article.post")
     for post in posts:
         headline = Headline()
         headline["title"] = post.css(".entry-title").xpath(
             "a/text()").get()
         headline["title2"] = response.css("article.post").css(
             ".entry-content::text").get().strip()
         headline["url"] = post.css(".entry-title").xpath("a/@href").get()
         imgsrc = post.css("figure.thumbnail").xpath("a/img/@src").get()
         imgsrc2 = post.css("figure.thumbnail").xpath(
             "a/img/@pm-lazy-src").get()
         if imgsrc is not None and imgsrc.find(
                 "data:") == -1 and imgsrc.find("http") != -1:
             headline["imgurl"] = imgsrc
         elif imgsrc2 is not None and imgsrc2.find("http") != -1:
             headline["imgurl"] = imgsrc2
         headline["id"] = post.xpath("@data-event-tracking").get().split(
             "|")[-2]
         if self.should_get_article(headline["id"]):
             yield scrapy.Request(url=headline["url"],
                                  meta={
                                      "dont_cache": False,
                                      "headline": headline
                                  },
                                  callback=self.parse_body)
     self.page += 1
     yield scrapy.Request(url=self.HOST.format(self.page),
                          meta={"dont_cache": self.dont_cache})
Пример #3
0
 def parse(self, response):
     html = json.loads(response.body_as_unicode())["rendering"]
     res = scrapy.http.HtmlResponse(url=response.url,
                                    body=html,
                                    encoding="utf-8")
     stories = res.css("div.c-card>a")
     for story in stories:
         url = f"{self.HOST}{story.xpath('@href').get()}"
         id = url.split("/")[-2]
         author = story.css("span.c-card__author::text").get()
         title = story.css("div.c-card__hed-text::text").get()
         imgurl = story.css("img.c-image").xpath("@src").get()
         headline = Headline({
             "id": id,
             "url": url,
             "author": author,
             "title": title,
             "imgurl": imgurl
         })
         if self.should_get_article(headline["id"]):
             yield scrapy.Request(url=headline["url"],
                                  meta={
                                      "dont_cache": False,
                                      "headline": headline
                                  },
                                  callback=self.parse_body)
     self.last_id += 10
     if self.last_id <= 80:
         yield scrapy.Request(url=self.FETCH_HOST.format(self.last_id),
                              meta={"dont_cache": self.dont_cache})
Пример #4
0
def get_headlines(url):
    """
    Gets headlines from http://www.newsapi.org

    :param url: url
    :return: array of headlines
    """
    headlines = []

    req = urllib.request.Request(url)
    response = urllib.request.urlopen(req).read().decode('utf8')
    r = json.loads(response)

    prev_published_at = str(datetime.now()).split(" ")[0]

    for re in r['articles']:

        if str(re['publishedAt']) == 'None':
            published_at = prev_published_at
        else:
            published_at = "" + str(re['publishedAt']).split('T')[0]
            prev_published_at = published_at

        h = Headline(re['title'].split('\n')[0], re['url'], re['source']['id']
                     or re['source']['name'], published_at, re['urlToImage'])
        headlines.append(h)

    return headlines
Пример #5
0
def classify_headlines(headlines, classifier):
    features = [bait_features(Headline(headline)) for headline in headlines]
    label_list = []
    for feat in features:
        label_list.append(classifier.classify(feat))
    bait_count = label_list.count('bait')
    return bait_count / len(label_list)
Пример #6
0
 def parse(self, response):
   stories = response.css("div.story")
   for story in stories:
     headline = Headline()
     article = story.css("article")
     a = story.css("h3.story-h").xpath("a")
     headline["url"] = a.xpath("@href").get()
     headline["title"] = a.xpath("text()").get()
     headline["title2"] = article.css("div.story-txt").css("p::text").get()
     headline["id"] = story.xpath("@data-post_id").get()
     headline["imgurl"] = article.css("img.story-img").xpath("@src").get()
     self.last_id = int(headline["id"])
     if self.should_get_article(headline["id"]):
       yield scrapy.Request(url=headline["url"],meta={"dont_cache":False,"headline":headline},callback=self.parse_body)
   url = self.HOST.format(self.last_id)
   yield scrapy.Request(url=url,meta={"dont_cache":self.dont_cache})
Пример #7
0
 def parse(self, response):
     stories = response.css("article.post")
     for story in stories:
         headline = Headline()
         headline["url"] = story.css(".entry-title>a").xpath("@href").get()
         headline["title"] = story.css(".entry-title>a::text").get()
         data = json.loads(story.xpath("@data-evt-val").get())
         headline["id"] = data["story"]["id"] if data["story"][
             "id"] else headline["url"].split("/")[-1]
         headline["imgurl"] = story.css(
             "img.attachment-post-thumbnail").xpath("@src").get()
         if self.should_get_article(headline["id"]):
             yield scrapy.Request(url=headline["url"],
                                  meta={
                                      "dont_cache": False,
                                      "headline": headline
                                  },
                                  callback=self.parse_body)
Пример #8
0
 def extract_headlines(self):
     """ Gather all headline information from the passed html code and return them as a list.
     :return list of headlines
     """
     self.headlines.clear()
     result_block = self.raw_html.find_all('div', attrs={'class': 'g'})
     for result in result_block:
         link = result.find('a', href=True)
         title = result.find('h3')
         description = result.find('span', attrs={'class': 'st'})
         if link and title:
             link = link['href']
             title = title.get_text()
             if description:
                 description = description.get_text()
             if link != '#':
                 self.headlines.append(Headline(title, link, description))
     return self.headlines
Пример #9
0
 def parse(self, response):
     stories = response.css(".story")
     for story in stories:
         headline = Headline()
         a = story.xpath("div[@class='story__body']/span/span/a")
         url = f"https://www.thestar.com{a.xpath('@href').get()}"
         title = a.xpath("span[@class='story__headline']/text()").get()
         title2 = a.xpath("p[@class='story__abstract']/text()").get()
         headline["url"] = url
         headline["title"] = title
         headline["title2"] = title2
         headline["id"] = url.split("/")[-1].split(".")[0]
         if self.should_get_article(headline["id"]):
             yield scrapy.Request(url=headline["url"],
                                  meta={
                                      "dont_cache": False,
                                      "headline": headline
                                  },
                                  callback=self.parse_body)
Пример #10
0
 def parse(self, response):
     stories = response.css("article.post")
     for story in stories:
         headline = Headline()
         headline["url"] = story.css("div.row>div.text>header>a").xpath(
             "@href").get()
         headline["title"] = story.css("div.row>div.text>header>a").xpath(
             "@title").get()
         headline["title2"] = story.css(
             "div.row>div.text>header>a>div.excerpt>p::text").get()
         headline["id"] = story.xpath("@id").get().split("-")[-1]
         headline["imgurl"] = story.css("img").xpath("@data-src").get()
         if self.should_get_article(headline["id"]):
             yield scrapy.Request(url=headline["url"],
                                  meta={
                                      "dont_cache": False,
                                      "headline": headline
                                  },
                                  callback=self.parse_body)
     self.page += 1
     if self.page <= 10:
         yield scrapy.Request(url=self.HOST.format(self.page),
                              meta={"dont_cache": self.dont_cache})
Пример #11
0
def create_headlines():
    headline_tuples = pickle.load(open('headlines.p', 'rb'))
    headlines = [Headline(h[0], h[1]) for h in headline_tuples]
    random.shuffle(headlines)
    return headlines
Пример #12
0
def create_predictable_list(headline_string):
    predictor_as_object = Headline(headline_string, "none")
    predictable_values = []
    predictable_values.append(create_x_vals(predictor_as_object))
    return predictable_values