예제 #1
0
    def parse_article(self, response):
        nsil = NewsScraperItemLoader(selector=response.selector)

        nsil.add_value("author", response.meta['author'])
        nsil.add_value("publication", "Destructoid")
        nsil.add_xpath("headline", "//h3[@class='fancy-title']//text()")
        nsil.add_xpath("date_published", ["//h6[1]/b/text()", "//h6[1]/text()"], NormalizedJoin())
        nsil.add_xpath("body", "//div[@id='desktoppadding']//p", Declutter())

        nsil.add_value("date_scraped", str(datetime.datetime.now()))
        nsil.add_value("scraped_by", self.name)
        nsil.add_value("scraped_from", response.url)

        yield nsil.load_item()
    def parse_article(self, response):
        nsil = NewsScraperItemLoader(selector=response.selector)

        nsil.add_xpath("headline", "//div[@class='post-inner']//h2//text()")
        nsil.add_xpath("author", "//div[@class='entry']//a[contains(@href, 'mailto')]//text()")
        nsil.add_xpath("date_published", "//div[@class='entry']//aside/p/text()[2]", re=r"on (.*)")
        nsil.add_value("publication", "RockPaperShotgun")
        nsil.add_xpath("body", "//div[@class='entry']/p", Declutter())

        nsil.add_value("date_scraped", str(datetime.datetime.now()))
        nsil.add_value("scraped_by", self.name)
        nsil.add_value("scraped_from", response.url)

        yield nsil.load_item()
예제 #3
0
    def parse_article(self, response):
        nsil = NewsScraperItemLoader(selector=response.selector)

        nsil.add_xpath("headline", "//header/h1/text()", NormalizedJoin())
        nsil.add_value("publication", "Eurogamer")
        nsil.add_xpath("author", "//p[@class='byline']//a[contains(@href, 'author')]/text()")
        nsil.add_xpath("date_published", "//p[@class='byline']//span[@itemprop='datePublished']")
        nsil.add_xpath("body", "//article/section/p", Declutter())

        nsil.add_value("date_scraped", str(datetime.datetime.now()))
        nsil.add_value("scraped_by", self.name)
        nsil.add_value("scraped_from", response.url)

        yield nsil.load_item()
    def parse_article(self, response):
        nsil = NewsScraperItemLoader(selector=response.selector)

        nsil.add_xpath("headline", "//header/h1[@class='title']//text()", NormalizedJoin())
        nsil.add_xpath("author", "//span[@class='vcard']//a/text()", NormalizedJoin())
        nsil.add_xpath("date_published", "//span[@class='value-title']/time//text()", NormalizedJoin())
        nsil.add_value("publication", "DigitalTrends")
        nsil.add_xpath("body", "//article[contains(@class, 'm-content')]//p", Declutter(), NormalizedJoin())

        nsil.add_value("date_scraped", str(datetime.datetime.now()))
        nsil.add_value("scraped_by", self.name)
        nsil.add_value("scraped_from", response.url)

        yield nsil.load_item()
예제 #5
0
    def parse_article(self, response):
        sel = Selector(response)
        nsil = NewsScraperItemLoader(selector=sel)

        nsil.add_xpath("headline", "//h1[contains(@class, 'tweet-title')]//text()")
        nsil.add_value("publication", "TechCrunch")
        nsil.add_xpath("author", "//a[@rel='author']//text()", Join())
        nsil.add_xpath("body", "//div[contains(@class, 'article-entry')]//p//text()", Join())

        nsil.add_value("date_scraped", str(datetime.datetime.now()))
        nsil.add_value("scraped_by", self.name)
        nsil.add_value("scraped_from", response.url)

        yield nsil.load_item()
    def parse_article(self, response):
        nsil = NewsScraperItemLoader(selector=response.selector)

        nsil.add_xpath("headline", "//div[@class='headline']//h1[@class='title']//text()")
        nsil.add_xpath("date_published", "//div[@class='date']//text()")
        nsil.add_value("publication", "Killscreen Daily")
        nsil.add_xpath("author", "//div[@class='author']//a/text()")
        nsil.add_xpath("body", "//div[@class='article-content']//p", Declutter())

        nsil.add_value("date_scraped", str(datetime.datetime.now()))
        nsil.add_value("scraped_by", self.name)
        nsil.add_value("scraped_from", response.url)

        yield nsil.load_item()
예제 #7
0
    def parse_article(self, response):
        nsil = NewsScraperItemLoader(selector=response.selector)

        nsil.add_xpath("headline", "//h1//text()")
        nsil.add_xpath("author", "//h2//text()", re=r"By: (.*)")
        nsil.add_xpath("date_published", "//h3//text()", re=r"On: (.*)")
        nsil.add_xpath("body", "//div[@class='blog_post']//p", Declutter())
        nsil.add_value("publication", "TIGSource")

        nsil.add_value("date_scraped", str(datetime.datetime.now()))
        nsil.add_value("scraped_by", self.name)
        nsil.add_value("scraped_from", response.url)

        yield nsil.load_item()
    def parse_article(self, response):
        nsil = NewsScraperItemLoader(selector=response.selector)

        #   nsil.add_xpath("headline", "//h1[@class='headline']//text()")
        nsil.add_xpath("author", "//div[@class='author-description']//h5/a[contains(@href, 'profile')]//text()")
        nsil.add_xpath("date_published", "//span[@class='featureDate']//text()")
        nsil.add_value("publication", "GamesRadar")
        nsil.add_xpath("body", "//div[contains(@class, 'grArticleBody_contents')]//p/text()")

        nsil.add_value("date_scraped", str(datetime.datetime.now()))
        nsil.add_value("scraped_by", self.name)
        nsil.add_value("scraped_from", response.url)

        yield nsil.load_item()
예제 #9
0
    def parse_body(self, response):
        sel = Selector(response)
        nsil = NewsScraperItemLoader(selector=sel)

        nsil.add_xpath("headline", "//div[@class='title']//h2/text()")
        nsil.add_value("publication", "ExtremeTech")
        nsil.add_xpath("date_published", "//div[@class='title']//span[contains(@class, 'by vcard')]//text()", re=r"on (.*)")
        nsil.add_xpath("author", "//div[@class='title']//span[contains(@class, 'by vcard')]//text()", re=r"(.*) on")
        nsil.add_xpath("body", "//div[@class='content']//p//text()")

        nsil.add_value("scraped_by", self.name)
        nsil.add_value("scraped_from", response.url)
        nsil.add_value("date_scraped", str(datetime.datetime.now()))

        yield nsil.load_item()
예제 #10
0
    def parse_article(self, response):
        sel = Selector(response)
        nsil = NewsScraperItemLoader(selector=sel)

        # article data
        nsil.add_xpath("headline", "//h1[@itemprop='headline']//text()", Join())
        nsil.add_value("publication", "Engadget")
        nsil.add_xpath("date_published", "//span[@class='timeago']/@datetime", Join())
        nsil.add_xpath("author", "//strong[@itemprop='author']//text()")
        nsil.add_xpath("body", "//p[@class='read-more']//preceding-sibling::p", Declutter())
        nsil.add_xpath("category", "//strong[contains(text(), 'ags')]//following-sibling::span/a[1]/text()", Join())

        nsil.add_xpath("source_article_name", "//strong[contains(text(), 'ource')]/following-sibling::a[1]/text()", Join())
        nsil.add_xpath("source_article_link", "//strong[contains(text(), 'ource')]/following-sibling::a[1]/@href", Join())

        nsil.add_value("date_scraped", str(datetime.datetime.now()))
        nsil.add_value("scraped_by", self.name)
        nsil.add_value("scraped_from", response.url)

        yield nsil.load_item()
예제 #11
0
    def parse_review(self, response):
        nsil = NewsScraperItemLoader(selector=response.selector)

        nsil.add_xpath("headline", "//h1[@itemprop='name']//text()")
        nsil.add_xpath("author", "//div[@class='review_header']//div[@class='byline']/a[@rel='author']//text()")
        nsil.add_xpath("date_published", "//div[@class='review_header']//div[@class='byline']/text()", re=r"on (.*)")
        nsil.add_xpath("publication", "GamesRadar")
        nsil.add_xpath("body", "//div[contains(@class, 'grArticleBody_contents')]//p/text()")

        nsil.add_value("date_scraped", str(datetime.datetime.now()))
        nsil.add_value("scraped_by", self.name)
        nsil.add_value("scraped_from", response.url)

        yield nsil.load_item()
예제 #12
0
    def parse_page(self, response):
        sel = Selector(response)
        nsil = NewsScraperItemLoader(selector=sel)

        # data pre-processing
        author_raw = nsil.get_xpath("//p[contains(@itemprop, 'author')]//text()",
            NormalizedJoin())

        date_raw = nsil.get_xpath("//p[contains(@itemprop, 'author')]//text()",
            NormalizedJoin())

        author = "".join(AUTHOR_RE.findall(author_raw))
        date = "".join(DATE_RE.findall(date_raw))

        # article data first
        nsil.add_xpath("headline", "//header/h1[@class='heading']//text()", Join())
        nsil.add_value("publication", "Ars Technica")
        nsil.add_value("date_published", date)
        nsil.add_value("category", response.meta["category"])
        nsil.add_value("author", author)
        nsil.add_xpath("body", "//div[@itemprop='articleBody']//text()", NormalizedJoin())

        # metadata
        nsil.add_value("date_scraped", str(datetime.datetime.now()))
        nsil.add_value("scraped_by", self.name)
        nsil.add_value("scraped_from", response.url)

        yield nsil.load_item()
예제 #13
0
    def parse_article(self, response):
        nsil = NewsScraperItemLoader(selector=response.selector)

        # templates across PCG site are inconsistent.
        # this is what I've been able to figure out by
        # analyzing samples of the results pulled.
        body_xpath = ["//div[@class='body']//p/.",
                      "//div[@class='section-wrap']//div[@class='textcomponent']//p",
                      "//div[@class='gallery_desc']//p",]

        author_xpath = ["//h3[@class='author']//text()",
                        "//div[@class='review_header']//h3//text()"]

        date_published_xpath = ["//span[@class='localized byline']//text()",
                                "//div[@class='review_header']//span[@class='localized']//text()"]

        nsil.add_xpath("headline", "//h1//text()", NormalizedJoin())
        nsil.add_value("publication", "PC Gamer")
        nsil.add_value("author", self.select_first_xpath(author_xpath, nsil))
        nsil.add_value("body", self.select_first_xpath(body_xpath, nsil, Declutter()))
        nsil.add_value("date_published", self.select_first_xpath(date_published_xpath, nsil))

        nsil.add_value("date_scraped", str(datetime.datetime.now()))
        nsil.add_value("scraped_by", self.name)
        nsil.add_value("scraped_from", response.url)

        yield nsil.load_item()