Exemplo n.º 1
0
        self.data = None


class WikimediaScraper(PageScraper):
    def extract(self, page):
        titles = page.xpath(
            "//ul[@class='mw-search-results']/li/div[1]/a/@title")
        urls = page.xpath("//ul[@class='mw-search-results']/li/div[1]/a/@href")
        data = page.xpath("//ul[@class='mw-search-results']/li/div[3]/text()")

        items = []
        for i in range(0, len(titles)):
            item = WikimediaItem()
            item.title = titles[i]
            item.url = urls[i]
            item.data = data[i]
            items.append(item)
        return items


start_pages = []
for i in range(1, 6):
    url = "https://commons.wikimedia.org/w/index.php?title=Special:Search&limit=20&offset=" + str(
        i * 20) + "&profile=default&search=water"
    page = Page(url, WikimediaScraper())
    start_pages.append(page)

crawler = XCrawler(start_pages)
crawler.config.output_file_name = "wikimedia_search_results_crawler_output.csv"
crawler.run()
Exemplo n.º 2
0
from xcrawler import XCrawler, Page, PageScraper


class Scraper(PageScraper):
    def extract(self, page):
        return page.__str__()


start_page = Page("http://192.168.5.5", Scraper())
start_page.request.cookies = {"theme": "classic"}
crawler = XCrawler([start_page])
crawler.config.request_timeout = (5, 5)
crawler.config.output_file_name = "router_request_example_output.csv"
crawler.run()

    def extract(self, page):
        item = StackOverflowItem()
        item.description = "A web page with tagged questions"
        item.url = page.url
        item.tag = page.xpath("//div[@class='tagged']/a/text()").get(0)
        item.related_tags = page.xpath("//div[@class='module js-gps-related-tags']//div[not(@*)]/a/text()")
        return item

    def visit(self, page):
        hrefs = page.xpath("//a[@class='question-hyperlink']/@href")[0:2]
        urls = page.to_urls(hrefs)
        return [Page(url, QuestionScraper()) for url in urls]


class QuestionScraper(PageScraper):
    def extract(self, page):
        item = StackOverflowItem()
        item.description = "A web page with question details"
        item.url = page.url
        item.title = page.css_text("h1 a").get(0)
        item.votes = page.css_text(".question .vote-count-post").get(0).strip()
        return item


start_pages = [ Page("http://stackoverflow.com/tags", TagsScraper()) ]
crawler = XCrawler(start_pages)
crawler.config.output_file_name = "stackoverflow_three_level_crawler_output.csv"
crawler.config.number_of_threads = 3
crawler.run()

Exemplo n.º 4
0
def setup_xcrawler():
    xcrawler = XCrawler()
    xcrawler.query = QUERY
    xcrawler.sure_value = SURE_VALUE

    return xcrawler