示例#1
0
    def __init__(self, config_file='tablesuf.cfg', start_dir=None, *args, **kwargs):
        """

        :param start_dir:   directory for spider's meta-data
        """
        super(TdWebSpider, self).__init__(*args, **kwargs)
        # self.frontier = URLFrontier(max=100, max_crawl=100)
        self.next_id = 0
        self.countries = Countries(start_dir + '/' + "countries")
        self.transports = Transports(start_dir + '/' + "transports")
        self.links = Links(start_dir + '/' + "seeds")
        self.table_counter = itertools.count()
        self.config = Config(config_file)
        self.table_limit = int(self.config.get_attribute('tablelimit'))

        if (not self.links.empty()):
            self.start_urls = self.links.contents()
        else:
            raise IOError("No seeds!")

        # load links from seeds
        self.links = Links(start_dir + '/' + "seeds")
        self.start_urls = self.links.contents()
示例#2
0
class TdWebSpider(scrapy.Spider):
    """
    HTML table spider
    """
    name = "tablespider"
    # allowed_domains = [".vn", ".com"]


    def __init__(self, config_file='tablesuf.cfg', start_dir=None, *args, **kwargs):
        """

        :param start_dir:   directory for spider's meta-data
        """
        super(TdWebSpider, self).__init__(*args, **kwargs)
        # self.frontier = URLFrontier(max=100, max_crawl=100)
        self.next_id = 0
        self.countries = Countries(start_dir + '/' + "countries")
        self.transports = Transports(start_dir + '/' + "transports")
        self.links = Links(start_dir + '/' + "seeds")
        self.table_counter = itertools.count()
        self.config = Config(config_file)
        self.table_limit = int(self.config.get_attribute('tablelimit'))

        if (not self.links.empty()):
            self.start_urls = self.links.contents()
        else:
            raise IOError("No seeds!")

        # load links from seeds
        self.links = Links(start_dir + '/' + "seeds")
        self.start_urls = self.links.contents()


    def parse(self, response):
        # self.store_html(response)
        for item in self.parse_tables(response):
            yield item

        links = Selector(response).xpath("//a/@href").extract()
        for l in links:
            normLink = Utility.normalize(response.url, l)
            yield Request(normLink, callback=self.follow_links)


    def follow_links(self, response):
        if next(self.table_counter) > self.table_limit:
            raise CloseSpider(reason="Enough tables")

        # self.store_html(response)
        for item in self.parse_tables(response):
            yield item

        links = Selector(response).xpath("//a/@href").extract()
        for l in links:
            normLink = Utility.normalize(response.url, l)
            yield Request(normLink, callback=self.follow_links)

    def parse_tables(self, response):
        sel = Selector(response)
        page_title = sel.xpath(r'//title/text()').extract()[0]
        tables = sel.xpath(r"//table")
        for table in tables:
            item = HTMLTableItem()
            item['table'] = table.extract()
            item['source'] = response.url
            item['table_id'] = next(self.table_counter)
            item['page_title'] = page_title
            yield item

    def get_path(self):
        return self.config.get_attribute('storagedir')