def __init__(self, config_file='tablesuf.cfg', start_dir=None, *args, **kwargs): """ :param start_dir: directory for spider's meta-data """ super(TdWebSpider, self).__init__(*args, **kwargs) # self.frontier = URLFrontier(max=100, max_crawl=100) self.next_id = 0 self.countries = Countries(start_dir + '/' + "countries") self.transports = Transports(start_dir + '/' + "transports") self.links = Links(start_dir + '/' + "seeds") self.table_counter = itertools.count() self.config = Config(config_file) self.table_limit = int(self.config.get_attribute('tablelimit')) if (not self.links.empty()): self.start_urls = self.links.contents() else: raise IOError("No seeds!") # load links from seeds self.links = Links(start_dir + '/' + "seeds") self.start_urls = self.links.contents()
class TdWebSpider(scrapy.Spider): """ HTML table spider """ name = "tablespider" # allowed_domains = [".vn", ".com"] def __init__(self, config_file='tablesuf.cfg', start_dir=None, *args, **kwargs): """ :param start_dir: directory for spider's meta-data """ super(TdWebSpider, self).__init__(*args, **kwargs) # self.frontier = URLFrontier(max=100, max_crawl=100) self.next_id = 0 self.countries = Countries(start_dir + '/' + "countries") self.transports = Transports(start_dir + '/' + "transports") self.links = Links(start_dir + '/' + "seeds") self.table_counter = itertools.count() self.config = Config(config_file) self.table_limit = int(self.config.get_attribute('tablelimit')) if (not self.links.empty()): self.start_urls = self.links.contents() else: raise IOError("No seeds!") # load links from seeds self.links = Links(start_dir + '/' + "seeds") self.start_urls = self.links.contents() def parse(self, response): # self.store_html(response) for item in self.parse_tables(response): yield item links = Selector(response).xpath("//a/@href").extract() for l in links: normLink = Utility.normalize(response.url, l) yield Request(normLink, callback=self.follow_links) def follow_links(self, response): if next(self.table_counter) > self.table_limit: raise CloseSpider(reason="Enough tables") # self.store_html(response) for item in self.parse_tables(response): yield item links = Selector(response).xpath("//a/@href").extract() for l in links: normLink = Utility.normalize(response.url, l) yield Request(normLink, callback=self.follow_links) def parse_tables(self, response): sel = Selector(response) page_title = sel.xpath(r'//title/text()').extract()[0] tables = sel.xpath(r"//table") for table in tables: item = HTMLTableItem() item['table'] = table.extract() item['source'] = response.url item['table_id'] = next(self.table_counter) item['page_title'] = page_title yield item def get_path(self): return self.config.get_attribute('storagedir')