def parse_materials(self, response): reportnum = response.request.meta['reportnum'] text = unicode (response.body, response.encoding) hxs = HtmlXPathSelector(text=text) materials = hxs.select ('//table[@class="t16Standard"]/tr') if (len(materials) == 0): self.log('Materials data not present in response from {0}'.format(response.url), log.INFO) else: # Skip the first report record because this is the header row materials.pop (0) if (len(materials) == 0): self.log('No materials reports found in response {0}' .format(reportnum), log.INFO) else: self.log('Retrieved {0} materials records in report {1}' .format(len(materials),reportnum), log.INFO) for material in materials: l = XPathItemLoader(NrcScrapedMaterial(), material) l.name_in = lambda slist: [s[:32] for s in slist] l.add_value('reportnum', reportnum) for name, params in NrcScrapedMaterial.fields.items(): if 'xpath' in params: l.add_xpath(name, params['xpath']) item = l.load_item() yield item self.db.setBotTaskStatus(reportnum, self.name, 'DONE')