def refine_item(self, response, item): data = text.extract_dict( response, 'xpath:.//div[@id="person-chronology"]//table//th', 'xpath:.//div[@id="person-chronology"]//table//td', sep=' ') item['other']['publications'] = [ self.make_url_absolute(response.url, url) for url in item['other']['publications'] ] item['other']['websites'] = [ self.make_url_absolute(response.url, url) for url in item['other']['websites'] ] item['other']['bibliography'] = [ self.make_url_absolute(response.url, url) for url in item['other']['bibliography'] ] item['other']['participated_in'] = [ self.make_url_absolute(response.url, url) for url in item['other']['participated_in'] ] item['other']['biography'] = data item['birth'] = data.get('Born in') item['death'] = data.get('Deceased in', data.get('Deceased on')) return super(StructuraeNetSpider, self).refine_item(response, item)
def refine_item(self, response, item): item['other'] = { 'biography': text.extract_dict(response, 'xpath:.//td[@id="keyColumn"]', 'xpath:.//td[@id="valueColumn"]'), 'scripts': [('http://cesar.org.uk/cesar2/titles/titles.php?fct=edit&script_UOID=' + script[len('javascript:scriptClicked('):-1]) for script in response.xpath('.//td[@id="keywordColumn"]//a/@href').extract()] } item['name'] = '%s, %s' % (item['other']['biography'].get( 'Last name', ''), item['other']['biography'].get('First name', '')) return super(CesarOrgUkSpider, self).refine_item(response, item)
def refine_item(self, response, item): item['other'] = { 'biography': text.extract_dict(response, 'xpath:.//td[@id="keyColumn"]', 'xpath:.//td[@id="valueColumn"]' ), 'scripts': [('http://cesar.org.uk/cesar2/titles/titles.php?fct=edit&script_UOID=' + script[len('javascript:scriptClicked('):-1]) for script in response.xpath('.//td[@id="keywordColumn"]//a/@href').extract()] } item['name'] = '%s, %s' % (item['other']['biography'].get('Last name', ''), item['other']['biography'].get('First name', '')) return super(CesarOrgUkSpider, self).refine_item(response, item)
def refine_item(self, response, item): data = { key: [v.strip() for v in value.split(';') if v.strip() and 'http://' not in v and 'viaf.org' not in v] for key, value in text.extract_dict( response, 'xpath:(.//table)[last()]//tr/td[@class="td_label_details"]', 'xpath:(.//table)[last()]//tr/td[@class="td_value_details"]' ).iteritems() } last = ' '.join(data.get('Last Name', '')) first = ' '.join(data.get('Given Name', '')) item['name'] = '%s, %s' % (last, first) item['other'] = data return super(DsiSpider, self).refine_item(response, item)
def refine_item(self, response, item): data = text.extract_dict(response, 'xpath:.//div[@id="person-chronology"]//table//th', 'xpath:.//div[@id="person-chronology"]//table//td', sep=' ' ) item['other']['publications'] = [self.make_url_absolute(response.url, url) for url in item['other']['publications']] item['other']['websites'] = [self.make_url_absolute(response.url, url) for url in item['other']['websites']] item['other']['bibliography'] = [self.make_url_absolute(response.url, url) for url in item['other']['bibliography']] item['other']['participated_in'] = [self.make_url_absolute(response.url, url) for url in item['other']['participated_in']] item['other']['biography'] = data item['birth'] = data.get('Born in') item['death'] = data.get('Deceased in', data.get('Deceased on')) return super(StructuraeNetSpider, self).refine_item(response, item)
def parse_microdata(self, response): return text.extract_dict(response, 'xpath:.//section[@id="mdata"]//span', 'xpath:.//section[@id="mdata"]//span', './@itemprop', './text()')