示例#1
0
    def refine_item(self, response, item):
        data = text.extract_dict(
            response,
            'xpath:.//div[@id="person-chronology"]//table//th',
            'xpath:.//div[@id="person-chronology"]//table//td',
            sep=' ')

        item['other']['publications'] = [
            self.make_url_absolute(response.url, url)
            for url in item['other']['publications']
        ]
        item['other']['websites'] = [
            self.make_url_absolute(response.url, url)
            for url in item['other']['websites']
        ]
        item['other']['bibliography'] = [
            self.make_url_absolute(response.url, url)
            for url in item['other']['bibliography']
        ]
        item['other']['participated_in'] = [
            self.make_url_absolute(response.url, url)
            for url in item['other']['participated_in']
        ]

        item['other']['biography'] = data

        item['birth'] = data.get('Born in')
        item['death'] = data.get('Deceased in', data.get('Deceased on'))

        return super(StructuraeNetSpider, self).refine_item(response, item)
示例#2
0
    def refine_item(self, response, item):

        item['other'] = {
            'biography':
            text.extract_dict(response, 'xpath:.//td[@id="keyColumn"]',
                              'xpath:.//td[@id="valueColumn"]'),
            'scripts':
            [('http://cesar.org.uk/cesar2/titles/titles.php?fct=edit&script_UOID='
              + script[len('javascript:scriptClicked('):-1]) for script in
             response.xpath('.//td[@id="keywordColumn"]//a/@href').extract()]
        }

        item['name'] = '%s, %s' % (item['other']['biography'].get(
            'Last name', ''), item['other']['biography'].get('First name', ''))

        return super(CesarOrgUkSpider, self).refine_item(response, item)
示例#3
0
    def refine_item(self, response, item):

        item['other'] = {
            'biography': text.extract_dict(response,
                                           'xpath:.//td[@id="keyColumn"]',
                                           'xpath:.//td[@id="valueColumn"]'
                                           ),
            'scripts': [('http://cesar.org.uk/cesar2/titles/titles.php?fct=edit&script_UOID=' +
                         script[len('javascript:scriptClicked('):-1])
                        for script in response.xpath('.//td[@id="keywordColumn"]//a/@href').extract()]
        }

        item['name'] = '%s, %s' % (item['other']['biography'].get('Last name', ''),
                                   item['other']['biography'].get('First name', ''))

        return super(CesarOrgUkSpider, self).refine_item(response, item)
示例#4
0
文件: dsi.py 项目: rpatil524/StrepHit
    def refine_item(self, response, item):
        data = {
            key: [v.strip() for v in value.split(';')
                  if v.strip() and 'http://' not in v and 'viaf.org' not in v]
            for key, value in text.extract_dict(
                response,
                'xpath:(.//table)[last()]//tr/td[@class="td_label_details"]',
                'xpath:(.//table)[last()]//tr/td[@class="td_value_details"]'
            ).iteritems()
        }

        last = ' '.join(data.get('Last Name', ''))
        first = ' '.join(data.get('Given Name', ''))
        item['name'] = '%s, %s' % (last, first)
        item['other'] = data

        return super(DsiSpider, self).refine_item(response, item)
示例#5
0
    def refine_item(self, response, item):
        data = text.extract_dict(response,
                                 'xpath:.//div[@id="person-chronology"]//table//th',
                                 'xpath:.//div[@id="person-chronology"]//table//td',
                                 sep=' '
                                 )

        item['other']['publications'] = [self.make_url_absolute(response.url, url)
                                         for url in item['other']['publications']]
        item['other']['websites'] = [self.make_url_absolute(response.url, url)
                                     for url in item['other']['websites']]
        item['other']['bibliography'] = [self.make_url_absolute(response.url, url)
                                         for url in item['other']['bibliography']]
        item['other']['participated_in'] = [self.make_url_absolute(response.url, url)
                                            for url in item['other']['participated_in']]

        item['other']['biography'] = data

        item['birth'] = data.get('Born in')
        item['death'] = data.get('Deceased in', data.get('Deceased on'))

        return super(StructuraeNetSpider, self).refine_item(response, item)
示例#6
0
 def parse_microdata(self, response):
     return text.extract_dict(response,
                              'xpath:.//section[@id="mdata"]//span',
                              'xpath:.//section[@id="mdata"]//span',
                              './@itemprop', './text()')
示例#7
0
 def parse_microdata(self, response):
     return text.extract_dict(response,
                              'xpath:.//section[@id="mdata"]//span',
                              'xpath:.//section[@id="mdata"]//span',
                              './@itemprop',
                              './text()')