Python extract_text示例，jonas.items.extract_text Python示例

示例#1

0

显示文件

文件： jonas_manuscript.py 项目： aipberoun/spider-farm

    def parse(self, response):
        self.log("Our starting URL is %s"%self.start_urls)

        def absolutize_url(rel_url):
            return urljoin(response.url, rel_url)
            
        l = ItemLoader(item=JonasManuscriptItem(), response=response)
        
        l.add_xpath('signature', '/html/body/div[1]/div[3]/fieldset[1]/a[2]/p/span/text()')
        l.add_xpath('permalink', '/html/body/div[1]/div[3]/fieldset[2]/p/a/text()')

        l.add_xpath('main_dating', '/html/body/div[1]/div[3]/div[2]/div[2]/table[1]/tr[4]/td[2]/text()')
        l.add_xpath('language', '/html/body/div[1]/div[3]/div[2]/div[2]/table[1]/tr[5]/td[2]/text()')
        l.add_xpath('input_status', '/html/body/div[1]/div[3]/div[2]/div[2]/table[1]/tr[7]/td[2]/text()')
        
        r = re.compile(u'(\d*) œuvre\(s\) :')
        number_text = response.xpath('//div[@class="titre_contenu_bloc"]/text()').extract()[0]
        res = r.search(number_text)
        if res:
            number = res.group(1)
        else:
            number = u''
        l.add_value('number', number)
        
        l.add_xpath('author', '//span[@class="auteur"]/text()')
        l.add_xpath('title', '//span[@class="titre"]/text()')
        l.add_xpath('incipit', '//span[@class="incipit"]/text()')
        #'foliation' # no foliation on the testpage
        
        l.add_xpath('state_of_witness', '//div[@class="contenu_temoin"]/table/tr[1]/td[2]/text()')
        r = re.compile(u'(.*) (?:\((.*)\))')
        com_text = response.xpath('//div[@class="contenu_temoin"]/table/tr[2]/td[2]/text()').extract()[0]
        res = r.search(com_text)
        if res:
            res = map(lambda x: x if x else u'', res.groups()) # replace None with u''
            composition_period = res[0]
            known_work = res[1]
        else:
            composition_period = u''
            known_work = u''
            self.log('Error while parsing composition period.\n%s'%com_text)
        l.add_value('composition_period', composition_period)
        l.add_value('known_work', known_work)
        l.add_xpath('acronym', '//div[@class="contenu_temoin"]/table/tr[8]/td[2]/text()')
        
        bib = []
        for x in response.xpath('//div[@class="bibliolink"]').extract():
            bib.append(extract_text(x)[0].replace(u'\xa0', u' ')) # u'\xa0' is nonbreaking space
        l.add_value('bibliography', bib)

        return l.load_item()

示例#2

0

显示文件

文件： jonas_author.py 项目： aipberoun/spider-farm

    def parse(self, response):
        self.log("Our starting URL is %s"%self.start_urls)

        def absolutize_url(rel_url):
            return urljoin(response.url, rel_url)
            
        l = ItemLoader(item=JonasAuthorItem(), response=response)
        
        l.add_xpath('author', '/html/body/div[1]/div[3]/div[2]/div[2]/table[1]/tr/td[2]/text()')
        l.add_xpath('permalink', '/html/body/div[1]/div[3]/fieldset[2]/p/a/text()')
        l.add_xpath('born_after', '/html/body/div[1]/div[3]/div[2]/div[2]/table[2]/tr/td[2]/text()')
        l.add_xpath('born_before', '/html/body/div[1]/div[3]/div[2]/div[2]/table[3]/tr/td[2]/text()')
        l.add_xpath('dead_after', '/html/body/div[1]/div[3]/div[2]/div[2]/table[4]/tr/td[2]/text()')
        l.add_xpath('dead_before', '/html/body/div[1]/div[3]/div[2]/div[2]/table[5]/tr/td[2]/text()')

        l.add_xpath('oeuvres_link_detailed_works',
                    '//div[@id="blocAssociationsOeuvres"]/div/a/@href',
                    MapCompose(absolutize_url)
                    )
        oeuvres = '//div[@id="blocAssociationsOeuvres"]/div/span[@class="%s"]' # some text() nodes are empty
        l.add_xpath('incipit', oeuvres % 'curincipitoeuvre')
        l.add_xpath('oeuvres_title', oeuvres % 'curtitreoeuvre')
        tmp = []
        sel = response.xpath('//div[@id="blocAssociationsOeuvres"]/div[@class="ed_com" or @class="association"]')
        for x in sel:
            if x.xpath('@class').extract()[0] == u'association':
                tmp.append(u'')
            else:
                tmp[-1] = x.xpath('text()').extract()[0]
        l.add_value('editorial_note', tmp)

        l.add_xpath('associated_link_detailed_works',
                    '//div[@id="blocAssociationsParutions"]/div/a/@href',
                    MapCompose(absolutize_url)
                    )
        l.add_xpath('associated_manuscripts', '//div[@id="blocAssociationsParutions"]/div[@class="association"]')

        l.add_xpath('bibliography_link_detailed_works',
                    '//div[@id="blocBibliographies"]/div/a/@href',
                    MapCompose(absolutize_url)
                    )
        l.add_xpath('author_date', '//div[@class="bibliolink"]/span[1]/text()')
        
        name, title, in_work, pages = [], [], [], []
        r = re.compile('(.*?), (.*?),(?: in : (.*),)? \d{4}(?: ?[:,] (.*))?\.')
        for x in response.xpath('//div[@class="bibliolink"]/span[2]').extract():
            extracted_text = extract_text(x)[0].replace(u'\xa0', u' ') # u'\xa0' is nonbreaking space
            res = r.search(extracted_text)
            if res:
                res = map(lambda x: x if x else u'', res.groups()) # replace None with u''
                name.append(res[0])
                title.append(res[1])
                in_work.append(res[2])
                pages.append(res[3])
            else:
                self.log('Bibliography not parsed correctly:\n\n%s\n'%extracted_text)                        
        #self.log('LENGTHS: name %s, title %s, in_work %s, pages %s'%tuple(map(len,[ name, title, in_work, pages])))        
        l.add_value('complete_name', name)
        l.add_value('bibliography_title', title)
        l.add_value('in_work', in_work)
        l.add_value('pages', pages)

        tmp = []
        for x in response.xpath('//div[@class="bibliolink"]'):
            addition = x.xpath('./div[@class="info"]/text()').extract()
            if addition:
                tmp.append(addition[0])
            else:
                tmp.append(u'')
        l.add_value('topic', tmp)
        
        l.add_xpath('signature', '//ul[@id="listeSignatures"]/li/text()', MapCompose(unicode.strip))
        
        return l.load_item()

示例#3

0

显示文件

文件： jonas_author.py 项目： orazionelson/spider-farm

    def parse(self, response):
        self.log("Our starting URL is %s" % self.start_urls)

        def absolutize_url(rel_url):
            return urljoin(response.url, rel_url)

        l = ItemLoader(item=JonasAuthorItem(), response=response)

        l.add_xpath(
            'author',
            '/html/body/div[1]/div[3]/div[2]/div[2]/table[1]/tr/td[2]/text()')
        l.add_xpath('permalink',
                    '/html/body/div[1]/div[3]/fieldset[2]/p/a/text()')
        l.add_xpath(
            'born_after',
            '/html/body/div[1]/div[3]/div[2]/div[2]/table[2]/tr/td[2]/text()')
        l.add_xpath(
            'born_before',
            '/html/body/div[1]/div[3]/div[2]/div[2]/table[3]/tr/td[2]/text()')
        l.add_xpath(
            'dead_after',
            '/html/body/div[1]/div[3]/div[2]/div[2]/table[4]/tr/td[2]/text()')
        l.add_xpath(
            'dead_before',
            '/html/body/div[1]/div[3]/div[2]/div[2]/table[5]/tr/td[2]/text()')

        l.add_xpath('oeuvres_link_detailed_works',
                    '//div[@id="blocAssociationsOeuvres"]/div/a/@href',
                    MapCompose(absolutize_url))
        oeuvres = '//div[@id="blocAssociationsOeuvres"]/div/span[@class="%s"]'  # some text() nodes are empty
        l.add_xpath('incipit', oeuvres % 'curincipitoeuvre')
        l.add_xpath('oeuvres_title', oeuvres % 'curtitreoeuvre')
        tmp = []
        sel = response.xpath(
            '//div[@id="blocAssociationsOeuvres"]/div[@class="ed_com" or @class="association"]'
        )
        for x in sel:
            if x.xpath('@class').extract()[0] == u'association':
                tmp.append(u'')
            else:
                tmp[-1] = x.xpath('text()').extract()[0]
        l.add_value('editorial_note', tmp)

        l.add_xpath('associated_link_detailed_works',
                    '//div[@id="blocAssociationsParutions"]/div/a/@href',
                    MapCompose(absolutize_url))
        l.add_xpath(
            'associated_manuscripts',
            '//div[@id="blocAssociationsParutions"]/div[@class="association"]')

        l.add_xpath('bibliography_link_detailed_works',
                    '//div[@id="blocBibliographies"]/div/a/@href',
                    MapCompose(absolutize_url))
        l.add_xpath('author_date', '//div[@class="bibliolink"]/span[1]/text()')

        name, title, in_work, pages = [], [], [], []
        r = re.compile('(.*?), (.*?),(?: in : (.*),)? \d{4}(?: ?[:,] (.*))?\.')
        for x in response.xpath(
                '//div[@class="bibliolink"]/span[2]').extract():
            extracted_text = extract_text(x)[0].replace(
                u'\xa0', u' ')  # u'\xa0' is nonbreaking space
            res = r.search(extracted_text)
            if res:
                res = map(lambda x: x
                          if x else u'', res.groups())  # replace None with u''
                name.append(res[0])
                title.append(res[1])
                in_work.append(res[2])
                pages.append(res[3])
            else:
                self.log('Bibliography not parsed correctly:\n\n%s\n' %
                         extracted_text)
        #self.log('LENGTHS: name %s, title %s, in_work %s, pages %s'%tuple(map(len,[ name, title, in_work, pages])))
        l.add_value('complete_name', name)
        l.add_value('bibliography_title', title)
        l.add_value('in_work', in_work)
        l.add_value('pages', pages)

        tmp = []
        for x in response.xpath('//div[@class="bibliolink"]'):
            addition = x.xpath('./div[@class="info"]/text()').extract()
            if addition:
                tmp.append(addition[0])
            else:
                tmp.append(u'')
        l.add_value('topic', tmp)

        l.add_xpath('signature', '//ul[@id="listeSignatures"]/li/text()',
                    MapCompose(unicode.strip))

        return l.load_item()

示例#4

0

显示文件

文件： jonas_work.py 项目： orazionelson/spider-farm

    def parse(self, response):
        self.log("Our starting URL is %s"%self.start_urls)

        def absolutize_url(rel_url):
            return urljoin(response.url, rel_url)
            
        l = ItemLoader(item=JonasWorkItem(), response=response)
        
        l.add_xpath('permalink', '/html/body/div[1]/div[3]/fieldset[2]/p/a/text()')
        
        l.add_xpath('title', '//td[@class="titre"]/text()')
        l.add_xpath('author', '//td[@class="auteur"]/text()')
        l.add_xpath('incipit', '//td[@class="incipit"]/text()')
        l.add_xpath('shape', '//table[@class="table_identification"]/tr[8]/td[2]/text()')
        
        r = re.compile(u'(.*) (?:\((.*)\))')
        com_text = response.xpath('//table[@class="table_identification"]/tr[12]/td[2]/text()').extract()[0]
        res = r.search(com_text)
        if res:
            res = map(lambda x: x if x else u'', res.groups()) # replace None with u''
            composition_period = res[0]
            note_work = res[1]
        else:
            composition_period = u''
            note_work = u''
            self.log('Error while parsing composition period.\n%s'%com_text)
        l.add_value('composition_period', composition_period)
        l.add_value('note_work', note_work)
        
        l.add_xpath('language', '//table[@class="table_identification"]/tr[13]/td[2]/text()')
        l.add_xpath('other_authors', '//table[@class="table_autres"]/tr/td[2]/ul/li/a/span/text()')
        l.add_xpath('role', '//table[@class="table_autres"]/tr/td[2]/ul/li/span[1]/text()')
        l.add_xpath('hierarchy', '//ul[@class="thesaurus"]//text()')

        l.add_xpath('associated_link_detailed_works',
                    '//div[@class="association"]/a/@href',
                    MapCompose(absolutize_url)
                    )
        l.add_xpath('associated_author', '//div[@class="association"]/span[@class="curauteuroeuvre"]/text()')
        l.add_xpath('associated_title', '//div[@class="association"]/span[@class="curtitreoeuvre"]/text()')
        l.add_xpath('associated_incipit', '//div[@class="association"]/span[@class="curincipitoeuvre"]/text()')

        r = re.compile(u'(\d*) témoin')
        wit = response.xpath('//div[@id="temoins"]/div[2]/text()').extract()[0]
        wit = wit.replace(u'\xa0', u' ') # u'\xa0' is nonbreaking space
        res = r.search(wit)
        if res:
            num_wit = res.group(1)
        else:
            num_wit = u''
            self.log("Error parsing number of witnesses:\n%s"%wit)
        l.add_value('number_of_witnesses', num_wit)

        l.add_xpath('manuscripts', '//div[@class="un_temoin temoin"]')
        
        l.add_xpath('bibliography_link',
                    '//div[@id="blocBibliographies"]/div/a/@href',
                    MapCompose(absolutize_url)
                    )
        bib = []
        for x in response.xpath('//div[@class="bibliolink"]').extract():
            bib.append(extract_text(x)[0].replace(u'\xa0', u' ')) # u'\xa0' is nonbreaking space
        l.add_value('bibliography', bib)

        return l.load_item()

示例#5

0

显示文件

文件： jonas_manuscript.py 项目： orazionelson/spider-farm

    def parse(self, response):
        self.log("Our starting URL is %s" % self.start_urls)

        def absolutize_url(rel_url):
            return urljoin(response.url, rel_url)

        l = ItemLoader(item=JonasManuscriptItem(), response=response)

        l.add_xpath('signature',
                    '/html/body/div[1]/div[3]/fieldset[1]/a[2]/p/span/text()')
        l.add_xpath('permalink',
                    '/html/body/div[1]/div[3]/fieldset[2]/p/a/text()')

        l.add_xpath(
            'main_dating',
            '/html/body/div[1]/div[3]/div[2]/div[2]/table[1]/tr[4]/td[2]/text()'
        )
        l.add_xpath(
            'language',
            '/html/body/div[1]/div[3]/div[2]/div[2]/table[1]/tr[5]/td[2]/text()'
        )
        l.add_xpath(
            'input_status',
            '/html/body/div[1]/div[3]/div[2]/div[2]/table[1]/tr[7]/td[2]/text()'
        )

        r = re.compile(u'(\d*) œuvre\(s\) :')
        number_text = response.xpath(
            '//div[@class="titre_contenu_bloc"]/text()').extract()[0]
        res = r.search(number_text)
        if res:
            number = res.group(1)
        else:
            number = u''
        l.add_value('number', number)

        l.add_xpath('author', '//span[@class="auteur"]/text()')
        l.add_xpath('title', '//span[@class="titre"]/text()')
        l.add_xpath('incipit', '//span[@class="incipit"]/text()')
        #'foliation' # no foliation on the testpage

        l.add_xpath('state_of_witness',
                    '//div[@class="contenu_temoin"]/table/tr[1]/td[2]/text()')
        r = re.compile(u'(.*) (?:\((.*)\))')
        com_text = response.xpath(
            '//div[@class="contenu_temoin"]/table/tr[2]/td[2]/text()').extract(
            )[0]
        res = r.search(com_text)
        if res:
            res = map(lambda x: x
                      if x else u'', res.groups())  # replace None with u''
            composition_period = res[0]
            known_work = res[1]
        else:
            composition_period = u''
            known_work = u''
            self.log('Error while parsing composition period.\n%s' % com_text)
        l.add_value('composition_period', composition_period)
        l.add_value('known_work', known_work)
        l.add_xpath('acronym',
                    '//div[@class="contenu_temoin"]/table/tr[8]/td[2]/text()')

        bib = []
        for x in response.xpath('//div[@class="bibliolink"]').extract():
            bib.append(extract_text(x)[0].replace(
                u'\xa0', u' '))  # u'\xa0' is nonbreaking space
        l.add_value('bibliography', bib)

        return l.load_item()