def parse(self, response): self.log("Our starting URL is %s"%self.start_urls) def absolutize_url(rel_url): return urljoin(response.url, rel_url) l = ItemLoader(item=JonasManuscriptItem(), response=response) l.add_xpath('signature', '/html/body/div[1]/div[3]/fieldset[1]/a[2]/p/span/text()') l.add_xpath('permalink', '/html/body/div[1]/div[3]/fieldset[2]/p/a/text()') l.add_xpath('main_dating', '/html/body/div[1]/div[3]/div[2]/div[2]/table[1]/tr[4]/td[2]/text()') l.add_xpath('language', '/html/body/div[1]/div[3]/div[2]/div[2]/table[1]/tr[5]/td[2]/text()') l.add_xpath('input_status', '/html/body/div[1]/div[3]/div[2]/div[2]/table[1]/tr[7]/td[2]/text()') r = re.compile(u'(\d*) œuvre\(s\) :') number_text = response.xpath('//div[@class="titre_contenu_bloc"]/text()').extract()[0] res = r.search(number_text) if res: number = res.group(1) else: number = u'' l.add_value('number', number) l.add_xpath('author', '//span[@class="auteur"]/text()') l.add_xpath('title', '//span[@class="titre"]/text()') l.add_xpath('incipit', '//span[@class="incipit"]/text()') #'foliation' # no foliation on the testpage l.add_xpath('state_of_witness', '//div[@class="contenu_temoin"]/table/tr[1]/td[2]/text()') r = re.compile(u'(.*) (?:\((.*)\))') com_text = response.xpath('//div[@class="contenu_temoin"]/table/tr[2]/td[2]/text()').extract()[0] res = r.search(com_text) if res: res = map(lambda x: x if x else u'', res.groups()) # replace None with u'' composition_period = res[0] known_work = res[1] else: composition_period = u'' known_work = u'' self.log('Error while parsing composition period.\n%s'%com_text) l.add_value('composition_period', composition_period) l.add_value('known_work', known_work) l.add_xpath('acronym', '//div[@class="contenu_temoin"]/table/tr[8]/td[2]/text()') bib = [] for x in response.xpath('//div[@class="bibliolink"]').extract(): bib.append(extract_text(x)[0].replace(u'\xa0', u' ')) # u'\xa0' is nonbreaking space l.add_value('bibliography', bib) return l.load_item()
def parse(self, response): self.log("Our starting URL is %s"%self.start_urls) def absolutize_url(rel_url): return urljoin(response.url, rel_url) l = ItemLoader(item=JonasAuthorItem(), response=response) l.add_xpath('author', '/html/body/div[1]/div[3]/div[2]/div[2]/table[1]/tr/td[2]/text()') l.add_xpath('permalink', '/html/body/div[1]/div[3]/fieldset[2]/p/a/text()') l.add_xpath('born_after', '/html/body/div[1]/div[3]/div[2]/div[2]/table[2]/tr/td[2]/text()') l.add_xpath('born_before', '/html/body/div[1]/div[3]/div[2]/div[2]/table[3]/tr/td[2]/text()') l.add_xpath('dead_after', '/html/body/div[1]/div[3]/div[2]/div[2]/table[4]/tr/td[2]/text()') l.add_xpath('dead_before', '/html/body/div[1]/div[3]/div[2]/div[2]/table[5]/tr/td[2]/text()') l.add_xpath('oeuvres_link_detailed_works', '//div[@id="blocAssociationsOeuvres"]/div/a/@href', MapCompose(absolutize_url) ) oeuvres = '//div[@id="blocAssociationsOeuvres"]/div/span[@class="%s"]' # some text() nodes are empty l.add_xpath('incipit', oeuvres % 'curincipitoeuvre') l.add_xpath('oeuvres_title', oeuvres % 'curtitreoeuvre') tmp = [] sel = response.xpath('//div[@id="blocAssociationsOeuvres"]/div[@class="ed_com" or @class="association"]') for x in sel: if x.xpath('@class').extract()[0] == u'association': tmp.append(u'') else: tmp[-1] = x.xpath('text()').extract()[0] l.add_value('editorial_note', tmp) l.add_xpath('associated_link_detailed_works', '//div[@id="blocAssociationsParutions"]/div/a/@href', MapCompose(absolutize_url) ) l.add_xpath('associated_manuscripts', '//div[@id="blocAssociationsParutions"]/div[@class="association"]') l.add_xpath('bibliography_link_detailed_works', '//div[@id="blocBibliographies"]/div/a/@href', MapCompose(absolutize_url) ) l.add_xpath('author_date', '//div[@class="bibliolink"]/span[1]/text()') name, title, in_work, pages = [], [], [], [] r = re.compile('(.*?), (.*?),(?: in : (.*),)? \d{4}(?: ?[:,] (.*))?\.') for x in response.xpath('//div[@class="bibliolink"]/span[2]').extract(): extracted_text = extract_text(x)[0].replace(u'\xa0', u' ') # u'\xa0' is nonbreaking space res = r.search(extracted_text) if res: res = map(lambda x: x if x else u'', res.groups()) # replace None with u'' name.append(res[0]) title.append(res[1]) in_work.append(res[2]) pages.append(res[3]) else: self.log('Bibliography not parsed correctly:\n\n%s\n'%extracted_text) #self.log('LENGTHS: name %s, title %s, in_work %s, pages %s'%tuple(map(len,[ name, title, in_work, pages]))) l.add_value('complete_name', name) l.add_value('bibliography_title', title) l.add_value('in_work', in_work) l.add_value('pages', pages) tmp = [] for x in response.xpath('//div[@class="bibliolink"]'): addition = x.xpath('./div[@class="info"]/text()').extract() if addition: tmp.append(addition[0]) else: tmp.append(u'') l.add_value('topic', tmp) l.add_xpath('signature', '//ul[@id="listeSignatures"]/li/text()', MapCompose(unicode.strip)) return l.load_item()
def parse(self, response): self.log("Our starting URL is %s" % self.start_urls) def absolutize_url(rel_url): return urljoin(response.url, rel_url) l = ItemLoader(item=JonasAuthorItem(), response=response) l.add_xpath( 'author', '/html/body/div[1]/div[3]/div[2]/div[2]/table[1]/tr/td[2]/text()') l.add_xpath('permalink', '/html/body/div[1]/div[3]/fieldset[2]/p/a/text()') l.add_xpath( 'born_after', '/html/body/div[1]/div[3]/div[2]/div[2]/table[2]/tr/td[2]/text()') l.add_xpath( 'born_before', '/html/body/div[1]/div[3]/div[2]/div[2]/table[3]/tr/td[2]/text()') l.add_xpath( 'dead_after', '/html/body/div[1]/div[3]/div[2]/div[2]/table[4]/tr/td[2]/text()') l.add_xpath( 'dead_before', '/html/body/div[1]/div[3]/div[2]/div[2]/table[5]/tr/td[2]/text()') l.add_xpath('oeuvres_link_detailed_works', '//div[@id="blocAssociationsOeuvres"]/div/a/@href', MapCompose(absolutize_url)) oeuvres = '//div[@id="blocAssociationsOeuvres"]/div/span[@class="%s"]' # some text() nodes are empty l.add_xpath('incipit', oeuvres % 'curincipitoeuvre') l.add_xpath('oeuvres_title', oeuvres % 'curtitreoeuvre') tmp = [] sel = response.xpath( '//div[@id="blocAssociationsOeuvres"]/div[@class="ed_com" or @class="association"]' ) for x in sel: if x.xpath('@class').extract()[0] == u'association': tmp.append(u'') else: tmp[-1] = x.xpath('text()').extract()[0] l.add_value('editorial_note', tmp) l.add_xpath('associated_link_detailed_works', '//div[@id="blocAssociationsParutions"]/div/a/@href', MapCompose(absolutize_url)) l.add_xpath( 'associated_manuscripts', '//div[@id="blocAssociationsParutions"]/div[@class="association"]') l.add_xpath('bibliography_link_detailed_works', '//div[@id="blocBibliographies"]/div/a/@href', MapCompose(absolutize_url)) l.add_xpath('author_date', '//div[@class="bibliolink"]/span[1]/text()') name, title, in_work, pages = [], [], [], [] r = re.compile('(.*?), (.*?),(?: in : (.*),)? \d{4}(?: ?[:,] (.*))?\.') for x in response.xpath( '//div[@class="bibliolink"]/span[2]').extract(): extracted_text = extract_text(x)[0].replace( u'\xa0', u' ') # u'\xa0' is nonbreaking space res = r.search(extracted_text) if res: res = map(lambda x: x if x else u'', res.groups()) # replace None with u'' name.append(res[0]) title.append(res[1]) in_work.append(res[2]) pages.append(res[3]) else: self.log('Bibliography not parsed correctly:\n\n%s\n' % extracted_text) #self.log('LENGTHS: name %s, title %s, in_work %s, pages %s'%tuple(map(len,[ name, title, in_work, pages]))) l.add_value('complete_name', name) l.add_value('bibliography_title', title) l.add_value('in_work', in_work) l.add_value('pages', pages) tmp = [] for x in response.xpath('//div[@class="bibliolink"]'): addition = x.xpath('./div[@class="info"]/text()').extract() if addition: tmp.append(addition[0]) else: tmp.append(u'') l.add_value('topic', tmp) l.add_xpath('signature', '//ul[@id="listeSignatures"]/li/text()', MapCompose(unicode.strip)) return l.load_item()
def parse(self, response): self.log("Our starting URL is %s"%self.start_urls) def absolutize_url(rel_url): return urljoin(response.url, rel_url) l = ItemLoader(item=JonasWorkItem(), response=response) l.add_xpath('permalink', '/html/body/div[1]/div[3]/fieldset[2]/p/a/text()') l.add_xpath('title', '//td[@class="titre"]/text()') l.add_xpath('author', '//td[@class="auteur"]/text()') l.add_xpath('incipit', '//td[@class="incipit"]/text()') l.add_xpath('shape', '//table[@class="table_identification"]/tr[8]/td[2]/text()') r = re.compile(u'(.*) (?:\((.*)\))') com_text = response.xpath('//table[@class="table_identification"]/tr[12]/td[2]/text()').extract()[0] res = r.search(com_text) if res: res = map(lambda x: x if x else u'', res.groups()) # replace None with u'' composition_period = res[0] note_work = res[1] else: composition_period = u'' note_work = u'' self.log('Error while parsing composition period.\n%s'%com_text) l.add_value('composition_period', composition_period) l.add_value('note_work', note_work) l.add_xpath('language', '//table[@class="table_identification"]/tr[13]/td[2]/text()') l.add_xpath('other_authors', '//table[@class="table_autres"]/tr/td[2]/ul/li/a/span/text()') l.add_xpath('role', '//table[@class="table_autres"]/tr/td[2]/ul/li/span[1]/text()') l.add_xpath('hierarchy', '//ul[@class="thesaurus"]//text()') l.add_xpath('associated_link_detailed_works', '//div[@class="association"]/a/@href', MapCompose(absolutize_url) ) l.add_xpath('associated_author', '//div[@class="association"]/span[@class="curauteuroeuvre"]/text()') l.add_xpath('associated_title', '//div[@class="association"]/span[@class="curtitreoeuvre"]/text()') l.add_xpath('associated_incipit', '//div[@class="association"]/span[@class="curincipitoeuvre"]/text()') r = re.compile(u'(\d*) témoin') wit = response.xpath('//div[@id="temoins"]/div[2]/text()').extract()[0] wit = wit.replace(u'\xa0', u' ') # u'\xa0' is nonbreaking space res = r.search(wit) if res: num_wit = res.group(1) else: num_wit = u'' self.log("Error parsing number of witnesses:\n%s"%wit) l.add_value('number_of_witnesses', num_wit) l.add_xpath('manuscripts', '//div[@class="un_temoin temoin"]') l.add_xpath('bibliography_link', '//div[@id="blocBibliographies"]/div/a/@href', MapCompose(absolutize_url) ) bib = [] for x in response.xpath('//div[@class="bibliolink"]').extract(): bib.append(extract_text(x)[0].replace(u'\xa0', u' ')) # u'\xa0' is nonbreaking space l.add_value('bibliography', bib) return l.load_item()
def parse(self, response): self.log("Our starting URL is %s" % self.start_urls) def absolutize_url(rel_url): return urljoin(response.url, rel_url) l = ItemLoader(item=JonasManuscriptItem(), response=response) l.add_xpath('signature', '/html/body/div[1]/div[3]/fieldset[1]/a[2]/p/span/text()') l.add_xpath('permalink', '/html/body/div[1]/div[3]/fieldset[2]/p/a/text()') l.add_xpath( 'main_dating', '/html/body/div[1]/div[3]/div[2]/div[2]/table[1]/tr[4]/td[2]/text()' ) l.add_xpath( 'language', '/html/body/div[1]/div[3]/div[2]/div[2]/table[1]/tr[5]/td[2]/text()' ) l.add_xpath( 'input_status', '/html/body/div[1]/div[3]/div[2]/div[2]/table[1]/tr[7]/td[2]/text()' ) r = re.compile(u'(\d*) œuvre\(s\) :') number_text = response.xpath( '//div[@class="titre_contenu_bloc"]/text()').extract()[0] res = r.search(number_text) if res: number = res.group(1) else: number = u'' l.add_value('number', number) l.add_xpath('author', '//span[@class="auteur"]/text()') l.add_xpath('title', '//span[@class="titre"]/text()') l.add_xpath('incipit', '//span[@class="incipit"]/text()') #'foliation' # no foliation on the testpage l.add_xpath('state_of_witness', '//div[@class="contenu_temoin"]/table/tr[1]/td[2]/text()') r = re.compile(u'(.*) (?:\((.*)\))') com_text = response.xpath( '//div[@class="contenu_temoin"]/table/tr[2]/td[2]/text()').extract( )[0] res = r.search(com_text) if res: res = map(lambda x: x if x else u'', res.groups()) # replace None with u'' composition_period = res[0] known_work = res[1] else: composition_period = u'' known_work = u'' self.log('Error while parsing composition period.\n%s' % com_text) l.add_value('composition_period', composition_period) l.add_value('known_work', known_work) l.add_xpath('acronym', '//div[@class="contenu_temoin"]/table/tr[8]/td[2]/text()') bib = [] for x in response.xpath('//div[@class="bibliolink"]').extract(): bib.append(extract_text(x)[0].replace( u'\xa0', u' ')) # u'\xa0' is nonbreaking space l.add_value('bibliography', bib) return l.load_item()