def parse_related_documents(self, response): xpath = '/html/body/div/table/tr[3]/td/table/tr/td/table/tr' hxs = HtmlXPathSelector(response).select(xpath) act = Loader(self, response, LegalAct(), hxs, required=('_id', )) act.add_xpath('_id', 'td[2]/b/text()') if not act.get_output_value('_id'): p_id = unicode(self._get_query_attr(response.url, 'p_id')) act.replace_value('_id', u'NONUMBER-%s' % p_id) relations = defaultdict(list) xpath = '/html/body/div/table/tr[3]/td/table/tr/td/align/table/tr' for row in HtmlXPathSelector(response).select(xpath): docid = get_all(row, 'td[4]/span//text()') rel_type = row.select('td[6]/span/text()') if rel_type: rel_type = rel_type.extract()[0].strip().lower() if rel_type in (u'pakeistas dokumentas', u'ankstesnė dokumento redakcija'): relations['amends'].append(docid) elif rel_type == u'priimtas dokumentas': relations['adopts'].append(docid) elif rel_type == u'ryšys su taikymą nusakančiu dokumentu': relations['defines_applicability'].append(docid) elif rel_type == u'ryšys su galiojimą nusakančiu dokumentu': relations['defines_validity'].append(docid) elif rel_type == u'negalioja de jure': relations['defines_as_no_longer_valid'].append(docid) elif rel_type == u'kitas projekto variantas': relations['new_draft_version'].append(docid) elif rel_type == u'kitas projekto variantas': relations['new_draft_version'].append(docid) elif rel_type == u'ryšys su ratifikavimo dokumentu': relations['ratification'].append(docid) if relations: act.add_value('relations', dict(relations)) yield act.load_item()
def _parse_law_act(self, response, hxs, base=False): """ Extracts basic document information and returns law act loader. Parameters: base Return only base information about document. This is used, when filling some information bits to a law act from several law act documents. """ lang = hxs.select('tr[1]/td[4]/b/text()').extract()[0].strip().lower() if lang not in (u'lietuvių', u'rusų', u'anglų', u'ispanų'): self.error(response, 'Unknown language: %s' % lang) if lang != u'lietuvių': return None act = Loader(self, response, LegalAct(), hxs, required=REQUIRED_FIELDS) act.add_xpath('_id', 'tr[1]/td[2]/b/text()') source = self._get_source(response.url, 'p_id') if not act.get_output_value('_id'): act.replace_value('_id', u'NONUMBER-%s' % source['id']) if base: return act act.add_xpath('name', 'caption/text()') act.add_xpath('kind', 'tr[1]/td[1]/b/text()') act.add_xpath('number', 'tr[1]/td[2]/b/text()') act.add_xpath('date', 'tr[1]/td[3]/b/text()') act.add_value('source', source) self._fix_name_case(act) return act
def _parse_person_details(self, response): xpath = '//table[@summary="Seimo narys"]' hxs = Selector(response).xpath(xpath)[0] source = self._get_source(response.url, 'p_asm_id') seimas_code = self._get_query_attr(response.url, 'p_r') if seimas_code: source['version'] = seimas_version_map[int(seimas_code)] _id = source['id'] person_hxs = hxs.xpath('tr/td/table/tr/td[2]/table/tr[2]/td[2]') person = Loader(self, response, Person(), person_hxs, required=('first_name', 'last_name')) person.add_value('_id', '%sp' % _id) # Details split = [ u'asmeniniai puslapiai', u'asmeninis elektroninis paštas', u'biuro adresas', u'darbo telefonas', u'iškėlė', u'išrinktas', u'kabinetas', u'kandidato puslapis', u'padėjėja sekretorė', u'seimo narys', ] details = ' '.join(person_hxs.xpath('descendant::text()').extract()) details = str2dict(split, details, normalize=mapwords({ u'išrinkta': u'išrinktas', u'seimo narė': u'seimo narys', u'el p': u'asmeninis elektroninis paštas', u'asmeninė svetainė': u'asmeniniai puslapiai', })) details = dict(details) email = details.get(u'asmeninis elektroninis paštas', '') phone = details.get(u'darbo telefonas', '') person.add_value('constituency', [details.get(u'išrinktas', '')]) person.add_value('raised_by', [details.get(u'iškėlė', '')]) person.add_value('email', split_by_comma(email)) person.add_value('phone', split_by_comma(phone)) person.add_value('office_address', [details.get(u'biuro adresas', '')]) person.add_xpath( 'home_page', u'a[contains(font/text(), "Asmeniniai puslapiai") or contains(font/text(), "Asmeninė svetainė")]/@href' ) person.add_xpath('candidate_page', 'a[contains(text(), "Kandidato puslapis")]/@href') person.add_value('source', source) # photo photo = hxs.xpath('tr/td/table/tr/td/div/img/@src').extract()[0] person.add_value('photo', photo) person.add_value('image_urls', photo) header_hxs = hxs.xpath('tr/td/table/tr/td[2]/table/tr/td[2]') # parliament parliament = header_hxs.xpath('div/b/font/text()') parliament = parliament.re(r'(\d{4}[-\x97]\d{4})') parliament = ''.join(parliament).replace(u'\x97', u'-') person.add_value('parliament', parliament) if u'seimo narys' in details: keys = ['nuo', 'iki'] membership = dict(str2dict(keys, details[u'seimo narys'])) parliament_group = { 'type': 'parliament', 'name': parliament, 'position': u'seimo narys', 'membership': [membership['nuo'], membership.get('iki')], } person.add_value('groups', [parliament_group]) # name (first name, last name) name = header_hxs.xpath('div/b/font[2]/text()').extract()[0] self._parse_name(person, name) # groups party_name = person.get_output_value('raised_by') if party_name: person.add_value('groups', [{'type': 'party', 'name': party_name, 'position': 'narys'}]) self._parse_groups(response, hxs, person) # date of birth xpath = (u'tr/td/table/' u'tr[contains(descendant::text(), "Biografija")]/' u'following-sibling::tr/td/' u'descendant::*[contains(text(), "Gimė")]/text()') dob_hxs = hxs.xpath(u'translate(%s, "\xa0", " ")' % xpath) dob_match = dob_hxs.re(dob_re) if dob_match: year, month, day = dob_match month = month_names_map[month] dob = u'%s-%02d-%s' % (year, month, day.zfill(2)) person.add_value('dob', dob) # biography xpath = (u'tr/td/table/' u'tr[contains(descendant::text(), "Biografija")]/' u'following-sibling::tr/td/div') bio_hxs = hxs.xpath(xpath) self._parse_biography(response, person, bio_hxs) # parliamentary history xpath = (u'//table[@summary="Istorija"]/' u'tr/td/a[starts-with(b/text(), "Buvo išrinkta")]/' u'following-sibling::text()') history_hxs = hxs.xpath(xpath) if history_hxs: for item in history_hxs: parliament = ''.join(item.re(r'(\d{4}) (-) (\d{4})')) person.add_value('parliament', [parliament]) return person.load_item()
def _parse_person_details(self, response): xpath = '//div[contains(@id,"page-content")]' hxs = Selector(response).xpath(xpath)[0] source = self._get_source(response.url, 'p_asm_id') seimas_code = self._get_query_attr(response.url, 'p_r') if seimas_code: source['version'] = seimas_version_map[int(seimas_code)] _id = source['id'] person_hxs = hxs.xpath('div/div[contains(@class, "col1")]') person = Loader(self, response, Person(), person_hxs, required=('first_name', 'last_name')) person.add_value('_id', '%sp' % _id) # Details split = [ u'asmeniniai puslapiai', u'asmeninis elektroninis paštas', u'biuro adresas', u'darbo telefonas', u'iškėlė', u'išrinktas', u'kabinetas', u'kandidato puslapis', u'padėjėja sekretorė', u'seimo narys', u'buvo išrinktas', u'buvo išrinkta', u'kontaktai', ] details = ' '.join(person_hxs.xpath('descendant::text()').extract()) details = str2dict(split, details, normalize=mapwords({ u'išrinkta': u'išrinktas', u'seimo narė': u'seimo narys', })) details = dict(details) contacts_hxs = hxs.xpath( 'div/div[contains(@class, "col3")]' '/div[contains(@class, "kontaktai")]' ) contacts = ' '.join(contacts_hxs.xpath('descendant::text()').extract()) contacts_split = [ u'el p', u'tel', u'asmeninė svetainė' ] contacts = str2dict(contacts_split, contacts) contacts = dict(contacts) if contacts.get('tel'): phone = re.sub("[^0-9]", "", contacts.get(u'tel')) person.add_value('phone', phone) email_xpath = 'div/div[contains(descendant::text(), "El. p.")]/a/text()' email_hxs = contacts_hxs.xpath(email_xpath) for email in email_hxs: person.add_value('email', email.extract()) # TODO person.add_value('office_address', ['']) website_hxs = contacts_hxs.xpath( 'div/div[contains(@class, "site")]/a/@href' ) if website_hxs: person.add_value( 'home_page', website_hxs.extract()[0] ) person.add_value('raised_by', [details.get(u'iškėlė', '')]) person.add_value('constituency', [details.get(u'išrinktas', '')]) person.add_value('source', source) # photo # first for P leader # second for the rest photo_selectors = [ '//*[@id="page-content"]/div/div[1]/div[1]/img/@src', '//*[contains(@class, "seimo-nario-foto")]/img/@src', ] photo = None for photo_selector in photo_selectors: photo = Selector(response).xpath(photo_selector).extract() if photo: break if photo: person.add_value('photo', photo[0]) person.add_value('image_urls', photo[0]) # parliament parliament = hxs.xpath( 'div/div/div[contains(@class, "smn-kadencija")]/span/text()' ) parliament = parliament.re(r'(\d{4}[^-]\d{4})') parliament = ''.join(parliament).replace(u'\u2013', u'-') person.add_value('parliament', parliament) if u'seimo narys' in details: keys = ['nuo', 'iki'] membership = dict(str2dict(keys, details[u'seimo narys'])) parliament_group = { 'type': 'parliament', 'name': parliament, 'position': u'seimo narys', 'membership': [membership['nuo'], membership.get('iki')], } person.add_value('groups', [parliament_group]) first_name = Selector(response).xpath('//*/div[contains(@class, "smn-name")]/text()').extract()[0] last_name = Selector(response).xpath('//*/span[contains(@class, "smn-pavarde")]/text()').extract()[0] person.add_value('first_name', unicode(first_name)) person.add_value('last_name', unicode(last_name.title())) # groups party_name = person.get_output_value('raised_by') if party_name: person.add_value('groups', [{'type': 'party', 'name': party_name, 'position': 'narys'}]) self._parse_groups(response, hxs, person) # biography_xpath = 'div/div[2]/div[3]/div/table[2]/tbody' # biography_hxs = hxs.xpath(biography_xpath) # self._parse_biography(person, biography_hxs) # parliamentary history xpath = ( u'div/div[contains(@class, "col1")]/' u'p[contains(@class, "buvo-isrinkta")]/descendant::text()' ) history_hxs = hxs.xpath(xpath) if history_hxs: for item in history_hxs: parliament = ''.join(item.re(r'(\d{4}[^-]\d{4})')) parliament = parliament.replace(u'\x97', '-') person.add_value('parliament', [parliament]) return person.load_item()