def _parse_question_speakers(self, response, hxs, item, position): for speaker in hxs.select('b[position()>%d]' % position): dspeaker = Loader(self, response, QuestionDocumentSpeaker(), speaker, required=('name',)) dspeaker.add_xpath('name', 'text()') speaker_details = (speaker.select('following::text()'). extract()[0]) if (speaker_details and speaker_details.startswith(', ') and len(speaker_details) > 4): # This is a workaround for situations, where some names has # comma. This whay commas are replaced with urlquotes, then all # string is splitted by commans and resulting list is unquoted # back. speaker_details = speaker_details.replace( u'Švietimo, mokslo', u'Švietimo%2c mokslo') speaker_details = map(lambda x: urllib.unquote(x.strip()), speaker_details.split(',')) # speaker_details = filter(None, speaker_details) dspeaker.reset_required('name', 'position',) inc = Increment(-1) if len(speaker_details) > 0: dspeaker.add_value('position', speaker_details[inc()]) if len(speaker_details) == 3: dspeaker.add_value('committee', speaker_details[inc()]) if len(speaker_details) > 1: dspeaker.add_value('institution', speaker_details[inc()]) item.add_value('speakers', dict(dspeaker.load_item()))
def _parse_project_row(self, xs, response): loader = Loader(self, item=ProposedLawProjectProposer(), selector=xs, response=response) doc_id = self._get_query_attr(xs.xpath('td[3]/a/@href').extract()[0], 'p_id') loader.add_value('id', doc_id) isodate = xs.xpath('td[2]/text()').extract()[0] proposal_date = datetime.date(*map(int, isodate.split('-'))) loader.add_value('date', proposal_date) loader.add_xpath('project_name', 'td[3]/text()') loader.add_xpath('project_url', 'td[3]/a/@href') loader.add_value('source', self._get_source(response.url, 'p_asm_id')) loader.add_value('project_number', self._extract_proposal_no(xs)) passed_xs = xs.xpath('td[4]/a') if passed_xs: passed = Loader(self, item=PassedLawProjectProposer(), selector=passed_xs, response=response) doc_id = self._get_query_attr( passed_xs.xpath('@href').extract()[0], 'p_id' ) passed.add_value('id', doc_id) doc_number = self._extract_passed_no(passed_xs) passed.add_value('passing_number', doc_number) passed.add_xpath('passing_url', '@href') passed.add_value('source', self._get_source(response.url, 'p_asm_id')) loader.add_value('passed', passed.load_item()) yield loader
def _parse_question_agenda(self, response, hxs, question): date = question['session']['date'] registration = None for item in hxs: # Registration if item.select('td[2]/a[1][contains(@href, "p_reg_id=")]'): registration = Loader(self, response, Registration(), item, required=('datetime',)) url = item.select('td[2]/a[1]/@href').extract()[0] _id = text_type(self._get_query_attr(url, 'p_reg_id')) registration.add_value('id', _id) registration.add_value('datetime', date) registration.add_xpath('datetime', 'td[1]/text()') registration.add_xpath('joined', 'td[2]/b[1]/text()') # Voting if item.select('td[2]/a[1][contains(@href, "p_bals_id=")]'): voting = self._get_question_voting(response, item, question) votes = sum([int(voting.get_output_value('vote_%s' % f)) for f in ('aye', 'no', 'abstain')]) voting.add_value('total_votes', text_type(votes)) if registration: registration = dict(registration.load_item()) joined = int(registration['joined']) voting.add_value('no_vote', text_type(joined - votes)) voting.add_value('registration', registration) registration = None yield voting.load_item()
def _get_question_voting(self, response, item, question): date = question['session']['date'] required = ( '_id', 'datetime', 'vote_aye', 'vote_no', 'vote_abstain', 'total_votes', 'question', 'source', ) if item.select(u'td[2][contains(a,"alternatyvus balsavimas:")]'): voting_type = u'alternatyvus' required += ('formulation_a', 'formulation_b',) else: voting_type = u'paprastas' voting = Loader(self, response, Voting(), item, required=required) url = item.select('td[2]/a[1]/@href').extract()[0] source = self._get_source_absolute_url(response, url, 'p_bals_id') _id = source['id'] voting.add_value('_id', '%sv' % _id) voting.add_value('type', voting_type) voting.add_value('datetime', date) voting.add_xpath('datetime', 'td[1]/text()') voting.add_value('question', question['_id']) if voting_type == u'alternatyvus': voting.add_xpath('formulation_a', 'td[2]/text()[3]') voting.add_xpath('formulation_b', 'td[2]/text()[5]') self._parse_question_votes(voting, (2, 4, 5, 6)) else: formulation = item.select('td[2]/text()[2]').extract()[0].strip() # If formulation node is equeal to '(už' it means, that # there is no formulation at all. if formulation.endswith(u'(už'): if not formulation == u'(už': voting.add_value('formulation', formulation[:-3]) voting_positions = (1, 2, 3) else: voting.add_value('formulation', formulation) voting_positions = (2, 3, 4, 1) if item.select('td[2]/b'): self._parse_question_votes(voting, voting_positions) else: self._parse_question_votes(voting, None) voting.add_value('source', source) return voting
def _get_session(self, response, hxs): session_id = hxs.select('div[1]/a[1]').re(r'p_ses_id=(\d+)') hxs = hxs.select("div[2]/b") session = Loader(self, response, Session(), hxs, required=( 'id', 'fakt_pos_id', 'number', 'date', 'type',)) session.add_value('id', session_id) session.add_value('fakt_pos_id', hxs.select('a[1]').re(r'p_fakt_pos_id=(-\d+)')) session.add_value('number', hxs.select('a[1]/text()').re(r'Nr. (\d+)')) session.add_xpath('date', 'a[2]/text()') session.add_xpath('type', 'a[3]/text()') return dict(session.load_item())
def parse_related_documents(self, response): xpath = '/html/body/div/table/tr[3]/td/table/tr/td/table/tr' hxs = HtmlXPathSelector(response).select(xpath) act = Loader(self, response, LegalAct(), hxs, required=('_id', )) act.add_xpath('_id', 'td[2]/b/text()') if not act.get_output_value('_id'): p_id = unicode(self._get_query_attr(response.url, 'p_id')) act.replace_value('_id', u'NONUMBER-%s' % p_id) relations = defaultdict(list) xpath = '/html/body/div/table/tr[3]/td/table/tr/td/align/table/tr' for row in HtmlXPathSelector(response).select(xpath): docid = get_all(row, 'td[4]/span//text()') rel_type = row.select('td[6]/span/text()') if rel_type: rel_type = rel_type.extract()[0].strip().lower() if rel_type in (u'pakeistas dokumentas', u'ankstesnė dokumento redakcija'): relations['amends'].append(docid) elif rel_type == u'priimtas dokumentas': relations['adopts'].append(docid) elif rel_type == u'ryšys su taikymą nusakančiu dokumentu': relations['defines_applicability'].append(docid) elif rel_type == u'ryšys su galiojimą nusakančiu dokumentu': relations['defines_validity'].append(docid) elif rel_type == u'negalioja de jure': relations['defines_as_no_longer_valid'].append(docid) elif rel_type == u'kitas projekto variantas': relations['new_draft_version'].append(docid) elif rel_type == u'kitas projekto variantas': relations['new_draft_version'].append(docid) elif rel_type == u'ryšys su ratifikavimo dokumentu': relations['ratification'].append(docid) if relations: act.add_value('relations', dict(relations)) yield act.load_item()
def parse_person_votes(self, response): xpath = ('/html/body/div/table/tr[3]/td/table/tr/td/align/' 'div[contains(h4,"rezultatai")]/table') hxs = HtmlXPathSelector(response).select(xpath)[0] source = self._get_source_absolute_url(response, response.url, 'p_bals_id') _id = source['id'] voting = Loader(self, response, Voting(), hxs, required=( '_id', 'datetime', 'votes',)) voting.add_value('_id', '%sv' % _id) datetime_xpath_base = '/html/body/div/table/tr[3]/td/table/tr/td/' date = hxs.xpath(datetime_xpath_base + 'div[2]/b/a[2]/text()')[0].extract() time = hxs.xpath(datetime_xpath_base + ( 'align/text()[contains(., "Balsavimo laikas")]/following-sibling::b[1]/text()' ))[0].extract() timestamp = '%s %s' % (date, time) datetime.datetime.strptime(timestamp, '%Y-%m-%d %H:%M:%S') voting.add_value('datetime', timestamp) self._parse_voting_legal_acts(response, voting) for person in hxs.select('tr'): if person.select('th'): continue # Skip header p_vote = Loader(self, response, PersonVote(), person, required=( '_id', 'voting_id', 'person', 'fraction', 'vote',)) p_id = person.select('td[1]/a/@href').re(r'p_asm_id=(-?\d+)')[0] vote_value = self._get_vote_value(person) # Only include votes that were actually voting, if vote_value != 'no-vote': p_vote.add_value('_id', '%s:%s' % (_id, p_id)) p_vote.add_value('voting_id', '%sv' % _id) p_vote.add_value('person', '%sp' % p_id) p_vote.add_xpath('name', 'td[1]/a/text()') p_vote.add_xpath('fraction', 'td[2]/text()') p_vote.add_value('datetime', timestamp) p_vote.add_value('vote', vote_value) p_vote = p_vote.load_item() voting.add_value('votes', dict(p_vote)) yield p_vote yield voting.load_item()
def _parse_group_items(self, response, person, items, group_type): for group_hxs in items.xpath('tr'): group_data_hxs = group_hxs.xpath('td[2]') group = Loader(self, response, Group(), group_data_hxs, required=('name', 'position')) group.add_value('type', group_type) group.add_xpath('name', 'a/text()') group.add_xpath('source', 'a/@href') meta = ''.join(group_hxs.xpath('text() | */text()').extract()) position, membership = group_meta_re.match(meta).groups() group.add_value('position', position) membership = date_re.findall(membership or '') if len(membership) == 1: membership.append(None) group.add_value('membership', membership) person.add_value('groups', [dict(group.load_item())])
def _parse_group_items(self, response, person, items, group_type): for group_hxs in items: group = Loader(self, response, Group(), group_hxs, required=('name', 'position')) group.add_value('type', group_type) group.add_xpath('name', 'a/text()') group.add_xpath('source', 'a/@href') meta = ''.join(group_hxs.xpath('text() | */text()').extract()) position, membership = group_meta_re.match(meta).groups() group.add_value('position', position) membership = date_re.findall(membership or '') if len(membership) == 1: membership.append(None) group.add_value('membership', membership) person.add_value('groups', [dict(group.load_item())])
def _parse_law_act(self, response, hxs, base=False): """ Extracts basic document information and returns law act loader. Parameters: base Return only base information about document. This is used, when filling some information bits to a law act from several law act documents. """ lang = hxs.select('tr[1]/td[4]/b/text()').extract()[0].strip().lower() if lang not in (u'lietuvių', u'rusų', u'anglų', u'ispanų'): self.error(response, 'Unknown language: %s' % lang) if lang != u'lietuvių': return None act = Loader(self, response, LegalAct(), hxs, required=REQUIRED_FIELDS) act.add_xpath('_id', 'tr[1]/td[2]/b/text()') source = self._get_source(response.url, 'p_id') if not act.get_output_value('_id'): act.replace_value('_id', u'NONUMBER-%s' % source['id']) if base: return act act.add_xpath('name', 'caption/text()') act.add_xpath('kind', 'tr[1]/td[1]/b/text()') act.add_xpath('number', 'tr[1]/td[2]/b/text()') act.add_xpath('date', 'tr[1]/td[3]/b/text()') act.add_value('source', source) self._fix_name_case(act) return act
def _get_voting_documents(self, response, hxs): qdoc = Loader(self, response, VotingDocument(), hxs, required=( 'id', 'name', 'type', 'number',)) d_id = hxs.select('b[2]/a[1]/@href').re(r'p_id=(-?\d+)') if not d_id: return None qdoc.add_value('id', u'%sd' % d_id[0]) qdoc.add_xpath('name', 'b[1]/a/text()') qdoc.add_value( 'type', hxs.select('b[1]/following::text()[1]').re('^; (.+)') ) number_re = (r'[A-Z]{1,4}' r'-' r'\d+' r'(([a-zA-Z0-9]{1,2})?(\([^)]{1,4}\))?)*') qdoc.add_value( 'number', hxs.select('b[1]//text()').re(r'\(Nr. (%s)\)' % number_re)[0] ) return qdoc.load_item()
def _parse_person_details(self, response): xpath = '//table[@summary="Seimo narys"]' hxs = Selector(response).xpath(xpath)[0] source = self._get_source(response.url, 'p_asm_id') seimas_code = self._get_query_attr(response.url, 'p_r') if seimas_code: source['version'] = seimas_version_map[int(seimas_code)] _id = source['id'] person_hxs = hxs.xpath('tr/td/table/tr/td[2]/table/tr[2]/td[2]') person = Loader(self, response, Person(), person_hxs, required=('first_name', 'last_name')) person.add_value('_id', '%sp' % _id) # Details split = [ u'asmeniniai puslapiai', u'asmeninis elektroninis paštas', u'biuro adresas', u'darbo telefonas', u'iškėlė', u'išrinktas', u'kabinetas', u'kandidato puslapis', u'padėjėja sekretorė', u'seimo narys', ] details = ' '.join(person_hxs.xpath('descendant::text()').extract()) details = str2dict(split, details, normalize=mapwords({ u'išrinkta': u'išrinktas', u'seimo narė': u'seimo narys', u'el p': u'asmeninis elektroninis paštas', u'asmeninė svetainė': u'asmeniniai puslapiai', })) details = dict(details) email = details.get(u'asmeninis elektroninis paštas', '') phone = details.get(u'darbo telefonas', '') person.add_value('constituency', [details.get(u'išrinktas', '')]) person.add_value('raised_by', [details.get(u'iškėlė', '')]) person.add_value('email', split_by_comma(email)) person.add_value('phone', split_by_comma(phone)) person.add_value('office_address', [details.get(u'biuro adresas', '')]) person.add_xpath( 'home_page', u'a[contains(font/text(), "Asmeniniai puslapiai") or contains(font/text(), "Asmeninė svetainė")]/@href' ) person.add_xpath('candidate_page', 'a[contains(text(), "Kandidato puslapis")]/@href') person.add_value('source', source) # photo photo = hxs.xpath('tr/td/table/tr/td/div/img/@src').extract()[0] person.add_value('photo', photo) person.add_value('image_urls', photo) header_hxs = hxs.xpath('tr/td/table/tr/td[2]/table/tr/td[2]') # parliament parliament = header_hxs.xpath('div/b/font/text()') parliament = parliament.re(r'(\d{4}[-\x97]\d{4})') parliament = ''.join(parliament).replace(u'\x97', u'-') person.add_value('parliament', parliament) if u'seimo narys' in details: keys = ['nuo', 'iki'] membership = dict(str2dict(keys, details[u'seimo narys'])) parliament_group = { 'type': 'parliament', 'name': parliament, 'position': u'seimo narys', 'membership': [membership['nuo'], membership.get('iki')], } person.add_value('groups', [parliament_group]) # name (first name, last name) name = header_hxs.xpath('div/b/font[2]/text()').extract()[0] self._parse_name(person, name) # groups party_name = person.get_output_value('raised_by') if party_name: person.add_value('groups', [{'type': 'party', 'name': party_name, 'position': 'narys'}]) self._parse_groups(response, hxs, person) # date of birth xpath = (u'tr/td/table/' u'tr[contains(descendant::text(), "Biografija")]/' u'following-sibling::tr/td/' u'descendant::*[contains(text(), "Gimė")]/text()') dob_hxs = hxs.xpath(u'translate(%s, "\xa0", " ")' % xpath) dob_match = dob_hxs.re(dob_re) if dob_match: year, month, day = dob_match month = month_names_map[month] dob = u'%s-%02d-%s' % (year, month, day.zfill(2)) person.add_value('dob', dob) # biography xpath = (u'tr/td/table/' u'tr[contains(descendant::text(), "Biografija")]/' u'following-sibling::tr/td/div') bio_hxs = hxs.xpath(xpath) self._parse_biography(response, person, bio_hxs) # parliamentary history xpath = (u'//table[@summary="Istorija"]/' u'tr/td/a[starts-with(b/text(), "Buvo išrinkta")]/' u'following-sibling::text()') history_hxs = hxs.xpath(xpath) if history_hxs: for item in history_hxs: parliament = ''.join(item.re(r'(\d{4}) (-) (\d{4})')) person.add_value('parliament', [parliament]) return person.load_item()