def parse_field(self, html, fn): response = XmlResponse('http://localhost/test.html', body='<book><row>%s</row></book>' % html) row = response.css('row')[0] node = response.css('entry')[0] declaration = Loader(self.spider, response, LobbyistDeclaration(), row) declaration.add_value(None, fn(node)) item = declaration.load_item() actual = dict(item) return actual
def parse_field(self, html, fn): response = HtmlResponse('http://localhost/test.html', body='<table><tr>%s</tr></table>' % html) row = response.css('tr')[0] node = response.css('td')[0] lobbyist = Loader(self.spider, response, Lobbyist(), row) lobbyist.add_value(None, fn(node)) item = lobbyist.load_item() actual = dict(item) return actual
def parse_related_documents(self, response): xpath = '/html/body/div/table/tr[3]/td/table/tr/td/table/tr' hxs = HtmlXPathSelector(response).select(xpath) act = Loader(self, response, LegalAct(), hxs, required=('_id', )) act.add_xpath('_id', 'td[2]/b/text()') if not act.get_output_value('_id'): p_id = unicode(self._get_query_attr(response.url, 'p_id')) act.replace_value('_id', u'NONUMBER-%s' % p_id) relations = defaultdict(list) xpath = '/html/body/div/table/tr[3]/td/table/tr/td/align/table/tr' for row in HtmlXPathSelector(response).select(xpath): docid = get_all(row, 'td[4]/span//text()') rel_type = row.select('td[6]/span/text()') if rel_type: rel_type = rel_type.extract()[0].strip().lower() if rel_type in (u'pakeistas dokumentas', u'ankstesnė dokumento redakcija'): relations['amends'].append(docid) elif rel_type == u'priimtas dokumentas': relations['adopts'].append(docid) elif rel_type == u'ryšys su taikymą nusakančiu dokumentu': relations['defines_applicability'].append(docid) elif rel_type == u'ryšys su galiojimą nusakančiu dokumentu': relations['defines_validity'].append(docid) elif rel_type == u'negalioja de jure': relations['defines_as_no_longer_valid'].append(docid) elif rel_type == u'kitas projekto variantas': relations['new_draft_version'].append(docid) elif rel_type == u'kitas projekto variantas': relations['new_draft_version'].append(docid) elif rel_type == u'ryšys su ratifikavimo dokumentu': relations['ratification'].append(docid) if relations: act.add_value('relations', dict(relations)) yield act.load_item()
def parse_question(self, response): xpath = '/html/body/div/table/tr[3]/td/table/tr/td' hxs = HtmlXPathSelector(response).select(xpath)[0] source = self._get_source_absolute_url(response, response.url, 'p_svarst_kl_stad_id') _id = source['id'] question = Loader(self, response, Question(), hxs, required=( '_id', 'session', 'documents', 'source',)) question.add_value('_id', '%sq' % _id) self._parse_question_documents(response, hxs, question) question.add_value('session', self._get_session(response, hxs)) question.add_value('source', source) yield question.load_item() agenda_hxs = hxs.select('table[@class="basic"]/tr') agenda = self._parse_question_agenda(response, agenda_hxs, question.item) or [] for item in agenda: yield item
def parse_person_votes(self, response): xpath = ('/html/body/div/table/tr[3]/td/table/tr/td/align/' 'div[contains(h4,"rezultatai")]/table') hxs = HtmlXPathSelector(response).select(xpath)[0] source = self._get_source_absolute_url(response, response.url, 'p_bals_id') _id = source['id'] voting = Loader(self, response, Voting(), hxs, required=( '_id', 'datetime', 'votes',)) voting.add_value('_id', '%sv' % _id) datetime_xpath_base = '/html/body/div/table/tr[3]/td/table/tr/td/' date = hxs.xpath(datetime_xpath_base + 'div[2]/b/a[2]/text()')[0].extract() time = hxs.xpath(datetime_xpath_base + ( 'align/text()[contains(., "Balsavimo laikas")]/following-sibling::b[1]/text()' ))[0].extract() timestamp = '%s %s' % (date, time) datetime.datetime.strptime(timestamp, '%Y-%m-%d %H:%M:%S') voting.add_value('datetime', timestamp) self._parse_voting_legal_acts(response, voting) for person in hxs.select('tr'): if person.select('th'): continue # Skip header p_vote = Loader(self, response, PersonVote(), person, required=( '_id', 'voting_id', 'person', 'fraction', 'vote',)) p_id = person.select('td[1]/a/@href').re(r'p_asm_id=(-?\d+)')[0] vote_value = self._get_vote_value(person) # Only include votes that were actually voting, if vote_value != 'no-vote': p_vote.add_value('_id', '%s:%s' % (_id, p_id)) p_vote.add_value('voting_id', '%sv' % _id) p_vote.add_value('person', '%sp' % p_id) p_vote.add_xpath('name', 'td[1]/a/text()') p_vote.add_xpath('fraction', 'td[2]/text()') p_vote.add_value('datetime', timestamp) p_vote.add_value('vote', vote_value) p_vote = p_vote.load_item() voting.add_value('votes', dict(p_vote)) yield p_vote yield voting.load_item()
def _get_voting_documents(self, response, hxs): qdoc = Loader(self, response, VotingDocument(), hxs, required=( 'id', 'name', 'type', 'number',)) d_id = hxs.select('b[2]/a[1]/@href').re(r'p_id=(-?\d+)') if not d_id: return None qdoc.add_value('id', u'%sd' % d_id[0]) qdoc.add_xpath('name', 'b[1]/a/text()') qdoc.add_value( 'type', hxs.select('b[1]/following::text()[1]').re('^; (.+)') ) number_re = (r'[A-Z]{1,4}' r'-' r'\d+' r'(([a-zA-Z0-9]{1,2})?(\([^)]{1,4}\))?)*') qdoc.add_value( 'number', hxs.select('b[1]//text()').re(r'\(Nr. (%s)\)' % number_re)[0] ) return qdoc.load_item()
def _parse_question_agenda(self, response, hxs, question): date = question['session']['date'] registration = None for item in hxs: # Registration if item.select('td[2]/a[1][contains(@href, "p_reg_id=")]'): registration = Loader(self, response, Registration(), item, required=('datetime',)) url = item.select('td[2]/a[1]/@href').extract()[0] _id = text_type(self._get_query_attr(url, 'p_reg_id')) registration.add_value('id', _id) registration.add_value('datetime', date) registration.add_xpath('datetime', 'td[1]/text()') registration.add_xpath('joined', 'td[2]/b[1]/text()') # Voting if item.select('td[2]/a[1][contains(@href, "p_bals_id=")]'): voting = self._get_question_voting(response, item, question) votes = sum([int(voting.get_output_value('vote_%s' % f)) for f in ('aye', 'no', 'abstain')]) voting.add_value('total_votes', text_type(votes)) if registration: registration = dict(registration.load_item()) joined = int(registration['joined']) voting.add_value('no_vote', text_type(joined - votes)) voting.add_value('registration', registration) registration = None yield voting.load_item()
def _parse_question_speakers(self, response, hxs, item, position): for speaker in hxs.select('b[position()>%d]' % position): dspeaker = Loader(self, response, QuestionDocumentSpeaker(), speaker, required=('name',)) dspeaker.add_xpath('name', 'text()') speaker_details = (speaker.select('following::text()'). extract()[0]) if (speaker_details and speaker_details.startswith(', ') and len(speaker_details) > 4): # This is a workaround for situations, where some names has # comma. This whay commas are replaced with urlquotes, then all # string is splitted by commans and resulting list is unquoted # back. speaker_details = speaker_details.replace( u'Švietimo, mokslo', u'Švietimo%2c mokslo') speaker_details = map(lambda x: urllib.unquote(x.strip()), speaker_details.split(',')) # speaker_details = filter(None, speaker_details) dspeaker.reset_required('name', 'position',) inc = Increment(-1) if len(speaker_details) > 0: dspeaker.add_value('position', speaker_details[inc()]) if len(speaker_details) == 3: dspeaker.add_value('committee', speaker_details[inc()]) if len(speaker_details) > 1: dspeaker.add_value('institution', speaker_details[inc()]) item.add_value('speakers', dict(dspeaker.load_item()))
def _parse_group_items(self, response, person, items, group_type): for group_hxs in items.xpath('tr'): group_data_hxs = group_hxs.xpath('td[2]') group = Loader(self, response, Group(), group_data_hxs, required=('name', 'position')) group.add_value('type', group_type) group.add_xpath('name', 'a/text()') group.add_xpath('source', 'a/@href') meta = ''.join(group_hxs.xpath('text() | */text()').extract()) position, membership = group_meta_re.match(meta).groups() group.add_value('position', position) membership = date_re.findall(membership or '') if len(membership) == 1: membership.append(None) group.add_value('membership', membership) person.add_value('groups', [dict(group.load_item())])
def _get_question_voting(self, response, item, question): date = question['session']['date'] required = ( '_id', 'datetime', 'vote_aye', 'vote_no', 'vote_abstain', 'total_votes', 'question', 'source', ) if item.select(u'td[2][contains(a,"alternatyvus balsavimas:")]'): voting_type = u'alternatyvus' required += ('formulation_a', 'formulation_b',) else: voting_type = u'paprastas' voting = Loader(self, response, Voting(), item, required=required) url = item.select('td[2]/a[1]/@href').extract()[0] source = self._get_source_absolute_url(response, url, 'p_bals_id') _id = source['id'] voting.add_value('_id', '%sv' % _id) voting.add_value('type', voting_type) voting.add_value('datetime', date) voting.add_xpath('datetime', 'td[1]/text()') voting.add_value('question', question['_id']) if voting_type == u'alternatyvus': voting.add_xpath('formulation_a', 'td[2]/text()[3]') voting.add_xpath('formulation_b', 'td[2]/text()[5]') self._parse_question_votes(voting, (2, 4, 5, 6)) else: formulation = item.select('td[2]/text()[2]').extract()[0].strip() # If formulation node is equeal to '(už' it means, that # there is no formulation at all. if formulation.endswith(u'(už'): if not formulation == u'(už': voting.add_value('formulation', formulation[:-3]) voting_positions = (1, 2, 3) else: voting.add_value('formulation', formulation) voting_positions = (2, 3, 4, 1) if item.select('td[2]/b'): self._parse_question_votes(voting, voting_positions) else: self._parse_question_votes(voting, None) voting.add_value('source', source) return voting
def _involved_parts(self, response, hxs, act): involved_string = hxs.select('tr[3]/td[1]/b/text()').extract() involved_string = ' '.join(involved_string) if not involved_string: return None m = DOCUMENT_INVOLVED_PARTS.match(involved_string) if not m: return None involved = Loader(self, response, DocumentInvolved(), hxs, required=( 'date', 'how', 'institution', )) involved.add_value('date', m.group(1)) involved.add_value('how', m.group(2).lower()) institution = m.group(3) if ',' in institution: # TODO: move this to utility function, same code is also used # in manoseimas/scrapy/spiders/mps.py:171 spl = institution.replace(u'Švietimo, mokslo', u'Švietimo%2c mokslo') spl = map(lambda x: urllib.unquote(x.strip()), spl.split(',')) spl = filter(None, spl) if len(spl) == 2: person, institution = spl else: person, group, institution = spl spl = group.strip().split() group_types = (u'komitetas', u'grupė', u'frakcija', u'komisija') if spl[-1].lower() in group_types: group_type = spl[-1].lower() elif spl[0].lower() in group_types: group_type = spl[0].lower() else: group_type = None if group_type: involved.add_value('group', group) involved.add_value('group_type', group_type) else: self.error(response, 'Not committee: %s' % group) involved.add_value('person', person) involved.add_value('institution', institution) act.add_value('involved', dict(involved.load_item()))
def _parse_law_act(self, response, hxs, base=False): """ Extracts basic document information and returns law act loader. Parameters: base Return only base information about document. This is used, when filling some information bits to a law act from several law act documents. """ lang = hxs.select('tr[1]/td[4]/b/text()').extract()[0].strip().lower() if lang not in (u'lietuvių', u'rusų', u'anglų', u'ispanų'): self.error(response, 'Unknown language: %s' % lang) if lang != u'lietuvių': return None act = Loader(self, response, LegalAct(), hxs, required=REQUIRED_FIELDS) act.add_xpath('_id', 'tr[1]/td[2]/b/text()') source = self._get_source(response.url, 'p_id') if not act.get_output_value('_id'): act.replace_value('_id', u'NONUMBER-%s' % source['id']) if base: return act act.add_xpath('name', 'caption/text()') act.add_xpath('kind', 'tr[1]/td[1]/b/text()') act.add_xpath('number', 'tr[1]/td[2]/b/text()') act.add_xpath('date', 'tr[1]/td[3]/b/text()') act.add_value('source', source) self._fix_name_case(act) return act
def _parse_lobbyist(self, response, row_group): row = row_group[0] entries = row.xpath('entry') columns = len(entries) if columns == 4: nr, name, law_projects, comments = entries else: assert len(entries) == 5 nr, name, clients, law_projects, comments = entries declaration = Loader(self, response, LobbyistDeclaration(), row, required=('name', )) declaration.add_value('source_url', response.url) declaration.add_value('raw_data', '\n'.join(row.extract() for row in row_group)) declaration.add_value(None, self._parse_number(nr)) declaration.add_value(None, self._parse_name(name)) client = None for row in row_group: if columns == 5: new_client = self._parse_client(row.xpath('entry')[-3]) if new_client is NO_CLIENT: client = None elif new_client is not None: client = new_client declaration.add_value("clients", [client]) law_projects = self._parse_law_projects(row.xpath('entry')[-2]) if client is not None: client['law_projects'].extend(law_projects) else: declaration.add_value('law_projects', law_projects) declaration.add_value(None, self._parse_comments(comments)) return declaration.load_item()
def _parse_project_row(self, xs, response): loader = Loader(self, item=ProposedLawProjectProposer(), selector=xs, response=response) doc_id = self._get_query_attr(xs.xpath('td[3]/a/@href').extract()[0], 'p_id') loader.add_value('id', doc_id) isodate = xs.xpath('td[2]/text()').extract()[0] proposal_date = datetime.date(*map(int, isodate.split('-'))) loader.add_value('date', proposal_date) loader.add_xpath('project_name', 'td[3]/text()') loader.add_xpath('project_url', 'td[3]/a/@href') loader.add_value('source', self._get_source(response.url, 'p_asm_id')) loader.add_value('project_number', self._extract_proposal_no(xs)) passed_xs = xs.xpath('td[4]/a') if passed_xs: passed = Loader(self, item=PassedLawProjectProposer(), selector=passed_xs, response=response) doc_id = self._get_query_attr( passed_xs.xpath('@href').extract()[0], 'p_id' ) passed.add_value('id', doc_id) doc_number = self._extract_passed_no(passed_xs) passed.add_value('passing_number', doc_number) passed.add_xpath('passing_url', '@href') passed.add_value('source', self._get_source(response.url, 'p_asm_id')) loader.add_value('passed', passed.load_item()) yield loader
def _parse_group_items(self, response, person, items, group_type): for group_hxs in items: group = Loader(self, response, Group(), group_hxs, required=('name', 'position')) group.add_value('type', group_type) group.add_xpath('name', 'a/text()') group.add_xpath('source', 'a/@href') meta = ''.join(group_hxs.xpath('text() | */text()').extract()) position, membership = group_meta_re.match(meta).groups() group.add_value('position', position) membership = date_re.findall(membership or '') if len(membership) == 1: membership.append(None) group.add_value('membership', membership) person.add_value('groups', [dict(group.load_item())])
def _parse_person_details(self, response): xpath = '//table[@summary="Seimo narys"]' hxs = Selector(response).xpath(xpath)[0] source = self._get_source(response.url, 'p_asm_id') seimas_code = self._get_query_attr(response.url, 'p_r') if seimas_code: source['version'] = seimas_version_map[int(seimas_code)] _id = source['id'] person_hxs = hxs.xpath('tr/td/table/tr/td[2]/table/tr[2]/td[2]') person = Loader(self, response, Person(), person_hxs, required=('first_name', 'last_name')) person.add_value('_id', '%sp' % _id) # Details split = [ u'asmeniniai puslapiai', u'asmeninis elektroninis paštas', u'biuro adresas', u'darbo telefonas', u'iškėlė', u'išrinktas', u'kabinetas', u'kandidato puslapis', u'padėjėja sekretorė', u'seimo narys', ] details = ' '.join(person_hxs.xpath('descendant::text()').extract()) details = str2dict(split, details, normalize=mapwords({ u'išrinkta': u'išrinktas', u'seimo narė': u'seimo narys', u'el p': u'asmeninis elektroninis paštas', u'asmeninė svetainė': u'asmeniniai puslapiai', })) details = dict(details) email = details.get(u'asmeninis elektroninis paštas', '') phone = details.get(u'darbo telefonas', '') person.add_value('constituency', [details.get(u'išrinktas', '')]) person.add_value('raised_by', [details.get(u'iškėlė', '')]) person.add_value('email', split_by_comma(email)) person.add_value('phone', split_by_comma(phone)) person.add_value('office_address', [details.get(u'biuro adresas', '')]) person.add_xpath( 'home_page', u'a[contains(font/text(), "Asmeniniai puslapiai") or contains(font/text(), "Asmeninė svetainė")]/@href' ) person.add_xpath('candidate_page', 'a[contains(text(), "Kandidato puslapis")]/@href') person.add_value('source', source) # photo photo = hxs.xpath('tr/td/table/tr/td/div/img/@src').extract()[0] person.add_value('photo', photo) person.add_value('image_urls', photo) header_hxs = hxs.xpath('tr/td/table/tr/td[2]/table/tr/td[2]') # parliament parliament = header_hxs.xpath('div/b/font/text()') parliament = parliament.re(r'(\d{4}[-\x97]\d{4})') parliament = ''.join(parliament).replace(u'\x97', u'-') person.add_value('parliament', parliament) if u'seimo narys' in details: keys = ['nuo', 'iki'] membership = dict(str2dict(keys, details[u'seimo narys'])) parliament_group = { 'type': 'parliament', 'name': parliament, 'position': u'seimo narys', 'membership': [membership['nuo'], membership.get('iki')], } person.add_value('groups', [parliament_group]) # name (first name, last name) name = header_hxs.xpath('div/b/font[2]/text()').extract()[0] self._parse_name(person, name) # groups party_name = person.get_output_value('raised_by') if party_name: person.add_value('groups', [{'type': 'party', 'name': party_name, 'position': 'narys'}]) self._parse_groups(response, hxs, person) # date of birth xpath = (u'tr/td/table/' u'tr[contains(descendant::text(), "Biografija")]/' u'following-sibling::tr/td/' u'descendant::*[contains(text(), "Gimė")]/text()') dob_hxs = hxs.xpath(u'translate(%s, "\xa0", " ")' % xpath) dob_match = dob_hxs.re(dob_re) if dob_match: year, month, day = dob_match month = month_names_map[month] dob = u'%s-%02d-%s' % (year, month, day.zfill(2)) person.add_value('dob', dob) # biography xpath = (u'tr/td/table/' u'tr[contains(descendant::text(), "Biografija")]/' u'following-sibling::tr/td/div') bio_hxs = hxs.xpath(xpath) self._parse_biography(response, person, bio_hxs) # parliamentary history xpath = (u'//table[@summary="Istorija"]/' u'tr/td/a[starts-with(b/text(), "Buvo išrinkta")]/' u'following-sibling::text()') history_hxs = hxs.xpath(xpath) if history_hxs: for item in history_hxs: parliament = ''.join(item.re(r'(\d{4}) (-) (\d{4})')) person.add_value('parliament', [parliament]) return person.load_item()
def parse_stenogram(self, response): sel = Selector(response) meta_xs = sel.xpath('/html/body/div[@class="WordSection1"]') meta = self._parse_stenogram_meta(response, meta_xs) paragraphs = sel.xpath('/html/body/div[@class="WordSection2"]/p') topics = self._group_topics(self._parse_paragraphs(paragraphs)) for topic in topics: try: loader = Loader(self, response, StenogramTopic(), required=('_id', 'title', 'date', 'sitting_no', 'statements')) loader.add_value('title', topic['title']) loader.add_value('date', datetime.combine(meta['date'], topic['time'])) loader.add_value('sitting_no', meta['sitting_no']) loader.add_value('statements', topic['statements']) loader.add_value('source', meta['source']) loader.add_value('_id', meta['_id']) loader.add_value('session', meta.get('session')) except KeyError: pass else: yield loader.load_item()
def _get_session(self, response, hxs): session_id = hxs.select('div[1]/a[1]').re(r'p_ses_id=(\d+)') hxs = hxs.select("div[2]/b") session = Loader(self, response, Session(), hxs, required=( 'id', 'fakt_pos_id', 'number', 'date', 'type',)) session.add_value('id', session_id) session.add_value('fakt_pos_id', hxs.select('a[1]').re(r'p_fakt_pos_id=(-\d+)')) session.add_value('number', hxs.select('a[1]/text()').re(r'Nr. (\d+)')) session.add_xpath('date', 'a[2]/text()') session.add_xpath('type', 'a[3]/text()') return dict(session.load_item())
def _parse_lobbyist(self, response, row): nr, name, company_code, inclusion = row.css('td') lobbyist = Loader(self, response, Lobbyist(), row, required=('name', 'date_of_inclusion', 'decision')) lobbyist.add_value('source_url', response.url) lobbyist.add_value('raw_data', row.extract()) lobbyist.add_value(None, self._parse_number(nr)) lobbyist.add_value(None, self._parse_name(name)) lobbyist.add_value(None, self._parse_company_code(company_code)) lobbyist.add_value(None, self._parse_inclusion(inclusion)) return lobbyist.load_item()
def _parse_person_details(self, response): xpath = '//div[contains(@id,"page-content")]' hxs = Selector(response).xpath(xpath)[0] source = self._get_source(response.url, 'p_asm_id') seimas_code = self._get_query_attr(response.url, 'p_r') if seimas_code: source['version'] = seimas_version_map[int(seimas_code)] _id = source['id'] person_hxs = hxs.xpath('div/div[contains(@class, "col1")]') person = Loader(self, response, Person(), person_hxs, required=('first_name', 'last_name')) person.add_value('_id', '%sp' % _id) # Details split = [ u'asmeniniai puslapiai', u'asmeninis elektroninis paštas', u'biuro adresas', u'darbo telefonas', u'iškėlė', u'išrinktas', u'kabinetas', u'kandidato puslapis', u'padėjėja sekretorė', u'seimo narys', u'buvo išrinktas', u'buvo išrinkta', u'kontaktai', ] details = ' '.join(person_hxs.xpath('descendant::text()').extract()) details = str2dict(split, details, normalize=mapwords({ u'išrinkta': u'išrinktas', u'seimo narė': u'seimo narys', })) details = dict(details) contacts_hxs = hxs.xpath( 'div/div[contains(@class, "col3")]' '/div[contains(@class, "kontaktai")]' ) contacts = ' '.join(contacts_hxs.xpath('descendant::text()').extract()) contacts_split = [ u'el p', u'tel', u'asmeninė svetainė' ] contacts = str2dict(contacts_split, contacts) contacts = dict(contacts) if contacts.get('tel'): phone = re.sub("[^0-9]", "", contacts.get(u'tel')) person.add_value('phone', phone) email_xpath = 'div/div[contains(descendant::text(), "El. p.")]/a/text()' email_hxs = contacts_hxs.xpath(email_xpath) for email in email_hxs: person.add_value('email', email.extract()) # TODO person.add_value('office_address', ['']) website_hxs = contacts_hxs.xpath( 'div/div[contains(@class, "site")]/a/@href' ) if website_hxs: person.add_value( 'home_page', website_hxs.extract()[0] ) person.add_value('raised_by', [details.get(u'iškėlė', '')]) person.add_value('constituency', [details.get(u'išrinktas', '')]) person.add_value('source', source) # photo # first for P leader # second for the rest photo_selectors = [ '//*[@id="page-content"]/div/div[1]/div[1]/img/@src', '//*[contains(@class, "seimo-nario-foto")]/img/@src', ] photo = None for photo_selector in photo_selectors: photo = Selector(response).xpath(photo_selector).extract() if photo: break if photo: person.add_value('photo', photo[0]) person.add_value('image_urls', photo[0]) # parliament parliament = hxs.xpath( 'div/div/div[contains(@class, "smn-kadencija")]/span/text()' ) parliament = parliament.re(r'(\d{4}[^-]\d{4})') parliament = ''.join(parliament).replace(u'\u2013', u'-') person.add_value('parliament', parliament) if u'seimo narys' in details: keys = ['nuo', 'iki'] membership = dict(str2dict(keys, details[u'seimo narys'])) parliament_group = { 'type': 'parliament', 'name': parliament, 'position': u'seimo narys', 'membership': [membership['nuo'], membership.get('iki')], } person.add_value('groups', [parliament_group]) first_name = Selector(response).xpath('//*/div[contains(@class, "smn-name")]/text()').extract()[0] last_name = Selector(response).xpath('//*/span[contains(@class, "smn-pavarde")]/text()').extract()[0] person.add_value('first_name', unicode(first_name)) person.add_value('last_name', unicode(last_name.title())) # groups party_name = person.get_output_value('raised_by') if party_name: person.add_value('groups', [{'type': 'party', 'name': party_name, 'position': 'narys'}]) self._parse_groups(response, hxs, person) # biography_xpath = 'div/div[2]/div[3]/div/table[2]/tbody' # biography_hxs = hxs.xpath(biography_xpath) # self._parse_biography(person, biography_hxs) # parliamentary history xpath = ( u'div/div[contains(@class, "col1")]/' u'p[contains(@class, "buvo-isrinkta")]/descendant::text()' ) history_hxs = hxs.xpath(xpath) if history_hxs: for item in history_hxs: parliament = ''.join(item.re(r'(\d{4}[^-]\d{4})')) parliament = parliament.replace(u'\x97', '-') person.add_value('parliament', [parliament]) return person.load_item()