示例#1
0
    def _parse_question_agenda(self, response, hxs, question):
        date = question['session']['date']
        registration = None
        for item in hxs:
            # Registration
            if item.select('td[2]/a[1][contains(@href, "p_reg_id=")]'):
                registration = Loader(self, response, Registration(), item,
                                      required=('datetime',))

                url = item.select('td[2]/a[1]/@href').extract()[0]
                _id = text_type(self._get_query_attr(url, 'p_reg_id'))

                registration.add_value('id', _id)
                registration.add_value('datetime', date)
                registration.add_xpath('datetime', 'td[1]/text()')
                registration.add_xpath('joined', 'td[2]/b[1]/text()')

            # Voting
            if item.select('td[2]/a[1][contains(@href, "p_bals_id=")]'):
                voting = self._get_question_voting(response, item, question)
                votes = sum([int(voting.get_output_value('vote_%s' % f))
                             for f in ('aye', 'no', 'abstain')])
                voting.add_value('total_votes', text_type(votes))

                if registration:
                    registration = dict(registration.load_item())
                    joined = int(registration['joined'])
                    voting.add_value('no_vote', text_type(joined - votes))
                    voting.add_value('registration', registration)

                registration = None

                yield voting.load_item()
示例#2
0
 def parse_field(self, html, fn):
     response = XmlResponse('http://localhost/test.html',
                            body='<book><row>%s</row></book>' % html)
     row = response.css('row')[0]
     node = response.css('entry')[0]
     declaration = Loader(self.spider, response, LobbyistDeclaration(), row)
     declaration.add_value(None, fn(node))
     item = declaration.load_item()
     actual = dict(item)
     return actual
 def parse_field(self, html, fn):
     response = HtmlResponse('http://localhost/test.html',
                             body='<table><tr>%s</tr></table>' % html)
     row = response.css('tr')[0]
     node = response.css('td')[0]
     lobbyist = Loader(self.spider, response, Lobbyist(), row)
     lobbyist.add_value(None, fn(node))
     item = lobbyist.load_item()
     actual = dict(item)
     return actual
 def parse_field(self, html, fn):
     response = XmlResponse('http://localhost/test.html',
                            body='<book><row>%s</row></book>' % html)
     row = response.css('row')[0]
     node = response.css('entry')[0]
     declaration = Loader(self.spider, response, LobbyistDeclaration(), row)
     declaration.add_value(None, fn(node))
     item = declaration.load_item()
     actual = dict(item)
     return actual
示例#5
0
 def _parse_lobbyist(self, response, row):
     nr, name, company_code, inclusion = row.css('td')
     lobbyist = Loader(self, response, Lobbyist(), row,
                       required=('name', 'date_of_inclusion', 'decision'))
     lobbyist.add_value('source_url', response.url)
     lobbyist.add_value('raw_data', row.extract())
     lobbyist.add_value(None, self._parse_number(nr))
     lobbyist.add_value(None, self._parse_name(name))
     lobbyist.add_value(None, self._parse_company_code(company_code))
     lobbyist.add_value(None, self._parse_inclusion(inclusion))
     return lobbyist.load_item()
    def parse_related_documents(self, response):
        xpath = '/html/body/div/table/tr[3]/td/table/tr/td/table/tr'
        hxs = HtmlXPathSelector(response).select(xpath)
        act = Loader(self, response, LegalAct(), hxs, required=('_id', ))
        act.add_xpath('_id', 'td[2]/b/text()')

        if not act.get_output_value('_id'):
            p_id = unicode(self._get_query_attr(response.url, 'p_id'))
            act.replace_value('_id', u'NONUMBER-%s' % p_id)

        relations = defaultdict(list)
        xpath = '/html/body/div/table/tr[3]/td/table/tr/td/align/table/tr'
        for row in HtmlXPathSelector(response).select(xpath):
            docid = get_all(row, 'td[4]/span//text()')
            rel_type = row.select('td[6]/span/text()')
            if rel_type:
                rel_type = rel_type.extract()[0].strip().lower()

            if rel_type in (u'pakeistas dokumentas',
                            u'ankstesnė dokumento redakcija'):
                relations['amends'].append(docid)

            elif rel_type == u'priimtas dokumentas':
                relations['adopts'].append(docid)

            elif rel_type == u'ryšys su taikymą nusakančiu dokumentu':
                relations['defines_applicability'].append(docid)

            elif rel_type == u'ryšys su galiojimą nusakančiu dokumentu':
                relations['defines_validity'].append(docid)

            elif rel_type == u'negalioja de jure':
                relations['defines_as_no_longer_valid'].append(docid)

            elif rel_type == u'kitas projekto variantas':
                relations['new_draft_version'].append(docid)

            elif rel_type == u'kitas projekto variantas':
                relations['new_draft_version'].append(docid)

            elif rel_type == u'ryšys su ratifikavimo dokumentu':
                relations['ratification'].append(docid)

        if relations:
            act.add_value('relations', dict(relations))
            yield act.load_item()
    def _parse_law_act(self, response, hxs, base=False):
        """
        Extracts basic document information and returns law act loader.

        Parameters:

        base
            Return only base information about document. This is used, when
            filling some information bits to a law act from several law act
            documents.

        """
        lang = hxs.select('tr[1]/td[4]/b/text()').extract()[0].strip().lower()

        if lang not in (u'lietuvių', u'rusų', u'anglų', u'ispanų'):
            self.error(response, 'Unknown language: %s' % lang)

        if lang != u'lietuvių':
            return None

        act = Loader(self, response, LegalAct(), hxs, required=REQUIRED_FIELDS)

        act.add_xpath('_id', 'tr[1]/td[2]/b/text()')

        source = self._get_source(response.url, 'p_id')

        if not act.get_output_value('_id'):
            act.replace_value('_id', u'NONUMBER-%s' % source['id'])

        if base:
            return act

        act.add_xpath('name', 'caption/text()')
        act.add_xpath('kind', 'tr[1]/td[1]/b/text()')
        act.add_xpath('number', 'tr[1]/td[2]/b/text()')
        act.add_xpath('date', 'tr[1]/td[3]/b/text()')

        act.add_value('source', source)

        self._fix_name_case(act)

        return act
示例#8
0
    def _parse_question_speakers(self, response, hxs, item, position):
        for speaker in hxs.select('b[position()>%d]' % position):
            dspeaker = Loader(self, response, QuestionDocumentSpeaker(),
                              speaker, required=('name',))
            dspeaker.add_xpath('name', 'text()')
            speaker_details = (speaker.select('following::text()').
                               extract()[0])
            if (speaker_details and speaker_details.startswith(', ') and
                    len(speaker_details) > 4):

                # This is a workaround for situations, where some names has
                # comma. This whay commas are replaced with urlquotes, then all
                # string is splitted by commans and resulting list is unquoted
                # back.
                speaker_details = speaker_details.replace(
                    u'Švietimo, mokslo', u'Švietimo%2c mokslo')

                speaker_details = map(lambda x: urllib.unquote(x.strip()),
                                      speaker_details.split(','))
                # speaker_details = filter(None, speaker_details)

                dspeaker.reset_required('name', 'position',)

                inc = Increment(-1)
                if len(speaker_details) > 0:
                    dspeaker.add_value('position', speaker_details[inc()])
                if len(speaker_details) == 3:
                    dspeaker.add_value('committee', speaker_details[inc()])
                if len(speaker_details) > 1:
                    dspeaker.add_value('institution', speaker_details[inc()])
            item.add_value('speakers', dict(dspeaker.load_item()))
示例#9
0
    def _get_session(self, response, hxs):
        session_id = hxs.select('div[1]/a[1]').re(r'p_ses_id=(\d+)')

        hxs = hxs.select("div[2]/b")

        session = Loader(self, response, Session(), hxs, required=(
            'id', 'fakt_pos_id', 'number', 'date', 'type',))

        session.add_value('id', session_id)
        session.add_value('fakt_pos_id',
                          hxs.select('a[1]').re(r'p_fakt_pos_id=(-\d+)'))
        session.add_value('number', hxs.select('a[1]/text()').re(r'Nr. (\d+)'))
        session.add_xpath('date', 'a[2]/text()')
        session.add_xpath('type', 'a[3]/text()')

        return dict(session.load_item())
示例#10
0
    def _parse_group_items(self, response, person, items, group_type):
        for group_hxs in items.xpath('tr'):
            group_data_hxs = group_hxs.xpath('td[2]')
            group = Loader(self, response, Group(), group_data_hxs,
                           required=('name', 'position'))
            group.add_value('type', group_type)
            group.add_xpath('name', 'a/text()')
            group.add_xpath('source', 'a/@href')
            meta = ''.join(group_hxs.xpath('text() | */text()').extract())
            position, membership = group_meta_re.match(meta).groups()
            group.add_value('position', position)

            membership = date_re.findall(membership or '')
            if len(membership) == 1:
                membership.append(None)
            group.add_value('membership', membership)

            person.add_value('groups', [dict(group.load_item())])
示例#11
0
    def _parse_group_items(self, response, person, items, group_type):
        for group_hxs in items:
            group = Loader(self, response, Group(), group_hxs,
                           required=('name', 'position'))

            group.add_value('type', group_type)
            group.add_xpath('name', 'a/text()')
            group.add_xpath('source', 'a/@href')

            meta = ''.join(group_hxs.xpath('text() | */text()').extract())
            position, membership = group_meta_re.match(meta).groups()
            group.add_value('position', position)

            membership = date_re.findall(membership or '')
            if len(membership) == 1:
                membership.append(None)
            group.add_value('membership', membership)

            person.add_value('groups', [dict(group.load_item())])
示例#12
0
    def parse_question(self, response):
        xpath = '/html/body/div/table/tr[3]/td/table/tr/td'
        hxs = HtmlXPathSelector(response).select(xpath)[0]

        source = self._get_source_absolute_url(response, response.url, 'p_svarst_kl_stad_id')
        _id = source['id']

        question = Loader(self, response, Question(), hxs, required=(
            '_id', 'session', 'documents', 'source',))
        question.add_value('_id', '%sq' % _id)

        self._parse_question_documents(response, hxs, question)

        question.add_value('session', self._get_session(response, hxs))
        question.add_value('source', source)

        yield question.load_item()

        agenda_hxs = hxs.select('table[@class="basic"]/tr')
        agenda = self._parse_question_agenda(response, agenda_hxs,
                                             question.item) or []
        for item in agenda:
            yield item
示例#13
0
    def _get_voting_documents(self, response, hxs):
        qdoc = Loader(self, response, VotingDocument(), hxs, required=(
            'id', 'name', 'type', 'number',))

        d_id = hxs.select('b[2]/a[1]/@href').re(r'p_id=(-?\d+)')
        if not d_id:
            return None

        qdoc.add_value('id', u'%sd' % d_id[0])
        qdoc.add_xpath('name', 'b[1]/a/text()')
        qdoc.add_value(
            'type',
            hxs.select('b[1]/following::text()[1]').re('^; (.+)')
        )
        number_re = (r'[A-Z]{1,4}'
                     r'-'
                     r'\d+'
                     r'(([a-zA-Z0-9]{1,2})?(\([^)]{1,4}\))?)*')
        qdoc.add_value(
            'number',
            hxs.select('b[1]//text()').re(r'\(Nr. (%s)\)' % number_re)[0]
        )

        return qdoc.load_item()
示例#14
0
    def _parse_person_details(self, response):
        xpath = '//div[contains(@id,"page-content")]'
        hxs = Selector(response).xpath(xpath)[0]

        source = self._get_source(response.url, 'p_asm_id')

        seimas_code = self._get_query_attr(response.url, 'p_r')
        if seimas_code:
            source['version'] = seimas_version_map[int(seimas_code)]

        _id = source['id']
        person_hxs = hxs.xpath('div/div[contains(@class, "col1")]')
        person = Loader(self, response, Person(), person_hxs,
                        required=('first_name', 'last_name'))
        person.add_value('_id', '%sp' % _id)

        # Details
        split = [
            u'asmeniniai puslapiai',
            u'asmeninis elektroninis paštas',
            u'biuro adresas',
            u'darbo telefonas',
            u'iškėlė',
            u'išrinktas',
            u'kabinetas',
            u'kandidato puslapis',
            u'padėjėja sekretorė',
            u'seimo narys',
            u'buvo išrinktas',
            u'buvo išrinkta',
            u'kontaktai',
        ]
        details = ' '.join(person_hxs.xpath('descendant::text()').extract())

        details = str2dict(split, details, normalize=mapwords({
            u'išrinkta': u'išrinktas',
            u'seimo narė': u'seimo narys',
        }))

        details = dict(details)

        contacts_hxs = hxs.xpath(
            'div/div[contains(@class, "col3")]'
            '/div[contains(@class, "kontaktai")]'
        )
        contacts = ' '.join(contacts_hxs.xpath('descendant::text()').extract())
        contacts_split = [
            u'el p',
            u'tel',
            u'asmeninė svetainė'
        ]
        contacts = str2dict(contacts_split, contacts)
        contacts = dict(contacts)

        if contacts.get('tel'):
            phone = re.sub("[^0-9]", "", contacts.get(u'tel'))
            person.add_value('phone', phone)

        email_xpath = 'div/div[contains(descendant::text(), "El. p.")]/a/text()'
        email_hxs = contacts_hxs.xpath(email_xpath)

        for email in email_hxs:
            person.add_value('email', email.extract())

        # TODO

        person.add_value('office_address', [''])

        website_hxs = contacts_hxs.xpath(
            'div/div[contains(@class, "site")]/a/@href'
        )
        if website_hxs:
            person.add_value(
                'home_page',
                website_hxs.extract()[0]
            )
        person.add_value('raised_by', [details.get(u'iškėlė', '')])
        person.add_value('constituency', [details.get(u'išrinktas', '')])

        person.add_value('source', source)

        # photo
        # first for P leader
        # second for the rest
        photo_selectors = [
            '//*[@id="page-content"]/div/div[1]/div[1]/img/@src',
            '//*[contains(@class, "seimo-nario-foto")]/img/@src',
        ]
        photo = None
        for photo_selector in photo_selectors:
            photo = Selector(response).xpath(photo_selector).extract()
            if photo:
                break
        if photo:
            person.add_value('photo', photo[0])
            person.add_value('image_urls', photo[0])
        # parliament

        parliament = hxs.xpath(
            'div/div/div[contains(@class, "smn-kadencija")]/span/text()'
        )
        parliament = parliament.re(r'(\d{4}[^-]\d{4})')
        parliament = ''.join(parliament).replace(u'\u2013', u'-')
        person.add_value('parliament', parliament)
        if u'seimo narys' in details:
            keys = ['nuo', 'iki']
            membership = dict(str2dict(keys, details[u'seimo narys']))
            parliament_group = {
                'type': 'parliament',
                'name': parliament,
                'position': u'seimo narys',
                'membership': [membership['nuo'], membership.get('iki')],
            }
            person.add_value('groups', [parliament_group])

        first_name = Selector(response).xpath('//*/div[contains(@class, "smn-name")]/text()').extract()[0]
        last_name = Selector(response).xpath('//*/span[contains(@class, "smn-pavarde")]/text()').extract()[0]

        person.add_value('first_name', unicode(first_name))
        person.add_value('last_name', unicode(last_name.title()))

        # groups
        party_name = person.get_output_value('raised_by')
        if party_name:
            person.add_value('groups', [{'type': 'party',
                                         'name': party_name,
                                         'position': 'narys'}])
        self._parse_groups(response, hxs, person)

        # biography_xpath = 'div/div[2]/div[3]/div/table[2]/tbody'
        # biography_hxs = hxs.xpath(biography_xpath)
        # self._parse_biography(person, biography_hxs)

        # parliamentary history
        xpath = (
                u'div/div[contains(@class, "col1")]/'
                u'p[contains(@class, "buvo-isrinkta")]/descendant::text()'
                )
        history_hxs = hxs.xpath(xpath)

        if history_hxs:
            for item in history_hxs:
                parliament = ''.join(item.re(r'(\d{4}[^-]\d{4})'))
                parliament = parliament.replace(u'\x97', '-')
                person.add_value('parliament', [parliament])

        return person.load_item()
示例#15
0
    def _parse_person_details(self, response):
        xpath = '//table[@summary="Seimo narys"]'
        hxs = Selector(response).xpath(xpath)[0]

        source = self._get_source(response.url, 'p_asm_id')

        seimas_code = self._get_query_attr(response.url, 'p_r')
        if seimas_code:
            source['version'] = seimas_version_map[int(seimas_code)]

        _id = source['id']

        person_hxs = hxs.xpath('tr/td/table/tr/td[2]/table/tr[2]/td[2]')
        person = Loader(self, response, Person(), person_hxs,
                        required=('first_name', 'last_name'))
        person.add_value('_id', '%sp' % _id)

        # Details

        split = [
            u'asmeniniai puslapiai',
            u'asmeninis elektroninis paštas',
            u'biuro adresas',
            u'darbo telefonas',
            u'iškėlė',
            u'išrinktas',
            u'kabinetas',
            u'kandidato puslapis',
            u'padėjėja sekretorė',
            u'seimo narys',
        ]
        details = ' '.join(person_hxs.xpath('descendant::text()').extract())
        details = str2dict(split, details, normalize=mapwords({
            u'išrinkta': u'išrinktas',
            u'seimo narė': u'seimo narys',
            u'el p': u'asmeninis elektroninis paštas',
            u'asmeninė svetainė': u'asmeniniai puslapiai',
        }))
        details = dict(details)

        email = details.get(u'asmeninis elektroninis paštas', '')
        phone = details.get(u'darbo telefonas', '')

        person.add_value('constituency', [details.get(u'išrinktas', '')])
        person.add_value('raised_by', [details.get(u'iškėlė', '')])
        person.add_value('email', split_by_comma(email))
        person.add_value('phone', split_by_comma(phone))
        person.add_value('office_address', [details.get(u'biuro adresas', '')])

        person.add_xpath(
            'home_page',
            u'a[contains(font/text(), "Asmeniniai puslapiai") or contains(font/text(), "Asmeninė svetainė")]/@href'
        )
        person.add_xpath('candidate_page',
                         'a[contains(text(), "Kandidato puslapis")]/@href')

        person.add_value('source', source)

        # photo
        photo = hxs.xpath('tr/td/table/tr/td/div/img/@src').extract()[0]
        person.add_value('photo', photo)
        person.add_value('image_urls', photo)

        header_hxs = hxs.xpath('tr/td/table/tr/td[2]/table/tr/td[2]')

        # parliament
        parliament = header_hxs.xpath('div/b/font/text()')
        parliament = parliament.re(r'(\d{4}[-\x97]\d{4})')
        parliament = ''.join(parliament).replace(u'\x97', u'-')
        person.add_value('parliament', parliament)
        if u'seimo narys' in details:
            keys = ['nuo', 'iki']
            membership = dict(str2dict(keys, details[u'seimo narys']))
            parliament_group = {
                'type': 'parliament',
                'name': parliament,
                'position': u'seimo narys',
                'membership': [membership['nuo'], membership.get('iki')],
            }
            person.add_value('groups', [parliament_group])

        # name (first name, last name)
        name = header_hxs.xpath('div/b/font[2]/text()').extract()[0]
        self._parse_name(person, name)

        # groups
        party_name = person.get_output_value('raised_by')
        if party_name:
            person.add_value('groups', [{'type': 'party',
                                         'name': party_name,
                                         'position': 'narys'}])

        self._parse_groups(response, hxs, person)

        # date of birth
        xpath = (u'tr/td/table/'
                 u'tr[contains(descendant::text(), "Biografija")]/'
                 u'following-sibling::tr/td/'
                 u'descendant::*[contains(text(), "Gimė")]/text()')
        dob_hxs = hxs.xpath(u'translate(%s, "\xa0", " ")' % xpath)
        dob_match = dob_hxs.re(dob_re)
        if dob_match:
            year, month, day = dob_match
            month = month_names_map[month]
            dob = u'%s-%02d-%s' % (year, month, day.zfill(2))
            person.add_value('dob', dob)

        # biography
        xpath = (u'tr/td/table/'
                 u'tr[contains(descendant::text(), "Biografija")]/'
                 u'following-sibling::tr/td/div')
        bio_hxs = hxs.xpath(xpath)
        self._parse_biography(response, person, bio_hxs)

        # parliamentary history
        xpath = (u'//table[@summary="Istorija"]/'
                 u'tr/td/a[starts-with(b/text(), "Buvo išrinkta")]/'
                 u'following-sibling::text()')
        history_hxs = hxs.xpath(xpath)
        if history_hxs:
            for item in history_hxs:
                parliament = ''.join(item.re(r'(\d{4}) (-) (\d{4})'))
                person.add_value('parliament', [parliament])

        return person.load_item()
示例#16
0
    def _involved_parts(self, response, hxs, act):
        involved_string = hxs.select('tr[3]/td[1]/b/text()').extract()
        involved_string = ' '.join(involved_string)
        if not involved_string:
            return None

        m = DOCUMENT_INVOLVED_PARTS.match(involved_string)
        if not m:
            return None

        involved = Loader(self,
                          response,
                          DocumentInvolved(),
                          hxs,
                          required=(
                              'date',
                              'how',
                              'institution',
                          ))
        involved.add_value('date', m.group(1))
        involved.add_value('how', m.group(2).lower())
        institution = m.group(3)
        if ',' in institution:
            # TODO: move this to utility function, same code is also used
            # in manoseimas/scrapy/spiders/mps.py:171
            spl = institution.replace(u'Švietimo, mokslo',
                                      u'Švietimo%2c mokslo')
            spl = map(lambda x: urllib.unquote(x.strip()), spl.split(','))
            spl = filter(None, spl)
            if len(spl) == 2:
                person, institution = spl
            else:
                person, group, institution = spl
                spl = group.strip().split()
                group_types = (u'komitetas', u'grupė', u'frakcija',
                               u'komisija')
                if spl[-1].lower() in group_types:
                    group_type = spl[-1].lower()
                elif spl[0].lower() in group_types:
                    group_type = spl[0].lower()
                else:
                    group_type = None

                if group_type:
                    involved.add_value('group', group)
                    involved.add_value('group_type', group_type)
                else:
                    self.error(response, 'Not committee: %s' % group)
            involved.add_value('person', person)
        involved.add_value('institution', institution)
        act.add_value('involved', dict(involved.load_item()))
示例#17
0
    def parse_person_votes(self, response):
        xpath = ('/html/body/div/table/tr[3]/td/table/tr/td/align/'
                 'div[contains(h4,"rezultatai")]/table')
        hxs = HtmlXPathSelector(response).select(xpath)[0]

        source = self._get_source_absolute_url(response, response.url, 'p_bals_id')
        _id = source['id']

        voting = Loader(self, response, Voting(), hxs, required=(
            '_id', 'datetime', 'votes',))
        voting.add_value('_id', '%sv' % _id)

        datetime_xpath_base = '/html/body/div/table/tr[3]/td/table/tr/td/'
        date = hxs.xpath(datetime_xpath_base + 'div[2]/b/a[2]/text()')[0].extract()
        time = hxs.xpath(datetime_xpath_base + (
            'align/text()[contains(., "Balsavimo laikas")]/following-sibling::b[1]/text()'
        ))[0].extract()
        timestamp = '%s %s' % (date, time)
        datetime.datetime.strptime(timestamp, '%Y-%m-%d %H:%M:%S')
        voting.add_value('datetime', timestamp)

        self._parse_voting_legal_acts(response, voting)

        for person in hxs.select('tr'):
            if person.select('th'):
                continue  # Skip header

            p_vote = Loader(self, response, PersonVote(), person, required=(
                '_id', 'voting_id', 'person', 'fraction', 'vote',))

            p_id = person.select('td[1]/a/@href').re(r'p_asm_id=(-?\d+)')[0]
            vote_value = self._get_vote_value(person)
            # Only include votes that were actually voting,
            if vote_value != 'no-vote':
                p_vote.add_value('_id', '%s:%s' % (_id, p_id))
                p_vote.add_value('voting_id', '%sv' % _id)
                p_vote.add_value('person', '%sp' % p_id)
                p_vote.add_xpath('name', 'td[1]/a/text()')
                p_vote.add_xpath('fraction', 'td[2]/text()')
                p_vote.add_value('datetime', timestamp)
                p_vote.add_value('vote', vote_value)
                p_vote = p_vote.load_item()
                voting.add_value('votes', dict(p_vote))
                yield p_vote

        yield voting.load_item()
 def _parse_lobbyist(self, response, row_group):
     row = row_group[0]
     entries = row.xpath('entry')
     columns = len(entries)
     if columns == 4:
         nr, name, law_projects, comments = entries
     else:
         assert len(entries) == 5
         nr, name, clients, law_projects, comments = entries
     declaration = Loader(self, response, LobbyistDeclaration(), row,
                          required=('name', ))
     declaration.add_value('source_url', response.url)
     declaration.add_value('raw_data', '\n'.join(row.extract() for row in row_group))
     declaration.add_value(None, self._parse_number(nr))
     declaration.add_value(None, self._parse_name(name))
     client = None
     for row in row_group:
         if columns == 5:
             new_client = self._parse_client(row.xpath('entry')[-3])
             if new_client is NO_CLIENT:
                 client = None
             elif new_client is not None:
                 client = new_client
                 declaration.add_value("clients", [client])
         law_projects = self._parse_law_projects(row.xpath('entry')[-2])
         if client is not None:
             client['law_projects'].extend(law_projects)
         else:
             declaration.add_value('law_projects', law_projects)
     declaration.add_value(None, self._parse_comments(comments))
     return declaration.load_item()
示例#19
0
 def _parse_project_row(self, xs, response):
     loader = Loader(self, item=ProposedLawProjectProposer(), selector=xs,
                     response=response)
     doc_id = self._get_query_attr(xs.xpath('td[3]/a/@href').extract()[0],
                                   'p_id')
     loader.add_value('id', doc_id)
     isodate = xs.xpath('td[2]/text()').extract()[0]
     proposal_date = datetime.date(*map(int, isodate.split('-')))
     loader.add_value('date', proposal_date)
     loader.add_xpath('project_name', 'td[3]/text()')
     loader.add_xpath('project_url', 'td[3]/a/@href')
     loader.add_value('source', self._get_source(response.url, 'p_asm_id'))
     loader.add_value('project_number', self._extract_proposal_no(xs))
     passed_xs = xs.xpath('td[4]/a')
     if passed_xs:
         passed = Loader(self, item=PassedLawProjectProposer(),
                         selector=passed_xs, response=response)
         doc_id = self._get_query_attr(
             passed_xs.xpath('@href').extract()[0], 'p_id'
         )
         passed.add_value('id', doc_id)
         doc_number = self._extract_passed_no(passed_xs)
         passed.add_value('passing_number', doc_number)
         passed.add_xpath('passing_url', '@href')
         passed.add_value('source', self._get_source(response.url,
                                                     'p_asm_id'))
         loader.add_value('passed', passed.load_item())
     yield loader
示例#20
0
    def _get_question_voting(self, response, item, question):
        date = question['session']['date']
        required = (
            '_id', 'datetime', 'vote_aye', 'vote_no', 'vote_abstain',
            'total_votes', 'question', 'source',
        )

        if item.select(u'td[2][contains(a,"alternatyvus balsavimas:")]'):
            voting_type = u'alternatyvus'
            required += ('formulation_a', 'formulation_b',)
        else:
            voting_type = u'paprastas'

        voting = Loader(self, response, Voting(), item, required=required)

        url = item.select('td[2]/a[1]/@href').extract()[0]
        source = self._get_source_absolute_url(response, url, 'p_bals_id')
        _id = source['id']

        voting.add_value('_id', '%sv' % _id)
        voting.add_value('type', voting_type)
        voting.add_value('datetime', date)
        voting.add_xpath('datetime', 'td[1]/text()')
        voting.add_value('question', question['_id'])

        if voting_type == u'alternatyvus':
            voting.add_xpath('formulation_a', 'td[2]/text()[3]')
            voting.add_xpath('formulation_b', 'td[2]/text()[5]')
            self._parse_question_votes(voting, (2, 4, 5, 6))
        else:
            formulation = item.select('td[2]/text()[2]').extract()[0].strip()

            # If formulation node is equeal to '(už' it means, that
            # there is no formulation at all.
            if formulation.endswith(u'(už'):
                if not formulation == u'(už':
                    voting.add_value('formulation', formulation[:-3])
                voting_positions = (1, 2, 3)
            else:
                voting.add_value('formulation', formulation)
                voting_positions = (2, 3, 4, 1)

            if item.select('td[2]/b'):
                self._parse_question_votes(voting, voting_positions)
            else:
                self._parse_question_votes(voting, None)

        voting.add_value('source', source)

        return voting
示例#21
0
    def parse_stenogram(self, response):
        sel = Selector(response)
        meta_xs = sel.xpath('/html/body/div[@class="WordSection1"]')
        meta = self._parse_stenogram_meta(response, meta_xs)
        paragraphs = sel.xpath('/html/body/div[@class="WordSection2"]/p')
        topics = self._group_topics(self._parse_paragraphs(paragraphs))
        for topic in topics:
            try:
                loader = Loader(self, response, StenogramTopic(),
                                required=('_id', 'title', 'date', 'sitting_no',
                                          'statements'))
                loader.add_value('title', topic['title'])
                loader.add_value('date', datetime.combine(meta['date'],
                                                          topic['time']))
                loader.add_value('sitting_no', meta['sitting_no'])

                loader.add_value('statements', topic['statements'])
                loader.add_value('source', meta['source'])
                loader.add_value('_id', meta['_id'])
                loader.add_value('session', meta.get('session'))
            except KeyError:
                pass
            else:
                yield loader.load_item()
示例#22
0
 def _parse_lobbyist(self, response, row_group):
     row = row_group[0]
     entries = row.xpath('entry')
     columns = len(entries)
     if columns == 4:
         nr, name, law_projects, comments = entries
     else:
         assert len(entries) == 5
         nr, name, clients, law_projects, comments = entries
     declaration = Loader(self,
                          response,
                          LobbyistDeclaration(),
                          row,
                          required=('name', ))
     declaration.add_value('source_url', response.url)
     declaration.add_value('raw_data',
                           '\n'.join(row.extract() for row in row_group))
     declaration.add_value(None, self._parse_number(nr))
     declaration.add_value(None, self._parse_name(name))
     client = None
     for row in row_group:
         if columns == 5:
             new_client = self._parse_client(row.xpath('entry')[-3])
             if new_client is NO_CLIENT:
                 client = None
             elif new_client is not None:
                 client = new_client
                 declaration.add_value("clients", [client])
         law_projects = self._parse_law_projects(row.xpath('entry')[-2])
         if client is not None:
             client['law_projects'].extend(law_projects)
         else:
             declaration.add_value('law_projects', law_projects)
     declaration.add_value(None, self._parse_comments(comments))
     return declaration.load_item()