Python Loader.add_xpath示例，manoseimas.scrapy.loaders.Loader.add_xpath Python示例

示例#1

0

显示文件

    def _parse_question_speakers(self, response, hxs, item, position):
        for speaker in hxs.select('b[position()>%d]' % position):
            dspeaker = Loader(self, response, QuestionDocumentSpeaker(),
                              speaker, required=('name',))
            dspeaker.add_xpath('name', 'text()')
            speaker_details = (speaker.select('following::text()').
                               extract()[0])
            if (speaker_details and speaker_details.startswith(', ') and
                    len(speaker_details) > 4):

                # This is a workaround for situations, where some names has
                # comma. This whay commas are replaced with urlquotes, then all
                # string is splitted by commans and resulting list is unquoted
                # back.
                speaker_details = speaker_details.replace(
                    u'Švietimo, mokslo', u'Švietimo%2c mokslo')

                speaker_details = map(lambda x: urllib.unquote(x.strip()),
                                      speaker_details.split(','))
                # speaker_details = filter(None, speaker_details)

                dspeaker.reset_required('name', 'position',)

                inc = Increment(-1)
                if len(speaker_details) > 0:
                    dspeaker.add_value('position', speaker_details[inc()])
                if len(speaker_details) == 3:
                    dspeaker.add_value('committee', speaker_details[inc()])
                if len(speaker_details) > 1:
                    dspeaker.add_value('institution', speaker_details[inc()])
            item.add_value('speakers', dict(dspeaker.load_item()))

示例#2

0

显示文件

 def _parse_project_row(self, xs, response):
     loader = Loader(self, item=ProposedLawProjectProposer(), selector=xs,
                     response=response)
     doc_id = self._get_query_attr(xs.xpath('td[3]/a/@href').extract()[0],
                                   'p_id')
     loader.add_value('id', doc_id)
     isodate = xs.xpath('td[2]/text()').extract()[0]
     proposal_date = datetime.date(*map(int, isodate.split('-')))
     loader.add_value('date', proposal_date)
     loader.add_xpath('project_name', 'td[3]/text()')
     loader.add_xpath('project_url', 'td[3]/a/@href')
     loader.add_value('source', self._get_source(response.url, 'p_asm_id'))
     loader.add_value('project_number', self._extract_proposal_no(xs))
     passed_xs = xs.xpath('td[4]/a')
     if passed_xs:
         passed = Loader(self, item=PassedLawProjectProposer(),
                         selector=passed_xs, response=response)
         doc_id = self._get_query_attr(
             passed_xs.xpath('@href').extract()[0], 'p_id'
         )
         passed.add_value('id', doc_id)
         doc_number = self._extract_passed_no(passed_xs)
         passed.add_value('passing_number', doc_number)
         passed.add_xpath('passing_url', '@href')
         passed.add_value('source', self._get_source(response.url,
                                                     'p_asm_id'))
         loader.add_value('passed', passed.load_item())
     yield loader

示例#3

0

显示文件

    def _parse_question_agenda(self, response, hxs, question):
        date = question['session']['date']
        registration = None
        for item in hxs:
            # Registration
            if item.select('td[2]/a[1][contains(@href, "p_reg_id=")]'):
                registration = Loader(self, response, Registration(), item,
                                      required=('datetime',))

                url = item.select('td[2]/a[1]/@href').extract()[0]
                _id = text_type(self._get_query_attr(url, 'p_reg_id'))

                registration.add_value('id', _id)
                registration.add_value('datetime', date)
                registration.add_xpath('datetime', 'td[1]/text()')
                registration.add_xpath('joined', 'td[2]/b[1]/text()')

            # Voting
            if item.select('td[2]/a[1][contains(@href, "p_bals_id=")]'):
                voting = self._get_question_voting(response, item, question)
                votes = sum([int(voting.get_output_value('vote_%s' % f))
                             for f in ('aye', 'no', 'abstain')])
                voting.add_value('total_votes', text_type(votes))

                if registration:
                    registration = dict(registration.load_item())
                    joined = int(registration['joined'])
                    voting.add_value('no_vote', text_type(joined - votes))
                    voting.add_value('registration', registration)

                registration = None

                yield voting.load_item()

示例#4

0

显示文件

    def _get_question_voting(self, response, item, question):
        date = question['session']['date']
        required = (
            '_id', 'datetime', 'vote_aye', 'vote_no', 'vote_abstain',
            'total_votes', 'question', 'source',
        )

        if item.select(u'td[2][contains(a,"alternatyvus balsavimas:")]'):
            voting_type = u'alternatyvus'
            required += ('formulation_a', 'formulation_b',)
        else:
            voting_type = u'paprastas'

        voting = Loader(self, response, Voting(), item, required=required)

        url = item.select('td[2]/a[1]/@href').extract()[0]
        source = self._get_source_absolute_url(response, url, 'p_bals_id')
        _id = source['id']

        voting.add_value('_id', '%sv' % _id)
        voting.add_value('type', voting_type)
        voting.add_value('datetime', date)
        voting.add_xpath('datetime', 'td[1]/text()')
        voting.add_value('question', question['_id'])

        if voting_type == u'alternatyvus':
            voting.add_xpath('formulation_a', 'td[2]/text()[3]')
            voting.add_xpath('formulation_b', 'td[2]/text()[5]')
            self._parse_question_votes(voting, (2, 4, 5, 6))
        else:
            formulation = item.select('td[2]/text()[2]').extract()[0].strip()

            # If formulation node is equeal to '(už' it means, that
            # there is no formulation at all.
            if formulation.endswith(u'(už'):
                if not formulation == u'(už':
                    voting.add_value('formulation', formulation[:-3])
                voting_positions = (1, 2, 3)
            else:
                voting.add_value('formulation', formulation)
                voting_positions = (2, 3, 4, 1)

            if item.select('td[2]/b'):
                self._parse_question_votes(voting, voting_positions)
            else:
                self._parse_question_votes(voting, None)

        voting.add_value('source', source)

        return voting

示例#5

0

显示文件

    def _get_session(self, response, hxs):
        session_id = hxs.select('div[1]/a[1]').re(r'p_ses_id=(\d+)')

        hxs = hxs.select("div[2]/b")

        session = Loader(self, response, Session(), hxs, required=(
            'id', 'fakt_pos_id', 'number', 'date', 'type',))

        session.add_value('id', session_id)
        session.add_value('fakt_pos_id',
                          hxs.select('a[1]').re(r'p_fakt_pos_id=(-\d+)'))
        session.add_value('number', hxs.select('a[1]/text()').re(r'Nr. (\d+)'))
        session.add_xpath('date', 'a[2]/text()')
        session.add_xpath('type', 'a[3]/text()')

        return dict(session.load_item())

示例#6

0

显示文件

文件： documents.py 项目： tadaskrisciunas/manoseimas.lt

    def parse_related_documents(self, response):
        xpath = '/html/body/div/table/tr[3]/td/table/tr/td/table/tr'
        hxs = HtmlXPathSelector(response).select(xpath)
        act = Loader(self, response, LegalAct(), hxs, required=('_id', ))
        act.add_xpath('_id', 'td[2]/b/text()')

        if not act.get_output_value('_id'):
            p_id = unicode(self._get_query_attr(response.url, 'p_id'))
            act.replace_value('_id', u'NONUMBER-%s' % p_id)

        relations = defaultdict(list)
        xpath = '/html/body/div/table/tr[3]/td/table/tr/td/align/table/tr'
        for row in HtmlXPathSelector(response).select(xpath):
            docid = get_all(row, 'td[4]/span//text()')
            rel_type = row.select('td[6]/span/text()')
            if rel_type:
                rel_type = rel_type.extract()[0].strip().lower()

            if rel_type in (u'pakeistas dokumentas',
                            u'ankstesnė dokumento redakcija'):
                relations['amends'].append(docid)

            elif rel_type == u'priimtas dokumentas':
                relations['adopts'].append(docid)

            elif rel_type == u'ryšys su taikymą nusakančiu dokumentu':
                relations['defines_applicability'].append(docid)

            elif rel_type == u'ryšys su galiojimą nusakančiu dokumentu':
                relations['defines_validity'].append(docid)

            elif rel_type == u'negalioja de jure':
                relations['defines_as_no_longer_valid'].append(docid)

            elif rel_type == u'kitas projekto variantas':
                relations['new_draft_version'].append(docid)

            elif rel_type == u'kitas projekto variantas':
                relations['new_draft_version'].append(docid)

            elif rel_type == u'ryšys su ratifikavimo dokumentu':
                relations['ratification'].append(docid)

        if relations:
            act.add_value('relations', dict(relations))
            yield act.load_item()

示例#7

0

显示文件

    def parse_person_votes(self, response):
        xpath = ('/html/body/div/table/tr[3]/td/table/tr/td/align/'
                 'div[contains(h4,"rezultatai")]/table')
        hxs = HtmlXPathSelector(response).select(xpath)[0]

        source = self._get_source_absolute_url(response, response.url, 'p_bals_id')
        _id = source['id']

        voting = Loader(self, response, Voting(), hxs, required=(
            '_id', 'datetime', 'votes',))
        voting.add_value('_id', '%sv' % _id)

        datetime_xpath_base = '/html/body/div/table/tr[3]/td/table/tr/td/'
        date = hxs.xpath(datetime_xpath_base + 'div[2]/b/a[2]/text()')[0].extract()
        time = hxs.xpath(datetime_xpath_base + (
            'align/text()[contains(., "Balsavimo laikas")]/following-sibling::b[1]/text()'
        ))[0].extract()
        timestamp = '%s %s' % (date, time)
        datetime.datetime.strptime(timestamp, '%Y-%m-%d %H:%M:%S')
        voting.add_value('datetime', timestamp)

        self._parse_voting_legal_acts(response, voting)

        for person in hxs.select('tr'):
            if person.select('th'):
                continue  # Skip header

            p_vote = Loader(self, response, PersonVote(), person, required=(
                '_id', 'voting_id', 'person', 'fraction', 'vote',))

            p_id = person.select('td[1]/a/@href').re(r'p_asm_id=(-?\d+)')[0]
            vote_value = self._get_vote_value(person)
            # Only include votes that were actually voting,
            if vote_value != 'no-vote':
                p_vote.add_value('_id', '%s:%s' % (_id, p_id))
                p_vote.add_value('voting_id', '%sv' % _id)
                p_vote.add_value('person', '%sp' % p_id)
                p_vote.add_xpath('name', 'td[1]/a/text()')
                p_vote.add_xpath('fraction', 'td[2]/text()')
                p_vote.add_value('datetime', timestamp)
                p_vote.add_value('vote', vote_value)
                p_vote = p_vote.load_item()
                voting.add_value('votes', dict(p_vote))
                yield p_vote

        yield voting.load_item()

示例#8

0

显示文件

文件： mps.py 项目： ManoSeimas/manoseimas.lt

    def _parse_group_items(self, response, person, items, group_type):
        for group_hxs in items.xpath('tr'):
            group_data_hxs = group_hxs.xpath('td[2]')
            group = Loader(self, response, Group(), group_data_hxs,
                           required=('name', 'position'))
            group.add_value('type', group_type)
            group.add_xpath('name', 'a/text()')
            group.add_xpath('source', 'a/@href')
            meta = ''.join(group_hxs.xpath('text() | */text()').extract())
            position, membership = group_meta_re.match(meta).groups()
            group.add_value('position', position)

            membership = date_re.findall(membership or '')
            if len(membership) == 1:
                membership.append(None)
            group.add_value('membership', membership)

            person.add_value('groups', [dict(group.load_item())])

示例#9

0

显示文件

文件： mps.py 项目： tadaskrisciunas/manoseimas.lt

    def _parse_group_items(self, response, person, items, group_type):
        for group_hxs in items:
            group = Loader(self, response, Group(), group_hxs,
                           required=('name', 'position'))

            group.add_value('type', group_type)
            group.add_xpath('name', 'a/text()')
            group.add_xpath('source', 'a/@href')

            meta = ''.join(group_hxs.xpath('text() | */text()').extract())
            position, membership = group_meta_re.match(meta).groups()
            group.add_value('position', position)

            membership = date_re.findall(membership or '')
            if len(membership) == 1:
                membership.append(None)
            group.add_value('membership', membership)

            person.add_value('groups', [dict(group.load_item())])

示例#10

0

显示文件

文件： documents.py 项目： tadaskrisciunas/manoseimas.lt

    def _parse_law_act(self, response, hxs, base=False):
        """
        Extracts basic document information and returns law act loader.

        Parameters:

        base
            Return only base information about document. This is used, when
            filling some information bits to a law act from several law act
            documents.

        """
        lang = hxs.select('tr[1]/td[4]/b/text()').extract()[0].strip().lower()

        if lang not in (u'lietuvių', u'rusų', u'anglų', u'ispanų'):
            self.error(response, 'Unknown language: %s' % lang)

        if lang != u'lietuvių':
            return None

        act = Loader(self, response, LegalAct(), hxs, required=REQUIRED_FIELDS)

        act.add_xpath('_id', 'tr[1]/td[2]/b/text()')

        source = self._get_source(response.url, 'p_id')

        if not act.get_output_value('_id'):
            act.replace_value('_id', u'NONUMBER-%s' % source['id'])

        if base:
            return act

        act.add_xpath('name', 'caption/text()')
        act.add_xpath('kind', 'tr[1]/td[1]/b/text()')
        act.add_xpath('number', 'tr[1]/td[2]/b/text()')
        act.add_xpath('date', 'tr[1]/td[3]/b/text()')

        act.add_value('source', source)

        self._fix_name_case(act)

        return act

示例#11

0

显示文件

    def _get_voting_documents(self, response, hxs):
        qdoc = Loader(self, response, VotingDocument(), hxs, required=(
            'id', 'name', 'type', 'number',))

        d_id = hxs.select('b[2]/a[1]/@href').re(r'p_id=(-?\d+)')
        if not d_id:
            return None

        qdoc.add_value('id', u'%sd' % d_id[0])
        qdoc.add_xpath('name', 'b[1]/a/text()')
        qdoc.add_value(
            'type',
            hxs.select('b[1]/following::text()[1]').re('^; (.+)')
        )
        number_re = (r'[A-Z]{1,4}'
                     r'-'
                     r'\d+'
                     r'(([a-zA-Z0-9]{1,2})?(\([^)]{1,4}\))?)*')
        qdoc.add_value(
            'number',
            hxs.select('b[1]//text()').re(r'\(Nr. (%s)\)' % number_re)[0]
        )

        return qdoc.load_item()

示例#12

0

显示文件

文件： mps.py 项目： tadaskrisciunas/manoseimas.lt

    def _parse_person_details(self, response):
        xpath = '//table[@summary="Seimo narys"]'
        hxs = Selector(response).xpath(xpath)[0]

        source = self._get_source(response.url, 'p_asm_id')

        seimas_code = self._get_query_attr(response.url, 'p_r')
        if seimas_code:
            source['version'] = seimas_version_map[int(seimas_code)]

        _id = source['id']

        person_hxs = hxs.xpath('tr/td/table/tr/td[2]/table/tr[2]/td[2]')
        person = Loader(self, response, Person(), person_hxs,
                        required=('first_name', 'last_name'))
        person.add_value('_id', '%sp' % _id)

        # Details

        split = [
            u'asmeniniai puslapiai',
            u'asmeninis elektroninis paštas',
            u'biuro adresas',
            u'darbo telefonas',
            u'iškėlė',
            u'išrinktas',
            u'kabinetas',
            u'kandidato puslapis',
            u'padėjėja sekretorė',
            u'seimo narys',
        ]
        details = ' '.join(person_hxs.xpath('descendant::text()').extract())
        details = str2dict(split, details, normalize=mapwords({
            u'išrinkta': u'išrinktas',
            u'seimo narė': u'seimo narys',
            u'el p': u'asmeninis elektroninis paštas',
            u'asmeninė svetainė': u'asmeniniai puslapiai',
        }))
        details = dict(details)

        email = details.get(u'asmeninis elektroninis paštas', '')
        phone = details.get(u'darbo telefonas', '')

        person.add_value('constituency', [details.get(u'išrinktas', '')])
        person.add_value('raised_by', [details.get(u'iškėlė', '')])
        person.add_value('email', split_by_comma(email))
        person.add_value('phone', split_by_comma(phone))
        person.add_value('office_address', [details.get(u'biuro adresas', '')])

        person.add_xpath(
            'home_page',
            u'a[contains(font/text(), "Asmeniniai puslapiai") or contains(font/text(), "Asmeninė svetainė")]/@href'
        )
        person.add_xpath('candidate_page',
                         'a[contains(text(), "Kandidato puslapis")]/@href')

        person.add_value('source', source)

        # photo
        photo = hxs.xpath('tr/td/table/tr/td/div/img/@src').extract()[0]
        person.add_value('photo', photo)
        person.add_value('image_urls', photo)

        header_hxs = hxs.xpath('tr/td/table/tr/td[2]/table/tr/td[2]')

        # parliament
        parliament = header_hxs.xpath('div/b/font/text()')
        parliament = parliament.re(r'(\d{4}[-\x97]\d{4})')
        parliament = ''.join(parliament).replace(u'\x97', u'-')
        person.add_value('parliament', parliament)
        if u'seimo narys' in details:
            keys = ['nuo', 'iki']
            membership = dict(str2dict(keys, details[u'seimo narys']))
            parliament_group = {
                'type': 'parliament',
                'name': parliament,
                'position': u'seimo narys',
                'membership': [membership['nuo'], membership.get('iki')],
            }
            person.add_value('groups', [parliament_group])

        # name (first name, last name)
        name = header_hxs.xpath('div/b/font[2]/text()').extract()[0]
        self._parse_name(person, name)

        # groups
        party_name = person.get_output_value('raised_by')
        if party_name:
            person.add_value('groups', [{'type': 'party',
                                         'name': party_name,
                                         'position': 'narys'}])

        self._parse_groups(response, hxs, person)

        # date of birth
        xpath = (u'tr/td/table/'
                 u'tr[contains(descendant::text(), "Biografija")]/'
                 u'following-sibling::tr/td/'
                 u'descendant::*[contains(text(), "Gimė")]/text()')
        dob_hxs = hxs.xpath(u'translate(%s, "\xa0", " ")' % xpath)
        dob_match = dob_hxs.re(dob_re)
        if dob_match:
            year, month, day = dob_match
            month = month_names_map[month]
            dob = u'%s-%02d-%s' % (year, month, day.zfill(2))
            person.add_value('dob', dob)

        # biography
        xpath = (u'tr/td/table/'
                 u'tr[contains(descendant::text(), "Biografija")]/'
                 u'following-sibling::tr/td/div')
        bio_hxs = hxs.xpath(xpath)
        self._parse_biography(response, person, bio_hxs)

        # parliamentary history
        xpath = (u'//table[@summary="Istorija"]/'
                 u'tr/td/a[starts-with(b/text(), "Buvo išrinkta")]/'
                 u'following-sibling::text()')
        history_hxs = hxs.xpath(xpath)
        if history_hxs:
            for item in history_hxs:
                parliament = ''.join(item.re(r'(\d{4}) (-) (\d{4})'))
                person.add_value('parliament', [parliament])

        return person.load_item()