예제 #1
0
    def parse_related_documents(self, response):
        xpath = '/html/body/div/table/tr[3]/td/table/tr/td/table/tr'
        hxs = HtmlXPathSelector(response).select(xpath)
        act = Loader(self, response, LegalAct(), hxs, required=('_id', ))
        act.add_xpath('_id', 'td[2]/b/text()')

        if not act.get_output_value('_id'):
            p_id = unicode(self._get_query_attr(response.url, 'p_id'))
            act.replace_value('_id', u'NONUMBER-%s' % p_id)

        relations = defaultdict(list)
        xpath = '/html/body/div/table/tr[3]/td/table/tr/td/align/table/tr'
        for row in HtmlXPathSelector(response).select(xpath):
            docid = get_all(row, 'td[4]/span//text()')
            rel_type = row.select('td[6]/span/text()')
            if rel_type:
                rel_type = rel_type.extract()[0].strip().lower()

            if rel_type in (u'pakeistas dokumentas',
                            u'ankstesnė dokumento redakcija'):
                relations['amends'].append(docid)

            elif rel_type == u'priimtas dokumentas':
                relations['adopts'].append(docid)

            elif rel_type == u'ryšys su taikymą nusakančiu dokumentu':
                relations['defines_applicability'].append(docid)

            elif rel_type == u'ryšys su galiojimą nusakančiu dokumentu':
                relations['defines_validity'].append(docid)

            elif rel_type == u'negalioja de jure':
                relations['defines_as_no_longer_valid'].append(docid)

            elif rel_type == u'kitas projekto variantas':
                relations['new_draft_version'].append(docid)

            elif rel_type == u'kitas projekto variantas':
                relations['new_draft_version'].append(docid)

            elif rel_type == u'ryšys su ratifikavimo dokumentu':
                relations['ratification'].append(docid)

        if relations:
            act.add_value('relations', dict(relations))
            yield act.load_item()
예제 #2
0
    def _parse_law_act(self, response, hxs, base=False):
        """
        Extracts basic document information and returns law act loader.

        Parameters:

        base
            Return only base information about document. This is used, when
            filling some information bits to a law act from several law act
            documents.

        """
        lang = hxs.select('tr[1]/td[4]/b/text()').extract()[0].strip().lower()

        if lang not in (u'lietuvių', u'rusų', u'anglų', u'ispanų'):
            self.error(response, 'Unknown language: %s' % lang)

        if lang != u'lietuvių':
            return None

        act = Loader(self, response, LegalAct(), hxs, required=REQUIRED_FIELDS)

        act.add_xpath('_id', 'tr[1]/td[2]/b/text()')

        source = self._get_source(response.url, 'p_id')

        if not act.get_output_value('_id'):
            act.replace_value('_id', u'NONUMBER-%s' % source['id'])

        if base:
            return act

        act.add_xpath('name', 'caption/text()')
        act.add_xpath('kind', 'tr[1]/td[1]/b/text()')
        act.add_xpath('number', 'tr[1]/td[2]/b/text()')
        act.add_xpath('date', 'tr[1]/td[3]/b/text()')

        act.add_value('source', source)

        self._fix_name_case(act)

        return act
예제 #3
0
    def _parse_person_details(self, response):
        xpath = '//table[@summary="Seimo narys"]'
        hxs = Selector(response).xpath(xpath)[0]

        source = self._get_source(response.url, 'p_asm_id')

        seimas_code = self._get_query_attr(response.url, 'p_r')
        if seimas_code:
            source['version'] = seimas_version_map[int(seimas_code)]

        _id = source['id']

        person_hxs = hxs.xpath('tr/td/table/tr/td[2]/table/tr[2]/td[2]')
        person = Loader(self, response, Person(), person_hxs,
                        required=('first_name', 'last_name'))
        person.add_value('_id', '%sp' % _id)

        # Details

        split = [
            u'asmeniniai puslapiai',
            u'asmeninis elektroninis paštas',
            u'biuro adresas',
            u'darbo telefonas',
            u'iškėlė',
            u'išrinktas',
            u'kabinetas',
            u'kandidato puslapis',
            u'padėjėja sekretorė',
            u'seimo narys',
        ]
        details = ' '.join(person_hxs.xpath('descendant::text()').extract())
        details = str2dict(split, details, normalize=mapwords({
            u'išrinkta': u'išrinktas',
            u'seimo narė': u'seimo narys',
            u'el p': u'asmeninis elektroninis paštas',
            u'asmeninė svetainė': u'asmeniniai puslapiai',
        }))
        details = dict(details)

        email = details.get(u'asmeninis elektroninis paštas', '')
        phone = details.get(u'darbo telefonas', '')

        person.add_value('constituency', [details.get(u'išrinktas', '')])
        person.add_value('raised_by', [details.get(u'iškėlė', '')])
        person.add_value('email', split_by_comma(email))
        person.add_value('phone', split_by_comma(phone))
        person.add_value('office_address', [details.get(u'biuro adresas', '')])

        person.add_xpath(
            'home_page',
            u'a[contains(font/text(), "Asmeniniai puslapiai") or contains(font/text(), "Asmeninė svetainė")]/@href'
        )
        person.add_xpath('candidate_page',
                         'a[contains(text(), "Kandidato puslapis")]/@href')

        person.add_value('source', source)

        # photo
        photo = hxs.xpath('tr/td/table/tr/td/div/img/@src').extract()[0]
        person.add_value('photo', photo)
        person.add_value('image_urls', photo)

        header_hxs = hxs.xpath('tr/td/table/tr/td[2]/table/tr/td[2]')

        # parliament
        parliament = header_hxs.xpath('div/b/font/text()')
        parliament = parliament.re(r'(\d{4}[-\x97]\d{4})')
        parliament = ''.join(parliament).replace(u'\x97', u'-')
        person.add_value('parliament', parliament)
        if u'seimo narys' in details:
            keys = ['nuo', 'iki']
            membership = dict(str2dict(keys, details[u'seimo narys']))
            parliament_group = {
                'type': 'parliament',
                'name': parliament,
                'position': u'seimo narys',
                'membership': [membership['nuo'], membership.get('iki')],
            }
            person.add_value('groups', [parliament_group])

        # name (first name, last name)
        name = header_hxs.xpath('div/b/font[2]/text()').extract()[0]
        self._parse_name(person, name)

        # groups
        party_name = person.get_output_value('raised_by')
        if party_name:
            person.add_value('groups', [{'type': 'party',
                                         'name': party_name,
                                         'position': 'narys'}])

        self._parse_groups(response, hxs, person)

        # date of birth
        xpath = (u'tr/td/table/'
                 u'tr[contains(descendant::text(), "Biografija")]/'
                 u'following-sibling::tr/td/'
                 u'descendant::*[contains(text(), "Gimė")]/text()')
        dob_hxs = hxs.xpath(u'translate(%s, "\xa0", " ")' % xpath)
        dob_match = dob_hxs.re(dob_re)
        if dob_match:
            year, month, day = dob_match
            month = month_names_map[month]
            dob = u'%s-%02d-%s' % (year, month, day.zfill(2))
            person.add_value('dob', dob)

        # biography
        xpath = (u'tr/td/table/'
                 u'tr[contains(descendant::text(), "Biografija")]/'
                 u'following-sibling::tr/td/div')
        bio_hxs = hxs.xpath(xpath)
        self._parse_biography(response, person, bio_hxs)

        # parliamentary history
        xpath = (u'//table[@summary="Istorija"]/'
                 u'tr/td/a[starts-with(b/text(), "Buvo išrinkta")]/'
                 u'following-sibling::text()')
        history_hxs = hxs.xpath(xpath)
        if history_hxs:
            for item in history_hxs:
                parliament = ''.join(item.re(r'(\d{4}) (-) (\d{4})'))
                person.add_value('parliament', [parliament])

        return person.load_item()
예제 #4
0
    def _parse_person_details(self, response):
        xpath = '//div[contains(@id,"page-content")]'
        hxs = Selector(response).xpath(xpath)[0]

        source = self._get_source(response.url, 'p_asm_id')

        seimas_code = self._get_query_attr(response.url, 'p_r')
        if seimas_code:
            source['version'] = seimas_version_map[int(seimas_code)]

        _id = source['id']
        person_hxs = hxs.xpath('div/div[contains(@class, "col1")]')
        person = Loader(self, response, Person(), person_hxs,
                        required=('first_name', 'last_name'))
        person.add_value('_id', '%sp' % _id)

        # Details
        split = [
            u'asmeniniai puslapiai',
            u'asmeninis elektroninis paštas',
            u'biuro adresas',
            u'darbo telefonas',
            u'iškėlė',
            u'išrinktas',
            u'kabinetas',
            u'kandidato puslapis',
            u'padėjėja sekretorė',
            u'seimo narys',
            u'buvo išrinktas',
            u'buvo išrinkta',
            u'kontaktai',
        ]
        details = ' '.join(person_hxs.xpath('descendant::text()').extract())

        details = str2dict(split, details, normalize=mapwords({
            u'išrinkta': u'išrinktas',
            u'seimo narė': u'seimo narys',
        }))

        details = dict(details)

        contacts_hxs = hxs.xpath(
            'div/div[contains(@class, "col3")]'
            '/div[contains(@class, "kontaktai")]'
        )
        contacts = ' '.join(contacts_hxs.xpath('descendant::text()').extract())
        contacts_split = [
            u'el p',
            u'tel',
            u'asmeninė svetainė'
        ]
        contacts = str2dict(contacts_split, contacts)
        contacts = dict(contacts)

        if contacts.get('tel'):
            phone = re.sub("[^0-9]", "", contacts.get(u'tel'))
            person.add_value('phone', phone)

        email_xpath = 'div/div[contains(descendant::text(), "El. p.")]/a/text()'
        email_hxs = contacts_hxs.xpath(email_xpath)

        for email in email_hxs:
            person.add_value('email', email.extract())

        # TODO

        person.add_value('office_address', [''])

        website_hxs = contacts_hxs.xpath(
            'div/div[contains(@class, "site")]/a/@href'
        )
        if website_hxs:
            person.add_value(
                'home_page',
                website_hxs.extract()[0]
            )
        person.add_value('raised_by', [details.get(u'iškėlė', '')])
        person.add_value('constituency', [details.get(u'išrinktas', '')])

        person.add_value('source', source)

        # photo
        # first for P leader
        # second for the rest
        photo_selectors = [
            '//*[@id="page-content"]/div/div[1]/div[1]/img/@src',
            '//*[contains(@class, "seimo-nario-foto")]/img/@src',
        ]
        photo = None
        for photo_selector in photo_selectors:
            photo = Selector(response).xpath(photo_selector).extract()
            if photo:
                break
        if photo:
            person.add_value('photo', photo[0])
            person.add_value('image_urls', photo[0])
        # parliament

        parliament = hxs.xpath(
            'div/div/div[contains(@class, "smn-kadencija")]/span/text()'
        )
        parliament = parliament.re(r'(\d{4}[^-]\d{4})')
        parliament = ''.join(parliament).replace(u'\u2013', u'-')
        person.add_value('parliament', parliament)
        if u'seimo narys' in details:
            keys = ['nuo', 'iki']
            membership = dict(str2dict(keys, details[u'seimo narys']))
            parliament_group = {
                'type': 'parliament',
                'name': parliament,
                'position': u'seimo narys',
                'membership': [membership['nuo'], membership.get('iki')],
            }
            person.add_value('groups', [parliament_group])

        first_name = Selector(response).xpath('//*/div[contains(@class, "smn-name")]/text()').extract()[0]
        last_name = Selector(response).xpath('//*/span[contains(@class, "smn-pavarde")]/text()').extract()[0]

        person.add_value('first_name', unicode(first_name))
        person.add_value('last_name', unicode(last_name.title()))

        # groups
        party_name = person.get_output_value('raised_by')
        if party_name:
            person.add_value('groups', [{'type': 'party',
                                         'name': party_name,
                                         'position': 'narys'}])
        self._parse_groups(response, hxs, person)

        # biography_xpath = 'div/div[2]/div[3]/div/table[2]/tbody'
        # biography_hxs = hxs.xpath(biography_xpath)
        # self._parse_biography(person, biography_hxs)

        # parliamentary history
        xpath = (
                u'div/div[contains(@class, "col1")]/'
                u'p[contains(@class, "buvo-isrinkta")]/descendant::text()'
                )
        history_hxs = hxs.xpath(xpath)

        if history_hxs:
            for item in history_hxs:
                parliament = ''.join(item.re(r'(\d{4}[^-]\d{4})'))
                parliament = parliament.replace(u'\x97', '-')
                person.add_value('parliament', [parliament])

        return person.load_item()